1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2 
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <linux/fs.h>
6 #include <linux/magic.h>
7 #include <pthread.h>
8 #include <stddef.h>
9 #include <sys/mman.h>
10 #include <sys/statvfs.h>
11 #include <sys/uio.h>
12 #include <unistd.h>
13 
14 #include "sd-event.h"
15 
16 #include "alloc-util.h"
17 #include "chattr-util.h"
18 #include "compress.h"
19 #include "env-util.h"
20 #include "fd-util.h"
21 #include "format-util.h"
22 #include "fs-util.h"
23 #include "journal-authenticate.h"
24 #include "journal-def.h"
25 #include "journal-file.h"
26 #include "lookup3.h"
27 #include "memory-util.h"
28 #include "path-util.h"
29 #include "random-util.h"
30 #include "set.h"
31 #include "sort-util.h"
32 #include "stat-util.h"
33 #include "string-table.h"
34 #include "string-util.h"
35 #include "strv.h"
36 #include "sync-util.h"
37 #include "xattr-util.h"
38 
39 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
40 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
41 
42 #define DEFAULT_COMPRESS_THRESHOLD (512ULL)
43 #define MIN_COMPRESS_THRESHOLD (8ULL)
44 
45 /* This is the minimum journal file size */
46 #define JOURNAL_FILE_SIZE_MIN (512 * 1024ULL)             /* 512 KiB */
47 
48 /* These are the lower and upper bounds if we deduce the max_use value
49  * from the file system size */
50 #define MAX_USE_LOWER (1 * 1024 * 1024ULL)                /* 1 MiB */
51 #define MAX_USE_UPPER (4 * 1024 * 1024 * 1024ULL)         /* 4 GiB */
52 
53 /* Those are the lower and upper bounds for the minimal use limit,
54  * i.e. how much we'll use even if keep_free suggests otherwise. */
55 #define MIN_USE_LOW (1 * 1024 * 1024ULL)                  /* 1 MiB */
56 #define MIN_USE_HIGH (16 * 1024 * 1024ULL)                /* 16 MiB */
57 
58 /* This is the upper bound if we deduce max_size from max_use */
59 #define MAX_SIZE_UPPER (128 * 1024 * 1024ULL)             /* 128 MiB */
60 
61 /* This is the upper bound if we deduce the keep_free value from the
62  * file system size */
63 #define KEEP_FREE_UPPER (4 * 1024 * 1024 * 1024ULL)       /* 4 GiB */
64 
65 /* This is the keep_free value when we can't determine the system
66  * size */
67 #define DEFAULT_KEEP_FREE (1024 * 1024ULL)                /* 1 MB */
68 
69 /* This is the default maximum number of journal files to keep around. */
70 #define DEFAULT_N_MAX_FILES 100
71 
72 /* n_data was the first entry we added after the initial file format design */
73 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
74 
75 /* How many entries to keep in the entry array chain cache at max */
76 #define CHAIN_CACHE_MAX 20
77 
78 /* How much to increase the journal file size at once each time we allocate something new. */
79 #define FILE_SIZE_INCREASE (8 * 1024 * 1024ULL)          /* 8MB */
80 
81 /* Reread fstat() of the file for detecting deletions at least this often */
82 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
83 
84 /* The mmap context to use for the header we pick as one above the last defined typed */
85 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
86 
87 /* Longest hash chain to rotate after */
88 #define HASH_CHAIN_DEPTH_MAX 100
89 
90 #ifdef __clang__
91 #  pragma GCC diagnostic ignored "-Waddress-of-packed-member"
92 #endif
93 
journal_file_tail_end_by_pread(JournalFile * f,uint64_t * ret_offset)94 int journal_file_tail_end_by_pread(JournalFile *f, uint64_t *ret_offset) {
95         uint64_t p;
96         int r;
97 
98         assert(f);
99         assert(f->header);
100         assert(ret_offset);
101 
102         /* Same as journal_file_tail_end_by_mmap() below, but operates with pread() to avoid the mmap cache
103          * (and thus is thread safe) */
104 
105         p = le64toh(f->header->tail_object_offset);
106         if (p == 0)
107                 p = le64toh(f->header->header_size);
108         else {
109                 Object tail;
110                 uint64_t sz;
111 
112                 r = journal_file_read_object_header(f, OBJECT_UNUSED, p, &tail);
113                 if (r < 0)
114                         return r;
115 
116                 sz = le64toh(tail.object.size);
117                 if (sz > UINT64_MAX - sizeof(uint64_t) + 1)
118                         return -EBADMSG;
119 
120                 sz = ALIGN64(sz);
121                 if (p > UINT64_MAX - sz)
122                         return -EBADMSG;
123 
124                 p += sz;
125         }
126 
127         *ret_offset = p;
128 
129         return 0;
130 }
131 
journal_file_tail_end_by_mmap(JournalFile * f,uint64_t * ret_offset)132 int journal_file_tail_end_by_mmap(JournalFile *f, uint64_t *ret_offset) {
133         uint64_t p;
134         int r;
135 
136         assert(f);
137         assert(f->header);
138         assert(ret_offset);
139 
140         /* Same as journal_file_tail_end_by_pread() above, but operates with the usual mmap logic */
141 
142         p = le64toh(f->header->tail_object_offset);
143         if (p == 0)
144                 p = le64toh(f->header->header_size);
145         else {
146                 Object *tail;
147                 uint64_t sz;
148 
149                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
150                 if (r < 0)
151                         return r;
152 
153                 sz = le64toh(READ_NOW(tail->object.size));
154                 if (sz > UINT64_MAX - sizeof(uint64_t) + 1)
155                         return -EBADMSG;
156 
157                 sz = ALIGN64(sz);
158                 if (p > UINT64_MAX - sz)
159                         return -EBADMSG;
160 
161                 p += sz;
162         }
163 
164         *ret_offset = p;
165 
166         return 0;
167 }
168 
journal_file_set_offline_thread_join(JournalFile * f)169 int journal_file_set_offline_thread_join(JournalFile *f) {
170         int r;
171 
172         assert(f);
173 
174         if (f->offline_state == OFFLINE_JOINED)
175                 return 0;
176 
177         r = pthread_join(f->offline_thread, NULL);
178         if (r)
179                 return -r;
180 
181         f->offline_state = OFFLINE_JOINED;
182 
183         if (mmap_cache_fd_got_sigbus(f->cache_fd))
184                 return -EIO;
185 
186         return 0;
187 }
188 
journal_file_set_online(JournalFile * f)189 static int journal_file_set_online(JournalFile *f) {
190         bool wait = true;
191 
192         assert(f);
193 
194         if (!journal_file_writable(f))
195                 return -EPERM;
196 
197         if (f->fd < 0 || !f->header)
198                 return -EINVAL;
199 
200         while (wait) {
201                 switch (f->offline_state) {
202                 case OFFLINE_JOINED:
203                         /* No offline thread, no need to wait. */
204                         wait = false;
205                         break;
206 
207                 case OFFLINE_SYNCING:
208                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_CANCEL))
209                                 continue;
210                         /* Canceled syncing prior to offlining, no need to wait. */
211                         wait = false;
212                         break;
213 
214                 case OFFLINE_AGAIN_FROM_SYNCING:
215                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_CANCEL))
216                                 continue;
217                         /* Canceled restart from syncing, no need to wait. */
218                         wait = false;
219                         break;
220 
221                 case OFFLINE_AGAIN_FROM_OFFLINING:
222                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_CANCEL))
223                                 continue;
224                         /* Canceled restart from offlining, must wait for offlining to complete however. */
225                         _fallthrough_;
226                 default: {
227                         int r;
228 
229                         r = journal_file_set_offline_thread_join(f);
230                         if (r < 0)
231                                 return r;
232 
233                         wait = false;
234                         break;
235                 }
236                 }
237         }
238 
239         if (mmap_cache_fd_got_sigbus(f->cache_fd))
240                 return -EIO;
241 
242         switch (f->header->state) {
243                 case STATE_ONLINE:
244                         return 0;
245 
246                 case STATE_OFFLINE:
247                         f->header->state = STATE_ONLINE;
248                         (void) fsync(f->fd);
249                         return 0;
250 
251                 default:
252                         return -EINVAL;
253         }
254 }
255 
journal_file_close(JournalFile * f)256 JournalFile* journal_file_close(JournalFile *f) {
257         if (!f)
258                 return NULL;
259 
260         if (f->cache_fd)
261                 mmap_cache_fd_free(f->cache_fd);
262 
263         if (f->close_fd)
264                 safe_close(f->fd);
265         free(f->path);
266 
267         ordered_hashmap_free_free(f->chain_cache);
268 
269 #if HAVE_COMPRESSION
270         free(f->compress_buffer);
271 #endif
272 
273 #if HAVE_GCRYPT
274         if (f->fss_file)
275                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
276         else
277                 free(f->fsprg_state);
278 
279         free(f->fsprg_seed);
280 
281         if (f->hmac)
282                 gcry_md_close(f->hmac);
283 #endif
284 
285         return mfree(f);
286 }
287 
journal_file_init_header(JournalFile * f,JournalFileFlags file_flags,JournalFile * template)288 static int journal_file_init_header(JournalFile *f, JournalFileFlags file_flags, JournalFile *template) {
289         Header h = {};
290         ssize_t k;
291         bool keyed_hash, seal = false;
292         int r;
293 
294         assert(f);
295 
296         /* We turn on keyed hashes by default, but provide an environment variable to turn them off, if
297          * people really want that */
298         r = getenv_bool("SYSTEMD_JOURNAL_KEYED_HASH");
299         if (r < 0) {
300                 if (r != -ENXIO)
301                         log_debug_errno(r, "Failed to parse $SYSTEMD_JOURNAL_KEYED_HASH environment variable, ignoring: %m");
302                 keyed_hash = true;
303         } else
304                 keyed_hash = r;
305 
306 #if HAVE_GCRYPT
307         /* Try to load the FSPRG state, and if we can't, then just don't do sealing */
308         seal = FLAGS_SET(file_flags, JOURNAL_SEAL) && journal_file_fss_load(f) >= 0;
309 #endif
310 
311         memcpy(h.signature, HEADER_SIGNATURE, 8);
312         h.header_size = htole64(ALIGN64(sizeof(h)));
313 
314         h.incompatible_flags |= htole32(
315                         FLAGS_SET(file_flags, JOURNAL_COMPRESS) *
316                         COMPRESSION_TO_HEADER_INCOMPATIBLE_FLAG(DEFAULT_COMPRESSION) |
317                         keyed_hash * HEADER_INCOMPATIBLE_KEYED_HASH);
318 
319         h.compatible_flags = htole32(seal * HEADER_COMPATIBLE_SEALED);
320 
321         r = sd_id128_randomize(&h.file_id);
322         if (r < 0)
323                 return r;
324 
325         if (template) {
326                 h.seqnum_id = template->header->seqnum_id;
327                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
328         } else
329                 h.seqnum_id = h.file_id;
330 
331         k = pwrite(f->fd, &h, sizeof(h), 0);
332         if (k < 0)
333                 return -errno;
334 
335         if (k != sizeof(h))
336                 return -EIO;
337 
338         return 0;
339 }
340 
journal_file_refresh_header(JournalFile * f)341 static int journal_file_refresh_header(JournalFile *f) {
342         int r;
343 
344         assert(f);
345         assert(f->header);
346 
347         r = sd_id128_get_machine(&f->header->machine_id);
348         if (IN_SET(r, -ENOENT, -ENOMEDIUM))
349                 /* We don't have a machine-id, let's continue without */
350                 zero(f->header->machine_id);
351         else if (r < 0)
352                 return r;
353 
354         r = sd_id128_get_boot(&f->header->boot_id);
355         if (r < 0)
356                 return r;
357 
358         r = journal_file_set_online(f);
359 
360         /* Sync the online state to disk; likely just created a new file, also sync the directory this file
361          * is located in. */
362         (void) fsync_full(f->fd);
363 
364         return r;
365 }
366 
warn_wrong_flags(const JournalFile * f,bool compatible)367 static bool warn_wrong_flags(const JournalFile *f, bool compatible) {
368         const uint32_t any = compatible ? HEADER_COMPATIBLE_ANY : HEADER_INCOMPATIBLE_ANY,
369                 supported = compatible ? HEADER_COMPATIBLE_SUPPORTED : HEADER_INCOMPATIBLE_SUPPORTED;
370         const char *type = compatible ? "compatible" : "incompatible";
371         uint32_t flags;
372 
373         flags = le32toh(compatible ? f->header->compatible_flags : f->header->incompatible_flags);
374 
375         if (flags & ~supported) {
376                 if (flags & ~any)
377                         log_debug("Journal file %s has unknown %s flags 0x%"PRIx32,
378                                   f->path, type, flags & ~any);
379                 flags = (flags & any) & ~supported;
380                 if (flags) {
381                         const char* strv[5];
382                         size_t n = 0;
383                         _cleanup_free_ char *t = NULL;
384 
385                         if (compatible) {
386                                 if (flags & HEADER_COMPATIBLE_SEALED)
387                                         strv[n++] = "sealed";
388                         } else {
389                                 if (flags & HEADER_INCOMPATIBLE_COMPRESSED_XZ)
390                                         strv[n++] = "xz-compressed";
391                                 if (flags & HEADER_INCOMPATIBLE_COMPRESSED_LZ4)
392                                         strv[n++] = "lz4-compressed";
393                                 if (flags & HEADER_INCOMPATIBLE_COMPRESSED_ZSTD)
394                                         strv[n++] = "zstd-compressed";
395                                 if (flags & HEADER_INCOMPATIBLE_KEYED_HASH)
396                                         strv[n++] = "keyed-hash";
397                         }
398                         strv[n] = NULL;
399                         assert(n < ELEMENTSOF(strv));
400 
401                         t = strv_join((char**) strv, ", ");
402                         log_debug("Journal file %s uses %s %s %s disabled at compilation time.",
403                                   f->path, type, n > 1 ? "flags" : "flag", strnull(t));
404                 }
405                 return true;
406         }
407 
408         return false;
409 }
410 
journal_file_verify_header(JournalFile * f)411 static int journal_file_verify_header(JournalFile *f) {
412         uint64_t arena_size, header_size;
413 
414         assert(f);
415         assert(f->header);
416 
417         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
418                 return -EBADMSG;
419 
420         /* In both read and write mode we refuse to open files with incompatible
421          * flags we don't know. */
422         if (warn_wrong_flags(f, false))
423                 return -EPROTONOSUPPORT;
424 
425         /* When open for writing we refuse to open files with compatible flags, too. */
426         if (journal_file_writable(f) && warn_wrong_flags(f, true))
427                 return -EPROTONOSUPPORT;
428 
429         if (f->header->state >= _STATE_MAX)
430                 return -EBADMSG;
431 
432         header_size = le64toh(READ_NOW(f->header->header_size));
433 
434         /* The first addition was n_data, so check that we are at least this large */
435         if (header_size < HEADER_SIZE_MIN)
436                 return -EBADMSG;
437 
438         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
439                 return -EBADMSG;
440 
441         arena_size = le64toh(READ_NOW(f->header->arena_size));
442 
443         if (UINT64_MAX - header_size < arena_size || header_size + arena_size > (uint64_t) f->last_stat.st_size)
444                 return -ENODATA;
445 
446         if (le64toh(f->header->tail_object_offset) > header_size + arena_size)
447                 return -ENODATA;
448 
449         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
450             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
451             !VALID64(le64toh(f->header->tail_object_offset)) ||
452             !VALID64(le64toh(f->header->entry_array_offset)))
453                 return -ENODATA;
454 
455         if (journal_file_writable(f)) {
456                 sd_id128_t machine_id;
457                 uint8_t state;
458                 int r;
459 
460                 r = sd_id128_get_machine(&machine_id);
461                 if (r < 0)
462                         return r;
463 
464                 if (!sd_id128_equal(machine_id, f->header->machine_id))
465                         return -EHOSTDOWN;
466 
467                 state = f->header->state;
468 
469                 if (state == STATE_ARCHIVED)
470                         return -ESHUTDOWN; /* Already archived */
471                 else if (state == STATE_ONLINE)
472                         return log_debug_errno(SYNTHETIC_ERRNO(EBUSY),
473                                                "Journal file %s is already online. Assuming unclean closing.",
474                                                f->path);
475                 else if (state != STATE_OFFLINE)
476                         return log_debug_errno(SYNTHETIC_ERRNO(EBUSY),
477                                                "Journal file %s has unknown state %i.",
478                                                f->path, state);
479 
480                 if (f->header->field_hash_table_size == 0 || f->header->data_hash_table_size == 0)
481                         return -EBADMSG;
482 
483                 /* Don't permit appending to files from the future. Because otherwise the realtime timestamps wouldn't
484                  * be strictly ordered in the entries in the file anymore, and we can't have that since it breaks
485                  * bisection. */
486                 if (le64toh(f->header->tail_entry_realtime) > now(CLOCK_REALTIME))
487                         return log_debug_errno(SYNTHETIC_ERRNO(ETXTBSY),
488                                                "Journal file %s is from the future, refusing to append new data to it that'd be older.",
489                                                f->path);
490         }
491 
492         return 0;
493 }
494 
journal_file_fstat(JournalFile * f)495 int journal_file_fstat(JournalFile *f) {
496         int r;
497 
498         assert(f);
499         assert(f->fd >= 0);
500 
501         if (fstat(f->fd, &f->last_stat) < 0)
502                 return -errno;
503 
504         f->last_stat_usec = now(CLOCK_MONOTONIC);
505 
506         /* Refuse dealing with files that aren't regular */
507         r = stat_verify_regular(&f->last_stat);
508         if (r < 0)
509                 return r;
510 
511         /* Refuse appending to files that are already deleted */
512         if (f->last_stat.st_nlink <= 0)
513                 return -EIDRM;
514 
515         return 0;
516 }
517 
journal_file_allocate(JournalFile * f,uint64_t offset,uint64_t size)518 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
519         uint64_t old_size, new_size, old_header_size, old_arena_size;
520         int r;
521 
522         assert(f);
523         assert(f->header);
524 
525         /* We assume that this file is not sparse, and we know that for sure, since we always call
526          * posix_fallocate() ourselves */
527 
528         if (size > PAGE_ALIGN_DOWN(UINT64_MAX) - offset)
529                 return -EINVAL;
530 
531         if (mmap_cache_fd_got_sigbus(f->cache_fd))
532                 return -EIO;
533 
534         old_header_size = le64toh(READ_NOW(f->header->header_size));
535         old_arena_size = le64toh(READ_NOW(f->header->arena_size));
536         if (old_arena_size > PAGE_ALIGN_DOWN(UINT64_MAX) - old_header_size)
537                 return -EBADMSG;
538 
539         old_size = old_header_size + old_arena_size;
540 
541         new_size = MAX(PAGE_ALIGN(offset + size), old_header_size);
542 
543         if (new_size <= old_size) {
544 
545                 /* We already pre-allocated enough space, but before
546                  * we write to it, let's check with fstat() if the
547                  * file got deleted, in order make sure we don't throw
548                  * away the data immediately. Don't check fstat() for
549                  * all writes though, but only once ever 10s. */
550 
551                 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
552                         return 0;
553 
554                 return journal_file_fstat(f);
555         }
556 
557         /* Allocate more space. */
558 
559         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
560                 return -E2BIG;
561 
562         if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
563                 struct statvfs svfs;
564 
565                 if (fstatvfs(f->fd, &svfs) >= 0) {
566                         uint64_t available;
567 
568                         available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
569 
570                         if (new_size - old_size > available)
571                                 return -E2BIG;
572                 }
573         }
574 
575         /* Increase by larger blocks at once */
576         new_size = DIV_ROUND_UP(new_size, FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
577         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
578                 new_size = f->metrics.max_size;
579 
580         /* Note that the glibc fallocate() fallback is very
581            inefficient, hence we try to minimize the allocation area
582            as we can. */
583         r = posix_fallocate_loop(f->fd, old_size, new_size - old_size);
584         if (r < 0)
585                 return r;
586 
587         f->header->arena_size = htole64(new_size - old_header_size);
588 
589         return journal_file_fstat(f);
590 }
591 
type_to_context(ObjectType type)592 static unsigned type_to_context(ObjectType type) {
593         /* One context for each type, plus one catch-all for the rest */
594         assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
595         assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
596         return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
597 }
598 
journal_file_move_to(JournalFile * f,ObjectType type,bool keep_always,uint64_t offset,uint64_t size,void ** ret)599 static int journal_file_move_to(
600                 JournalFile *f,
601                 ObjectType type,
602                 bool keep_always,
603                 uint64_t offset,
604                 uint64_t size,
605                 void **ret) {
606 
607         int r;
608 
609         assert(f);
610         assert(ret);
611 
612         if (size <= 0)
613                 return -EINVAL;
614 
615         if (size > UINT64_MAX - offset)
616                 return -EBADMSG;
617 
618         /* Avoid SIGBUS on invalid accesses */
619         if (offset + size > (uint64_t) f->last_stat.st_size) {
620                 /* Hmm, out of range? Let's refresh the fstat() data
621                  * first, before we trust that check. */
622 
623                 r = journal_file_fstat(f);
624                 if (r < 0)
625                         return r;
626 
627                 if (offset + size > (uint64_t) f->last_stat.st_size)
628                         return -EADDRNOTAVAIL;
629         }
630 
631         return mmap_cache_fd_get(f->cache_fd, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
632 }
633 
minimum_header_size(Object * o)634 static uint64_t minimum_header_size(Object *o) {
635 
636         static const uint64_t table[] = {
637                 [OBJECT_DATA] = sizeof(DataObject),
638                 [OBJECT_FIELD] = sizeof(FieldObject),
639                 [OBJECT_ENTRY] = sizeof(EntryObject),
640                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
641                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
642                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
643                 [OBJECT_TAG] = sizeof(TagObject),
644         };
645 
646         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
647                 return sizeof(ObjectHeader);
648 
649         return table[o->object.type];
650 }
651 
652 /* Lightweight object checks. We want this to be fast, so that we won't
653  * slowdown every journal_file_move_to_object() call too much. */
journal_file_check_object(JournalFile * f,uint64_t offset,Object * o)654 static int journal_file_check_object(JournalFile *f, uint64_t offset, Object *o) {
655         assert(f);
656         assert(o);
657 
658         switch (o->object.type) {
659 
660         case OBJECT_DATA:
661                 if ((le64toh(o->data.entry_offset) == 0) ^ (le64toh(o->data.n_entries) == 0))
662                         return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
663                                                "Bad n_entries: %" PRIu64 ": %" PRIu64,
664                                                le64toh(o->data.n_entries),
665                                                offset);
666 
667                 if (le64toh(o->object.size) <= offsetof(Object, data.payload))
668                         return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
669                                                "Bad object size (<= %zu): %" PRIu64 ": %" PRIu64,
670                                                offsetof(Object, data.payload),
671                                                le64toh(o->object.size),
672                                                offset);
673 
674                 if (!VALID64(le64toh(o->data.next_hash_offset)) ||
675                     !VALID64(le64toh(o->data.next_field_offset)) ||
676                     !VALID64(le64toh(o->data.entry_offset)) ||
677                     !VALID64(le64toh(o->data.entry_array_offset)))
678                         return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
679                                                "Invalid offset, next_hash_offset=" OFSfmt ", next_field_offset=" OFSfmt ", entry_offset=" OFSfmt ", entry_array_offset=" OFSfmt ": %" PRIu64,
680                                                le64toh(o->data.next_hash_offset),
681                                                le64toh(o->data.next_field_offset),
682                                                le64toh(o->data.entry_offset),
683                                                le64toh(o->data.entry_array_offset),
684                                                offset);
685 
686                 break;
687 
688         case OBJECT_FIELD:
689                 if (le64toh(o->object.size) <= offsetof(Object, field.payload))
690                         return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
691                                                "Bad field size (<= %zu): %" PRIu64 ": %" PRIu64,
692                                                offsetof(Object, field.payload),
693                                                le64toh(o->object.size),
694                                                offset);
695 
696                 if (!VALID64(le64toh(o->field.next_hash_offset)) ||
697                     !VALID64(le64toh(o->field.head_data_offset)))
698                         return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
699                                                "Invalid offset, next_hash_offset=" OFSfmt ", head_data_offset=" OFSfmt ": %" PRIu64,
700                                                le64toh(o->field.next_hash_offset),
701                                                le64toh(o->field.head_data_offset),
702                                                offset);
703                 break;
704 
705         case OBJECT_ENTRY: {
706                 uint64_t sz;
707 
708                 sz = le64toh(READ_NOW(o->object.size));
709                 if (sz < offsetof(Object, entry.items) ||
710                     (sz - offsetof(Object, entry.items)) % sizeof(EntryItem) != 0)
711                         return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
712                                                "Bad entry size (<= %zu): %" PRIu64 ": %" PRIu64,
713                                                offsetof(Object, entry.items),
714                                                sz,
715                                                offset);
716 
717                 if ((sz - offsetof(Object, entry.items)) / sizeof(EntryItem) <= 0)
718                         return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
719                                                "Invalid number items in entry: %" PRIu64 ": %" PRIu64,
720                                                (sz - offsetof(Object, entry.items)) / sizeof(EntryItem),
721                                                offset);
722 
723                 if (le64toh(o->entry.seqnum) <= 0)
724                         return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
725                                                "Invalid entry seqnum: %" PRIx64 ": %" PRIu64,
726                                                le64toh(o->entry.seqnum),
727                                                offset);
728 
729                 if (!VALID_REALTIME(le64toh(o->entry.realtime)))
730                         return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
731                                                "Invalid entry realtime timestamp: %" PRIu64 ": %" PRIu64,
732                                                le64toh(o->entry.realtime),
733                                                offset);
734 
735                 if (!VALID_MONOTONIC(le64toh(o->entry.monotonic)))
736                         return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
737                                                "Invalid entry monotonic timestamp: %" PRIu64 ": %" PRIu64,
738                                                le64toh(o->entry.monotonic),
739                                                offset);
740 
741                 break;
742         }
743 
744         case OBJECT_DATA_HASH_TABLE:
745         case OBJECT_FIELD_HASH_TABLE: {
746                 uint64_t sz;
747 
748                 sz = le64toh(READ_NOW(o->object.size));
749                 if (sz < offsetof(Object, hash_table.items) ||
750                     (sz - offsetof(Object, hash_table.items)) % sizeof(HashItem) != 0 ||
751                     (sz - offsetof(Object, hash_table.items)) / sizeof(HashItem) <= 0)
752                         return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
753                                                "Invalid %s hash table size: %" PRIu64 ": %" PRIu64,
754                                                o->object.type == OBJECT_DATA_HASH_TABLE ? "data" : "field",
755                                                sz,
756                                                offset);
757 
758                 break;
759         }
760 
761         case OBJECT_ENTRY_ARRAY: {
762                 uint64_t sz;
763 
764                 sz = le64toh(READ_NOW(o->object.size));
765                 if (sz < offsetof(Object, entry_array.items) ||
766                     (sz - offsetof(Object, entry_array.items)) % sizeof(le64_t) != 0 ||
767                     (sz - offsetof(Object, entry_array.items)) / sizeof(le64_t) <= 0)
768                         return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
769                                                "Invalid object entry array size: %" PRIu64 ": %" PRIu64,
770                                                sz,
771                                                offset);
772 
773                 if (!VALID64(le64toh(o->entry_array.next_entry_array_offset)))
774                         return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
775                                                "Invalid object entry array next_entry_array_offset: " OFSfmt ": %" PRIu64,
776                                                le64toh(o->entry_array.next_entry_array_offset),
777                                                offset);
778 
779                 break;
780         }
781 
782         case OBJECT_TAG:
783                 if (le64toh(o->object.size) != sizeof(TagObject))
784                         return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
785                                                "Invalid object tag size: %" PRIu64 ": %" PRIu64,
786                                                le64toh(o->object.size),
787                                                offset);
788 
789                 if (!VALID_EPOCH(le64toh(o->tag.epoch)))
790                         return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
791                                                "Invalid object tag epoch: %" PRIu64 ": %" PRIu64,
792                                                le64toh(o->tag.epoch), offset);
793 
794                 break;
795         }
796 
797         return 0;
798 }
799 
journal_file_move_to_object(JournalFile * f,ObjectType type,uint64_t offset,Object ** ret)800 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
801         int r;
802         void *t;
803         Object *o;
804         uint64_t s;
805 
806         assert(f);
807 
808         /* Objects may only be located at multiple of 64 bit */
809         if (!VALID64(offset))
810                 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
811                                        "Attempt to move to object at non-64bit boundary: %" PRIu64,
812                                        offset);
813 
814         /* Object may not be located in the file header */
815         if (offset < le64toh(f->header->header_size))
816                 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
817                                        "Attempt to move to object located in file header: %" PRIu64,
818                                        offset);
819 
820         r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
821         if (r < 0)
822                 return r;
823 
824         o = (Object*) t;
825         s = le64toh(READ_NOW(o->object.size));
826 
827         if (s == 0)
828                 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
829                                        "Attempt to move to uninitialized object: %" PRIu64,
830                                        offset);
831         if (s < sizeof(ObjectHeader))
832                 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
833                                        "Attempt to move to overly short object: %" PRIu64,
834                                        offset);
835 
836         if (o->object.type <= OBJECT_UNUSED)
837                 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
838                                        "Attempt to move to object with invalid type: %" PRIu64,
839                                        offset);
840 
841         if (s < minimum_header_size(o))
842                 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
843                                        "Attempt to move to truncated object: %" PRIu64,
844                                        offset);
845 
846         if (type > OBJECT_UNUSED && o->object.type != type)
847                 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
848                                        "Attempt to move to object of unexpected type: %" PRIu64,
849                                        offset);
850 
851         r = journal_file_move_to(f, type, false, offset, s, &t);
852         if (r < 0)
853                 return r;
854 
855         o = (Object*) t;
856 
857         r = journal_file_check_object(f, offset, o);
858         if (r < 0)
859                 return r;
860 
861         if (ret)
862                 *ret = o;
863 
864         return 0;
865 }
866 
journal_file_read_object_header(JournalFile * f,ObjectType type,uint64_t offset,Object * ret)867 int journal_file_read_object_header(JournalFile *f, ObjectType type, uint64_t offset, Object *ret) {
868         uint64_t s;
869         ssize_t n;
870         Object o;
871         int r;
872 
873         assert(f);
874 
875         /* Objects may only be located at multiple of 64 bit */
876         if (!VALID64(offset))
877                 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
878                                        "Attempt to read object at non-64bit boundary: %" PRIu64,
879                                        offset);
880 
881         /* Object may not be located in the file header */
882         if (offset < le64toh(f->header->header_size))
883                 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
884                                        "Attempt to read object located in file header: %" PRIu64,
885                                        offset);
886 
887         /* This will likely read too much data but it avoids having to call pread() twice. */
888         n = pread(f->fd, &o, sizeof(o), offset);
889         if (n < 0)
890                 return log_debug_errno(errno, "Failed to read journal file at offset: %" PRIu64,
891                                        offset);
892 
893         if ((size_t) n < sizeof(o.object))
894                 return log_debug_errno(SYNTHETIC_ERRNO(EIO),
895                                        "Failed to read short object at offset: %" PRIu64,
896                                        offset);
897 
898         s = le64toh(o.object.size);
899         if (s == 0)
900                 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
901                                        "Attempt to read uninitialized object: %" PRIu64,
902                                        offset);
903         if (s < sizeof(o.object))
904                 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
905                                        "Attempt to read overly short object: %" PRIu64,
906                                        offset);
907 
908         if (o.object.type <= OBJECT_UNUSED)
909                 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
910                                        "Attempt to read object with invalid type: %" PRIu64,
911                                        offset);
912 
913         if (s < minimum_header_size(&o))
914                 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
915                                        "Attempt to read truncated object: %" PRIu64,
916                                        offset);
917 
918         if ((size_t) n < minimum_header_size(&o))
919                 return log_debug_errno(SYNTHETIC_ERRNO(EIO),
920                                        "Short read while reading object: %" PRIu64,
921                                        offset);
922 
923         if (type > OBJECT_UNUSED && o.object.type != type)
924                 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
925                                        "Attempt to read object of unexpected type: %" PRIu64,
926                                        offset);
927 
928         r = journal_file_check_object(f, offset, &o);
929         if (r < 0)
930                 return r;
931 
932         if (ret)
933                 *ret = o;
934 
935         return 0;
936 }
937 
journal_file_entry_seqnum(JournalFile * f,uint64_t * seqnum)938 static uint64_t journal_file_entry_seqnum(
939                 JournalFile *f,
940                 uint64_t *seqnum) {
941 
942         uint64_t ret;
943 
944         assert(f);
945         assert(f->header);
946 
947         /* Picks a new sequence number for the entry we are about to add and returns it. */
948 
949         ret = le64toh(f->header->tail_entry_seqnum) + 1;
950 
951         if (seqnum) {
952                 /* If an external seqnum counter was passed, we update both the local and the external one,
953                  * and set it to the maximum of both */
954 
955                 if (*seqnum + 1 > ret)
956                         ret = *seqnum + 1;
957 
958                 *seqnum = ret;
959         }
960 
961         f->header->tail_entry_seqnum = htole64(ret);
962 
963         if (f->header->head_entry_seqnum == 0)
964                 f->header->head_entry_seqnum = htole64(ret);
965 
966         return ret;
967 }
968 
journal_file_append_object(JournalFile * f,ObjectType type,uint64_t size,Object ** ret,uint64_t * ret_offset)969 int journal_file_append_object(
970                 JournalFile *f,
971                 ObjectType type,
972                 uint64_t size,
973                 Object **ret,
974                 uint64_t *ret_offset) {
975 
976         int r;
977         uint64_t p;
978         Object *o;
979         void *t;
980 
981         assert(f);
982         assert(f->header);
983         assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
984         assert(size >= sizeof(ObjectHeader));
985 
986         r = journal_file_set_online(f);
987         if (r < 0)
988                 return r;
989 
990         r = journal_file_tail_end_by_mmap(f, &p);
991         if (r < 0)
992                 return r;
993 
994         r = journal_file_allocate(f, p, size);
995         if (r < 0)
996                 return r;
997 
998         r = journal_file_move_to(f, type, false, p, size, &t);
999         if (r < 0)
1000                 return r;
1001 
1002         o = (Object*) t;
1003         o->object = (ObjectHeader) {
1004                 .type = type,
1005                 .size = htole64(size),
1006         };
1007 
1008         f->header->tail_object_offset = htole64(p);
1009         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
1010 
1011         if (ret)
1012                 *ret = o;
1013 
1014         if (ret_offset)
1015                 *ret_offset = p;
1016 
1017         return 0;
1018 }
1019 
journal_file_setup_data_hash_table(JournalFile * f)1020 static int journal_file_setup_data_hash_table(JournalFile *f) {
1021         uint64_t s, p;
1022         Object *o;
1023         int r;
1024 
1025         assert(f);
1026         assert(f->header);
1027 
1028         /* We estimate that we need 1 hash table entry per 768 bytes
1029            of journal file and we want to make sure we never get
1030            beyond 75% fill level. Calculate the hash table size for
1031            the maximum file size based on these metrics. */
1032 
1033         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
1034         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
1035                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
1036 
1037         log_debug("Reserving %"PRIu64" entries in data hash table.", s / sizeof(HashItem));
1038 
1039         r = journal_file_append_object(f,
1040                                        OBJECT_DATA_HASH_TABLE,
1041                                        offsetof(Object, hash_table.items) + s,
1042                                        &o, &p);
1043         if (r < 0)
1044                 return r;
1045 
1046         memzero(o->hash_table.items, s);
1047 
1048         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1049         f->header->data_hash_table_size = htole64(s);
1050 
1051         return 0;
1052 }
1053 
journal_file_setup_field_hash_table(JournalFile * f)1054 static int journal_file_setup_field_hash_table(JournalFile *f) {
1055         uint64_t s, p;
1056         Object *o;
1057         int r;
1058 
1059         assert(f);
1060         assert(f->header);
1061 
1062         /* We use a fixed size hash table for the fields as this
1063          * number should grow very slowly only */
1064 
1065         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
1066         log_debug("Reserving %"PRIu64" entries in field hash table.", s / sizeof(HashItem));
1067 
1068         r = journal_file_append_object(f,
1069                                        OBJECT_FIELD_HASH_TABLE,
1070                                        offsetof(Object, hash_table.items) + s,
1071                                        &o, &p);
1072         if (r < 0)
1073                 return r;
1074 
1075         memzero(o->hash_table.items, s);
1076 
1077         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1078         f->header->field_hash_table_size = htole64(s);
1079 
1080         return 0;
1081 }
1082 
journal_file_map_data_hash_table(JournalFile * f)1083 int journal_file_map_data_hash_table(JournalFile *f) {
1084         uint64_t s, p;
1085         void *t;
1086         int r;
1087 
1088         assert(f);
1089         assert(f->header);
1090 
1091         if (f->data_hash_table)
1092                 return 0;
1093 
1094         p = le64toh(f->header->data_hash_table_offset);
1095         s = le64toh(f->header->data_hash_table_size);
1096 
1097         r = journal_file_move_to(f,
1098                                  OBJECT_DATA_HASH_TABLE,
1099                                  true,
1100                                  p, s,
1101                                  &t);
1102         if (r < 0)
1103                 return r;
1104 
1105         f->data_hash_table = t;
1106         return 0;
1107 }
1108 
journal_file_map_field_hash_table(JournalFile * f)1109 int journal_file_map_field_hash_table(JournalFile *f) {
1110         uint64_t s, p;
1111         void *t;
1112         int r;
1113 
1114         assert(f);
1115         assert(f->header);
1116 
1117         if (f->field_hash_table)
1118                 return 0;
1119 
1120         p = le64toh(f->header->field_hash_table_offset);
1121         s = le64toh(f->header->field_hash_table_size);
1122 
1123         r = journal_file_move_to(f,
1124                                  OBJECT_FIELD_HASH_TABLE,
1125                                  true,
1126                                  p, s,
1127                                  &t);
1128         if (r < 0)
1129                 return r;
1130 
1131         f->field_hash_table = t;
1132         return 0;
1133 }
1134 
journal_file_link_field(JournalFile * f,Object * o,uint64_t offset,uint64_t hash)1135 static int journal_file_link_field(
1136                 JournalFile *f,
1137                 Object *o,
1138                 uint64_t offset,
1139                 uint64_t hash) {
1140 
1141         uint64_t p, h, m;
1142         int r;
1143 
1144         assert(f);
1145         assert(f->header);
1146         assert(f->field_hash_table);
1147         assert(o);
1148         assert(offset > 0);
1149 
1150         if (o->object.type != OBJECT_FIELD)
1151                 return -EINVAL;
1152 
1153         m = le64toh(READ_NOW(f->header->field_hash_table_size)) / sizeof(HashItem);
1154         if (m <= 0)
1155                 return -EBADMSG;
1156 
1157         /* This might alter the window we are looking at */
1158         o->field.next_hash_offset = o->field.head_data_offset = 0;
1159 
1160         h = hash % m;
1161         p = le64toh(f->field_hash_table[h].tail_hash_offset);
1162         if (p == 0)
1163                 f->field_hash_table[h].head_hash_offset = htole64(offset);
1164         else {
1165                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1166                 if (r < 0)
1167                         return r;
1168 
1169                 o->field.next_hash_offset = htole64(offset);
1170         }
1171 
1172         f->field_hash_table[h].tail_hash_offset = htole64(offset);
1173 
1174         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1175                 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
1176 
1177         return 0;
1178 }
1179 
journal_file_link_data(JournalFile * f,Object * o,uint64_t offset,uint64_t hash)1180 static int journal_file_link_data(
1181                 JournalFile *f,
1182                 Object *o,
1183                 uint64_t offset,
1184                 uint64_t hash) {
1185 
1186         uint64_t p, h, m;
1187         int r;
1188 
1189         assert(f);
1190         assert(f->header);
1191         assert(f->data_hash_table);
1192         assert(o);
1193         assert(offset > 0);
1194 
1195         if (o->object.type != OBJECT_DATA)
1196                 return -EINVAL;
1197 
1198         m = le64toh(READ_NOW(f->header->data_hash_table_size)) / sizeof(HashItem);
1199         if (m <= 0)
1200                 return -EBADMSG;
1201 
1202         /* This might alter the window we are looking at */
1203         o->data.next_hash_offset = o->data.next_field_offset = 0;
1204         o->data.entry_offset = o->data.entry_array_offset = 0;
1205         o->data.n_entries = 0;
1206 
1207         h = hash % m;
1208         p = le64toh(f->data_hash_table[h].tail_hash_offset);
1209         if (p == 0)
1210                 /* Only entry in the hash table is easy */
1211                 f->data_hash_table[h].head_hash_offset = htole64(offset);
1212         else {
1213                 /* Move back to the previous data object, to patch in
1214                  * pointer */
1215 
1216                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1217                 if (r < 0)
1218                         return r;
1219 
1220                 o->data.next_hash_offset = htole64(offset);
1221         }
1222 
1223         f->data_hash_table[h].tail_hash_offset = htole64(offset);
1224 
1225         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1226                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
1227 
1228         return 0;
1229 }
1230 
next_hash_offset(JournalFile * f,uint64_t * p,le64_t * next_hash_offset,uint64_t * depth,le64_t * header_max_depth)1231 static int next_hash_offset(
1232                 JournalFile *f,
1233                 uint64_t *p,
1234                 le64_t *next_hash_offset,
1235                 uint64_t *depth,
1236                 le64_t *header_max_depth) {
1237 
1238         uint64_t nextp;
1239 
1240         nextp = le64toh(READ_NOW(*next_hash_offset));
1241         if (nextp > 0) {
1242                 if (nextp <= *p) /* Refuse going in loops */
1243                         return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
1244                                                "Detected hash item loop in %s, refusing.", f->path);
1245 
1246                 (*depth)++;
1247 
1248                 /* If the depth of this hash chain is larger than all others we have seen so far, record it */
1249                 if (header_max_depth && journal_file_writable(f))
1250                         *header_max_depth = htole64(MAX(*depth, le64toh(*header_max_depth)));
1251         }
1252 
1253         *p = nextp;
1254         return 0;
1255 }
1256 
journal_file_find_field_object_with_hash(JournalFile * f,const void * field,uint64_t size,uint64_t hash,Object ** ret,uint64_t * ret_offset)1257 int journal_file_find_field_object_with_hash(
1258                 JournalFile *f,
1259                 const void *field, uint64_t size, uint64_t hash,
1260                 Object **ret, uint64_t *ret_offset) {
1261 
1262         uint64_t p, osize, h, m, depth = 0;
1263         int r;
1264 
1265         assert(f);
1266         assert(f->header);
1267         assert(field && size > 0);
1268 
1269         /* If the field hash table is empty, we can't find anything */
1270         if (le64toh(f->header->field_hash_table_size) <= 0)
1271                 return 0;
1272 
1273         /* Map the field hash table, if it isn't mapped yet. */
1274         r = journal_file_map_field_hash_table(f);
1275         if (r < 0)
1276                 return r;
1277 
1278         osize = offsetof(Object, field.payload) + size;
1279 
1280         m = le64toh(READ_NOW(f->header->field_hash_table_size)) / sizeof(HashItem);
1281         if (m <= 0)
1282                 return -EBADMSG;
1283 
1284         h = hash % m;
1285         p = le64toh(f->field_hash_table[h].head_hash_offset);
1286         while (p > 0) {
1287                 Object *o;
1288 
1289                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1290                 if (r < 0)
1291                         return r;
1292 
1293                 if (le64toh(o->field.hash) == hash &&
1294                     le64toh(o->object.size) == osize &&
1295                     memcmp(o->field.payload, field, size) == 0) {
1296 
1297                         if (ret)
1298                                 *ret = o;
1299                         if (ret_offset)
1300                                 *ret_offset = p;
1301 
1302                         return 1;
1303                 }
1304 
1305                 r = next_hash_offset(
1306                                 f,
1307                                 &p,
1308                                 &o->field.next_hash_offset,
1309                                 &depth,
1310                                 JOURNAL_HEADER_CONTAINS(f->header, field_hash_chain_depth) ? &f->header->field_hash_chain_depth : NULL);
1311                 if (r < 0)
1312                         return r;
1313         }
1314 
1315         return 0;
1316 }
1317 
journal_file_hash_data(JournalFile * f,const void * data,size_t sz)1318 uint64_t journal_file_hash_data(
1319                 JournalFile *f,
1320                 const void *data,
1321                 size_t sz) {
1322 
1323         assert(f);
1324         assert(data || sz == 0);
1325 
1326         /* We try to unify our codebase on siphash, hence new-styled journal files utilizing the keyed hash
1327          * function use siphash. Old journal files use the Jenkins hash. */
1328 
1329         if (JOURNAL_HEADER_KEYED_HASH(f->header))
1330                 return siphash24(data, sz, f->header->file_id.bytes);
1331 
1332         return jenkins_hash64(data, sz);
1333 }
1334 
journal_file_find_field_object(JournalFile * f,const void * field,uint64_t size,Object ** ret,uint64_t * ret_offset)1335 int journal_file_find_field_object(
1336                 JournalFile *f,
1337                 const void *field, uint64_t size,
1338                 Object **ret, uint64_t *ret_offset) {
1339 
1340         assert(f);
1341         assert(field && size > 0);
1342 
1343         return journal_file_find_field_object_with_hash(
1344                         f,
1345                         field, size,
1346                         journal_file_hash_data(f, field, size),
1347                         ret, ret_offset);
1348 }
1349 
journal_file_find_data_object_with_hash(JournalFile * f,const void * data,uint64_t size,uint64_t hash,Object ** ret,uint64_t * ret_offset)1350 int journal_file_find_data_object_with_hash(
1351                 JournalFile *f,
1352                 const void *data, uint64_t size, uint64_t hash,
1353                 Object **ret, uint64_t *ret_offset) {
1354 
1355         uint64_t p, osize, h, m, depth = 0;
1356         int r;
1357 
1358         assert(f);
1359         assert(f->header);
1360         assert(data || size == 0);
1361 
1362         /* If there's no data hash table, then there's no entry. */
1363         if (le64toh(f->header->data_hash_table_size) <= 0)
1364                 return 0;
1365 
1366         /* Map the data hash table, if it isn't mapped yet. */
1367         r = journal_file_map_data_hash_table(f);
1368         if (r < 0)
1369                 return r;
1370 
1371         osize = offsetof(Object, data.payload) + size;
1372 
1373         m = le64toh(READ_NOW(f->header->data_hash_table_size)) / sizeof(HashItem);
1374         if (m <= 0)
1375                 return -EBADMSG;
1376 
1377         h = hash % m;
1378         p = le64toh(f->data_hash_table[h].head_hash_offset);
1379 
1380         while (p > 0) {
1381                 Compression c;
1382                 Object *o;
1383 
1384                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1385                 if (r < 0)
1386                         return r;
1387 
1388                 if (le64toh(o->data.hash) != hash)
1389                         goto next;
1390 
1391                 c = COMPRESSION_FROM_OBJECT(o);
1392                 if (c < 0)
1393                         return -EPROTONOSUPPORT;
1394                 if (c != COMPRESSION_NONE) {
1395 #if HAVE_COMPRESSION
1396                         uint64_t l;
1397                         size_t rsize = 0;
1398 
1399                         l = le64toh(READ_NOW(o->object.size));
1400                         if (l <= offsetof(Object, data.payload))
1401                                 return -EBADMSG;
1402 
1403                         l -= offsetof(Object, data.payload);
1404 
1405                         r = decompress_blob(c, o->data.payload, l, &f->compress_buffer, &rsize, 0);
1406                         if (r < 0)
1407                                 return r;
1408 
1409                         if (rsize == size &&
1410                             memcmp(f->compress_buffer, data, size) == 0) {
1411 
1412                                 if (ret)
1413                                         *ret = o;
1414 
1415                                 if (ret_offset)
1416                                         *ret_offset = p;
1417 
1418                                 return 1;
1419                         }
1420 #else
1421                         return -EPROTONOSUPPORT;
1422 #endif
1423                 } else if (le64toh(o->object.size) == osize &&
1424                            memcmp(o->data.payload, data, size) == 0) {
1425 
1426                         if (ret)
1427                                 *ret = o;
1428 
1429                         if (ret_offset)
1430                                 *ret_offset = p;
1431 
1432                         return 1;
1433                 }
1434 
1435         next:
1436                 r = next_hash_offset(
1437                                 f,
1438                                 &p,
1439                                 &o->data.next_hash_offset,
1440                                 &depth,
1441                                 JOURNAL_HEADER_CONTAINS(f->header, data_hash_chain_depth) ? &f->header->data_hash_chain_depth : NULL);
1442                 if (r < 0)
1443                         return r;
1444         }
1445 
1446         return 0;
1447 }
1448 
journal_file_find_data_object(JournalFile * f,const void * data,uint64_t size,Object ** ret,uint64_t * ret_offset)1449 int journal_file_find_data_object(
1450                 JournalFile *f,
1451                 const void *data, uint64_t size,
1452                 Object **ret, uint64_t *ret_offset) {
1453 
1454         assert(f);
1455         assert(data || size == 0);
1456 
1457         return journal_file_find_data_object_with_hash(
1458                         f,
1459                         data, size,
1460                         journal_file_hash_data(f, data, size),
1461                         ret, ret_offset);
1462 }
1463 
journal_field_valid(const char * p,size_t l,bool allow_protected)1464 bool journal_field_valid(const char *p, size_t l, bool allow_protected) {
1465         /* We kinda enforce POSIX syntax recommendations for
1466            environment variables here, but make a couple of additional
1467            requirements.
1468 
1469            http://pubs.opengroup.org/onlinepubs/000095399/basedefs/xbd_chap08.html */
1470 
1471         if (l == SIZE_MAX)
1472                 l = strlen(p);
1473 
1474         /* No empty field names */
1475         if (l <= 0)
1476                 return false;
1477 
1478         /* Don't allow names longer than 64 chars */
1479         if (l > 64)
1480                 return false;
1481 
1482         /* Variables starting with an underscore are protected */
1483         if (!allow_protected && p[0] == '_')
1484                 return false;
1485 
1486         /* Don't allow digits as first character */
1487         if (p[0] >= '0' && p[0] <= '9')
1488                 return false;
1489 
1490         /* Only allow A-Z0-9 and '_' */
1491         for (const char *a = p; a < p + l; a++)
1492                 if ((*a < 'A' || *a > 'Z') &&
1493                     (*a < '0' || *a > '9') &&
1494                     *a != '_')
1495                         return false;
1496 
1497         return true;
1498 }
1499 
journal_file_append_field(JournalFile * f,const void * field,uint64_t size,Object ** ret,uint64_t * ret_offset)1500 static int journal_file_append_field(
1501                 JournalFile *f,
1502                 const void *field, uint64_t size,
1503                 Object **ret, uint64_t *ret_offset) {
1504 
1505         uint64_t hash, p;
1506         uint64_t osize;
1507         Object *o;
1508         int r;
1509 
1510         assert(f);
1511         assert(field && size > 0);
1512 
1513         if (!journal_field_valid(field, size, true))
1514                 return -EBADMSG;
1515 
1516         hash = journal_file_hash_data(f, field, size);
1517 
1518         r = journal_file_find_field_object_with_hash(f, field, size, hash, ret, ret_offset);
1519         if (r < 0)
1520                 return r;
1521         if (r > 0)
1522                 return 0;
1523 
1524         osize = offsetof(Object, field.payload) + size;
1525         r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
1526         if (r < 0)
1527                 return r;
1528 
1529         o->field.hash = htole64(hash);
1530         memcpy(o->field.payload, field, size);
1531 
1532         r = journal_file_link_field(f, o, p, hash);
1533         if (r < 0)
1534                 return r;
1535 
1536         /* The linking might have altered the window, so let's only pass the offset to hmac which will
1537          * move to the object again if needed. */
1538 
1539 #if HAVE_GCRYPT
1540         r = journal_file_hmac_put_object(f, OBJECT_FIELD, NULL, p);
1541         if (r < 0)
1542                 return r;
1543 #endif
1544 
1545         if (ret) {
1546                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, ret);
1547                 if (r < 0)
1548                         return r;
1549         }
1550 
1551         if (ret_offset)
1552                 *ret_offset = p;
1553 
1554         return 0;
1555 }
1556 
journal_file_append_data(JournalFile * f,const void * data,uint64_t size,Object ** ret,uint64_t * ret_offset)1557 static int journal_file_append_data(
1558                 JournalFile *f,
1559                 const void *data, uint64_t size,
1560                 Object **ret, uint64_t *ret_offset) {
1561 
1562         uint64_t hash, p, fp, osize;
1563         Object *o, *fo;
1564         int r, compression = 0;
1565         const void *eq;
1566 
1567         assert(f);
1568 
1569         if (!data || size == 0)
1570                 return -EINVAL;
1571 
1572         hash = journal_file_hash_data(f, data, size);
1573 
1574         r = journal_file_find_data_object_with_hash(f, data, size, hash, ret, ret_offset);
1575         if (r < 0)
1576                 return r;
1577         if (r > 0)
1578                 return 0;
1579 
1580         eq = memchr(data, '=', size);
1581         if (!eq)
1582                 return -EINVAL;
1583 
1584         osize = offsetof(Object, data.payload) + size;
1585         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1586         if (r < 0)
1587                 return r;
1588 
1589         o->data.hash = htole64(hash);
1590 
1591 #if HAVE_COMPRESSION
1592         if (JOURNAL_FILE_COMPRESS(f) && size >= f->compress_threshold_bytes) {
1593                 size_t rsize = 0;
1594 
1595                 compression = compress_blob(data, size, o->data.payload, size - 1, &rsize);
1596                 if (compression > COMPRESSION_NONE) {
1597                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1598                         o->object.flags |= COMPRESSION_TO_OBJECT_FLAG(compression);
1599 
1600                         log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1601                                   size, rsize, compression_to_string(compression));
1602                 } else
1603                         /* Compression didn't work, we don't really care why, let's continue without compression */
1604                         compression = COMPRESSION_NONE;
1605         }
1606 #endif
1607 
1608         if (compression == 0)
1609                 memcpy_safe(o->data.payload, data, size);
1610 
1611         r = journal_file_link_data(f, o, p, hash);
1612         if (r < 0)
1613                 return r;
1614 
1615         /* The linking might have altered the window, so let's refresh our pointer. */
1616         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1617         if (r < 0)
1618                 return r;
1619 
1620 #if HAVE_GCRYPT
1621         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1622         if (r < 0)
1623                 return r;
1624 #endif
1625 
1626         /* Create field object ... */
1627         r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1628         if (r < 0)
1629                 return r;
1630 
1631         /* ... and link it in. */
1632         o->data.next_field_offset = fo->field.head_data_offset;
1633         fo->field.head_data_offset = le64toh(p);
1634 
1635         if (ret)
1636                 *ret = o;
1637 
1638         if (ret_offset)
1639                 *ret_offset = p;
1640 
1641         return 0;
1642 }
1643 
journal_file_entry_n_items(Object * o)1644 uint64_t journal_file_entry_n_items(Object *o) {
1645         uint64_t sz;
1646         assert(o);
1647 
1648         if (o->object.type != OBJECT_ENTRY)
1649                 return 0;
1650 
1651         sz = le64toh(READ_NOW(o->object.size));
1652         if (sz < offsetof(Object, entry.items))
1653                 return 0;
1654 
1655         return (sz - offsetof(Object, entry.items)) / sizeof(EntryItem);
1656 }
1657 
journal_file_entry_array_n_items(Object * o)1658 uint64_t journal_file_entry_array_n_items(Object *o) {
1659         uint64_t sz;
1660 
1661         assert(o);
1662 
1663         if (o->object.type != OBJECT_ENTRY_ARRAY)
1664                 return 0;
1665 
1666         sz = le64toh(READ_NOW(o->object.size));
1667         if (sz < offsetof(Object, entry_array.items))
1668                 return 0;
1669 
1670         return (sz - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1671 }
1672 
journal_file_hash_table_n_items(Object * o)1673 uint64_t journal_file_hash_table_n_items(Object *o) {
1674         uint64_t sz;
1675 
1676         assert(o);
1677 
1678         if (!IN_SET(o->object.type, OBJECT_DATA_HASH_TABLE, OBJECT_FIELD_HASH_TABLE))
1679                 return 0;
1680 
1681         sz = le64toh(READ_NOW(o->object.size));
1682         if (sz < offsetof(Object, hash_table.items))
1683                 return 0;
1684 
1685         return (sz - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1686 }
1687 
link_entry_into_array(JournalFile * f,le64_t * first,le64_t * idx,uint64_t p)1688 static int link_entry_into_array(JournalFile *f,
1689                                  le64_t *first,
1690                                  le64_t *idx,
1691                                  uint64_t p) {
1692         int r;
1693         uint64_t n = 0, ap = 0, q, i, a, hidx;
1694         Object *o;
1695 
1696         assert(f);
1697         assert(f->header);
1698         assert(first);
1699         assert(idx);
1700         assert(p > 0);
1701 
1702         a = le64toh(*first);
1703         i = hidx = le64toh(READ_NOW(*idx));
1704         while (a > 0) {
1705 
1706                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1707                 if (r < 0)
1708                         return r;
1709 
1710                 n = journal_file_entry_array_n_items(o);
1711                 if (i < n) {
1712                         o->entry_array.items[i] = htole64(p);
1713                         *idx = htole64(hidx + 1);
1714                         return 0;
1715                 }
1716 
1717                 i -= n;
1718                 ap = a;
1719                 a = le64toh(o->entry_array.next_entry_array_offset);
1720         }
1721 
1722         if (hidx > n)
1723                 n = (hidx+1) * 2;
1724         else
1725                 n = n * 2;
1726 
1727         if (n < 4)
1728                 n = 4;
1729 
1730         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1731                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1732                                        &o, &q);
1733         if (r < 0)
1734                 return r;
1735 
1736 #if HAVE_GCRYPT
1737         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1738         if (r < 0)
1739                 return r;
1740 #endif
1741 
1742         o->entry_array.items[i] = htole64(p);
1743 
1744         if (ap == 0)
1745                 *first = htole64(q);
1746         else {
1747                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1748                 if (r < 0)
1749                         return r;
1750 
1751                 o->entry_array.next_entry_array_offset = htole64(q);
1752         }
1753 
1754         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1755                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1756 
1757         *idx = htole64(hidx + 1);
1758 
1759         return 0;
1760 }
1761 
link_entry_into_array_plus_one(JournalFile * f,le64_t * extra,le64_t * first,le64_t * idx,uint64_t p)1762 static int link_entry_into_array_plus_one(JournalFile *f,
1763                                           le64_t *extra,
1764                                           le64_t *first,
1765                                           le64_t *idx,
1766                                           uint64_t p) {
1767 
1768         uint64_t hidx;
1769         int r;
1770 
1771         assert(f);
1772         assert(extra);
1773         assert(first);
1774         assert(idx);
1775         assert(p > 0);
1776 
1777         hidx = le64toh(READ_NOW(*idx));
1778         if (hidx == UINT64_MAX)
1779                 return -EBADMSG;
1780         if (hidx == 0)
1781                 *extra = htole64(p);
1782         else {
1783                 le64_t i;
1784 
1785                 i = htole64(hidx - 1);
1786                 r = link_entry_into_array(f, first, &i, p);
1787                 if (r < 0)
1788                         return r;
1789         }
1790 
1791         *idx = htole64(hidx + 1);
1792         return 0;
1793 }
1794 
journal_file_link_entry_item(JournalFile * f,Object * o,uint64_t offset,uint64_t i)1795 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1796         uint64_t p;
1797         int r;
1798 
1799         assert(f);
1800         assert(o);
1801         assert(offset > 0);
1802 
1803         p = le64toh(o->entry.items[i].object_offset);
1804         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1805         if (r < 0)
1806                 return r;
1807 
1808         return link_entry_into_array_plus_one(f,
1809                                               &o->data.entry_offset,
1810                                               &o->data.entry_array_offset,
1811                                               &o->data.n_entries,
1812                                               offset);
1813 }
1814 
journal_file_link_entry(JournalFile * f,Object * o,uint64_t offset)1815 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1816         uint64_t n;
1817         int r;
1818 
1819         assert(f);
1820         assert(f->header);
1821         assert(o);
1822         assert(offset > 0);
1823 
1824         if (o->object.type != OBJECT_ENTRY)
1825                 return -EINVAL;
1826 
1827         __sync_synchronize();
1828 
1829         /* Link up the entry itself */
1830         r = link_entry_into_array(f,
1831                                   &f->header->entry_array_offset,
1832                                   &f->header->n_entries,
1833                                   offset);
1834         if (r < 0)
1835                 return r;
1836 
1837         /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1838 
1839         if (f->header->head_entry_realtime == 0)
1840                 f->header->head_entry_realtime = o->entry.realtime;
1841 
1842         f->header->tail_entry_realtime = o->entry.realtime;
1843         f->header->tail_entry_monotonic = o->entry.monotonic;
1844 
1845         /* Link up the items */
1846         n = journal_file_entry_n_items(o);
1847         for (uint64_t i = 0; i < n; i++) {
1848                 int k;
1849 
1850                 /* If we fail to link an entry item because we can't allocate a new entry array, don't fail
1851                  * immediately but try to link the other entry items since it might still be possible to link
1852                  * those if they don't require a new entry array to be allocated. */
1853 
1854                 k = journal_file_link_entry_item(f, o, offset, i);
1855                 if (k == -E2BIG)
1856                         r = k;
1857                 else if (k < 0)
1858                         return k;
1859         }
1860 
1861         return r;
1862 }
1863 
journal_file_append_entry_internal(JournalFile * f,const dual_timestamp * ts,const sd_id128_t * boot_id,uint64_t xor_hash,const EntryItem items[],unsigned n_items,uint64_t * seqnum,Object ** ret,uint64_t * ret_offset)1864 static int journal_file_append_entry_internal(
1865                 JournalFile *f,
1866                 const dual_timestamp *ts,
1867                 const sd_id128_t *boot_id,
1868                 uint64_t xor_hash,
1869                 const EntryItem items[], unsigned n_items,
1870                 uint64_t *seqnum,
1871                 Object **ret, uint64_t *ret_offset) {
1872         uint64_t np;
1873         uint64_t osize;
1874         Object *o;
1875         int r;
1876 
1877         assert(f);
1878         assert(f->header);
1879         assert(items || n_items == 0);
1880         assert(ts);
1881 
1882         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1883 
1884         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1885         if (r < 0)
1886                 return r;
1887 
1888         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1889         memcpy_safe(o->entry.items, items, n_items * sizeof(EntryItem));
1890         o->entry.realtime = htole64(ts->realtime);
1891         o->entry.monotonic = htole64(ts->monotonic);
1892         o->entry.xor_hash = htole64(xor_hash);
1893         if (boot_id)
1894                 f->header->boot_id = *boot_id;
1895         o->entry.boot_id = f->header->boot_id;
1896 
1897 #if HAVE_GCRYPT
1898         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1899         if (r < 0)
1900                 return r;
1901 #endif
1902 
1903         r = journal_file_link_entry(f, o, np);
1904         if (r < 0)
1905                 return r;
1906 
1907         if (ret)
1908                 *ret = o;
1909 
1910         if (ret_offset)
1911                 *ret_offset = np;
1912 
1913         return r;
1914 }
1915 
journal_file_post_change(JournalFile * f)1916 void journal_file_post_change(JournalFile *f) {
1917         assert(f);
1918 
1919         if (f->fd < 0)
1920                 return;
1921 
1922         /* inotify() does not receive IN_MODIFY events from file
1923          * accesses done via mmap(). After each access we hence
1924          * trigger IN_MODIFY by truncating the journal file to its
1925          * current size which triggers IN_MODIFY. */
1926 
1927         __sync_synchronize();
1928 
1929         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1930                 log_debug_errno(errno, "Failed to truncate file to its own size: %m");
1931 }
1932 
post_change_thunk(sd_event_source * timer,uint64_t usec,void * userdata)1933 static int post_change_thunk(sd_event_source *timer, uint64_t usec, void *userdata) {
1934         assert(userdata);
1935 
1936         journal_file_post_change(userdata);
1937 
1938         return 1;
1939 }
1940 
schedule_post_change(JournalFile * f)1941 static void schedule_post_change(JournalFile *f) {
1942         sd_event *e;
1943         int r;
1944 
1945         assert(f);
1946         assert(f->post_change_timer);
1947 
1948         assert_se(e = sd_event_source_get_event(f->post_change_timer));
1949 
1950         /* If we are already going down, post the change immediately. */
1951         if (IN_SET(sd_event_get_state(e), SD_EVENT_EXITING, SD_EVENT_FINISHED))
1952                 goto fail;
1953 
1954         r = sd_event_source_get_enabled(f->post_change_timer, NULL);
1955         if (r < 0) {
1956                 log_debug_errno(r, "Failed to get ftruncate timer state: %m");
1957                 goto fail;
1958         }
1959         if (r > 0)
1960                 return;
1961 
1962         r = sd_event_source_set_time_relative(f->post_change_timer, f->post_change_timer_period);
1963         if (r < 0) {
1964                 log_debug_errno(r, "Failed to set time for scheduling ftruncate: %m");
1965                 goto fail;
1966         }
1967 
1968         r = sd_event_source_set_enabled(f->post_change_timer, SD_EVENT_ONESHOT);
1969         if (r < 0) {
1970                 log_debug_errno(r, "Failed to enable scheduled ftruncate: %m");
1971                 goto fail;
1972         }
1973 
1974         return;
1975 
1976 fail:
1977         /* On failure, let's simply post the change immediately. */
1978         journal_file_post_change(f);
1979 }
1980 
1981 /* Enable coalesced change posting in a timer on the provided sd_event instance */
journal_file_enable_post_change_timer(JournalFile * f,sd_event * e,usec_t t)1982 int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t) {
1983         _cleanup_(sd_event_source_unrefp) sd_event_source *timer = NULL;
1984         int r;
1985 
1986         assert(f);
1987         assert_return(!f->post_change_timer, -EINVAL);
1988         assert(e);
1989         assert(t);
1990 
1991         r = sd_event_add_time(e, &timer, CLOCK_MONOTONIC, 0, 0, post_change_thunk, f);
1992         if (r < 0)
1993                 return r;
1994 
1995         r = sd_event_source_set_enabled(timer, SD_EVENT_OFF);
1996         if (r < 0)
1997                 return r;
1998 
1999         f->post_change_timer = TAKE_PTR(timer);
2000         f->post_change_timer_period = t;
2001 
2002         return r;
2003 }
2004 
entry_item_cmp(const EntryItem * a,const EntryItem * b)2005 static int entry_item_cmp(const EntryItem *a, const EntryItem *b) {
2006         return CMP(le64toh(a->object_offset), le64toh(b->object_offset));
2007 }
2008 
remove_duplicate_entry_items(EntryItem items[],size_t n)2009 static size_t remove_duplicate_entry_items(EntryItem items[], size_t n) {
2010 
2011         /* This function relies on the items array being sorted. */
2012         size_t j = 1;
2013 
2014         if (n <= 1)
2015                 return n;
2016 
2017         for (size_t i = 1; i < n; i++)
2018                 if (items[i].object_offset != items[j - 1].object_offset)
2019                         items[j++] = items[i];
2020 
2021         return j;
2022 }
2023 
journal_file_append_entry(JournalFile * f,const dual_timestamp * ts,const sd_id128_t * boot_id,const struct iovec iovec[],unsigned n_iovec,uint64_t * seqnum,Object ** ret,uint64_t * ret_offset)2024 int journal_file_append_entry(
2025                 JournalFile *f,
2026                 const dual_timestamp *ts,
2027                 const sd_id128_t *boot_id,
2028                 const struct iovec iovec[], unsigned n_iovec,
2029                 uint64_t *seqnum,
2030                 Object **ret, uint64_t *ret_offset) {
2031 
2032         EntryItem *items;
2033         int r;
2034         uint64_t xor_hash = 0;
2035         struct dual_timestamp _ts;
2036 
2037         assert(f);
2038         assert(f->header);
2039         assert(iovec && n_iovec > 0);
2040 
2041         if (ts) {
2042                 if (!VALID_REALTIME(ts->realtime))
2043                         return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
2044                                                "Invalid realtime timestamp %" PRIu64 ", refusing entry.",
2045                                                ts->realtime);
2046                 if (!VALID_MONOTONIC(ts->monotonic))
2047                         return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
2048                                                "Invalid monotomic timestamp %" PRIu64 ", refusing entry.",
2049                                                ts->monotonic);
2050         } else {
2051                 dual_timestamp_get(&_ts);
2052                 ts = &_ts;
2053         }
2054 
2055 #if HAVE_GCRYPT
2056         r = journal_file_maybe_append_tag(f, ts->realtime);
2057         if (r < 0)
2058                 return r;
2059 #endif
2060 
2061         items = newa(EntryItem, n_iovec);
2062 
2063         for (size_t i = 0; i < n_iovec; i++) {
2064                 uint64_t p;
2065                 Object *o;
2066 
2067                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
2068                 if (r < 0)
2069                         return r;
2070 
2071                 /* When calculating the XOR hash field, we need to take special care if the "keyed-hash"
2072                  * journal file flag is on. We use the XOR hash field to quickly determine the identity of a
2073                  * specific record, and give records with otherwise identical position (i.e. match in seqno,
2074                  * timestamp, …) a stable ordering. But for that we can't have it that the hash of the
2075                  * objects in each file is different since they are keyed. Hence let's calculate the Jenkins
2076                  * hash here for that. This also has the benefit that cursors for old and new journal files
2077                  * are completely identical (they include the XOR hash after all). For classic Jenkins-hash
2078                  * files things are easier, we can just take the value from the stored record directly. */
2079 
2080                 if (JOURNAL_HEADER_KEYED_HASH(f->header))
2081                         xor_hash ^= jenkins_hash64(iovec[i].iov_base, iovec[i].iov_len);
2082                 else
2083                         xor_hash ^= le64toh(o->data.hash);
2084 
2085                 items[i] = (EntryItem) {
2086                         .object_offset = htole64(p),
2087                         .hash = o->data.hash,
2088                 };
2089         }
2090 
2091         /* Order by the position on disk, in order to improve seek
2092          * times for rotating media. */
2093         typesafe_qsort(items, n_iovec, entry_item_cmp);
2094         n_iovec = remove_duplicate_entry_items(items, n_iovec);
2095 
2096         r = journal_file_append_entry_internal(f, ts, boot_id, xor_hash, items, n_iovec, seqnum, ret, ret_offset);
2097 
2098         /* If the memory mapping triggered a SIGBUS then we return an
2099          * IO error and ignore the error code passed down to us, since
2100          * it is very likely just an effect of a nullified replacement
2101          * mapping page */
2102 
2103         if (mmap_cache_fd_got_sigbus(f->cache_fd))
2104                 r = -EIO;
2105 
2106         if (f->post_change_timer)
2107                 schedule_post_change(f);
2108         else
2109                 journal_file_post_change(f);
2110 
2111         return r;
2112 }
2113 
2114 typedef struct ChainCacheItem {
2115         uint64_t first; /* the array at the beginning of the chain */
2116         uint64_t array; /* the cached array */
2117         uint64_t begin; /* the first item in the cached array */
2118         uint64_t total; /* the total number of items in all arrays before this one in the chain */
2119         uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
2120 } ChainCacheItem;
2121 
chain_cache_put(OrderedHashmap * h,ChainCacheItem * ci,uint64_t first,uint64_t array,uint64_t begin,uint64_t total,uint64_t last_index)2122 static void chain_cache_put(
2123                 OrderedHashmap *h,
2124                 ChainCacheItem *ci,
2125                 uint64_t first,
2126                 uint64_t array,
2127                 uint64_t begin,
2128                 uint64_t total,
2129                 uint64_t last_index) {
2130 
2131         if (!ci) {
2132                 /* If the chain item to cache for this chain is the
2133                  * first one it's not worth caching anything */
2134                 if (array == first)
2135                         return;
2136 
2137                 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
2138                         ci = ordered_hashmap_steal_first(h);
2139                         assert(ci);
2140                 } else {
2141                         ci = new(ChainCacheItem, 1);
2142                         if (!ci)
2143                                 return;
2144                 }
2145 
2146                 ci->first = first;
2147 
2148                 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
2149                         free(ci);
2150                         return;
2151                 }
2152         } else
2153                 assert(ci->first == first);
2154 
2155         ci->array = array;
2156         ci->begin = begin;
2157         ci->total = total;
2158         ci->last_index = last_index;
2159 }
2160 
bump_array_index(uint64_t * i,direction_t direction,uint64_t n)2161 static int bump_array_index(uint64_t *i, direction_t direction, uint64_t n) {
2162         assert(i);
2163 
2164         /* Increase or decrease the specified index, in the right direction. */
2165 
2166         if (direction == DIRECTION_DOWN) {
2167                 if (*i >= n - 1)
2168                         return 0;
2169 
2170                 (*i)++;
2171         } else {
2172                 if (*i <= 0)
2173                         return 0;
2174 
2175                 (*i)--;
2176         }
2177 
2178         return 1;
2179 }
2180 
bump_entry_array(JournalFile * f,Object * o,uint64_t offset,uint64_t first,direction_t direction,uint64_t * ret)2181 static int bump_entry_array(JournalFile *f, Object *o, uint64_t offset, uint64_t first, direction_t direction, uint64_t *ret) {
2182         uint64_t p, q = 0;
2183         int r;
2184 
2185         assert(f);
2186         assert(offset);
2187         assert(ret);
2188 
2189         if (direction == DIRECTION_DOWN)
2190                 return le64toh(o->entry_array.next_entry_array_offset);
2191 
2192         /* Entry array chains are a singly linked list, so to find the previous array in the chain, we have
2193          * to start iterating from the top. */
2194 
2195         p = first;
2196 
2197         while (p > 0 && p != offset) {
2198                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, p, &o);
2199                 if (r < 0)
2200                         return r;
2201 
2202                 q = p;
2203                 p = le64toh(o->entry_array.next_entry_array_offset);
2204         }
2205 
2206         /* If we can't find the previous entry array in the entry array chain, we're likely dealing with a
2207          * corrupted journal file. */
2208         if (p == 0)
2209                 return -EBADMSG;
2210 
2211         *ret = q;
2212 
2213         return 0;
2214 }
2215 
generic_array_get(JournalFile * f,uint64_t first,uint64_t i,direction_t direction,Object ** ret,uint64_t * ret_offset)2216 static int generic_array_get(
2217                 JournalFile *f,
2218                 uint64_t first,
2219                 uint64_t i,
2220                 direction_t direction,
2221                 Object **ret, uint64_t *ret_offset) {
2222 
2223         Object *o;
2224         uint64_t p = 0, a, t = 0, k;
2225         int r;
2226         ChainCacheItem *ci;
2227 
2228         assert(f);
2229 
2230         a = first;
2231 
2232         /* Try the chain cache first */
2233         ci = ordered_hashmap_get(f->chain_cache, &first);
2234         if (ci && i > ci->total) {
2235                 a = ci->array;
2236                 i -= ci->total;
2237                 t = ci->total;
2238         }
2239 
2240         while (a > 0) {
2241                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
2242                 if (IN_SET(r, -EBADMSG, -EADDRNOTAVAIL)) {
2243                         /* If there's corruption and we're going downwards, let's pretend we reached the
2244                          * final entry in the entry array chain. */
2245 
2246                         if (direction == DIRECTION_DOWN)
2247                                 return 0;
2248 
2249                         /* If there's corruption and we're going upwards, move back to the previous entry
2250                          * array and start iterating entries from there. */
2251 
2252                         r = bump_entry_array(f, NULL, a, first, DIRECTION_UP, &a);
2253                         if (r < 0)
2254                                 return r;
2255 
2256                         i = UINT64_MAX;
2257 
2258                         break;
2259                 }
2260                 if (r < 0)
2261                         return r;
2262 
2263                 k = journal_file_entry_array_n_items(o);
2264                 if (i < k)
2265                         break;
2266 
2267                 i -= k;
2268                 t += k;
2269                 a = le64toh(o->entry_array.next_entry_array_offset);
2270         }
2271 
2272         /* If we've found the right location, now look for the first non-corrupt entry object (in the right
2273          * direction). */
2274 
2275         while (a > 0) {
2276                 /* In the first iteration of the while loop, we reuse i, k and o from the previous while
2277                  * loop. */
2278                 if (i == UINT64_MAX) {
2279                         r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
2280                         if (r < 0)
2281                                 return r;
2282 
2283                         k = journal_file_entry_array_n_items(o);
2284                         if (k == 0)
2285                                 break;
2286 
2287                         i = direction == DIRECTION_DOWN ? 0 : k - 1;
2288                 }
2289 
2290                 do {
2291                         p = le64toh(o->entry_array.items[i]);
2292 
2293                         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, ret);
2294                         if (r >= 0) {
2295                                 /* Let's cache this item for the next invocation */
2296                                 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
2297 
2298                                 if (ret_offset)
2299                                         *ret_offset = p;
2300 
2301                                 return 1;
2302                         }
2303                         if (!IN_SET(r, -EADDRNOTAVAIL, -EBADMSG))
2304                                 return r;
2305 
2306                         /* OK, so this entry is borked. Most likely some entry didn't get synced to
2307                         * disk properly, let's see if the next one might work for us instead. */
2308                         log_debug_errno(r, "Entry item %" PRIu64 " is bad, skipping over it.", i);
2309                 } while (bump_array_index(&i, direction, k) > 0);
2310 
2311                 r = bump_entry_array(f, o, a, first, direction, &a);
2312                 if (r < 0)
2313                         return r;
2314 
2315                 t += k;
2316                 i = UINT64_MAX;
2317         }
2318 
2319         return 0;
2320 }
2321 
generic_array_get_plus_one(JournalFile * f,uint64_t extra,uint64_t first,uint64_t i,direction_t direction,Object ** ret,uint64_t * ret_offset)2322 static int generic_array_get_plus_one(
2323                 JournalFile *f,
2324                 uint64_t extra,
2325                 uint64_t first,
2326                 uint64_t i,
2327                 direction_t direction,
2328                 Object **ret, uint64_t *ret_offset) {
2329 
2330         int r;
2331 
2332         assert(f);
2333 
2334         if (i == 0) {
2335                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, ret);
2336                 if (IN_SET(r, -EADDRNOTAVAIL, -EBADMSG))
2337                         return generic_array_get(f, first, 0, direction, ret, ret_offset);
2338                 if (r < 0)
2339                         return r;
2340 
2341                 if (ret_offset)
2342                         *ret_offset = extra;
2343 
2344                 return 1;
2345         }
2346 
2347         return generic_array_get(f, first, i - 1, direction, ret, ret_offset);
2348 }
2349 
2350 enum {
2351         TEST_FOUND,
2352         TEST_LEFT,
2353         TEST_RIGHT
2354 };
2355 
generic_array_bisect(JournalFile * f,uint64_t first,uint64_t n,uint64_t needle,int (* test_object)(JournalFile * f,uint64_t p,uint64_t needle),direction_t direction,Object ** ret,uint64_t * ret_offset,uint64_t * ret_idx)2356 static int generic_array_bisect(
2357                 JournalFile *f,
2358                 uint64_t first,
2359                 uint64_t n,
2360                 uint64_t needle,
2361                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2362                 direction_t direction,
2363                 Object **ret,
2364                 uint64_t *ret_offset,
2365                 uint64_t *ret_idx) {
2366 
2367         /* Given an entry array chain, this function finds the object "closest" to the given needle in the
2368          * chain, taking into account the provided direction. A function can be provided to determine how
2369          * an object is matched against the given needle.
2370          *
2371          * Given a journal file, the offset of an object and the needle, the test_object() function should
2372          * return TEST_LEFT if the needle is located earlier in the entry array chain, TEST_RIGHT if the
2373          * needle is located later in the entry array chain and TEST_FOUND if the object matches the needle.
2374          * If test_object() returns TEST_FOUND for a specific object, that object's information will be used
2375          * to populate the return values of this function. If test_object() never returns TEST_FOUND, the
2376          * return values are populated with the details of one of the objects closest to the needle. If the
2377          * direction is DIRECTION_UP, the earlier object is used. Otherwise, the later object is used.
2378          */
2379 
2380         uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = UINT64_MAX;
2381         bool subtract_one = false;
2382         Object *array = NULL;
2383         int r;
2384         ChainCacheItem *ci;
2385 
2386         assert(f);
2387         assert(test_object);
2388 
2389         /* Start with the first array in the chain */
2390         a = first;
2391 
2392         ci = ordered_hashmap_get(f->chain_cache, &first);
2393         if (ci && n > ci->total && ci->begin != 0) {
2394                 /* Ah, we have iterated this bisection array chain
2395                  * previously! Let's see if we can skip ahead in the
2396                  * chain, as far as the last time. But we can't jump
2397                  * backwards in the chain, so let's check that
2398                  * first. */
2399 
2400                 r = test_object(f, ci->begin, needle);
2401                 if (r < 0)
2402                         return r;
2403 
2404                 if (r == TEST_LEFT) {
2405                         /* OK, what we are looking for is right of the
2406                          * begin of this EntryArray, so let's jump
2407                          * straight to previously cached array in the
2408                          * chain */
2409 
2410                         a = ci->array;
2411                         n -= ci->total;
2412                         t = ci->total;
2413                         last_index = ci->last_index;
2414                 }
2415         }
2416 
2417         while (a > 0) {
2418                 uint64_t left, right, k, lp;
2419 
2420                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
2421                 if (r < 0)
2422                         return r;
2423 
2424                 k = journal_file_entry_array_n_items(array);
2425                 right = MIN(k, n);
2426                 if (right <= 0)
2427                         return 0;
2428 
2429                 i = right - 1;
2430                 lp = p = le64toh(array->entry_array.items[i]);
2431                 if (p <= 0)
2432                         r = -EBADMSG;
2433                 else
2434                         r = test_object(f, p, needle);
2435                 if (r == -EBADMSG) {
2436                         log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (1)");
2437                         n = i;
2438                         continue;
2439                 }
2440                 if (r < 0)
2441                         return r;
2442 
2443                 if (r == TEST_FOUND)
2444                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2445 
2446                 if (r == TEST_RIGHT) {
2447                         left = 0;
2448                         right -= 1;
2449 
2450                         if (last_index != UINT64_MAX) {
2451                                 assert(last_index <= right);
2452 
2453                                 /* If we cached the last index we
2454                                  * looked at, let's try to not to jump
2455                                  * too wildly around and see if we can
2456                                  * limit the range to look at early to
2457                                  * the immediate neighbors of the last
2458                                  * index we looked at. */
2459 
2460                                 if (last_index > 0) {
2461                                         uint64_t x = last_index - 1;
2462 
2463                                         p = le64toh(array->entry_array.items[x]);
2464                                         if (p <= 0)
2465                                                 return -EBADMSG;
2466 
2467                                         r = test_object(f, p, needle);
2468                                         if (r < 0)
2469                                                 return r;
2470 
2471                                         if (r == TEST_FOUND)
2472                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2473 
2474                                         if (r == TEST_RIGHT)
2475                                                 right = x;
2476                                         else
2477                                                 left = x + 1;
2478                                 }
2479 
2480                                 if (last_index < right) {
2481                                         uint64_t y = last_index + 1;
2482 
2483                                         p = le64toh(array->entry_array.items[y]);
2484                                         if (p <= 0)
2485                                                 return -EBADMSG;
2486 
2487                                         r = test_object(f, p, needle);
2488                                         if (r < 0)
2489                                                 return r;
2490 
2491                                         if (r == TEST_FOUND)
2492                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2493 
2494                                         if (r == TEST_RIGHT)
2495                                                 right = y;
2496                                         else
2497                                                 left = y + 1;
2498                                 }
2499                         }
2500 
2501                         for (;;) {
2502                                 if (left == right) {
2503                                         if (direction == DIRECTION_UP)
2504                                                 subtract_one = true;
2505 
2506                                         i = left;
2507                                         goto found;
2508                                 }
2509 
2510                                 assert(left < right);
2511                                 i = (left + right) / 2;
2512 
2513                                 p = le64toh(array->entry_array.items[i]);
2514                                 if (p <= 0)
2515                                         r = -EBADMSG;
2516                                 else
2517                                         r = test_object(f, p, needle);
2518                                 if (r == -EBADMSG) {
2519                                         log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (2)");
2520                                         right = n = i;
2521                                         continue;
2522                                 }
2523                                 if (r < 0)
2524                                         return r;
2525 
2526                                 if (r == TEST_FOUND)
2527                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2528 
2529                                 if (r == TEST_RIGHT)
2530                                         right = i;
2531                                 else
2532                                         left = i + 1;
2533                         }
2534                 }
2535 
2536                 if (k >= n) {
2537                         if (direction == DIRECTION_UP) {
2538                                 i = n;
2539                                 subtract_one = true;
2540                                 goto found;
2541                         }
2542 
2543                         return 0;
2544                 }
2545 
2546                 last_p = lp;
2547 
2548                 n -= k;
2549                 t += k;
2550                 last_index = UINT64_MAX;
2551                 a = le64toh(array->entry_array.next_entry_array_offset);
2552         }
2553 
2554         return 0;
2555 
2556 found:
2557         if (subtract_one && t == 0 && i == 0)
2558                 return 0;
2559 
2560         /* Let's cache this item for the next invocation */
2561         chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : UINT64_MAX) : i);
2562 
2563         if (subtract_one && i == 0)
2564                 p = last_p;
2565         else if (subtract_one)
2566                 p = le64toh(array->entry_array.items[i-1]);
2567         else
2568                 p = le64toh(array->entry_array.items[i]);
2569 
2570         if (ret) {
2571                 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, ret);
2572                 if (r < 0)
2573                         return r;
2574         }
2575 
2576         if (ret_offset)
2577                 *ret_offset = p;
2578 
2579         if (ret_idx)
2580                 *ret_idx = t + i + (subtract_one ? -1 : 0);
2581 
2582         return 1;
2583 }
2584 
generic_array_bisect_plus_one(JournalFile * f,uint64_t extra,uint64_t first,uint64_t n,uint64_t needle,int (* test_object)(JournalFile * f,uint64_t p,uint64_t needle),direction_t direction,Object ** ret,uint64_t * ret_offset,uint64_t * ret_idx)2585 static int generic_array_bisect_plus_one(
2586                 JournalFile *f,
2587                 uint64_t extra,
2588                 uint64_t first,
2589                 uint64_t n,
2590                 uint64_t needle,
2591                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2592                 direction_t direction,
2593                 Object **ret,
2594                 uint64_t *ret_offset,
2595                 uint64_t *ret_idx) {
2596 
2597         int r;
2598         bool step_back = false;
2599 
2600         assert(f);
2601         assert(test_object);
2602 
2603         if (n <= 0)
2604                 return 0;
2605 
2606         /* This bisects the array in object 'first', but first checks
2607          * an extra  */
2608         r = test_object(f, extra, needle);
2609         if (r < 0)
2610                 return r;
2611 
2612         if (r == TEST_FOUND)
2613                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2614 
2615         /* if we are looking with DIRECTION_UP then we need to first
2616            see if in the actual array there is a matching entry, and
2617            return the last one of that. But if there isn't any we need
2618            to return this one. Hence remember this, and return it
2619            below. */
2620         if (r == TEST_LEFT)
2621                 step_back = direction == DIRECTION_UP;
2622 
2623         if (r == TEST_RIGHT) {
2624                 if (direction == DIRECTION_DOWN)
2625                         goto found;
2626                 else
2627                         return 0;
2628         }
2629 
2630         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, ret_offset, ret_idx);
2631 
2632         if (r == 0 && step_back)
2633                 goto found;
2634 
2635         if (r > 0 && ret_idx)
2636                 (*ret_idx)++;
2637 
2638         return r;
2639 
2640 found:
2641         if (ret) {
2642                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, ret);
2643                 if (r < 0)
2644                         return r;
2645         }
2646 
2647         if (ret_offset)
2648                 *ret_offset = extra;
2649 
2650         if (ret_idx)
2651                 *ret_idx = 0;
2652 
2653         return 1;
2654 }
2655 
test_object_offset(JournalFile * f,uint64_t p,uint64_t needle)2656 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
2657         assert(f);
2658         assert(p > 0);
2659 
2660         if (p == needle)
2661                 return TEST_FOUND;
2662         else if (p < needle)
2663                 return TEST_LEFT;
2664         else
2665                 return TEST_RIGHT;
2666 }
2667 
journal_file_move_to_entry_by_offset(JournalFile * f,uint64_t p,direction_t direction,Object ** ret,uint64_t * ret_offset)2668 int journal_file_move_to_entry_by_offset(
2669                 JournalFile *f,
2670                 uint64_t p,
2671                 direction_t direction,
2672                 Object **ret,
2673                 uint64_t *ret_offset) {
2674 
2675         assert(f);
2676         assert(f->header);
2677 
2678         return generic_array_bisect(
2679                         f,
2680                         le64toh(f->header->entry_array_offset),
2681                         le64toh(f->header->n_entries),
2682                         p,
2683                         test_object_offset,
2684                         direction,
2685                         ret, ret_offset, NULL);
2686 }
2687 
test_object_seqnum(JournalFile * f,uint64_t p,uint64_t needle)2688 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
2689         uint64_t sq;
2690         Object *o;
2691         int r;
2692 
2693         assert(f);
2694         assert(p > 0);
2695 
2696         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2697         if (r < 0)
2698                 return r;
2699 
2700         sq = le64toh(READ_NOW(o->entry.seqnum));
2701         if (sq == needle)
2702                 return TEST_FOUND;
2703         else if (sq < needle)
2704                 return TEST_LEFT;
2705         else
2706                 return TEST_RIGHT;
2707 }
2708 
journal_file_move_to_entry_by_seqnum(JournalFile * f,uint64_t seqnum,direction_t direction,Object ** ret,uint64_t * ret_offset)2709 int journal_file_move_to_entry_by_seqnum(
2710                 JournalFile *f,
2711                 uint64_t seqnum,
2712                 direction_t direction,
2713                 Object **ret,
2714                 uint64_t *ret_offset) {
2715         assert(f);
2716         assert(f->header);
2717 
2718         return generic_array_bisect(
2719                         f,
2720                         le64toh(f->header->entry_array_offset),
2721                         le64toh(f->header->n_entries),
2722                         seqnum,
2723                         test_object_seqnum,
2724                         direction,
2725                         ret, ret_offset, NULL);
2726 }
2727 
test_object_realtime(JournalFile * f,uint64_t p,uint64_t needle)2728 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
2729         Object *o;
2730         uint64_t rt;
2731         int r;
2732 
2733         assert(f);
2734         assert(p > 0);
2735 
2736         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2737         if (r < 0)
2738                 return r;
2739 
2740         rt = le64toh(READ_NOW(o->entry.realtime));
2741         if (rt == needle)
2742                 return TEST_FOUND;
2743         else if (rt < needle)
2744                 return TEST_LEFT;
2745         else
2746                 return TEST_RIGHT;
2747 }
2748 
journal_file_move_to_entry_by_realtime(JournalFile * f,uint64_t realtime,direction_t direction,Object ** ret,uint64_t * ret_offset)2749 int journal_file_move_to_entry_by_realtime(
2750                 JournalFile *f,
2751                 uint64_t realtime,
2752                 direction_t direction,
2753                 Object **ret,
2754                 uint64_t *ret_offset) {
2755         assert(f);
2756         assert(f->header);
2757 
2758         return generic_array_bisect(
2759                         f,
2760                         le64toh(f->header->entry_array_offset),
2761                         le64toh(f->header->n_entries),
2762                         realtime,
2763                         test_object_realtime,
2764                         direction,
2765                         ret, ret_offset, NULL);
2766 }
2767 
test_object_monotonic(JournalFile * f,uint64_t p,uint64_t needle)2768 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
2769         Object *o;
2770         uint64_t m;
2771         int r;
2772 
2773         assert(f);
2774         assert(p > 0);
2775 
2776         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2777         if (r < 0)
2778                 return r;
2779 
2780         m = le64toh(READ_NOW(o->entry.monotonic));
2781         if (m == needle)
2782                 return TEST_FOUND;
2783         else if (m < needle)
2784                 return TEST_LEFT;
2785         else
2786                 return TEST_RIGHT;
2787 }
2788 
find_data_object_by_boot_id(JournalFile * f,sd_id128_t boot_id,Object ** o,uint64_t * b)2789 static int find_data_object_by_boot_id(
2790                 JournalFile *f,
2791                 sd_id128_t boot_id,
2792                 Object **o,
2793                 uint64_t *b) {
2794 
2795         char t[STRLEN("_BOOT_ID=") + 32 + 1] = "_BOOT_ID=";
2796 
2797         sd_id128_to_string(boot_id, t + 9);
2798         return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2799 }
2800 
journal_file_move_to_entry_by_monotonic(JournalFile * f,sd_id128_t boot_id,uint64_t monotonic,direction_t direction,Object ** ret,uint64_t * ret_offset)2801 int journal_file_move_to_entry_by_monotonic(
2802                 JournalFile *f,
2803                 sd_id128_t boot_id,
2804                 uint64_t monotonic,
2805                 direction_t direction,
2806                 Object **ret,
2807                 uint64_t *ret_offset) {
2808 
2809         Object *o;
2810         int r;
2811 
2812         assert(f);
2813 
2814         r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
2815         if (r < 0)
2816                 return r;
2817         if (r == 0)
2818                 return -ENOENT;
2819 
2820         return generic_array_bisect_plus_one(
2821                         f,
2822                         le64toh(o->data.entry_offset),
2823                         le64toh(o->data.entry_array_offset),
2824                         le64toh(o->data.n_entries),
2825                         monotonic,
2826                         test_object_monotonic,
2827                         direction,
2828                         ret, ret_offset, NULL);
2829 }
2830 
journal_file_reset_location(JournalFile * f)2831 void journal_file_reset_location(JournalFile *f) {
2832         f->location_type = LOCATION_HEAD;
2833         f->current_offset = 0;
2834         f->current_seqnum = 0;
2835         f->current_realtime = 0;
2836         f->current_monotonic = 0;
2837         zero(f->current_boot_id);
2838         f->current_xor_hash = 0;
2839 }
2840 
journal_file_save_location(JournalFile * f,Object * o,uint64_t offset)2841 void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
2842         f->location_type = LOCATION_SEEK;
2843         f->current_offset = offset;
2844         f->current_seqnum = le64toh(o->entry.seqnum);
2845         f->current_realtime = le64toh(o->entry.realtime);
2846         f->current_monotonic = le64toh(o->entry.monotonic);
2847         f->current_boot_id = o->entry.boot_id;
2848         f->current_xor_hash = le64toh(o->entry.xor_hash);
2849 }
2850 
journal_file_compare_locations(JournalFile * af,JournalFile * bf)2851 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2852         int r;
2853 
2854         assert(af);
2855         assert(af->header);
2856         assert(bf);
2857         assert(bf->header);
2858         assert(af->location_type == LOCATION_SEEK);
2859         assert(bf->location_type == LOCATION_SEEK);
2860 
2861         /* If contents, timestamps and seqnum match, these entries are
2862          * identical. */
2863         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2864             af->current_monotonic == bf->current_monotonic &&
2865             af->current_realtime == bf->current_realtime &&
2866             af->current_xor_hash == bf->current_xor_hash &&
2867             sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id) &&
2868             af->current_seqnum == bf->current_seqnum)
2869                 return 0;
2870 
2871         if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2872 
2873                 /* If this is from the same seqnum source, compare
2874                  * seqnums */
2875                 r = CMP(af->current_seqnum, bf->current_seqnum);
2876                 if (r != 0)
2877                         return r;
2878 
2879                 /* Wow! This is weird, different data but the same
2880                  * seqnums? Something is borked, but let's make the
2881                  * best of it and compare by time. */
2882         }
2883 
2884         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2885 
2886                 /* If the boot id matches, compare monotonic time */
2887                 r = CMP(af->current_monotonic, bf->current_monotonic);
2888                 if (r != 0)
2889                         return r;
2890         }
2891 
2892         /* Otherwise, compare UTC time */
2893         r = CMP(af->current_realtime, bf->current_realtime);
2894         if (r != 0)
2895                 return r;
2896 
2897         /* Finally, compare by contents */
2898         return CMP(af->current_xor_hash, bf->current_xor_hash);
2899 }
2900 
check_properly_ordered(uint64_t new_offset,uint64_t old_offset,direction_t direction)2901 static bool check_properly_ordered(uint64_t new_offset, uint64_t old_offset, direction_t direction) {
2902 
2903         /* Consider it an error if any of the two offsets is uninitialized */
2904         if (old_offset == 0 || new_offset == 0)
2905                 return false;
2906 
2907         /* If we go down, the new offset must be larger than the old one. */
2908         return direction == DIRECTION_DOWN ?
2909                 new_offset > old_offset  :
2910                 new_offset < old_offset;
2911 }
2912 
journal_file_next_entry(JournalFile * f,uint64_t p,direction_t direction,Object ** ret,uint64_t * ret_offset)2913 int journal_file_next_entry(
2914                 JournalFile *f,
2915                 uint64_t p,
2916                 direction_t direction,
2917                 Object **ret, uint64_t *ret_offset) {
2918 
2919         uint64_t i, n, ofs;
2920         int r;
2921 
2922         assert(f);
2923         assert(f->header);
2924 
2925         n = le64toh(READ_NOW(f->header->n_entries));
2926         if (n <= 0)
2927                 return 0;
2928 
2929         if (p == 0)
2930                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2931         else {
2932                 r = generic_array_bisect(f,
2933                                          le64toh(f->header->entry_array_offset),
2934                                          le64toh(f->header->n_entries),
2935                                          p,
2936                                          test_object_offset,
2937                                          DIRECTION_DOWN,
2938                                          NULL, NULL,
2939                                          &i);
2940                 if (r <= 0)
2941                         return r;
2942 
2943                 r = bump_array_index(&i, direction, n);
2944                 if (r <= 0)
2945                         return r;
2946         }
2947 
2948         /* And jump to it */
2949         r = generic_array_get(f, le64toh(f->header->entry_array_offset), i, direction, ret, &ofs);
2950         if (r <= 0)
2951                 return r;
2952 
2953         /* Ensure our array is properly ordered. */
2954         if (p > 0 && !check_properly_ordered(ofs, p, direction))
2955                 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
2956                                        "%s: entry array not properly ordered at entry %" PRIu64,
2957                                        f->path, i);
2958 
2959         if (ret_offset)
2960                 *ret_offset = ofs;
2961 
2962         return 1;
2963 }
2964 
journal_file_next_entry_for_data(JournalFile * f,Object * d,direction_t direction,Object ** ret,uint64_t * ret_offset)2965 int journal_file_next_entry_for_data(
2966                 JournalFile *f,
2967                 Object *d,
2968                 direction_t direction,
2969                 Object **ret, uint64_t *ret_offset) {
2970 
2971         uint64_t i, n, ofs;
2972         int r;
2973 
2974         assert(f);
2975         assert(d);
2976         assert(d->object.type == OBJECT_DATA);
2977 
2978         n = le64toh(READ_NOW(d->data.n_entries));
2979         if (n <= 0)
2980                 return n;
2981 
2982         i = direction == DIRECTION_DOWN ? 0 : n - 1;
2983 
2984         r = generic_array_get_plus_one(f,
2985                                        le64toh(d->data.entry_offset),
2986                                        le64toh(d->data.entry_array_offset),
2987                                        i,
2988                                        direction,
2989                                        ret, &ofs);
2990         if (r <= 0)
2991                 return r;
2992 
2993         if (ret_offset)
2994                 *ret_offset = ofs;
2995 
2996         return 1;
2997 }
2998 
journal_file_move_to_entry_by_offset_for_data(JournalFile * f,Object * d,uint64_t p,direction_t direction,Object ** ret,uint64_t * ret_offset)2999 int journal_file_move_to_entry_by_offset_for_data(
3000                 JournalFile *f,
3001                 Object *d,
3002                 uint64_t p,
3003                 direction_t direction,
3004                 Object **ret, uint64_t *ret_offset) {
3005 
3006         assert(f);
3007         assert(d);
3008         assert(d->object.type == OBJECT_DATA);
3009 
3010         return generic_array_bisect_plus_one(
3011                         f,
3012                         le64toh(d->data.entry_offset),
3013                         le64toh(d->data.entry_array_offset),
3014                         le64toh(d->data.n_entries),
3015                         p,
3016                         test_object_offset,
3017                         direction,
3018                         ret, ret_offset, NULL);
3019 }
3020 
journal_file_move_to_entry_by_monotonic_for_data(JournalFile * f,Object * d,sd_id128_t boot_id,uint64_t monotonic,direction_t direction,Object ** ret,uint64_t * ret_offset)3021 int journal_file_move_to_entry_by_monotonic_for_data(
3022                 JournalFile *f,
3023                 Object *d,
3024                 sd_id128_t boot_id,
3025                 uint64_t monotonic,
3026                 direction_t direction,
3027                 Object **ret, uint64_t *ret_offset) {
3028 
3029         Object *o;
3030         int r;
3031         uint64_t b, z, entry_offset, entry_array_offset, n_entries;
3032 
3033         assert(f);
3034         assert(d);
3035         assert(d->object.type == OBJECT_DATA);
3036 
3037         /* Save all the required data before the data object gets invalidated. */
3038         entry_offset = le64toh(READ_NOW(d->data.entry_offset));
3039         entry_array_offset = le64toh(READ_NOW(d->data.entry_array_offset));
3040         n_entries = le64toh(READ_NOW(d->data.n_entries));
3041 
3042         /* First, seek by time */
3043         r = find_data_object_by_boot_id(f, boot_id, &o, &b);
3044         if (r < 0)
3045                 return r;
3046         if (r == 0)
3047                 return -ENOENT;
3048 
3049         r = generic_array_bisect_plus_one(f,
3050                                           le64toh(o->data.entry_offset),
3051                                           le64toh(o->data.entry_array_offset),
3052                                           le64toh(o->data.n_entries),
3053                                           monotonic,
3054                                           test_object_monotonic,
3055                                           direction,
3056                                           NULL, &z, NULL);
3057         if (r <= 0)
3058                 return r;
3059 
3060         /* And now, continue seeking until we find an entry that
3061          * exists in both bisection arrays */
3062 
3063         r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
3064         if (r < 0)
3065                 return r;
3066 
3067         for (;;) {
3068                 uint64_t p, q;
3069 
3070                 r = generic_array_bisect_plus_one(f,
3071                                                   entry_offset,
3072                                                   entry_array_offset,
3073                                                   n_entries,
3074                                                   z,
3075                                                   test_object_offset,
3076                                                   direction,
3077                                                   NULL, &p, NULL);
3078                 if (r <= 0)
3079                         return r;
3080 
3081                 r = generic_array_bisect_plus_one(f,
3082                                                   le64toh(o->data.entry_offset),
3083                                                   le64toh(o->data.entry_array_offset),
3084                                                   le64toh(o->data.n_entries),
3085                                                   p,
3086                                                   test_object_offset,
3087                                                   direction,
3088                                                   NULL, &q, NULL);
3089 
3090                 if (r <= 0)
3091                         return r;
3092 
3093                 if (p == q) {
3094                         if (ret) {
3095                                 r = journal_file_move_to_object(f, OBJECT_ENTRY, q, ret);
3096                                 if (r < 0)
3097                                         return r;
3098                         }
3099 
3100                         if (ret_offset)
3101                                 *ret_offset = q;
3102 
3103                         return 1;
3104                 }
3105 
3106                 z = q;
3107         }
3108 }
3109 
journal_file_move_to_entry_by_seqnum_for_data(JournalFile * f,Object * d,uint64_t seqnum,direction_t direction,Object ** ret,uint64_t * ret_offset)3110 int journal_file_move_to_entry_by_seqnum_for_data(
3111                 JournalFile *f,
3112                 Object *d,
3113                 uint64_t seqnum,
3114                 direction_t direction,
3115                 Object **ret, uint64_t *ret_offset) {
3116 
3117         assert(f);
3118         assert(d);
3119         assert(d->object.type == OBJECT_DATA);
3120 
3121         return generic_array_bisect_plus_one(
3122                         f,
3123                         le64toh(d->data.entry_offset),
3124                         le64toh(d->data.entry_array_offset),
3125                         le64toh(d->data.n_entries),
3126                         seqnum,
3127                         test_object_seqnum,
3128                         direction,
3129                         ret, ret_offset, NULL);
3130 }
3131 
journal_file_move_to_entry_by_realtime_for_data(JournalFile * f,Object * d,uint64_t realtime,direction_t direction,Object ** ret,uint64_t * ret_offset)3132 int journal_file_move_to_entry_by_realtime_for_data(
3133                 JournalFile *f,
3134                 Object *d,
3135                 uint64_t realtime,
3136                 direction_t direction,
3137                 Object **ret, uint64_t *ret_offset) {
3138 
3139         assert(f);
3140         assert(d);
3141         assert(d->object.type == OBJECT_DATA);
3142 
3143         return generic_array_bisect_plus_one(
3144                         f,
3145                         le64toh(d->data.entry_offset),
3146                         le64toh(d->data.entry_array_offset),
3147                         le64toh(d->data.n_entries),
3148                         realtime,
3149                         test_object_realtime,
3150                         direction,
3151                         ret, ret_offset, NULL);
3152 }
3153 
journal_file_dump(JournalFile * f)3154 void journal_file_dump(JournalFile *f) {
3155         Object *o;
3156         int r;
3157         uint64_t p;
3158 
3159         assert(f);
3160         assert(f->header);
3161 
3162         journal_file_print_header(f);
3163 
3164         p = le64toh(READ_NOW(f->header->header_size));
3165         while (p != 0) {
3166                 const char *s;
3167                 Compression c;
3168 
3169                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
3170                 if (r < 0)
3171                         goto fail;
3172 
3173                 s = journal_object_type_to_string(o->object.type);
3174 
3175                 switch (o->object.type) {
3176 
3177                 case OBJECT_ENTRY:
3178                         assert(s);
3179 
3180                         printf("Type: %s seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
3181                                s,
3182                                le64toh(o->entry.seqnum),
3183                                le64toh(o->entry.monotonic),
3184                                le64toh(o->entry.realtime));
3185                         break;
3186 
3187                 case OBJECT_TAG:
3188                         assert(s);
3189 
3190                         printf("Type: %s seqnum=%"PRIu64" epoch=%"PRIu64"\n",
3191                                s,
3192                                le64toh(o->tag.seqnum),
3193                                le64toh(o->tag.epoch));
3194                         break;
3195 
3196                 default:
3197                         if (s)
3198                                 printf("Type: %s \n", s);
3199                         else
3200                                 printf("Type: unknown (%i)", o->object.type);
3201 
3202                         break;
3203                 }
3204 
3205                 c = COMPRESSION_FROM_OBJECT(o);
3206                 if (c > COMPRESSION_NONE)
3207                         printf("Flags: %s\n",
3208                                compression_to_string(c));
3209 
3210                 if (p == le64toh(f->header->tail_object_offset))
3211                         p = 0;
3212                 else
3213                         p += ALIGN64(le64toh(o->object.size));
3214         }
3215 
3216         return;
3217 fail:
3218         log_error("File corrupt");
3219 }
3220 
3221 /* Note: the lifetime of the compound literal is the immediately surrounding block. */
3222 #define FORMAT_TIMESTAMP_SAFE(t) (FORMAT_TIMESTAMP(t) ?: " --- ")
3223 
journal_file_print_header(JournalFile * f)3224 void journal_file_print_header(JournalFile *f) {
3225         struct stat st;
3226 
3227         assert(f);
3228         assert(f->header);
3229 
3230         printf("File path: %s\n"
3231                "File ID: %s\n"
3232                "Machine ID: %s\n"
3233                "Boot ID: %s\n"
3234                "Sequential number ID: %s\n"
3235                "State: %s\n"
3236                "Compatible flags:%s%s\n"
3237                "Incompatible flags:%s%s%s%s%s\n"
3238                "Header size: %"PRIu64"\n"
3239                "Arena size: %"PRIu64"\n"
3240                "Data hash table size: %"PRIu64"\n"
3241                "Field hash table size: %"PRIu64"\n"
3242                "Rotate suggested: %s\n"
3243                "Head sequential number: %"PRIu64" (%"PRIx64")\n"
3244                "Tail sequential number: %"PRIu64" (%"PRIx64")\n"
3245                "Head realtime timestamp: %s (%"PRIx64")\n"
3246                "Tail realtime timestamp: %s (%"PRIx64")\n"
3247                "Tail monotonic timestamp: %s (%"PRIx64")\n"
3248                "Objects: %"PRIu64"\n"
3249                "Entry objects: %"PRIu64"\n",
3250                f->path,
3251                SD_ID128_TO_STRING(f->header->file_id),
3252                SD_ID128_TO_STRING(f->header->machine_id),
3253                SD_ID128_TO_STRING(f->header->boot_id),
3254                SD_ID128_TO_STRING(f->header->seqnum_id),
3255                f->header->state == STATE_OFFLINE ? "OFFLINE" :
3256                f->header->state == STATE_ONLINE ? "ONLINE" :
3257                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
3258                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
3259                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
3260                JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
3261                JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
3262                JOURNAL_HEADER_COMPRESSED_ZSTD(f->header) ? " COMPRESSED-ZSTD" : "",
3263                JOURNAL_HEADER_KEYED_HASH(f->header) ? " KEYED-HASH" : "",
3264                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
3265                le64toh(f->header->header_size),
3266                le64toh(f->header->arena_size),
3267                le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3268                le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
3269                yes_no(journal_file_rotate_suggested(f, 0, LOG_DEBUG)),
3270                le64toh(f->header->head_entry_seqnum), le64toh(f->header->head_entry_seqnum),
3271                le64toh(f->header->tail_entry_seqnum), le64toh(f->header->tail_entry_seqnum),
3272                FORMAT_TIMESTAMP_SAFE(le64toh(f->header->head_entry_realtime)), le64toh(f->header->head_entry_realtime),
3273                FORMAT_TIMESTAMP_SAFE(le64toh(f->header->tail_entry_realtime)), le64toh(f->header->tail_entry_realtime),
3274                FORMAT_TIMESPAN(le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC), le64toh(f->header->tail_entry_monotonic),
3275                le64toh(f->header->n_objects),
3276                le64toh(f->header->n_entries));
3277 
3278         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3279                 printf("Data objects: %"PRIu64"\n"
3280                        "Data hash table fill: %.1f%%\n",
3281                        le64toh(f->header->n_data),
3282                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
3283 
3284         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3285                 printf("Field objects: %"PRIu64"\n"
3286                        "Field hash table fill: %.1f%%\n",
3287                        le64toh(f->header->n_fields),
3288                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3289 
3290         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
3291                 printf("Tag objects: %"PRIu64"\n",
3292                        le64toh(f->header->n_tags));
3293         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
3294                 printf("Entry array objects: %"PRIu64"\n",
3295                        le64toh(f->header->n_entry_arrays));
3296 
3297         if (JOURNAL_HEADER_CONTAINS(f->header, field_hash_chain_depth))
3298                 printf("Deepest field hash chain: %" PRIu64"\n",
3299                        f->header->field_hash_chain_depth);
3300 
3301         if (JOURNAL_HEADER_CONTAINS(f->header, data_hash_chain_depth))
3302                 printf("Deepest data hash chain: %" PRIu64"\n",
3303                        f->header->data_hash_chain_depth);
3304 
3305         if (fstat(f->fd, &st) >= 0)
3306                 printf("Disk usage: %s\n", FORMAT_BYTES((uint64_t) st.st_blocks * 512ULL));
3307 }
3308 
journal_file_warn_btrfs(JournalFile * f)3309 static int journal_file_warn_btrfs(JournalFile *f) {
3310         unsigned attrs;
3311         int r;
3312 
3313         assert(f);
3314 
3315         /* Before we write anything, check if the COW logic is turned
3316          * off on btrfs. Given our write pattern that is quite
3317          * unfriendly to COW file systems this should greatly improve
3318          * performance on COW file systems, such as btrfs, at the
3319          * expense of data integrity features (which shouldn't be too
3320          * bad, given that we do our own checksumming). */
3321 
3322         r = fd_is_fs_type(f->fd, BTRFS_SUPER_MAGIC);
3323         if (r < 0)
3324                 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
3325         if (!r)
3326                 return 0;
3327 
3328         r = read_attr_fd(f->fd, &attrs);
3329         if (r < 0)
3330                 return log_warning_errno(r, "Failed to read file attributes: %m");
3331 
3332         if (attrs & FS_NOCOW_FL) {
3333                 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
3334                 return 0;
3335         }
3336 
3337         log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
3338                    "This is likely to slow down journal access substantially, please consider turning "
3339                    "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
3340 
3341         return 1;
3342 }
3343 
journal_default_metrics(JournalMetrics * m,int fd)3344 static void journal_default_metrics(JournalMetrics *m, int fd) {
3345         struct statvfs ss;
3346         uint64_t fs_size = 0;
3347 
3348         assert(m);
3349         assert(fd >= 0);
3350 
3351         if (fstatvfs(fd, &ss) >= 0)
3352                 fs_size = ss.f_frsize * ss.f_blocks;
3353         else
3354                 log_debug_errno(errno, "Failed to determine disk size: %m");
3355 
3356         if (m->max_use == UINT64_MAX) {
3357 
3358                 if (fs_size > 0)
3359                         m->max_use = CLAMP(PAGE_ALIGN(fs_size / 10), /* 10% of file system size */
3360                                            MAX_USE_LOWER, MAX_USE_UPPER);
3361                 else
3362                         m->max_use = MAX_USE_LOWER;
3363         } else {
3364                 m->max_use = PAGE_ALIGN(m->max_use);
3365 
3366                 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
3367                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3368         }
3369 
3370         if (m->min_use == UINT64_MAX) {
3371                 if (fs_size > 0)
3372                         m->min_use = CLAMP(PAGE_ALIGN(fs_size / 50), /* 2% of file system size */
3373                                            MIN_USE_LOW, MIN_USE_HIGH);
3374                 else
3375                         m->min_use = MIN_USE_LOW;
3376         }
3377 
3378         if (m->min_use > m->max_use)
3379                 m->min_use = m->max_use;
3380 
3381         if (m->max_size == UINT64_MAX)
3382                 m->max_size = MIN(PAGE_ALIGN(m->max_use / 8), /* 8 chunks */
3383                                   MAX_SIZE_UPPER);
3384         else
3385                 m->max_size = PAGE_ALIGN(m->max_size);
3386 
3387         if (m->max_size != 0) {
3388                 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3389                         m->max_size = JOURNAL_FILE_SIZE_MIN;
3390 
3391                 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3392                         m->max_use = m->max_size*2;
3393         }
3394 
3395         if (m->min_size == UINT64_MAX)
3396                 m->min_size = JOURNAL_FILE_SIZE_MIN;
3397         else
3398                 m->min_size = CLAMP(PAGE_ALIGN(m->min_size),
3399                                     JOURNAL_FILE_SIZE_MIN,
3400                                     m->max_size ?: UINT64_MAX);
3401 
3402         if (m->keep_free == UINT64_MAX) {
3403                 if (fs_size > 0)
3404                         m->keep_free = MIN(PAGE_ALIGN(fs_size / 20), /* 5% of file system size */
3405                                            KEEP_FREE_UPPER);
3406                 else
3407                         m->keep_free = DEFAULT_KEEP_FREE;
3408         }
3409 
3410         if (m->n_max_files == UINT64_MAX)
3411                 m->n_max_files = DEFAULT_N_MAX_FILES;
3412 
3413         log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3414                   FORMAT_BYTES(m->min_use),
3415                   FORMAT_BYTES(m->max_use),
3416                   FORMAT_BYTES(m->max_size),
3417                   FORMAT_BYTES(m->min_size),
3418                   FORMAT_BYTES(m->keep_free),
3419                   m->n_max_files);
3420 }
3421 
journal_file_open(int fd,const char * fname,int open_flags,JournalFileFlags file_flags,mode_t mode,uint64_t compress_threshold_bytes,JournalMetrics * metrics,MMapCache * mmap_cache,JournalFile * template,JournalFile ** ret)3422 int journal_file_open(
3423                 int fd,
3424                 const char *fname,
3425                 int open_flags,
3426                 JournalFileFlags file_flags,
3427                 mode_t mode,
3428                 uint64_t compress_threshold_bytes,
3429                 JournalMetrics *metrics,
3430                 MMapCache *mmap_cache,
3431                 JournalFile *template,
3432                 JournalFile **ret) {
3433 
3434         bool newly_created = false;
3435         JournalFile *f;
3436         void *h;
3437         int r;
3438 
3439         assert(ret);
3440         assert(fd >= 0 || fname);
3441         assert(mmap_cache);
3442 
3443         if (!IN_SET((open_flags & O_ACCMODE), O_RDONLY, O_RDWR))
3444                 return -EINVAL;
3445 
3446         if ((open_flags & O_ACCMODE) == O_RDONLY && FLAGS_SET(open_flags, O_CREAT))
3447                 return -EINVAL;
3448 
3449         if (fname && (open_flags & O_CREAT) && !endswith(fname, ".journal"))
3450                 return -EINVAL;
3451 
3452         f = new(JournalFile, 1);
3453         if (!f)
3454                 return -ENOMEM;
3455 
3456         *f = (JournalFile) {
3457                 .fd = fd,
3458                 .mode = mode,
3459                 .open_flags = open_flags,
3460                 .compress_threshold_bytes = compress_threshold_bytes == UINT64_MAX ?
3461                                             DEFAULT_COMPRESS_THRESHOLD :
3462                                             MAX(MIN_COMPRESS_THRESHOLD, compress_threshold_bytes),
3463         };
3464 
3465         if (fname) {
3466                 f->path = strdup(fname);
3467                 if (!f->path) {
3468                         r = -ENOMEM;
3469                         goto fail;
3470                 }
3471         } else {
3472                 assert(fd >= 0);
3473 
3474                 /* If we don't know the path, fill in something explanatory and vaguely useful */
3475                 if (asprintf(&f->path, "/proc/self/%i", fd) < 0) {
3476                         r = -ENOMEM;
3477                         goto fail;
3478                 }
3479         }
3480 
3481         f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
3482         if (!f->chain_cache) {
3483                 r = -ENOMEM;
3484                 goto fail;
3485         }
3486 
3487         if (f->fd < 0) {
3488                 /* We pass O_NONBLOCK here, so that in case somebody pointed us to some character device node or FIFO
3489                  * or so, we likely fail quickly than block for long. For regular files O_NONBLOCK has no effect, hence
3490                  * it doesn't hurt in that case. */
3491 
3492                 f->fd = openat_report_new(AT_FDCWD, f->path, f->open_flags|O_CLOEXEC|O_NONBLOCK, f->mode, &newly_created);
3493                 if (f->fd < 0) {
3494                         r = f->fd;
3495                         goto fail;
3496                 }
3497 
3498                 /* fds we opened here by us should also be closed by us. */
3499                 f->close_fd = true;
3500 
3501                 r = fd_nonblock(f->fd, false);
3502                 if (r < 0)
3503                         goto fail;
3504 
3505                 if (!newly_created) {
3506                         r = journal_file_fstat(f);
3507                         if (r < 0)
3508                                 goto fail;
3509                 }
3510         } else {
3511                 r = journal_file_fstat(f);
3512                 if (r < 0)
3513                         goto fail;
3514 
3515                 /* If we just got the fd passed in, we don't really know if we created the file anew */
3516                 newly_created = f->last_stat.st_size == 0 && journal_file_writable(f);
3517         }
3518 
3519         f->cache_fd = mmap_cache_add_fd(mmap_cache, f->fd, prot_from_flags(open_flags));
3520         if (!f->cache_fd) {
3521                 r = -ENOMEM;
3522                 goto fail;
3523         }
3524 
3525         if (newly_created) {
3526                 (void) journal_file_warn_btrfs(f);
3527 
3528                 /* Let's attach the creation time to the journal file, so that the vacuuming code knows the age of this
3529                  * file even if the file might end up corrupted one day... Ideally we'd just use the creation time many
3530                  * file systems maintain for each file, but the API to query this is very new, hence let's emulate this
3531                  * via extended attributes. If extended attributes are not supported we'll just skip this, and rely
3532                  * solely on mtime/atime/ctime of the file. */
3533                 (void) fd_setcrtime(f->fd, 0);
3534 
3535                 r = journal_file_init_header(f, file_flags, template);
3536                 if (r < 0)
3537                         goto fail;
3538 
3539                 r = journal_file_fstat(f);
3540                 if (r < 0)
3541                         goto fail;
3542         }
3543 
3544         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
3545                 r = -ENODATA;
3546                 goto fail;
3547         }
3548 
3549         r = mmap_cache_fd_get(f->cache_fd, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
3550         if (r == -EINVAL) {
3551                 /* Some file systems (jffs2 or p9fs) don't support mmap() properly (or only read-only
3552                  * mmap()), and return EINVAL in that case. Let's propagate that as a more recognizable error
3553                  * code. */
3554                 r = -EAFNOSUPPORT;
3555                 goto fail;
3556         }
3557         if (r < 0)
3558                 goto fail;
3559 
3560         f->header = h;
3561 
3562         if (!newly_created) {
3563                 r = journal_file_verify_header(f);
3564                 if (r < 0)
3565                         goto fail;
3566         }
3567 
3568 #if HAVE_GCRYPT
3569         if (!newly_created && journal_file_writable(f) && JOURNAL_HEADER_SEALED(f->header)) {
3570                 r = journal_file_fss_load(f);
3571                 if (r < 0)
3572                         goto fail;
3573         }
3574 #endif
3575 
3576         if (journal_file_writable(f)) {
3577                 if (metrics) {
3578                         journal_default_metrics(metrics, f->fd);
3579                         f->metrics = *metrics;
3580                 } else if (template)
3581                         f->metrics = template->metrics;
3582 
3583                 r = journal_file_refresh_header(f);
3584                 if (r < 0)
3585                         goto fail;
3586         }
3587 
3588 #if HAVE_GCRYPT
3589         r = journal_file_hmac_setup(f);
3590         if (r < 0)
3591                 goto fail;
3592 #endif
3593 
3594         if (newly_created) {
3595                 r = journal_file_setup_field_hash_table(f);
3596                 if (r < 0)
3597                         goto fail;
3598 
3599                 r = journal_file_setup_data_hash_table(f);
3600                 if (r < 0)
3601                         goto fail;
3602 
3603 #if HAVE_GCRYPT
3604                 r = journal_file_append_first_tag(f);
3605                 if (r < 0)
3606                         goto fail;
3607 #endif
3608         }
3609 
3610         if (mmap_cache_fd_got_sigbus(f->cache_fd)) {
3611                 r = -EIO;
3612                 goto fail;
3613         }
3614 
3615         if (template && template->post_change_timer) {
3616                 r = journal_file_enable_post_change_timer(
3617                                 f,
3618                                 sd_event_source_get_event(template->post_change_timer),
3619                                 template->post_change_timer_period);
3620 
3621                 if (r < 0)
3622                         goto fail;
3623         }
3624 
3625         /* The file is opened now successfully, thus we take possession of any passed in fd. */
3626         f->close_fd = true;
3627 
3628         if (DEBUG_LOGGING) {
3629                 static int last_seal = -1, last_compress = -1, last_keyed_hash = -1;
3630                 static uint64_t last_bytes = UINT64_MAX;
3631 
3632                 if (last_seal != JOURNAL_HEADER_SEALED(f->header) ||
3633                     last_keyed_hash != JOURNAL_HEADER_KEYED_HASH(f->header) ||
3634                     last_compress != JOURNAL_FILE_COMPRESS(f) ||
3635                     last_bytes != f->compress_threshold_bytes) {
3636 
3637                         log_debug("Journal effective settings seal=%s keyed_hash=%s compress=%s compress_threshold_bytes=%s",
3638                                   yes_no(JOURNAL_HEADER_SEALED(f->header)), yes_no(JOURNAL_HEADER_KEYED_HASH(f->header)),
3639                                   yes_no(JOURNAL_FILE_COMPRESS(f)), FORMAT_BYTES(f->compress_threshold_bytes));
3640                         last_seal = JOURNAL_HEADER_SEALED(f->header);
3641                         last_keyed_hash = JOURNAL_HEADER_KEYED_HASH(f->header);
3642                         last_compress = JOURNAL_FILE_COMPRESS(f);
3643                         last_bytes = f->compress_threshold_bytes;
3644                 }
3645         }
3646 
3647         *ret = f;
3648         return 0;
3649 
3650 fail:
3651         if (f->cache_fd && mmap_cache_fd_got_sigbus(f->cache_fd))
3652                 r = -EIO;
3653 
3654         (void) journal_file_close(f);
3655 
3656         if (newly_created && fd < 0)
3657                 (void) unlink(fname);
3658 
3659         return r;
3660 }
3661 
journal_file_archive(JournalFile * f,char ** ret_previous_path)3662 int journal_file_archive(JournalFile *f, char **ret_previous_path) {
3663         _cleanup_free_ char *p = NULL;
3664 
3665         assert(f);
3666 
3667         if (!journal_file_writable(f))
3668                 return -EINVAL;
3669 
3670         /* Is this a journal file that was passed to us as fd? If so, we synthesized a path name for it, and we refuse
3671          * rotation, since we don't know the actual path, and couldn't rename the file hence. */
3672         if (path_startswith(f->path, "/proc/self/fd"))
3673                 return -EINVAL;
3674 
3675         if (!endswith(f->path, ".journal"))
3676                 return -EINVAL;
3677 
3678         if (asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
3679                      (int) strlen(f->path) - 8, f->path,
3680                      SD_ID128_FORMAT_VAL(f->header->seqnum_id),
3681                      le64toh(f->header->head_entry_seqnum),
3682                      le64toh(f->header->head_entry_realtime)) < 0)
3683                 return -ENOMEM;
3684 
3685         /* Try to rename the file to the archived version. If the file already was deleted, we'll get ENOENT, let's
3686          * ignore that case. */
3687         if (rename(f->path, p) < 0 && errno != ENOENT)
3688                 return -errno;
3689 
3690         /* Sync the rename to disk */
3691         (void) fsync_directory_of_file(f->fd);
3692 
3693         if (ret_previous_path)
3694                 *ret_previous_path = f->path;
3695         else
3696                 free(f->path);
3697 
3698         f->path = TAKE_PTR(p);
3699 
3700         /* Set as archive so offlining commits w/state=STATE_ARCHIVED. Previously we would set old_file->header->state
3701          * to STATE_ARCHIVED directly here, but journal_file_set_offline() short-circuits when state != STATE_ONLINE,
3702          * which would result in the rotated journal never getting fsync() called before closing.  Now we simply queue
3703          * the archive state by setting an archive bit, leaving the state as STATE_ONLINE so proper offlining
3704          * occurs. */
3705         f->archive = true;
3706 
3707         return 0;
3708 }
3709 
journal_file_dispose(int dir_fd,const char * fname)3710 int journal_file_dispose(int dir_fd, const char *fname) {
3711         _cleanup_free_ char *p = NULL;
3712 
3713         assert(fname);
3714 
3715         /* Renames a journal file to *.journal~, i.e. to mark it as corrupted or otherwise uncleanly shutdown. Note that
3716          * this is done without looking into the file or changing any of its contents. The idea is that this is called
3717          * whenever something is suspicious and we want to move the file away and make clear that it is not accessed
3718          * for writing anymore. */
3719 
3720         if (!endswith(fname, ".journal"))
3721                 return -EINVAL;
3722 
3723         if (asprintf(&p, "%.*s@%016" PRIx64 "-%016" PRIx64 ".journal~",
3724                      (int) strlen(fname) - 8, fname,
3725                      now(CLOCK_REALTIME),
3726                      random_u64()) < 0)
3727                 return -ENOMEM;
3728 
3729         if (renameat(dir_fd, fname, dir_fd, p) < 0)
3730                 return -errno;
3731 
3732         return 0;
3733 }
3734 
journal_file_copy_entry(JournalFile * from,JournalFile * to,Object * o,uint64_t p)3735 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p) {
3736         uint64_t q, n, xor_hash = 0;
3737         const sd_id128_t *boot_id;
3738         dual_timestamp ts;
3739         EntryItem *items;
3740         int r;
3741 
3742         assert(from);
3743         assert(to);
3744         assert(o);
3745         assert(p);
3746 
3747         if (!journal_file_writable(to))
3748                 return -EPERM;
3749 
3750         ts = (dual_timestamp) {
3751                 .monotonic = le64toh(o->entry.monotonic),
3752                 .realtime = le64toh(o->entry.realtime),
3753         };
3754         boot_id = &o->entry.boot_id;
3755 
3756         n = journal_file_entry_n_items(o);
3757         items = newa(EntryItem, n);
3758 
3759         for (uint64_t i = 0; i < n; i++) {
3760                 Compression c;
3761                 uint64_t l, h;
3762                 size_t t;
3763                 void *data;
3764                 Object *u;
3765 
3766                 q = le64toh(o->entry.items[i].object_offset);
3767 
3768                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
3769                 if (r < 0)
3770                         return r;
3771 
3772                 l = le64toh(READ_NOW(o->object.size));
3773                 if (l < offsetof(Object, data.payload))
3774                         return -EBADMSG;
3775 
3776                 l -= offsetof(Object, data.payload);
3777                 t = (size_t) l;
3778 
3779                 /* We hit the limit on 32bit machines */
3780                 if ((uint64_t) t != l)
3781                         return -E2BIG;
3782 
3783                 c = COMPRESSION_FROM_OBJECT(o);
3784                 if (c < 0)
3785                         return -EPROTONOSUPPORT;
3786                 if (c != COMPRESSION_NONE) {
3787 #if HAVE_COMPRESSION
3788                         size_t rsize = 0;
3789 
3790                         r = decompress_blob(
3791                                         c,
3792                                         o->data.payload, l,
3793                                         &from->compress_buffer, &rsize,
3794                                         0);
3795                         if (r < 0)
3796                                 return r;
3797 
3798                         data = from->compress_buffer;
3799                         l = rsize;
3800 #else
3801                         return -EPROTONOSUPPORT;
3802 #endif
3803                 } else
3804                         data = o->data.payload;
3805 
3806                 if (l == 0)
3807                         return -EBADMSG;
3808 
3809                 r = journal_file_append_data(to, data, l, &u, &h);
3810                 if (r < 0)
3811                         return r;
3812 
3813                 if (JOURNAL_HEADER_KEYED_HASH(to->header))
3814                         xor_hash ^= jenkins_hash64(data, l);
3815                 else
3816                         xor_hash ^= le64toh(u->data.hash);
3817 
3818                 items[i] = (EntryItem) {
3819                         .object_offset = htole64(h),
3820                         .hash = u->data.hash,
3821                 };
3822 
3823                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
3824                 if (r < 0)
3825                         return r;
3826         }
3827 
3828         r = journal_file_append_entry_internal(to, &ts, boot_id, xor_hash, items, n, NULL, NULL, NULL);
3829 
3830         if (mmap_cache_fd_got_sigbus(to->cache_fd))
3831                 return -EIO;
3832 
3833         return r;
3834 }
3835 
journal_reset_metrics(JournalMetrics * m)3836 void journal_reset_metrics(JournalMetrics *m) {
3837         assert(m);
3838 
3839         /* Set everything to "pick automatic values". */
3840 
3841         *m = (JournalMetrics) {
3842                 .min_use = UINT64_MAX,
3843                 .max_use = UINT64_MAX,
3844                 .min_size = UINT64_MAX,
3845                 .max_size = UINT64_MAX,
3846                 .keep_free = UINT64_MAX,
3847                 .n_max_files = UINT64_MAX,
3848         };
3849 }
3850 
journal_file_get_cutoff_realtime_usec(JournalFile * f,usec_t * from,usec_t * to)3851 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3852         assert(f);
3853         assert(f->header);
3854         assert(from || to);
3855 
3856         if (from) {
3857                 if (f->header->head_entry_realtime == 0)
3858                         return -ENOENT;
3859 
3860                 *from = le64toh(f->header->head_entry_realtime);
3861         }
3862 
3863         if (to) {
3864                 if (f->header->tail_entry_realtime == 0)
3865                         return -ENOENT;
3866 
3867                 *to = le64toh(f->header->tail_entry_realtime);
3868         }
3869 
3870         return 1;
3871 }
3872 
journal_file_get_cutoff_monotonic_usec(JournalFile * f,sd_id128_t boot_id,usec_t * from,usec_t * to)3873 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3874         Object *o;
3875         uint64_t p;
3876         int r;
3877 
3878         assert(f);
3879         assert(from || to);
3880 
3881         r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3882         if (r <= 0)
3883                 return r;
3884 
3885         if (le64toh(o->data.n_entries) <= 0)
3886                 return 0;
3887 
3888         if (from) {
3889                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3890                 if (r < 0)
3891                         return r;
3892 
3893                 *from = le64toh(o->entry.monotonic);
3894         }
3895 
3896         if (to) {
3897                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3898                 if (r < 0)
3899                         return r;
3900 
3901                 r = generic_array_get_plus_one(f,
3902                                                le64toh(o->data.entry_offset),
3903                                                le64toh(o->data.entry_array_offset),
3904                                                le64toh(o->data.n_entries) - 1,
3905                                                DIRECTION_UP,
3906                                                &o, NULL);
3907                 if (r <= 0)
3908                         return r;
3909 
3910                 *to = le64toh(o->entry.monotonic);
3911         }
3912 
3913         return 1;
3914 }
3915 
journal_file_rotate_suggested(JournalFile * f,usec_t max_file_usec,int log_level)3916 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec, int log_level) {
3917         assert(f);
3918         assert(f->header);
3919 
3920         /* If we gained new header fields we gained new features,
3921          * hence suggest a rotation */
3922         if (le64toh(f->header->header_size) < sizeof(Header)) {
3923                 log_full(log_level, "%s uses an outdated header, suggesting rotation.", f->path);
3924                 return true;
3925         }
3926 
3927         /* Let's check if the hash tables grew over a certain fill level (75%, borrowing this value from
3928          * Java's hash table implementation), and if so suggest a rotation. To calculate the fill level we
3929          * need the n_data field, which only exists in newer versions. */
3930 
3931         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3932                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3933                         log_full(log_level,
3934                                  "Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3935                                  f->path,
3936                                  100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3937                                  le64toh(f->header->n_data),
3938                                  le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3939                                  (unsigned long long) f->last_stat.st_size,
3940                                  f->last_stat.st_size / le64toh(f->header->n_data));
3941                         return true;
3942                 }
3943 
3944         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3945                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3946                         log_full(log_level,
3947                                  "Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3948                                  f->path,
3949                                  100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3950                                  le64toh(f->header->n_fields),
3951                                  le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3952                         return true;
3953                 }
3954 
3955         /* If there are too many hash collisions somebody is most likely playing games with us. Hence, if our
3956          * longest chain is longer than some threshold, let's suggest rotation. */
3957         if (JOURNAL_HEADER_CONTAINS(f->header, data_hash_chain_depth) &&
3958             le64toh(f->header->data_hash_chain_depth) > HASH_CHAIN_DEPTH_MAX) {
3959                 log_full(log_level,
3960                          "Data hash table of %s has deepest hash chain of length %" PRIu64 ", suggesting rotation.",
3961                          f->path, le64toh(f->header->data_hash_chain_depth));
3962                 return true;
3963         }
3964 
3965         if (JOURNAL_HEADER_CONTAINS(f->header, field_hash_chain_depth) &&
3966             le64toh(f->header->field_hash_chain_depth) > HASH_CHAIN_DEPTH_MAX) {
3967                 log_full(log_level,
3968                          "Field hash table of %s has deepest hash chain of length at %" PRIu64 ", suggesting rotation.",
3969                          f->path, le64toh(f->header->field_hash_chain_depth));
3970                 return true;
3971         }
3972 
3973         /* Are the data objects properly indexed by field objects? */
3974         if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3975             JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3976             le64toh(f->header->n_data) > 0 &&
3977             le64toh(f->header->n_fields) == 0) {
3978                 log_full(log_level,
3979                          "Data objects of %s are not indexed by field objects, suggesting rotation.",
3980                          f->path);
3981                 return true;
3982         }
3983 
3984         if (max_file_usec > 0) {
3985                 usec_t t, h;
3986 
3987                 h = le64toh(f->header->head_entry_realtime);
3988                 t = now(CLOCK_REALTIME);
3989 
3990                 if (h > 0 && t > h + max_file_usec) {
3991                         log_full(log_level,
3992                                  "Oldest entry in %s is older than the configured file retention duration (%s), suggesting rotation.",
3993                                  f->path, FORMAT_TIMESPAN(max_file_usec, USEC_PER_SEC));
3994                         return true;
3995                 }
3996         }
3997 
3998         return false;
3999 }
4000 
4001 static const char * const journal_object_type_table[] = {
4002         [OBJECT_UNUSED] = "unused",
4003         [OBJECT_DATA] = "data",
4004         [OBJECT_FIELD] = "field",
4005         [OBJECT_ENTRY] = "entry",
4006         [OBJECT_DATA_HASH_TABLE] = "data hash table",
4007         [OBJECT_FIELD_HASH_TABLE] = "field hash table",
4008         [OBJECT_ENTRY_ARRAY] = "entry array",
4009         [OBJECT_TAG] = "tag",
4010 };
4011 
4012 DEFINE_STRING_TABLE_LOOKUP_TO_STRING(journal_object_type, ObjectType);
4013