1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <pthread.h>
4 #include <unistd.h>
5
6 #include "chattr-util.h"
7 #include "copy.h"
8 #include "errno-util.h"
9 #include "fd-util.h"
10 #include "format-util.h"
11 #include "journal-authenticate.h"
12 #include "managed-journal-file.h"
13 #include "path-util.h"
14 #include "random-util.h"
15 #include "set.h"
16 #include "stat-util.h"
17 #include "sync-util.h"
18
19 #define PAYLOAD_BUFFER_SIZE (16U * 1024U)
20 #define MINIMUM_HOLE_SIZE (1U * 1024U * 1024U / 2U)
21
managed_journal_file_truncate(JournalFile * f)22 static int managed_journal_file_truncate(JournalFile *f) {
23 uint64_t p;
24 int r;
25
26 /* truncate excess from the end of archives */
27 r = journal_file_tail_end_by_pread(f, &p);
28 if (r < 0)
29 return log_debug_errno(r, "Failed to determine end of tail object: %m");
30
31 /* arena_size can't exceed the file size, ensure it's updated before truncating */
32 f->header->arena_size = htole64(p - le64toh(f->header->header_size));
33
34 if (ftruncate(f->fd, p) < 0)
35 return log_debug_errno(errno, "Failed to truncate %s: %m", f->path);
36
37 return journal_file_fstat(f);
38 }
39
managed_journal_file_entry_array_punch_hole(JournalFile * f,uint64_t p,uint64_t n_entries)40 static int managed_journal_file_entry_array_punch_hole(JournalFile *f, uint64_t p, uint64_t n_entries) {
41 Object o;
42 uint64_t offset, sz, n_items = 0, n_unused;
43 int r;
44
45 if (n_entries == 0)
46 return 0;
47
48 for (uint64_t q = p; q != 0; q = le64toh(o.entry_array.next_entry_array_offset)) {
49 r = journal_file_read_object_header(f, OBJECT_ENTRY_ARRAY, q, &o);
50 if (r < 0)
51 return r;
52
53 n_items += journal_file_entry_array_n_items(&o);
54 p = q;
55 }
56
57 if (p == 0)
58 return 0;
59
60 if (n_entries > n_items)
61 return -EBADMSG;
62
63 /* Amount of unused items in the final entry array. */
64 n_unused = n_items - n_entries;
65
66 if (n_unused == 0)
67 return 0;
68
69 offset = p + offsetof(Object, entry_array.items) +
70 (journal_file_entry_array_n_items(&o) - n_unused) * sizeof(le64_t);
71 sz = p + le64toh(o.object.size) - offset;
72
73 if (sz < MINIMUM_HOLE_SIZE)
74 return 0;
75
76 if (p == le64toh(f->header->tail_object_offset) && !JOURNAL_HEADER_SEALED(f->header)) {
77 ssize_t n;
78
79 o.object.size = htole64(offset - p);
80
81 n = pwrite(f->fd, &o, sizeof(EntryArrayObject), p);
82 if (n < 0)
83 return log_debug_errno(errno, "Failed to modify entry array object size: %m");
84 if ((size_t) n != sizeof(EntryArrayObject))
85 return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Short pwrite() while modifying entry array object size.");
86
87 f->header->arena_size = htole64(ALIGN64(offset) - le64toh(f->header->header_size));
88
89 if (ftruncate(f->fd, ALIGN64(offset)) < 0)
90 return log_debug_errno(errno, "Failed to truncate %s: %m", f->path);
91
92 return 0;
93 }
94
95 if (fallocate(f->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, sz) < 0) {
96 if (ERRNO_IS_NOT_SUPPORTED(errno)) {
97 log_debug("Hole punching not supported by backing file system, skipping.");
98 return -EOPNOTSUPP; /* Make recognizable */
99 }
100
101 return log_debug_errno(errno, "Failed to punch hole in entry array of %s: %m", f->path);
102 }
103
104 return 0;
105 }
106
managed_journal_file_punch_holes(JournalFile * f)107 static int managed_journal_file_punch_holes(JournalFile *f) {
108 HashItem items[PAYLOAD_BUFFER_SIZE / sizeof(HashItem)];
109 uint64_t p, sz;
110 ssize_t n = SSIZE_MAX;
111 int r;
112
113 r = managed_journal_file_entry_array_punch_hole(
114 f, le64toh(f->header->entry_array_offset), le64toh(f->header->n_entries));
115 if (r < 0)
116 return r;
117
118 p = le64toh(f->header->data_hash_table_offset);
119 sz = le64toh(f->header->data_hash_table_size);
120
121 for (uint64_t i = p; i < p + sz && n > 0; i += n) {
122 size_t m = MIN(sizeof(items), p + sz - i);
123 n = pread(f->fd, items, m, i);
124 if (n < 0)
125 return log_debug_errno(errno, "Failed to read hash table items: %m");
126
127 /* Let's ignore any partial hash items by rounding down to the nearest multiple of HashItem. */
128 n -= n % sizeof(HashItem);
129
130 for (size_t j = 0; j < (size_t) n / sizeof(HashItem); j++) {
131 Object o;
132
133 for (uint64_t q = le64toh(items[j].head_hash_offset); q != 0;
134 q = le64toh(o.data.next_hash_offset)) {
135
136 r = journal_file_read_object_header(f, OBJECT_DATA, q, &o);
137 if (r < 0) {
138 log_debug_errno(r, "Invalid data object: %m, ignoring");
139 break;
140 }
141
142 if (le64toh(o.data.n_entries) == 0)
143 continue;
144
145 r = managed_journal_file_entry_array_punch_hole(
146 f, le64toh(o.data.entry_array_offset), le64toh(o.data.n_entries) - 1);
147 if (r == -EOPNOTSUPP)
148 return -EOPNOTSUPP;
149
150 /* Ignore other errors */
151 }
152 }
153 }
154
155 return 0;
156 }
157
158 /* This may be called from a separate thread to prevent blocking the caller for the duration of fsync().
159 * As a result we use atomic operations on f->offline_state for inter-thread communications with
160 * journal_file_set_offline() and journal_file_set_online(). */
managed_journal_file_set_offline_internal(ManagedJournalFile * f)161 static void managed_journal_file_set_offline_internal(ManagedJournalFile *f) {
162 int r;
163
164 assert(f);
165 assert(f->file->fd >= 0);
166 assert(f->file->header);
167
168 for (;;) {
169 switch (f->file->offline_state) {
170 case OFFLINE_CANCEL:
171 if (!__sync_bool_compare_and_swap(&f->file->offline_state, OFFLINE_CANCEL, OFFLINE_DONE))
172 continue;
173 return;
174
175 case OFFLINE_AGAIN_FROM_SYNCING:
176 if (!__sync_bool_compare_and_swap(&f->file->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_SYNCING))
177 continue;
178 break;
179
180 case OFFLINE_AGAIN_FROM_OFFLINING:
181 if (!__sync_bool_compare_and_swap(&f->file->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_SYNCING))
182 continue;
183 break;
184
185 case OFFLINE_SYNCING:
186 if (f->file->archive) {
187 (void) managed_journal_file_truncate(f->file);
188 (void) managed_journal_file_punch_holes(f->file);
189 }
190
191 (void) fsync(f->file->fd);
192
193 if (!__sync_bool_compare_and_swap(&f->file->offline_state, OFFLINE_SYNCING, OFFLINE_OFFLINING))
194 continue;
195
196 f->file->header->state = f->file->archive ? STATE_ARCHIVED : STATE_OFFLINE;
197 (void) fsync(f->file->fd);
198
199 /* If we've archived the journal file, first try to re-enable COW on the file. If the
200 * FS_NOCOW_FL flag was never set or we successfully removed it, continue. If we fail
201 * to remove the flag on the archived file, rewrite the file without the NOCOW flag.
202 * We need this fallback because on some filesystems (BTRFS), the NOCOW flag cannot
203 * be removed after data has been written to a file. The only way to remove it is to
204 * copy all data to a new file without the NOCOW flag set. */
205
206 if (f->file->archive) {
207 r = chattr_fd(f->file->fd, 0, FS_NOCOW_FL, NULL);
208 if (r >= 0)
209 continue;
210
211 log_debug_errno(r, "Failed to re-enable copy-on-write for %s: %m, rewriting file", f->file->path);
212
213 r = copy_file_atomic(FORMAT_PROC_FD_PATH(f->file->fd), f->file->path, f->file->mode,
214 0,
215 FS_NOCOW_FL,
216 COPY_REPLACE | COPY_FSYNC | COPY_HOLES | COPY_ALL_XATTRS);
217 if (r < 0) {
218 log_debug_errno(r, "Failed to rewrite %s: %m", f->file->path);
219 continue;
220 }
221 }
222
223 break;
224
225 case OFFLINE_OFFLINING:
226 if (!__sync_bool_compare_and_swap(&f->file->offline_state, OFFLINE_OFFLINING, OFFLINE_DONE))
227 continue;
228 _fallthrough_;
229 case OFFLINE_DONE:
230 return;
231
232 case OFFLINE_JOINED:
233 log_debug("OFFLINE_JOINED unexpected offline state for journal_file_set_offline_internal()");
234 return;
235 }
236 }
237 }
238
managed_journal_file_set_offline_thread(void * arg)239 static void * managed_journal_file_set_offline_thread(void *arg) {
240 ManagedJournalFile *f = arg;
241
242 (void) pthread_setname_np(pthread_self(), "journal-offline");
243
244 managed_journal_file_set_offline_internal(f);
245
246 return NULL;
247 }
248
249 /* Trigger a restart if the offline thread is mid-flight in a restartable state. */
managed_journal_file_set_offline_try_restart(ManagedJournalFile * f)250 static bool managed_journal_file_set_offline_try_restart(ManagedJournalFile *f) {
251 for (;;) {
252 switch (f->file->offline_state) {
253 case OFFLINE_AGAIN_FROM_SYNCING:
254 case OFFLINE_AGAIN_FROM_OFFLINING:
255 return true;
256
257 case OFFLINE_CANCEL:
258 if (!__sync_bool_compare_and_swap(&f->file->offline_state, OFFLINE_CANCEL, OFFLINE_AGAIN_FROM_SYNCING))
259 continue;
260 return true;
261
262 case OFFLINE_SYNCING:
263 if (!__sync_bool_compare_and_swap(&f->file->offline_state, OFFLINE_SYNCING, OFFLINE_AGAIN_FROM_SYNCING))
264 continue;
265 return true;
266
267 case OFFLINE_OFFLINING:
268 if (!__sync_bool_compare_and_swap(&f->file->offline_state, OFFLINE_OFFLINING, OFFLINE_AGAIN_FROM_OFFLINING))
269 continue;
270 return true;
271
272 default:
273 return false;
274 }
275 }
276 }
277
278 /* Sets a journal offline.
279 *
280 * If wait is false then an offline is dispatched in a separate thread for a
281 * subsequent journal_file_set_offline() or journal_file_set_online() of the
282 * same journal to synchronize with.
283 *
284 * If wait is true, then either an existing offline thread will be restarted
285 * and joined, or if none exists the offline is simply performed in this
286 * context without involving another thread.
287 */
managed_journal_file_set_offline(ManagedJournalFile * f,bool wait)288 int managed_journal_file_set_offline(ManagedJournalFile *f, bool wait) {
289 int target_state;
290 bool restarted;
291 int r;
292
293 assert(f);
294
295 if (!journal_file_writable(f->file))
296 return -EPERM;
297
298 if (f->file->fd < 0 || !f->file->header)
299 return -EINVAL;
300
301 target_state = f->file->archive ? STATE_ARCHIVED : STATE_OFFLINE;
302
303 /* An offlining journal is implicitly online and may modify f->header->state,
304 * we must also join any potentially lingering offline thread when already in
305 * the desired offline state.
306 */
307 if (!managed_journal_file_is_offlining(f) && f->file->header->state == target_state)
308 return journal_file_set_offline_thread_join(f->file);
309
310 /* Restart an in-flight offline thread and wait if needed, or join a lingering done one. */
311 restarted = managed_journal_file_set_offline_try_restart(f);
312 if ((restarted && wait) || !restarted) {
313 r = journal_file_set_offline_thread_join(f->file);
314 if (r < 0)
315 return r;
316 }
317
318 if (restarted)
319 return 0;
320
321 /* Initiate a new offline. */
322 f->file->offline_state = OFFLINE_SYNCING;
323
324 if (wait) /* Without using a thread if waiting. */
325 managed_journal_file_set_offline_internal(f);
326 else {
327 sigset_t ss, saved_ss;
328 int k;
329
330 assert_se(sigfillset(&ss) >= 0);
331 /* Don't block SIGBUS since the offlining thread accesses a memory mapped file.
332 * Asynchronous SIGBUS signals can safely be handled by either thread. */
333 assert_se(sigdelset(&ss, SIGBUS) >= 0);
334
335 r = pthread_sigmask(SIG_BLOCK, &ss, &saved_ss);
336 if (r > 0)
337 return -r;
338
339 r = pthread_create(&f->file->offline_thread, NULL, managed_journal_file_set_offline_thread, f);
340
341 k = pthread_sigmask(SIG_SETMASK, &saved_ss, NULL);
342 if (r > 0) {
343 f->file->offline_state = OFFLINE_JOINED;
344 return -r;
345 }
346 if (k > 0)
347 return -k;
348 }
349
350 return 0;
351 }
352
managed_journal_file_is_offlining(ManagedJournalFile * f)353 bool managed_journal_file_is_offlining(ManagedJournalFile *f) {
354 assert(f);
355
356 __sync_synchronize();
357
358 if (IN_SET(f->file->offline_state, OFFLINE_DONE, OFFLINE_JOINED))
359 return false;
360
361 return true;
362 }
363
managed_journal_file_close(ManagedJournalFile * f)364 ManagedJournalFile* managed_journal_file_close(ManagedJournalFile *f) {
365 if (!f)
366 return NULL;
367
368 #if HAVE_GCRYPT
369 /* Write the final tag */
370 if (JOURNAL_HEADER_SEALED(f->file->header) && journal_file_writable(f->file)) {
371 int r;
372
373 r = journal_file_append_tag(f->file);
374 if (r < 0)
375 log_error_errno(r, "Failed to append tag when closing journal: %m");
376 }
377 #endif
378
379 if (f->file->post_change_timer) {
380 if (sd_event_source_get_enabled(f->file->post_change_timer, NULL) > 0)
381 journal_file_post_change(f->file);
382
383 sd_event_source_disable_unref(f->file->post_change_timer);
384 }
385
386 managed_journal_file_set_offline(f, true);
387
388 journal_file_close(f->file);
389
390 return mfree(f);
391 }
392
managed_journal_file_open(int fd,const char * fname,int open_flags,JournalFileFlags file_flags,mode_t mode,uint64_t compress_threshold_bytes,JournalMetrics * metrics,MMapCache * mmap_cache,Set * deferred_closes,ManagedJournalFile * template,ManagedJournalFile ** ret)393 int managed_journal_file_open(
394 int fd,
395 const char *fname,
396 int open_flags,
397 JournalFileFlags file_flags,
398 mode_t mode,
399 uint64_t compress_threshold_bytes,
400 JournalMetrics *metrics,
401 MMapCache *mmap_cache,
402 Set *deferred_closes,
403 ManagedJournalFile *template,
404 ManagedJournalFile **ret) {
405 _cleanup_free_ ManagedJournalFile *f = NULL;
406 int r;
407
408 set_clear_with_destructor(deferred_closes, managed_journal_file_close);
409
410 f = new0(ManagedJournalFile, 1);
411 if (!f)
412 return -ENOMEM;
413
414 r = journal_file_open(fd, fname, open_flags, file_flags, mode, compress_threshold_bytes, metrics,
415 mmap_cache, template ? template->file : NULL, &f->file);
416 if (r < 0)
417 return r;
418
419 *ret = TAKE_PTR(f);
420
421 return 0;
422 }
423
424
managed_journal_file_initiate_close(ManagedJournalFile * f,Set * deferred_closes)425 ManagedJournalFile* managed_journal_file_initiate_close(ManagedJournalFile *f, Set *deferred_closes) {
426 int r;
427
428 assert(f);
429
430 if (deferred_closes) {
431 r = set_put(deferred_closes, f);
432 if (r < 0)
433 log_debug_errno(r, "Failed to add file to deferred close set, closing immediately.");
434 else {
435 (void) managed_journal_file_set_offline(f, false);
436 return NULL;
437 }
438 }
439
440 return managed_journal_file_close(f);
441 }
442
managed_journal_file_rotate(ManagedJournalFile ** f,MMapCache * mmap_cache,JournalFileFlags file_flags,uint64_t compress_threshold_bytes,Set * deferred_closes)443 int managed_journal_file_rotate(
444 ManagedJournalFile **f,
445 MMapCache *mmap_cache,
446 JournalFileFlags file_flags,
447 uint64_t compress_threshold_bytes,
448 Set *deferred_closes) {
449
450 _cleanup_free_ char *path = NULL;
451 ManagedJournalFile *new_file = NULL;
452 int r;
453
454 assert(f);
455 assert(*f);
456
457 r = journal_file_archive((*f)->file, &path);
458 if (r < 0)
459 return r;
460
461 r = managed_journal_file_open(
462 -1,
463 path,
464 (*f)->file->open_flags,
465 file_flags,
466 (*f)->file->mode,
467 compress_threshold_bytes,
468 NULL, /* metrics */
469 mmap_cache,
470 deferred_closes,
471 *f, /* template */
472 &new_file);
473
474 managed_journal_file_initiate_close(*f, deferred_closes);
475 *f = new_file;
476
477 return r;
478 }
479
managed_journal_file_open_reliably(const char * fname,int open_flags,JournalFileFlags file_flags,mode_t mode,uint64_t compress_threshold_bytes,JournalMetrics * metrics,MMapCache * mmap_cache,Set * deferred_closes,ManagedJournalFile * template,ManagedJournalFile ** ret)480 int managed_journal_file_open_reliably(
481 const char *fname,
482 int open_flags,
483 JournalFileFlags file_flags,
484 mode_t mode,
485 uint64_t compress_threshold_bytes,
486 JournalMetrics *metrics,
487 MMapCache *mmap_cache,
488 Set *deferred_closes,
489 ManagedJournalFile *template,
490 ManagedJournalFile **ret) {
491
492 int r;
493
494 r = managed_journal_file_open(-1, fname, open_flags, file_flags, mode, compress_threshold_bytes, metrics,
495 mmap_cache, deferred_closes, template, ret);
496 if (!IN_SET(r,
497 -EBADMSG, /* Corrupted */
498 -ENODATA, /* Truncated */
499 -EHOSTDOWN, /* Other machine */
500 -EPROTONOSUPPORT, /* Incompatible feature */
501 -EBUSY, /* Unclean shutdown */
502 -ESHUTDOWN, /* Already archived */
503 -EIO, /* IO error, including SIGBUS on mmap */
504 -EIDRM, /* File has been deleted */
505 -ETXTBSY)) /* File is from the future */
506 return r;
507
508 if ((open_flags & O_ACCMODE) == O_RDONLY)
509 return r;
510
511 if (!(open_flags & O_CREAT))
512 return r;
513
514 if (!endswith(fname, ".journal"))
515 return r;
516
517 /* The file is corrupted. Rotate it away and try it again (but only once) */
518 log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
519
520 r = journal_file_dispose(AT_FDCWD, fname);
521 if (r < 0)
522 return r;
523
524 return managed_journal_file_open(-1, fname, open_flags, file_flags, mode, compress_threshold_bytes, metrics,
525 mmap_cache, deferred_closes, template, ret);
526 }
527