1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright (c) 2021, Microsoft Corporation.
4 *
5 * Authors:
6 * Beau Belgrave <beaub@linux.microsoft.com>
7 */
8
9 #include <linux/bitmap.h>
10 #include <linux/cdev.h>
11 #include <linux/hashtable.h>
12 #include <linux/list.h>
13 #include <linux/io.h>
14 #include <linux/uio.h>
15 #include <linux/ioctl.h>
16 #include <linux/jhash.h>
17 #include <linux/refcount.h>
18 #include <linux/trace_events.h>
19 #include <linux/tracefs.h>
20 #include <linux/types.h>
21 #include <linux/uaccess.h>
22 #include <linux/highmem.h>
23 #include <linux/init.h>
24 #include <linux/user_events.h>
25 #include "trace_dynevent.h"
26 #include "trace_output.h"
27 #include "trace.h"
28
29 #define USER_EVENTS_PREFIX_LEN (sizeof(USER_EVENTS_PREFIX)-1)
30
31 #define FIELD_DEPTH_TYPE 0
32 #define FIELD_DEPTH_NAME 1
33 #define FIELD_DEPTH_SIZE 2
34
35 /* Limit how long of an event name plus args within the subsystem. */
36 #define MAX_EVENT_DESC 512
37 #define EVENT_NAME(user_event) ((user_event)->tracepoint.name)
38 #define MAX_FIELD_ARRAY_SIZE 1024
39
40 /*
41 * Internal bits (kernel side only) to keep track of connected probes:
42 * These are used when status is requested in text form about an event. These
43 * bits are compared against an internal byte on the event to determine which
44 * probes to print out to the user.
45 *
46 * These do not reflect the mapped bytes between the user and kernel space.
47 */
48 #define EVENT_STATUS_FTRACE BIT(0)
49 #define EVENT_STATUS_PERF BIT(1)
50 #define EVENT_STATUS_OTHER BIT(7)
51
52 /*
53 * User register flags are not allowed yet, keep them here until we are
54 * ready to expose them out to the user ABI.
55 */
56 enum user_reg_flag {
57 /* Event will not delete upon last reference closing */
58 USER_EVENT_REG_PERSIST = 1U << 0,
59
60 /* This value or above is currently non-ABI */
61 USER_EVENT_REG_MAX = 1U << 1,
62 };
63
64 /*
65 * Stores the system name, tables, and locks for a group of events. This
66 * allows isolation for events by various means.
67 */
68 struct user_event_group {
69 char *system_name;
70 struct hlist_node node;
71 struct mutex reg_mutex;
72 DECLARE_HASHTABLE(register_table, 8);
73 };
74
75 /* Group for init_user_ns mapping, top-most group */
76 static struct user_event_group *init_group;
77
78 /* Max allowed events for the whole system */
79 static unsigned int max_user_events = 32768;
80
81 /* Current number of events on the whole system */
82 static unsigned int current_user_events;
83
84 /*
85 * Stores per-event properties, as users register events
86 * within a file a user_event might be created if it does not
87 * already exist. These are globally used and their lifetime
88 * is tied to the refcnt member. These cannot go away until the
89 * refcnt reaches one.
90 */
91 struct user_event {
92 struct user_event_group *group;
93 struct tracepoint tracepoint;
94 struct trace_event_call call;
95 struct trace_event_class class;
96 struct dyn_event devent;
97 struct hlist_node node;
98 struct list_head fields;
99 struct list_head validators;
100 struct work_struct put_work;
101 refcount_t refcnt;
102 int min_size;
103 int reg_flags;
104 char status;
105 };
106
107 /*
108 * Stores per-mm/event properties that enable an address to be
109 * updated properly for each task. As tasks are forked, we use
110 * these to track enablement sites that are tied to an event.
111 */
112 struct user_event_enabler {
113 struct list_head mm_enablers_link;
114 struct user_event *event;
115 unsigned long addr;
116
117 /* Track enable bit, flags, etc. Aligned for bitops. */
118 unsigned long values;
119 };
120
121 /* Bits 0-5 are for the bit to update upon enable/disable (0-63 allowed) */
122 #define ENABLE_VAL_BIT_MASK 0x3F
123
124 /* Bit 6 is for faulting status of enablement */
125 #define ENABLE_VAL_FAULTING_BIT 6
126
127 /* Bit 7 is for freeing status of enablement */
128 #define ENABLE_VAL_FREEING_BIT 7
129
130 /* Bit 8 is for marking 32-bit on 64-bit */
131 #define ENABLE_VAL_32_ON_64_BIT 8
132
133 #define ENABLE_VAL_COMPAT_MASK (1 << ENABLE_VAL_32_ON_64_BIT)
134
135 /* Only duplicate the bit and compat values */
136 #define ENABLE_VAL_DUP_MASK (ENABLE_VAL_BIT_MASK | ENABLE_VAL_COMPAT_MASK)
137
138 #define ENABLE_BITOPS(e) (&(e)->values)
139
140 #define ENABLE_BIT(e) ((int)((e)->values & ENABLE_VAL_BIT_MASK))
141
142 /* Used for asynchronous faulting in of pages */
143 struct user_event_enabler_fault {
144 struct work_struct work;
145 struct user_event_mm *mm;
146 struct user_event_enabler *enabler;
147 int attempt;
148 };
149
150 static struct kmem_cache *fault_cache;
151
152 /* Global list of memory descriptors using user_events */
153 static LIST_HEAD(user_event_mms);
154 static DEFINE_SPINLOCK(user_event_mms_lock);
155
156 /*
157 * Stores per-file events references, as users register events
158 * within a file this structure is modified and freed via RCU.
159 * The lifetime of this struct is tied to the lifetime of the file.
160 * These are not shared and only accessible by the file that created it.
161 */
162 struct user_event_refs {
163 struct rcu_head rcu;
164 int count;
165 struct user_event *events[];
166 };
167
168 struct user_event_file_info {
169 struct user_event_group *group;
170 struct user_event_refs *refs;
171 };
172
173 #define VALIDATOR_ENSURE_NULL (1 << 0)
174 #define VALIDATOR_REL (1 << 1)
175
176 struct user_event_validator {
177 struct list_head user_event_link;
178 int offset;
179 int flags;
180 };
181
align_addr_bit(unsigned long * addr,int * bit,unsigned long * flags)182 static inline void align_addr_bit(unsigned long *addr, int *bit,
183 unsigned long *flags)
184 {
185 if (IS_ALIGNED(*addr, sizeof(long))) {
186 #ifdef __BIG_ENDIAN
187 /* 32 bit on BE 64 bit requires a 32 bit offset when aligned. */
188 if (test_bit(ENABLE_VAL_32_ON_64_BIT, flags))
189 *bit += 32;
190 #endif
191 return;
192 }
193
194 *addr = ALIGN_DOWN(*addr, sizeof(long));
195
196 /*
197 * We only support 32 and 64 bit values. The only time we need
198 * to align is a 32 bit value on a 64 bit kernel, which on LE
199 * is always 32 bits, and on BE requires no change when unaligned.
200 */
201 #ifdef __LITTLE_ENDIAN
202 *bit += 32;
203 #endif
204 }
205
206 typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i,
207 void *tpdata, bool *faulted);
208
209 static int user_event_parse(struct user_event_group *group, char *name,
210 char *args, char *flags,
211 struct user_event **newuser, int reg_flags);
212
213 static struct user_event_mm *user_event_mm_get(struct user_event_mm *mm);
214 static struct user_event_mm *user_event_mm_get_all(struct user_event *user);
215 static void user_event_mm_put(struct user_event_mm *mm);
216 static int destroy_user_event(struct user_event *user);
217
user_event_key(char * name)218 static u32 user_event_key(char *name)
219 {
220 return jhash(name, strlen(name), 0);
221 }
222
user_event_get(struct user_event * user)223 static struct user_event *user_event_get(struct user_event *user)
224 {
225 refcount_inc(&user->refcnt);
226
227 return user;
228 }
229
delayed_destroy_user_event(struct work_struct * work)230 static void delayed_destroy_user_event(struct work_struct *work)
231 {
232 struct user_event *user = container_of(
233 work, struct user_event, put_work);
234
235 mutex_lock(&event_mutex);
236
237 if (!refcount_dec_and_test(&user->refcnt))
238 goto out;
239
240 if (destroy_user_event(user)) {
241 /*
242 * The only reason this would fail here is if we cannot
243 * update the visibility of the event. In this case the
244 * event stays in the hashtable, waiting for someone to
245 * attempt to delete it later.
246 */
247 pr_warn("user_events: Unable to delete event\n");
248 refcount_set(&user->refcnt, 1);
249 }
250 out:
251 mutex_unlock(&event_mutex);
252 }
253
user_event_put(struct user_event * user,bool locked)254 static void user_event_put(struct user_event *user, bool locked)
255 {
256 bool delete;
257
258 if (unlikely(!user))
259 return;
260
261 /*
262 * When the event is not enabled for auto-delete there will always
263 * be at least 1 reference to the event. During the event creation
264 * we initially set the refcnt to 2 to achieve this. In those cases
265 * the caller must acquire event_mutex and after decrement check if
266 * the refcnt is 1, meaning this is the last reference. When auto
267 * delete is enabled, there will only be 1 ref, IE: refcnt will be
268 * only set to 1 during creation to allow the below checks to go
269 * through upon the last put. The last put must always be done with
270 * the event mutex held.
271 */
272 if (!locked) {
273 lockdep_assert_not_held(&event_mutex);
274 delete = refcount_dec_and_mutex_lock(&user->refcnt, &event_mutex);
275 } else {
276 lockdep_assert_held(&event_mutex);
277 delete = refcount_dec_and_test(&user->refcnt);
278 }
279
280 if (!delete)
281 return;
282
283 /*
284 * We now have the event_mutex in all cases, which ensures that
285 * no new references will be taken until event_mutex is released.
286 * New references come through find_user_event(), which requires
287 * the event_mutex to be held.
288 */
289
290 if (user->reg_flags & USER_EVENT_REG_PERSIST) {
291 /* We should not get here when persist flag is set */
292 pr_alert("BUG: Auto-delete engaged on persistent event\n");
293 goto out;
294 }
295
296 /*
297 * Unfortunately we have to attempt the actual destroy in a work
298 * queue. This is because not all cases handle a trace_event_call
299 * being removed within the class->reg() operation for unregister.
300 */
301 INIT_WORK(&user->put_work, delayed_destroy_user_event);
302
303 /*
304 * Since the event is still in the hashtable, we have to re-inc
305 * the ref count to 1. This count will be decremented and checked
306 * in the work queue to ensure it's still the last ref. This is
307 * needed because a user-process could register the same event in
308 * between the time of event_mutex release and the work queue
309 * running the delayed destroy. If we removed the item now from
310 * the hashtable, this would result in a timing window where a
311 * user process would fail a register because the trace_event_call
312 * register would fail in the tracing layers.
313 */
314 refcount_set(&user->refcnt, 1);
315
316 if (WARN_ON_ONCE(!schedule_work(&user->put_work))) {
317 /*
318 * If we fail we must wait for an admin to attempt delete or
319 * another register/close of the event, whichever is first.
320 */
321 pr_warn("user_events: Unable to queue delayed destroy\n");
322 }
323 out:
324 /* Ensure if we didn't have event_mutex before we unlock it */
325 if (!locked)
326 mutex_unlock(&event_mutex);
327 }
328
user_event_group_destroy(struct user_event_group * group)329 static void user_event_group_destroy(struct user_event_group *group)
330 {
331 kfree(group->system_name);
332 kfree(group);
333 }
334
user_event_group_system_name(void)335 static char *user_event_group_system_name(void)
336 {
337 char *system_name;
338 int len = sizeof(USER_EVENTS_SYSTEM) + 1;
339
340 system_name = kmalloc(len, GFP_KERNEL);
341
342 if (!system_name)
343 return NULL;
344
345 snprintf(system_name, len, "%s", USER_EVENTS_SYSTEM);
346
347 return system_name;
348 }
349
current_user_event_group(void)350 static struct user_event_group *current_user_event_group(void)
351 {
352 return init_group;
353 }
354
user_event_group_create(void)355 static struct user_event_group *user_event_group_create(void)
356 {
357 struct user_event_group *group;
358
359 group = kzalloc(sizeof(*group), GFP_KERNEL);
360
361 if (!group)
362 return NULL;
363
364 group->system_name = user_event_group_system_name();
365
366 if (!group->system_name)
367 goto error;
368
369 mutex_init(&group->reg_mutex);
370 hash_init(group->register_table);
371
372 return group;
373 error:
374 if (group)
375 user_event_group_destroy(group);
376
377 return NULL;
378 };
379
user_event_enabler_destroy(struct user_event_enabler * enabler,bool locked)380 static void user_event_enabler_destroy(struct user_event_enabler *enabler,
381 bool locked)
382 {
383 list_del_rcu(&enabler->mm_enablers_link);
384
385 /* No longer tracking the event via the enabler */
386 user_event_put(enabler->event, locked);
387
388 kfree(enabler);
389 }
390
user_event_mm_fault_in(struct user_event_mm * mm,unsigned long uaddr,int attempt)391 static int user_event_mm_fault_in(struct user_event_mm *mm, unsigned long uaddr,
392 int attempt)
393 {
394 bool unlocked;
395 int ret;
396
397 /*
398 * Normally this is low, ensure that it cannot be taken advantage of by
399 * bad user processes to cause excessive looping.
400 */
401 if (attempt > 10)
402 return -EFAULT;
403
404 mmap_read_lock(mm->mm);
405
406 /* Ensure MM has tasks, cannot use after exit_mm() */
407 if (refcount_read(&mm->tasks) == 0) {
408 ret = -ENOENT;
409 goto out;
410 }
411
412 ret = fixup_user_fault(mm->mm, uaddr, FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE,
413 &unlocked);
414 out:
415 mmap_read_unlock(mm->mm);
416
417 return ret;
418 }
419
420 static int user_event_enabler_write(struct user_event_mm *mm,
421 struct user_event_enabler *enabler,
422 bool fixup_fault, int *attempt);
423
user_event_enabler_fault_fixup(struct work_struct * work)424 static void user_event_enabler_fault_fixup(struct work_struct *work)
425 {
426 struct user_event_enabler_fault *fault = container_of(
427 work, struct user_event_enabler_fault, work);
428 struct user_event_enabler *enabler = fault->enabler;
429 struct user_event_mm *mm = fault->mm;
430 unsigned long uaddr = enabler->addr;
431 int attempt = fault->attempt;
432 int ret;
433
434 ret = user_event_mm_fault_in(mm, uaddr, attempt);
435
436 if (ret && ret != -ENOENT) {
437 struct user_event *user = enabler->event;
438
439 pr_warn("user_events: Fault for mm: 0x%pK @ 0x%llx event: %s\n",
440 mm->mm, (unsigned long long)uaddr, EVENT_NAME(user));
441 }
442
443 /* Prevent state changes from racing */
444 mutex_lock(&event_mutex);
445
446 /* User asked for enabler to be removed during fault */
447 if (test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler))) {
448 user_event_enabler_destroy(enabler, true);
449 goto out;
450 }
451
452 /*
453 * If we managed to get the page, re-issue the write. We do not
454 * want to get into a possible infinite loop, which is why we only
455 * attempt again directly if the page came in. If we couldn't get
456 * the page here, then we will try again the next time the event is
457 * enabled/disabled.
458 */
459 clear_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler));
460
461 if (!ret) {
462 mmap_read_lock(mm->mm);
463 user_event_enabler_write(mm, enabler, true, &attempt);
464 mmap_read_unlock(mm->mm);
465 }
466 out:
467 mutex_unlock(&event_mutex);
468
469 /* In all cases we no longer need the mm or fault */
470 user_event_mm_put(mm);
471 kmem_cache_free(fault_cache, fault);
472 }
473
user_event_enabler_queue_fault(struct user_event_mm * mm,struct user_event_enabler * enabler,int attempt)474 static bool user_event_enabler_queue_fault(struct user_event_mm *mm,
475 struct user_event_enabler *enabler,
476 int attempt)
477 {
478 struct user_event_enabler_fault *fault;
479
480 fault = kmem_cache_zalloc(fault_cache, GFP_NOWAIT | __GFP_NOWARN);
481
482 if (!fault)
483 return false;
484
485 INIT_WORK(&fault->work, user_event_enabler_fault_fixup);
486 fault->mm = user_event_mm_get(mm);
487 fault->enabler = enabler;
488 fault->attempt = attempt;
489
490 /* Don't try to queue in again while we have a pending fault */
491 set_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler));
492
493 if (!schedule_work(&fault->work)) {
494 /* Allow another attempt later */
495 clear_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler));
496
497 user_event_mm_put(mm);
498 kmem_cache_free(fault_cache, fault);
499
500 return false;
501 }
502
503 return true;
504 }
505
user_event_enabler_write(struct user_event_mm * mm,struct user_event_enabler * enabler,bool fixup_fault,int * attempt)506 static int user_event_enabler_write(struct user_event_mm *mm,
507 struct user_event_enabler *enabler,
508 bool fixup_fault, int *attempt)
509 {
510 unsigned long uaddr = enabler->addr;
511 unsigned long *ptr;
512 struct page *page;
513 void *kaddr;
514 int bit = ENABLE_BIT(enabler);
515 int ret;
516
517 lockdep_assert_held(&event_mutex);
518 mmap_assert_locked(mm->mm);
519
520 *attempt += 1;
521
522 /* Ensure MM has tasks, cannot use after exit_mm() */
523 if (refcount_read(&mm->tasks) == 0)
524 return -ENOENT;
525
526 if (unlikely(test_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler)) ||
527 test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler))))
528 return -EBUSY;
529
530 align_addr_bit(&uaddr, &bit, ENABLE_BITOPS(enabler));
531
532 ret = pin_user_pages_remote(mm->mm, uaddr, 1, FOLL_WRITE | FOLL_NOFAULT,
533 &page, NULL);
534
535 if (unlikely(ret <= 0)) {
536 if (!fixup_fault)
537 return -EFAULT;
538
539 if (!user_event_enabler_queue_fault(mm, enabler, *attempt))
540 pr_warn("user_events: Unable to queue fault handler\n");
541
542 return -EFAULT;
543 }
544
545 kaddr = kmap_local_page(page);
546 ptr = kaddr + (uaddr & ~PAGE_MASK);
547
548 /* Update bit atomically, user tracers must be atomic as well */
549 if (enabler->event && enabler->event->status)
550 set_bit(bit, ptr);
551 else
552 clear_bit(bit, ptr);
553
554 kunmap_local(kaddr);
555 unpin_user_pages_dirty_lock(&page, 1, true);
556
557 return 0;
558 }
559
user_event_enabler_exists(struct user_event_mm * mm,unsigned long uaddr,unsigned char bit)560 static bool user_event_enabler_exists(struct user_event_mm *mm,
561 unsigned long uaddr, unsigned char bit)
562 {
563 struct user_event_enabler *enabler;
564
565 list_for_each_entry(enabler, &mm->enablers, mm_enablers_link) {
566 if (enabler->addr == uaddr && ENABLE_BIT(enabler) == bit)
567 return true;
568 }
569
570 return false;
571 }
572
user_event_enabler_update(struct user_event * user)573 static void user_event_enabler_update(struct user_event *user)
574 {
575 struct user_event_enabler *enabler;
576 struct user_event_mm *next;
577 struct user_event_mm *mm;
578 int attempt;
579
580 lockdep_assert_held(&event_mutex);
581
582 /*
583 * We need to build a one-shot list of all the mms that have an
584 * enabler for the user_event passed in. This list is only valid
585 * while holding the event_mutex. The only reason for this is due
586 * to the global mm list being RCU protected and we use methods
587 * which can wait (mmap_read_lock and pin_user_pages_remote).
588 *
589 * NOTE: user_event_mm_get_all() increments the ref count of each
590 * mm that is added to the list to prevent removal timing windows.
591 * We must always put each mm after they are used, which may wait.
592 */
593 mm = user_event_mm_get_all(user);
594
595 while (mm) {
596 next = mm->next;
597 mmap_read_lock(mm->mm);
598
599 list_for_each_entry(enabler, &mm->enablers, mm_enablers_link) {
600 if (enabler->event == user) {
601 attempt = 0;
602 user_event_enabler_write(mm, enabler, true, &attempt);
603 }
604 }
605
606 mmap_read_unlock(mm->mm);
607 user_event_mm_put(mm);
608 mm = next;
609 }
610 }
611
user_event_enabler_dup(struct user_event_enabler * orig,struct user_event_mm * mm)612 static bool user_event_enabler_dup(struct user_event_enabler *orig,
613 struct user_event_mm *mm)
614 {
615 struct user_event_enabler *enabler;
616
617 /* Skip pending frees */
618 if (unlikely(test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(orig))))
619 return true;
620
621 enabler = kzalloc(sizeof(*enabler), GFP_NOWAIT | __GFP_ACCOUNT);
622
623 if (!enabler)
624 return false;
625
626 enabler->event = user_event_get(orig->event);
627 enabler->addr = orig->addr;
628
629 /* Only dup part of value (ignore future flags, etc) */
630 enabler->values = orig->values & ENABLE_VAL_DUP_MASK;
631
632 /* Enablers not exposed yet, RCU not required */
633 list_add(&enabler->mm_enablers_link, &mm->enablers);
634
635 return true;
636 }
637
user_event_mm_get(struct user_event_mm * mm)638 static struct user_event_mm *user_event_mm_get(struct user_event_mm *mm)
639 {
640 refcount_inc(&mm->refcnt);
641
642 return mm;
643 }
644
user_event_mm_get_all(struct user_event * user)645 static struct user_event_mm *user_event_mm_get_all(struct user_event *user)
646 {
647 struct user_event_mm *found = NULL;
648 struct user_event_enabler *enabler;
649 struct user_event_mm *mm;
650
651 /*
652 * We use the mm->next field to build a one-shot list from the global
653 * RCU protected list. To build this list the event_mutex must be held.
654 * This lets us build a list without requiring allocs that could fail
655 * when user based events are most wanted for diagnostics.
656 */
657 lockdep_assert_held(&event_mutex);
658
659 /*
660 * We do not want to block fork/exec while enablements are being
661 * updated, so we use RCU to walk the current tasks that have used
662 * user_events ABI for 1 or more events. Each enabler found in each
663 * task that matches the event being updated has a write to reflect
664 * the kernel state back into the process. Waits/faults must not occur
665 * during this. So we scan the list under RCU for all the mm that have
666 * the event within it. This is needed because mm_read_lock() can wait.
667 * Each user mm returned has a ref inc to handle remove RCU races.
668 */
669 rcu_read_lock();
670
671 list_for_each_entry_rcu(mm, &user_event_mms, mms_link) {
672 list_for_each_entry_rcu(enabler, &mm->enablers, mm_enablers_link) {
673 if (enabler->event == user) {
674 mm->next = found;
675 found = user_event_mm_get(mm);
676 break;
677 }
678 }
679 }
680
681 rcu_read_unlock();
682
683 return found;
684 }
685
user_event_mm_alloc(struct task_struct * t)686 static struct user_event_mm *user_event_mm_alloc(struct task_struct *t)
687 {
688 struct user_event_mm *user_mm;
689
690 user_mm = kzalloc(sizeof(*user_mm), GFP_KERNEL_ACCOUNT);
691
692 if (!user_mm)
693 return NULL;
694
695 user_mm->mm = t->mm;
696 INIT_LIST_HEAD(&user_mm->enablers);
697 refcount_set(&user_mm->refcnt, 1);
698 refcount_set(&user_mm->tasks, 1);
699
700 /*
701 * The lifetime of the memory descriptor can slightly outlast
702 * the task lifetime if a ref to the user_event_mm is taken
703 * between list_del_rcu() and call_rcu(). Therefore we need
704 * to take a reference to it to ensure it can live this long
705 * under this corner case. This can also occur in clones that
706 * outlast the parent.
707 */
708 mmgrab(user_mm->mm);
709
710 return user_mm;
711 }
712
user_event_mm_attach(struct user_event_mm * user_mm,struct task_struct * t)713 static void user_event_mm_attach(struct user_event_mm *user_mm, struct task_struct *t)
714 {
715 unsigned long flags;
716
717 spin_lock_irqsave(&user_event_mms_lock, flags);
718 list_add_rcu(&user_mm->mms_link, &user_event_mms);
719 spin_unlock_irqrestore(&user_event_mms_lock, flags);
720
721 t->user_event_mm = user_mm;
722 }
723
current_user_event_mm(void)724 static struct user_event_mm *current_user_event_mm(void)
725 {
726 struct user_event_mm *user_mm = current->user_event_mm;
727
728 if (user_mm)
729 goto inc;
730
731 user_mm = user_event_mm_alloc(current);
732
733 if (!user_mm)
734 goto error;
735
736 user_event_mm_attach(user_mm, current);
737 inc:
738 refcount_inc(&user_mm->refcnt);
739 error:
740 return user_mm;
741 }
742
user_event_mm_destroy(struct user_event_mm * mm)743 static void user_event_mm_destroy(struct user_event_mm *mm)
744 {
745 struct user_event_enabler *enabler, *next;
746
747 list_for_each_entry_safe(enabler, next, &mm->enablers, mm_enablers_link)
748 user_event_enabler_destroy(enabler, false);
749
750 mmdrop(mm->mm);
751 kfree(mm);
752 }
753
user_event_mm_put(struct user_event_mm * mm)754 static void user_event_mm_put(struct user_event_mm *mm)
755 {
756 if (mm && refcount_dec_and_test(&mm->refcnt))
757 user_event_mm_destroy(mm);
758 }
759
delayed_user_event_mm_put(struct work_struct * work)760 static void delayed_user_event_mm_put(struct work_struct *work)
761 {
762 struct user_event_mm *mm;
763
764 mm = container_of(to_rcu_work(work), struct user_event_mm, put_rwork);
765 user_event_mm_put(mm);
766 }
767
user_event_mm_remove(struct task_struct * t)768 void user_event_mm_remove(struct task_struct *t)
769 {
770 struct user_event_mm *mm;
771 unsigned long flags;
772
773 might_sleep();
774
775 mm = t->user_event_mm;
776 t->user_event_mm = NULL;
777
778 /* Clone will increment the tasks, only remove if last clone */
779 if (!refcount_dec_and_test(&mm->tasks))
780 return;
781
782 /* Remove the mm from the list, so it can no longer be enabled */
783 spin_lock_irqsave(&user_event_mms_lock, flags);
784 list_del_rcu(&mm->mms_link);
785 spin_unlock_irqrestore(&user_event_mms_lock, flags);
786
787 /*
788 * We need to wait for currently occurring writes to stop within
789 * the mm. This is required since exit_mm() snaps the current rss
790 * stats and clears them. On the final mmdrop(), check_mm() will
791 * report a bug if these increment.
792 *
793 * All writes/pins are done under mmap_read lock, take the write
794 * lock to ensure in-progress faults have completed. Faults that
795 * are pending but yet to run will check the task count and skip
796 * the fault since the mm is going away.
797 */
798 mmap_write_lock(mm->mm);
799 mmap_write_unlock(mm->mm);
800
801 /*
802 * Put for mm must be done after RCU delay to handle new refs in
803 * between the list_del_rcu() and now. This ensures any get refs
804 * during rcu_read_lock() are accounted for during list removal.
805 *
806 * CPU A | CPU B
807 * ---------------------------------------------------------------
808 * user_event_mm_remove() | rcu_read_lock();
809 * list_del_rcu() | list_for_each_entry_rcu();
810 * call_rcu() | refcount_inc();
811 * . | rcu_read_unlock();
812 * schedule_work() | .
813 * user_event_mm_put() | .
814 *
815 * mmdrop() cannot be called in the softirq context of call_rcu()
816 * so we use a work queue after call_rcu() to run within.
817 */
818 INIT_RCU_WORK(&mm->put_rwork, delayed_user_event_mm_put);
819 queue_rcu_work(system_wq, &mm->put_rwork);
820 }
821
user_event_mm_dup(struct task_struct * t,struct user_event_mm * old_mm)822 void user_event_mm_dup(struct task_struct *t, struct user_event_mm *old_mm)
823 {
824 struct user_event_mm *mm = user_event_mm_alloc(t);
825 struct user_event_enabler *enabler;
826
827 if (!mm)
828 return;
829
830 rcu_read_lock();
831
832 list_for_each_entry_rcu(enabler, &old_mm->enablers, mm_enablers_link) {
833 if (!user_event_enabler_dup(enabler, mm))
834 goto error;
835 }
836
837 rcu_read_unlock();
838
839 user_event_mm_attach(mm, t);
840 return;
841 error:
842 rcu_read_unlock();
843 user_event_mm_destroy(mm);
844 }
845
current_user_event_enabler_exists(unsigned long uaddr,unsigned char bit)846 static bool current_user_event_enabler_exists(unsigned long uaddr,
847 unsigned char bit)
848 {
849 struct user_event_mm *user_mm = current_user_event_mm();
850 bool exists;
851
852 if (!user_mm)
853 return false;
854
855 exists = user_event_enabler_exists(user_mm, uaddr, bit);
856
857 user_event_mm_put(user_mm);
858
859 return exists;
860 }
861
862 static struct user_event_enabler
user_event_enabler_create(struct user_reg * reg,struct user_event * user,int * write_result)863 *user_event_enabler_create(struct user_reg *reg, struct user_event *user,
864 int *write_result)
865 {
866 struct user_event_enabler *enabler;
867 struct user_event_mm *user_mm;
868 unsigned long uaddr = (unsigned long)reg->enable_addr;
869 int attempt = 0;
870
871 user_mm = current_user_event_mm();
872
873 if (!user_mm)
874 return NULL;
875
876 enabler = kzalloc(sizeof(*enabler), GFP_KERNEL_ACCOUNT);
877
878 if (!enabler)
879 goto out;
880
881 enabler->event = user;
882 enabler->addr = uaddr;
883 enabler->values = reg->enable_bit;
884
885 #if BITS_PER_LONG >= 64
886 if (reg->enable_size == 4)
887 set_bit(ENABLE_VAL_32_ON_64_BIT, ENABLE_BITOPS(enabler));
888 #endif
889
890 retry:
891 /* Prevents state changes from racing with new enablers */
892 mutex_lock(&event_mutex);
893
894 /* Attempt to reflect the current state within the process */
895 mmap_read_lock(user_mm->mm);
896 *write_result = user_event_enabler_write(user_mm, enabler, false,
897 &attempt);
898 mmap_read_unlock(user_mm->mm);
899
900 /*
901 * If the write works, then we will track the enabler. A ref to the
902 * underlying user_event is held by the enabler to prevent it going
903 * away while the enabler is still in use by a process. The ref is
904 * removed when the enabler is destroyed. This means a event cannot
905 * be forcefully deleted from the system until all tasks using it
906 * exit or run exec(), which includes forks and clones.
907 */
908 if (!*write_result) {
909 user_event_get(user);
910 list_add_rcu(&enabler->mm_enablers_link, &user_mm->enablers);
911 }
912
913 mutex_unlock(&event_mutex);
914
915 if (*write_result) {
916 /* Attempt to fault-in and retry if it worked */
917 if (!user_event_mm_fault_in(user_mm, uaddr, attempt))
918 goto retry;
919
920 kfree(enabler);
921 enabler = NULL;
922 }
923 out:
924 user_event_mm_put(user_mm);
925
926 return enabler;
927 }
928
929 static __always_inline __must_check
user_event_last_ref(struct user_event * user)930 bool user_event_last_ref(struct user_event *user)
931 {
932 int last = 0;
933
934 if (user->reg_flags & USER_EVENT_REG_PERSIST)
935 last = 1;
936
937 return refcount_read(&user->refcnt) == last;
938 }
939
940 static __always_inline __must_check
copy_nofault(void * addr,size_t bytes,struct iov_iter * i)941 size_t copy_nofault(void *addr, size_t bytes, struct iov_iter *i)
942 {
943 size_t ret;
944
945 pagefault_disable();
946
947 ret = copy_from_iter_nocache(addr, bytes, i);
948
949 pagefault_enable();
950
951 return ret;
952 }
953
user_event_get_fields(struct trace_event_call * call)954 static struct list_head *user_event_get_fields(struct trace_event_call *call)
955 {
956 struct user_event *user = (struct user_event *)call->data;
957
958 return &user->fields;
959 }
960
961 /*
962 * Parses a register command for user_events
963 * Format: event_name[:FLAG1[,FLAG2...]] [field1[;field2...]]
964 *
965 * Example event named 'test' with a 20 char 'msg' field with an unsigned int
966 * 'id' field after:
967 * test char[20] msg;unsigned int id
968 *
969 * NOTE: Offsets are from the user data perspective, they are not from the
970 * trace_entry/buffer perspective. We automatically add the common properties
971 * sizes to the offset for the user.
972 *
973 * Upon success user_event has its ref count increased by 1.
974 */
user_event_parse_cmd(struct user_event_group * group,char * raw_command,struct user_event ** newuser,int reg_flags)975 static int user_event_parse_cmd(struct user_event_group *group,
976 char *raw_command, struct user_event **newuser,
977 int reg_flags)
978 {
979 char *name = raw_command;
980 char *args = strpbrk(name, " ");
981 char *flags;
982
983 if (args)
984 *args++ = '\0';
985
986 flags = strpbrk(name, ":");
987
988 if (flags)
989 *flags++ = '\0';
990
991 return user_event_parse(group, name, args, flags, newuser, reg_flags);
992 }
993
user_field_array_size(const char * type)994 static int user_field_array_size(const char *type)
995 {
996 const char *start = strchr(type, '[');
997 char val[8];
998 char *bracket;
999 int size = 0;
1000
1001 if (start == NULL)
1002 return -EINVAL;
1003
1004 if (strscpy(val, start + 1, sizeof(val)) <= 0)
1005 return -EINVAL;
1006
1007 bracket = strchr(val, ']');
1008
1009 if (!bracket)
1010 return -EINVAL;
1011
1012 *bracket = '\0';
1013
1014 if (kstrtouint(val, 0, &size))
1015 return -EINVAL;
1016
1017 if (size > MAX_FIELD_ARRAY_SIZE)
1018 return -EINVAL;
1019
1020 return size;
1021 }
1022
user_field_size(const char * type)1023 static int user_field_size(const char *type)
1024 {
1025 /* long is not allowed from a user, since it's ambigious in size */
1026 if (strcmp(type, "s64") == 0)
1027 return sizeof(s64);
1028 if (strcmp(type, "u64") == 0)
1029 return sizeof(u64);
1030 if (strcmp(type, "s32") == 0)
1031 return sizeof(s32);
1032 if (strcmp(type, "u32") == 0)
1033 return sizeof(u32);
1034 if (strcmp(type, "int") == 0)
1035 return sizeof(int);
1036 if (strcmp(type, "unsigned int") == 0)
1037 return sizeof(unsigned int);
1038 if (strcmp(type, "s16") == 0)
1039 return sizeof(s16);
1040 if (strcmp(type, "u16") == 0)
1041 return sizeof(u16);
1042 if (strcmp(type, "short") == 0)
1043 return sizeof(short);
1044 if (strcmp(type, "unsigned short") == 0)
1045 return sizeof(unsigned short);
1046 if (strcmp(type, "s8") == 0)
1047 return sizeof(s8);
1048 if (strcmp(type, "u8") == 0)
1049 return sizeof(u8);
1050 if (strcmp(type, "char") == 0)
1051 return sizeof(char);
1052 if (strcmp(type, "unsigned char") == 0)
1053 return sizeof(unsigned char);
1054 if (str_has_prefix(type, "char["))
1055 return user_field_array_size(type);
1056 if (str_has_prefix(type, "unsigned char["))
1057 return user_field_array_size(type);
1058 if (str_has_prefix(type, "__data_loc "))
1059 return sizeof(u32);
1060 if (str_has_prefix(type, "__rel_loc "))
1061 return sizeof(u32);
1062
1063 /* Uknown basic type, error */
1064 return -EINVAL;
1065 }
1066
user_event_destroy_validators(struct user_event * user)1067 static void user_event_destroy_validators(struct user_event *user)
1068 {
1069 struct user_event_validator *validator, *next;
1070 struct list_head *head = &user->validators;
1071
1072 list_for_each_entry_safe(validator, next, head, user_event_link) {
1073 list_del(&validator->user_event_link);
1074 kfree(validator);
1075 }
1076 }
1077
user_event_destroy_fields(struct user_event * user)1078 static void user_event_destroy_fields(struct user_event *user)
1079 {
1080 struct ftrace_event_field *field, *next;
1081 struct list_head *head = &user->fields;
1082
1083 list_for_each_entry_safe(field, next, head, link) {
1084 list_del(&field->link);
1085 kfree(field);
1086 }
1087 }
1088
user_event_add_field(struct user_event * user,const char * type,const char * name,int offset,int size,int is_signed,int filter_type)1089 static int user_event_add_field(struct user_event *user, const char *type,
1090 const char *name, int offset, int size,
1091 int is_signed, int filter_type)
1092 {
1093 struct user_event_validator *validator;
1094 struct ftrace_event_field *field;
1095 int validator_flags = 0;
1096
1097 field = kmalloc(sizeof(*field), GFP_KERNEL_ACCOUNT);
1098
1099 if (!field)
1100 return -ENOMEM;
1101
1102 if (str_has_prefix(type, "__data_loc "))
1103 goto add_validator;
1104
1105 if (str_has_prefix(type, "__rel_loc ")) {
1106 validator_flags |= VALIDATOR_REL;
1107 goto add_validator;
1108 }
1109
1110 goto add_field;
1111
1112 add_validator:
1113 if (strstr(type, "char") != NULL)
1114 validator_flags |= VALIDATOR_ENSURE_NULL;
1115
1116 validator = kmalloc(sizeof(*validator), GFP_KERNEL_ACCOUNT);
1117
1118 if (!validator) {
1119 kfree(field);
1120 return -ENOMEM;
1121 }
1122
1123 validator->flags = validator_flags;
1124 validator->offset = offset;
1125
1126 /* Want sequential access when validating */
1127 list_add_tail(&validator->user_event_link, &user->validators);
1128
1129 add_field:
1130 field->type = type;
1131 field->name = name;
1132 field->offset = offset;
1133 field->size = size;
1134 field->is_signed = is_signed;
1135 field->filter_type = filter_type;
1136
1137 if (filter_type == FILTER_OTHER)
1138 field->filter_type = filter_assign_type(type);
1139
1140 list_add(&field->link, &user->fields);
1141
1142 /*
1143 * Min size from user writes that are required, this does not include
1144 * the size of trace_entry (common fields).
1145 */
1146 user->min_size = (offset + size) - sizeof(struct trace_entry);
1147
1148 return 0;
1149 }
1150
1151 /*
1152 * Parses the values of a field within the description
1153 * Format: type name [size]
1154 */
user_event_parse_field(char * field,struct user_event * user,u32 * offset)1155 static int user_event_parse_field(char *field, struct user_event *user,
1156 u32 *offset)
1157 {
1158 char *part, *type, *name;
1159 u32 depth = 0, saved_offset = *offset;
1160 int len, size = -EINVAL;
1161 bool is_struct = false;
1162
1163 field = skip_spaces(field);
1164
1165 if (*field == '\0')
1166 return 0;
1167
1168 /* Handle types that have a space within */
1169 len = str_has_prefix(field, "unsigned ");
1170 if (len)
1171 goto skip_next;
1172
1173 len = str_has_prefix(field, "struct ");
1174 if (len) {
1175 is_struct = true;
1176 goto skip_next;
1177 }
1178
1179 len = str_has_prefix(field, "__data_loc unsigned ");
1180 if (len)
1181 goto skip_next;
1182
1183 len = str_has_prefix(field, "__data_loc ");
1184 if (len)
1185 goto skip_next;
1186
1187 len = str_has_prefix(field, "__rel_loc unsigned ");
1188 if (len)
1189 goto skip_next;
1190
1191 len = str_has_prefix(field, "__rel_loc ");
1192 if (len)
1193 goto skip_next;
1194
1195 goto parse;
1196 skip_next:
1197 type = field;
1198 field = strpbrk(field + len, " ");
1199
1200 if (field == NULL)
1201 return -EINVAL;
1202
1203 *field++ = '\0';
1204 depth++;
1205 parse:
1206 name = NULL;
1207
1208 while ((part = strsep(&field, " ")) != NULL) {
1209 switch (depth++) {
1210 case FIELD_DEPTH_TYPE:
1211 type = part;
1212 break;
1213 case FIELD_DEPTH_NAME:
1214 name = part;
1215 break;
1216 case FIELD_DEPTH_SIZE:
1217 if (!is_struct)
1218 return -EINVAL;
1219
1220 if (kstrtou32(part, 10, &size))
1221 return -EINVAL;
1222 break;
1223 default:
1224 return -EINVAL;
1225 }
1226 }
1227
1228 if (depth < FIELD_DEPTH_SIZE || !name)
1229 return -EINVAL;
1230
1231 if (depth == FIELD_DEPTH_SIZE)
1232 size = user_field_size(type);
1233
1234 if (size == 0)
1235 return -EINVAL;
1236
1237 if (size < 0)
1238 return size;
1239
1240 *offset = saved_offset + size;
1241
1242 return user_event_add_field(user, type, name, saved_offset, size,
1243 type[0] != 'u', FILTER_OTHER);
1244 }
1245
user_event_parse_fields(struct user_event * user,char * args)1246 static int user_event_parse_fields(struct user_event *user, char *args)
1247 {
1248 char *field;
1249 u32 offset = sizeof(struct trace_entry);
1250 int ret = -EINVAL;
1251
1252 if (args == NULL)
1253 return 0;
1254
1255 while ((field = strsep(&args, ";")) != NULL) {
1256 ret = user_event_parse_field(field, user, &offset);
1257
1258 if (ret)
1259 break;
1260 }
1261
1262 return ret;
1263 }
1264
1265 static struct trace_event_fields user_event_fields_array[1];
1266
user_field_format(const char * type)1267 static const char *user_field_format(const char *type)
1268 {
1269 if (strcmp(type, "s64") == 0)
1270 return "%lld";
1271 if (strcmp(type, "u64") == 0)
1272 return "%llu";
1273 if (strcmp(type, "s32") == 0)
1274 return "%d";
1275 if (strcmp(type, "u32") == 0)
1276 return "%u";
1277 if (strcmp(type, "int") == 0)
1278 return "%d";
1279 if (strcmp(type, "unsigned int") == 0)
1280 return "%u";
1281 if (strcmp(type, "s16") == 0)
1282 return "%d";
1283 if (strcmp(type, "u16") == 0)
1284 return "%u";
1285 if (strcmp(type, "short") == 0)
1286 return "%d";
1287 if (strcmp(type, "unsigned short") == 0)
1288 return "%u";
1289 if (strcmp(type, "s8") == 0)
1290 return "%d";
1291 if (strcmp(type, "u8") == 0)
1292 return "%u";
1293 if (strcmp(type, "char") == 0)
1294 return "%d";
1295 if (strcmp(type, "unsigned char") == 0)
1296 return "%u";
1297 if (strstr(type, "char[") != NULL)
1298 return "%s";
1299
1300 /* Unknown, likely struct, allowed treat as 64-bit */
1301 return "%llu";
1302 }
1303
user_field_is_dyn_string(const char * type,const char ** str_func)1304 static bool user_field_is_dyn_string(const char *type, const char **str_func)
1305 {
1306 if (str_has_prefix(type, "__data_loc ")) {
1307 *str_func = "__get_str";
1308 goto check;
1309 }
1310
1311 if (str_has_prefix(type, "__rel_loc ")) {
1312 *str_func = "__get_rel_str";
1313 goto check;
1314 }
1315
1316 return false;
1317 check:
1318 return strstr(type, "char") != NULL;
1319 }
1320
1321 #define LEN_OR_ZERO (len ? len - pos : 0)
user_dyn_field_set_string(int argc,const char ** argv,int * iout,char * buf,int len,bool * colon)1322 static int user_dyn_field_set_string(int argc, const char **argv, int *iout,
1323 char *buf, int len, bool *colon)
1324 {
1325 int pos = 0, i = *iout;
1326
1327 *colon = false;
1328
1329 for (; i < argc; ++i) {
1330 if (i != *iout)
1331 pos += snprintf(buf + pos, LEN_OR_ZERO, " ");
1332
1333 pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", argv[i]);
1334
1335 if (strchr(argv[i], ';')) {
1336 ++i;
1337 *colon = true;
1338 break;
1339 }
1340 }
1341
1342 /* Actual set, advance i */
1343 if (len != 0)
1344 *iout = i;
1345
1346 return pos + 1;
1347 }
1348
user_field_set_string(struct ftrace_event_field * field,char * buf,int len,bool colon)1349 static int user_field_set_string(struct ftrace_event_field *field,
1350 char *buf, int len, bool colon)
1351 {
1352 int pos = 0;
1353
1354 pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", field->type);
1355 pos += snprintf(buf + pos, LEN_OR_ZERO, " ");
1356 pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", field->name);
1357
1358 if (str_has_prefix(field->type, "struct "))
1359 pos += snprintf(buf + pos, LEN_OR_ZERO, " %d", field->size);
1360
1361 if (colon)
1362 pos += snprintf(buf + pos, LEN_OR_ZERO, ";");
1363
1364 return pos + 1;
1365 }
1366
user_event_set_print_fmt(struct user_event * user,char * buf,int len)1367 static int user_event_set_print_fmt(struct user_event *user, char *buf, int len)
1368 {
1369 struct ftrace_event_field *field;
1370 struct list_head *head = &user->fields;
1371 int pos = 0, depth = 0;
1372 const char *str_func;
1373
1374 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
1375
1376 list_for_each_entry_reverse(field, head, link) {
1377 if (depth != 0)
1378 pos += snprintf(buf + pos, LEN_OR_ZERO, " ");
1379
1380 pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s",
1381 field->name, user_field_format(field->type));
1382
1383 depth++;
1384 }
1385
1386 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
1387
1388 list_for_each_entry_reverse(field, head, link) {
1389 if (user_field_is_dyn_string(field->type, &str_func))
1390 pos += snprintf(buf + pos, LEN_OR_ZERO,
1391 ", %s(%s)", str_func, field->name);
1392 else
1393 pos += snprintf(buf + pos, LEN_OR_ZERO,
1394 ", REC->%s", field->name);
1395 }
1396
1397 return pos + 1;
1398 }
1399 #undef LEN_OR_ZERO
1400
user_event_create_print_fmt(struct user_event * user)1401 static int user_event_create_print_fmt(struct user_event *user)
1402 {
1403 char *print_fmt;
1404 int len;
1405
1406 len = user_event_set_print_fmt(user, NULL, 0);
1407
1408 print_fmt = kmalloc(len, GFP_KERNEL_ACCOUNT);
1409
1410 if (!print_fmt)
1411 return -ENOMEM;
1412
1413 user_event_set_print_fmt(user, print_fmt, len);
1414
1415 user->call.print_fmt = print_fmt;
1416
1417 return 0;
1418 }
1419
user_event_print_trace(struct trace_iterator * iter,int flags,struct trace_event * event)1420 static enum print_line_t user_event_print_trace(struct trace_iterator *iter,
1421 int flags,
1422 struct trace_event *event)
1423 {
1424 return print_event_fields(iter, event);
1425 }
1426
1427 static struct trace_event_functions user_event_funcs = {
1428 .trace = user_event_print_trace,
1429 };
1430
user_event_set_call_visible(struct user_event * user,bool visible)1431 static int user_event_set_call_visible(struct user_event *user, bool visible)
1432 {
1433 int ret;
1434 const struct cred *old_cred;
1435 struct cred *cred;
1436
1437 cred = prepare_creds();
1438
1439 if (!cred)
1440 return -ENOMEM;
1441
1442 /*
1443 * While by default tracefs is locked down, systems can be configured
1444 * to allow user_event files to be less locked down. The extreme case
1445 * being "other" has read/write access to user_events_data/status.
1446 *
1447 * When not locked down, processes may not have permissions to
1448 * add/remove calls themselves to tracefs. We need to temporarily
1449 * switch to root file permission to allow for this scenario.
1450 */
1451 cred->fsuid = GLOBAL_ROOT_UID;
1452
1453 old_cred = override_creds(cred);
1454
1455 if (visible)
1456 ret = trace_add_event_call(&user->call);
1457 else
1458 ret = trace_remove_event_call(&user->call);
1459
1460 revert_creds(old_cred);
1461 put_cred(cred);
1462
1463 return ret;
1464 }
1465
destroy_user_event(struct user_event * user)1466 static int destroy_user_event(struct user_event *user)
1467 {
1468 int ret = 0;
1469
1470 lockdep_assert_held(&event_mutex);
1471
1472 /* Must destroy fields before call removal */
1473 user_event_destroy_fields(user);
1474
1475 ret = user_event_set_call_visible(user, false);
1476
1477 if (ret)
1478 return ret;
1479
1480 dyn_event_remove(&user->devent);
1481 hash_del(&user->node);
1482
1483 user_event_destroy_validators(user);
1484 kfree(user->call.print_fmt);
1485 kfree(EVENT_NAME(user));
1486 kfree(user);
1487
1488 if (current_user_events > 0)
1489 current_user_events--;
1490 else
1491 pr_alert("BUG: Bad current_user_events\n");
1492
1493 return ret;
1494 }
1495
find_user_event(struct user_event_group * group,char * name,u32 * outkey)1496 static struct user_event *find_user_event(struct user_event_group *group,
1497 char *name, u32 *outkey)
1498 {
1499 struct user_event *user;
1500 u32 key = user_event_key(name);
1501
1502 *outkey = key;
1503
1504 hash_for_each_possible(group->register_table, user, node, key)
1505 if (!strcmp(EVENT_NAME(user), name))
1506 return user_event_get(user);
1507
1508 return NULL;
1509 }
1510
user_event_validate(struct user_event * user,void * data,int len)1511 static int user_event_validate(struct user_event *user, void *data, int len)
1512 {
1513 struct list_head *head = &user->validators;
1514 struct user_event_validator *validator;
1515 void *pos, *end = data + len;
1516 u32 loc, offset, size;
1517
1518 list_for_each_entry(validator, head, user_event_link) {
1519 pos = data + validator->offset;
1520
1521 /* Already done min_size check, no bounds check here */
1522 loc = *(u32 *)pos;
1523 offset = loc & 0xffff;
1524 size = loc >> 16;
1525
1526 if (likely(validator->flags & VALIDATOR_REL))
1527 pos += offset + sizeof(loc);
1528 else
1529 pos = data + offset;
1530
1531 pos += size;
1532
1533 if (unlikely(pos > end))
1534 return -EFAULT;
1535
1536 if (likely(validator->flags & VALIDATOR_ENSURE_NULL))
1537 if (unlikely(*(char *)(pos - 1) != '\0'))
1538 return -EFAULT;
1539 }
1540
1541 return 0;
1542 }
1543
1544 /*
1545 * Writes the user supplied payload out to a trace file.
1546 */
user_event_ftrace(struct user_event * user,struct iov_iter * i,void * tpdata,bool * faulted)1547 static void user_event_ftrace(struct user_event *user, struct iov_iter *i,
1548 void *tpdata, bool *faulted)
1549 {
1550 struct trace_event_file *file;
1551 struct trace_entry *entry;
1552 struct trace_event_buffer event_buffer;
1553 size_t size = sizeof(*entry) + i->count;
1554
1555 file = (struct trace_event_file *)tpdata;
1556
1557 if (!file ||
1558 !(file->flags & EVENT_FILE_FL_ENABLED) ||
1559 trace_trigger_soft_disabled(file))
1560 return;
1561
1562 /* Allocates and fills trace_entry, + 1 of this is data payload */
1563 entry = trace_event_buffer_reserve(&event_buffer, file, size);
1564
1565 if (unlikely(!entry))
1566 return;
1567
1568 if (unlikely(i->count != 0 && !copy_nofault(entry + 1, i->count, i)))
1569 goto discard;
1570
1571 if (!list_empty(&user->validators) &&
1572 unlikely(user_event_validate(user, entry, size)))
1573 goto discard;
1574
1575 trace_event_buffer_commit(&event_buffer);
1576
1577 return;
1578 discard:
1579 *faulted = true;
1580 __trace_event_discard_commit(event_buffer.buffer,
1581 event_buffer.event);
1582 }
1583
1584 #ifdef CONFIG_PERF_EVENTS
1585 /*
1586 * Writes the user supplied payload out to perf ring buffer.
1587 */
user_event_perf(struct user_event * user,struct iov_iter * i,void * tpdata,bool * faulted)1588 static void user_event_perf(struct user_event *user, struct iov_iter *i,
1589 void *tpdata, bool *faulted)
1590 {
1591 struct hlist_head *perf_head;
1592
1593 perf_head = this_cpu_ptr(user->call.perf_events);
1594
1595 if (perf_head && !hlist_empty(perf_head)) {
1596 struct trace_entry *perf_entry;
1597 struct pt_regs *regs;
1598 size_t size = sizeof(*perf_entry) + i->count;
1599 int context;
1600
1601 perf_entry = perf_trace_buf_alloc(ALIGN(size, 8),
1602 ®s, &context);
1603
1604 if (unlikely(!perf_entry))
1605 return;
1606
1607 perf_fetch_caller_regs(regs);
1608
1609 if (unlikely(i->count != 0 && !copy_nofault(perf_entry + 1, i->count, i)))
1610 goto discard;
1611
1612 if (!list_empty(&user->validators) &&
1613 unlikely(user_event_validate(user, perf_entry, size)))
1614 goto discard;
1615
1616 perf_trace_buf_submit(perf_entry, size, context,
1617 user->call.event.type, 1, regs,
1618 perf_head, NULL);
1619
1620 return;
1621 discard:
1622 *faulted = true;
1623 perf_swevent_put_recursion_context(context);
1624 }
1625 }
1626 #endif
1627
1628 /*
1629 * Update the enabled bit among all user processes.
1630 */
update_enable_bit_for(struct user_event * user)1631 static void update_enable_bit_for(struct user_event *user)
1632 {
1633 struct tracepoint *tp = &user->tracepoint;
1634 char status = 0;
1635
1636 if (atomic_read(&tp->key.enabled) > 0) {
1637 struct tracepoint_func *probe_func_ptr;
1638 user_event_func_t probe_func;
1639
1640 rcu_read_lock_sched();
1641
1642 probe_func_ptr = rcu_dereference_sched(tp->funcs);
1643
1644 if (probe_func_ptr) {
1645 do {
1646 probe_func = probe_func_ptr->func;
1647
1648 if (probe_func == user_event_ftrace)
1649 status |= EVENT_STATUS_FTRACE;
1650 #ifdef CONFIG_PERF_EVENTS
1651 else if (probe_func == user_event_perf)
1652 status |= EVENT_STATUS_PERF;
1653 #endif
1654 else
1655 status |= EVENT_STATUS_OTHER;
1656 } while ((++probe_func_ptr)->func);
1657 }
1658
1659 rcu_read_unlock_sched();
1660 }
1661
1662 user->status = status;
1663
1664 user_event_enabler_update(user);
1665 }
1666
1667 /*
1668 * Register callback for our events from tracing sub-systems.
1669 */
user_event_reg(struct trace_event_call * call,enum trace_reg type,void * data)1670 static int user_event_reg(struct trace_event_call *call,
1671 enum trace_reg type,
1672 void *data)
1673 {
1674 struct user_event *user = (struct user_event *)call->data;
1675 int ret = 0;
1676
1677 if (!user)
1678 return -ENOENT;
1679
1680 switch (type) {
1681 case TRACE_REG_REGISTER:
1682 ret = tracepoint_probe_register(call->tp,
1683 call->class->probe,
1684 data);
1685 if (!ret)
1686 goto inc;
1687 break;
1688
1689 case TRACE_REG_UNREGISTER:
1690 tracepoint_probe_unregister(call->tp,
1691 call->class->probe,
1692 data);
1693 goto dec;
1694
1695 #ifdef CONFIG_PERF_EVENTS
1696 case TRACE_REG_PERF_REGISTER:
1697 ret = tracepoint_probe_register(call->tp,
1698 call->class->perf_probe,
1699 data);
1700 if (!ret)
1701 goto inc;
1702 break;
1703
1704 case TRACE_REG_PERF_UNREGISTER:
1705 tracepoint_probe_unregister(call->tp,
1706 call->class->perf_probe,
1707 data);
1708 goto dec;
1709
1710 case TRACE_REG_PERF_OPEN:
1711 case TRACE_REG_PERF_CLOSE:
1712 case TRACE_REG_PERF_ADD:
1713 case TRACE_REG_PERF_DEL:
1714 break;
1715 #endif
1716 }
1717
1718 return ret;
1719 inc:
1720 user_event_get(user);
1721 update_enable_bit_for(user);
1722 return 0;
1723 dec:
1724 update_enable_bit_for(user);
1725 user_event_put(user, true);
1726 return 0;
1727 }
1728
user_event_create(const char * raw_command)1729 static int user_event_create(const char *raw_command)
1730 {
1731 struct user_event_group *group;
1732 struct user_event *user;
1733 char *name;
1734 int ret;
1735
1736 if (!str_has_prefix(raw_command, USER_EVENTS_PREFIX))
1737 return -ECANCELED;
1738
1739 raw_command += USER_EVENTS_PREFIX_LEN;
1740 raw_command = skip_spaces(raw_command);
1741
1742 name = kstrdup(raw_command, GFP_KERNEL_ACCOUNT);
1743
1744 if (!name)
1745 return -ENOMEM;
1746
1747 group = current_user_event_group();
1748
1749 if (!group) {
1750 kfree(name);
1751 return -ENOENT;
1752 }
1753
1754 mutex_lock(&group->reg_mutex);
1755
1756 /* Dyn events persist, otherwise they would cleanup immediately */
1757 ret = user_event_parse_cmd(group, name, &user, USER_EVENT_REG_PERSIST);
1758
1759 if (!ret)
1760 user_event_put(user, false);
1761
1762 mutex_unlock(&group->reg_mutex);
1763
1764 if (ret)
1765 kfree(name);
1766
1767 return ret;
1768 }
1769
user_event_show(struct seq_file * m,struct dyn_event * ev)1770 static int user_event_show(struct seq_file *m, struct dyn_event *ev)
1771 {
1772 struct user_event *user = container_of(ev, struct user_event, devent);
1773 struct ftrace_event_field *field;
1774 struct list_head *head;
1775 int depth = 0;
1776
1777 seq_printf(m, "%s%s", USER_EVENTS_PREFIX, EVENT_NAME(user));
1778
1779 head = trace_get_fields(&user->call);
1780
1781 list_for_each_entry_reverse(field, head, link) {
1782 if (depth == 0)
1783 seq_puts(m, " ");
1784 else
1785 seq_puts(m, "; ");
1786
1787 seq_printf(m, "%s %s", field->type, field->name);
1788
1789 if (str_has_prefix(field->type, "struct "))
1790 seq_printf(m, " %d", field->size);
1791
1792 depth++;
1793 }
1794
1795 seq_puts(m, "\n");
1796
1797 return 0;
1798 }
1799
user_event_is_busy(struct dyn_event * ev)1800 static bool user_event_is_busy(struct dyn_event *ev)
1801 {
1802 struct user_event *user = container_of(ev, struct user_event, devent);
1803
1804 return !user_event_last_ref(user);
1805 }
1806
user_event_free(struct dyn_event * ev)1807 static int user_event_free(struct dyn_event *ev)
1808 {
1809 struct user_event *user = container_of(ev, struct user_event, devent);
1810
1811 if (!user_event_last_ref(user))
1812 return -EBUSY;
1813
1814 return destroy_user_event(user);
1815 }
1816
user_field_match(struct ftrace_event_field * field,int argc,const char ** argv,int * iout)1817 static bool user_field_match(struct ftrace_event_field *field, int argc,
1818 const char **argv, int *iout)
1819 {
1820 char *field_name = NULL, *dyn_field_name = NULL;
1821 bool colon = false, match = false;
1822 int dyn_len, len;
1823
1824 if (*iout >= argc)
1825 return false;
1826
1827 dyn_len = user_dyn_field_set_string(argc, argv, iout, dyn_field_name,
1828 0, &colon);
1829
1830 len = user_field_set_string(field, field_name, 0, colon);
1831
1832 if (dyn_len != len)
1833 return false;
1834
1835 dyn_field_name = kmalloc(dyn_len, GFP_KERNEL);
1836 field_name = kmalloc(len, GFP_KERNEL);
1837
1838 if (!dyn_field_name || !field_name)
1839 goto out;
1840
1841 user_dyn_field_set_string(argc, argv, iout, dyn_field_name,
1842 dyn_len, &colon);
1843
1844 user_field_set_string(field, field_name, len, colon);
1845
1846 match = strcmp(dyn_field_name, field_name) == 0;
1847 out:
1848 kfree(dyn_field_name);
1849 kfree(field_name);
1850
1851 return match;
1852 }
1853
user_fields_match(struct user_event * user,int argc,const char ** argv)1854 static bool user_fields_match(struct user_event *user, int argc,
1855 const char **argv)
1856 {
1857 struct ftrace_event_field *field;
1858 struct list_head *head = &user->fields;
1859 int i = 0;
1860
1861 list_for_each_entry_reverse(field, head, link) {
1862 if (!user_field_match(field, argc, argv, &i))
1863 return false;
1864 }
1865
1866 if (i != argc)
1867 return false;
1868
1869 return true;
1870 }
1871
user_event_match(const char * system,const char * event,int argc,const char ** argv,struct dyn_event * ev)1872 static bool user_event_match(const char *system, const char *event,
1873 int argc, const char **argv, struct dyn_event *ev)
1874 {
1875 struct user_event *user = container_of(ev, struct user_event, devent);
1876 bool match;
1877
1878 match = strcmp(EVENT_NAME(user), event) == 0 &&
1879 (!system || strcmp(system, USER_EVENTS_SYSTEM) == 0);
1880
1881 if (match && argc > 0)
1882 match = user_fields_match(user, argc, argv);
1883 else if (match && argc == 0)
1884 match = list_empty(&user->fields);
1885
1886 return match;
1887 }
1888
1889 static struct dyn_event_operations user_event_dops = {
1890 .create = user_event_create,
1891 .show = user_event_show,
1892 .is_busy = user_event_is_busy,
1893 .free = user_event_free,
1894 .match = user_event_match,
1895 };
1896
user_event_trace_register(struct user_event * user)1897 static int user_event_trace_register(struct user_event *user)
1898 {
1899 int ret;
1900
1901 ret = register_trace_event(&user->call.event);
1902
1903 if (!ret)
1904 return -ENODEV;
1905
1906 ret = user_event_set_call_visible(user, true);
1907
1908 if (ret)
1909 unregister_trace_event(&user->call.event);
1910
1911 return ret;
1912 }
1913
1914 /*
1915 * Parses the event name, arguments and flags then registers if successful.
1916 * The name buffer lifetime is owned by this method for success cases only.
1917 * Upon success the returned user_event has its ref count increased by 1.
1918 */
user_event_parse(struct user_event_group * group,char * name,char * args,char * flags,struct user_event ** newuser,int reg_flags)1919 static int user_event_parse(struct user_event_group *group, char *name,
1920 char *args, char *flags,
1921 struct user_event **newuser, int reg_flags)
1922 {
1923 int ret;
1924 u32 key;
1925 struct user_event *user;
1926 int argc = 0;
1927 char **argv;
1928
1929 /* User register flags are not ready yet */
1930 if (reg_flags != 0 || flags != NULL)
1931 return -EINVAL;
1932
1933 /* Prevent dyn_event from racing */
1934 mutex_lock(&event_mutex);
1935 user = find_user_event(group, name, &key);
1936 mutex_unlock(&event_mutex);
1937
1938 if (user) {
1939 if (args) {
1940 argv = argv_split(GFP_KERNEL, args, &argc);
1941 if (!argv) {
1942 ret = -ENOMEM;
1943 goto error;
1944 }
1945
1946 ret = user_fields_match(user, argc, (const char **)argv);
1947 argv_free(argv);
1948
1949 } else
1950 ret = list_empty(&user->fields);
1951
1952 if (ret) {
1953 *newuser = user;
1954 /*
1955 * Name is allocated by caller, free it since it already exists.
1956 * Caller only worries about failure cases for freeing.
1957 */
1958 kfree(name);
1959 } else {
1960 ret = -EADDRINUSE;
1961 goto error;
1962 }
1963
1964 return 0;
1965 error:
1966 user_event_put(user, false);
1967 return ret;
1968 }
1969
1970 user = kzalloc(sizeof(*user), GFP_KERNEL_ACCOUNT);
1971
1972 if (!user)
1973 return -ENOMEM;
1974
1975 INIT_LIST_HEAD(&user->class.fields);
1976 INIT_LIST_HEAD(&user->fields);
1977 INIT_LIST_HEAD(&user->validators);
1978
1979 user->group = group;
1980 user->tracepoint.name = name;
1981
1982 ret = user_event_parse_fields(user, args);
1983
1984 if (ret)
1985 goto put_user;
1986
1987 ret = user_event_create_print_fmt(user);
1988
1989 if (ret)
1990 goto put_user;
1991
1992 user->call.data = user;
1993 user->call.class = &user->class;
1994 user->call.name = name;
1995 user->call.flags = TRACE_EVENT_FL_TRACEPOINT;
1996 user->call.tp = &user->tracepoint;
1997 user->call.event.funcs = &user_event_funcs;
1998 user->class.system = group->system_name;
1999
2000 user->class.fields_array = user_event_fields_array;
2001 user->class.get_fields = user_event_get_fields;
2002 user->class.reg = user_event_reg;
2003 user->class.probe = user_event_ftrace;
2004 #ifdef CONFIG_PERF_EVENTS
2005 user->class.perf_probe = user_event_perf;
2006 #endif
2007
2008 mutex_lock(&event_mutex);
2009
2010 if (current_user_events >= max_user_events) {
2011 ret = -EMFILE;
2012 goto put_user_lock;
2013 }
2014
2015 ret = user_event_trace_register(user);
2016
2017 if (ret)
2018 goto put_user_lock;
2019
2020 user->reg_flags = reg_flags;
2021
2022 if (user->reg_flags & USER_EVENT_REG_PERSIST) {
2023 /* Ensure we track self ref and caller ref (2) */
2024 refcount_set(&user->refcnt, 2);
2025 } else {
2026 /* Ensure we track only caller ref (1) */
2027 refcount_set(&user->refcnt, 1);
2028 }
2029
2030 dyn_event_init(&user->devent, &user_event_dops);
2031 dyn_event_add(&user->devent, &user->call);
2032 hash_add(group->register_table, &user->node, key);
2033 current_user_events++;
2034
2035 mutex_unlock(&event_mutex);
2036
2037 *newuser = user;
2038 return 0;
2039 put_user_lock:
2040 mutex_unlock(&event_mutex);
2041 put_user:
2042 user_event_destroy_fields(user);
2043 user_event_destroy_validators(user);
2044 kfree(user->call.print_fmt);
2045 kfree(user);
2046 return ret;
2047 }
2048
2049 /*
2050 * Deletes a previously created event if it is no longer being used.
2051 */
delete_user_event(struct user_event_group * group,char * name)2052 static int delete_user_event(struct user_event_group *group, char *name)
2053 {
2054 u32 key;
2055 struct user_event *user = find_user_event(group, name, &key);
2056
2057 if (!user)
2058 return -ENOENT;
2059
2060 user_event_put(user, true);
2061
2062 if (!user_event_last_ref(user))
2063 return -EBUSY;
2064
2065 return destroy_user_event(user);
2066 }
2067
2068 /*
2069 * Validates the user payload and writes via iterator.
2070 */
user_events_write_core(struct file * file,struct iov_iter * i)2071 static ssize_t user_events_write_core(struct file *file, struct iov_iter *i)
2072 {
2073 struct user_event_file_info *info = file->private_data;
2074 struct user_event_refs *refs;
2075 struct user_event *user = NULL;
2076 struct tracepoint *tp;
2077 ssize_t ret = i->count;
2078 int idx;
2079
2080 if (unlikely(copy_from_iter(&idx, sizeof(idx), i) != sizeof(idx)))
2081 return -EFAULT;
2082
2083 if (idx < 0)
2084 return -EINVAL;
2085
2086 rcu_read_lock_sched();
2087
2088 refs = rcu_dereference_sched(info->refs);
2089
2090 /*
2091 * The refs->events array is protected by RCU, and new items may be
2092 * added. But the user retrieved from indexing into the events array
2093 * shall be immutable while the file is opened.
2094 */
2095 if (likely(refs && idx < refs->count))
2096 user = refs->events[idx];
2097
2098 rcu_read_unlock_sched();
2099
2100 if (unlikely(user == NULL))
2101 return -ENOENT;
2102
2103 if (unlikely(i->count < user->min_size))
2104 return -EINVAL;
2105
2106 tp = &user->tracepoint;
2107
2108 /*
2109 * It's possible key.enabled disables after this check, however
2110 * we don't mind if a few events are included in this condition.
2111 */
2112 if (likely(atomic_read(&tp->key.enabled) > 0)) {
2113 struct tracepoint_func *probe_func_ptr;
2114 user_event_func_t probe_func;
2115 struct iov_iter copy;
2116 void *tpdata;
2117 bool faulted;
2118
2119 if (unlikely(fault_in_iov_iter_readable(i, i->count)))
2120 return -EFAULT;
2121
2122 faulted = false;
2123
2124 rcu_read_lock_sched();
2125
2126 probe_func_ptr = rcu_dereference_sched(tp->funcs);
2127
2128 if (probe_func_ptr) {
2129 do {
2130 copy = *i;
2131 probe_func = probe_func_ptr->func;
2132 tpdata = probe_func_ptr->data;
2133 probe_func(user, ©, tpdata, &faulted);
2134 } while ((++probe_func_ptr)->func);
2135 }
2136
2137 rcu_read_unlock_sched();
2138
2139 if (unlikely(faulted))
2140 return -EFAULT;
2141 } else
2142 return -EBADF;
2143
2144 return ret;
2145 }
2146
user_events_open(struct inode * node,struct file * file)2147 static int user_events_open(struct inode *node, struct file *file)
2148 {
2149 struct user_event_group *group;
2150 struct user_event_file_info *info;
2151
2152 group = current_user_event_group();
2153
2154 if (!group)
2155 return -ENOENT;
2156
2157 info = kzalloc(sizeof(*info), GFP_KERNEL_ACCOUNT);
2158
2159 if (!info)
2160 return -ENOMEM;
2161
2162 info->group = group;
2163
2164 file->private_data = info;
2165
2166 return 0;
2167 }
2168
user_events_write(struct file * file,const char __user * ubuf,size_t count,loff_t * ppos)2169 static ssize_t user_events_write(struct file *file, const char __user *ubuf,
2170 size_t count, loff_t *ppos)
2171 {
2172 struct iovec iov;
2173 struct iov_iter i;
2174
2175 if (unlikely(*ppos != 0))
2176 return -EFAULT;
2177
2178 if (unlikely(import_single_range(ITER_SOURCE, (char __user *)ubuf,
2179 count, &iov, &i)))
2180 return -EFAULT;
2181
2182 return user_events_write_core(file, &i);
2183 }
2184
user_events_write_iter(struct kiocb * kp,struct iov_iter * i)2185 static ssize_t user_events_write_iter(struct kiocb *kp, struct iov_iter *i)
2186 {
2187 return user_events_write_core(kp->ki_filp, i);
2188 }
2189
user_events_ref_add(struct user_event_file_info * info,struct user_event * user)2190 static int user_events_ref_add(struct user_event_file_info *info,
2191 struct user_event *user)
2192 {
2193 struct user_event_group *group = info->group;
2194 struct user_event_refs *refs, *new_refs;
2195 int i, size, count = 0;
2196
2197 refs = rcu_dereference_protected(info->refs,
2198 lockdep_is_held(&group->reg_mutex));
2199
2200 if (refs) {
2201 count = refs->count;
2202
2203 for (i = 0; i < count; ++i)
2204 if (refs->events[i] == user)
2205 return i;
2206 }
2207
2208 size = struct_size(refs, events, count + 1);
2209
2210 new_refs = kzalloc(size, GFP_KERNEL_ACCOUNT);
2211
2212 if (!new_refs)
2213 return -ENOMEM;
2214
2215 new_refs->count = count + 1;
2216
2217 for (i = 0; i < count; ++i)
2218 new_refs->events[i] = refs->events[i];
2219
2220 new_refs->events[i] = user_event_get(user);
2221
2222 rcu_assign_pointer(info->refs, new_refs);
2223
2224 if (refs)
2225 kfree_rcu(refs, rcu);
2226
2227 return i;
2228 }
2229
user_reg_get(struct user_reg __user * ureg,struct user_reg * kreg)2230 static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg)
2231 {
2232 u32 size;
2233 long ret;
2234
2235 ret = get_user(size, &ureg->size);
2236
2237 if (ret)
2238 return ret;
2239
2240 if (size > PAGE_SIZE)
2241 return -E2BIG;
2242
2243 if (size < offsetofend(struct user_reg, write_index))
2244 return -EINVAL;
2245
2246 ret = copy_struct_from_user(kreg, sizeof(*kreg), ureg, size);
2247
2248 if (ret)
2249 return ret;
2250
2251 /* Ensure only valid flags */
2252 if (kreg->flags & ~(USER_EVENT_REG_MAX-1))
2253 return -EINVAL;
2254
2255 /* Ensure supported size */
2256 switch (kreg->enable_size) {
2257 case 4:
2258 /* 32-bit */
2259 break;
2260 #if BITS_PER_LONG >= 64
2261 case 8:
2262 /* 64-bit */
2263 break;
2264 #endif
2265 default:
2266 return -EINVAL;
2267 }
2268
2269 /* Ensure natural alignment */
2270 if (kreg->enable_addr % kreg->enable_size)
2271 return -EINVAL;
2272
2273 /* Ensure bit range for size */
2274 if (kreg->enable_bit > (kreg->enable_size * BITS_PER_BYTE) - 1)
2275 return -EINVAL;
2276
2277 /* Ensure accessible */
2278 if (!access_ok((const void __user *)(uintptr_t)kreg->enable_addr,
2279 kreg->enable_size))
2280 return -EFAULT;
2281
2282 kreg->size = size;
2283
2284 return 0;
2285 }
2286
2287 /*
2288 * Registers a user_event on behalf of a user process.
2289 */
user_events_ioctl_reg(struct user_event_file_info * info,unsigned long uarg)2290 static long user_events_ioctl_reg(struct user_event_file_info *info,
2291 unsigned long uarg)
2292 {
2293 struct user_reg __user *ureg = (struct user_reg __user *)uarg;
2294 struct user_reg reg;
2295 struct user_event *user;
2296 struct user_event_enabler *enabler;
2297 char *name;
2298 long ret;
2299 int write_result;
2300
2301 ret = user_reg_get(ureg, ®);
2302
2303 if (ret)
2304 return ret;
2305
2306 /*
2307 * Prevent users from using the same address and bit multiple times
2308 * within the same mm address space. This can cause unexpected behavior
2309 * for user processes that is far easier to debug if this is explictly
2310 * an error upon registering.
2311 */
2312 if (current_user_event_enabler_exists((unsigned long)reg.enable_addr,
2313 reg.enable_bit))
2314 return -EADDRINUSE;
2315
2316 name = strndup_user((const char __user *)(uintptr_t)reg.name_args,
2317 MAX_EVENT_DESC);
2318
2319 if (IS_ERR(name)) {
2320 ret = PTR_ERR(name);
2321 return ret;
2322 }
2323
2324 ret = user_event_parse_cmd(info->group, name, &user, reg.flags);
2325
2326 if (ret) {
2327 kfree(name);
2328 return ret;
2329 }
2330
2331 ret = user_events_ref_add(info, user);
2332
2333 /* No longer need parse ref, ref_add either worked or not */
2334 user_event_put(user, false);
2335
2336 /* Positive number is index and valid */
2337 if (ret < 0)
2338 return ret;
2339
2340 /*
2341 * user_events_ref_add succeeded:
2342 * At this point we have a user_event, it's lifetime is bound by the
2343 * reference count, not this file. If anything fails, the user_event
2344 * still has a reference until the file is released. During release
2345 * any remaining references (from user_events_ref_add) are decremented.
2346 *
2347 * Attempt to create an enabler, which too has a lifetime tied in the
2348 * same way for the event. Once the task that caused the enabler to be
2349 * created exits or issues exec() then the enablers it has created
2350 * will be destroyed and the ref to the event will be decremented.
2351 */
2352 enabler = user_event_enabler_create(®, user, &write_result);
2353
2354 if (!enabler)
2355 return -ENOMEM;
2356
2357 /* Write failed/faulted, give error back to caller */
2358 if (write_result)
2359 return write_result;
2360
2361 put_user((u32)ret, &ureg->write_index);
2362
2363 return 0;
2364 }
2365
2366 /*
2367 * Deletes a user_event on behalf of a user process.
2368 */
user_events_ioctl_del(struct user_event_file_info * info,unsigned long uarg)2369 static long user_events_ioctl_del(struct user_event_file_info *info,
2370 unsigned long uarg)
2371 {
2372 void __user *ubuf = (void __user *)uarg;
2373 char *name;
2374 long ret;
2375
2376 name = strndup_user(ubuf, MAX_EVENT_DESC);
2377
2378 if (IS_ERR(name))
2379 return PTR_ERR(name);
2380
2381 /* event_mutex prevents dyn_event from racing */
2382 mutex_lock(&event_mutex);
2383 ret = delete_user_event(info->group, name);
2384 mutex_unlock(&event_mutex);
2385
2386 kfree(name);
2387
2388 return ret;
2389 }
2390
user_unreg_get(struct user_unreg __user * ureg,struct user_unreg * kreg)2391 static long user_unreg_get(struct user_unreg __user *ureg,
2392 struct user_unreg *kreg)
2393 {
2394 u32 size;
2395 long ret;
2396
2397 ret = get_user(size, &ureg->size);
2398
2399 if (ret)
2400 return ret;
2401
2402 if (size > PAGE_SIZE)
2403 return -E2BIG;
2404
2405 if (size < offsetofend(struct user_unreg, disable_addr))
2406 return -EINVAL;
2407
2408 ret = copy_struct_from_user(kreg, sizeof(*kreg), ureg, size);
2409
2410 /* Ensure no reserved values, since we don't support any yet */
2411 if (kreg->__reserved || kreg->__reserved2)
2412 return -EINVAL;
2413
2414 return ret;
2415 }
2416
user_event_mm_clear_bit(struct user_event_mm * user_mm,unsigned long uaddr,unsigned char bit,unsigned long flags)2417 static int user_event_mm_clear_bit(struct user_event_mm *user_mm,
2418 unsigned long uaddr, unsigned char bit,
2419 unsigned long flags)
2420 {
2421 struct user_event_enabler enabler;
2422 int result;
2423 int attempt = 0;
2424
2425 memset(&enabler, 0, sizeof(enabler));
2426 enabler.addr = uaddr;
2427 enabler.values = bit | flags;
2428 retry:
2429 /* Prevents state changes from racing with new enablers */
2430 mutex_lock(&event_mutex);
2431
2432 /* Force the bit to be cleared, since no event is attached */
2433 mmap_read_lock(user_mm->mm);
2434 result = user_event_enabler_write(user_mm, &enabler, false, &attempt);
2435 mmap_read_unlock(user_mm->mm);
2436
2437 mutex_unlock(&event_mutex);
2438
2439 if (result) {
2440 /* Attempt to fault-in and retry if it worked */
2441 if (!user_event_mm_fault_in(user_mm, uaddr, attempt))
2442 goto retry;
2443 }
2444
2445 return result;
2446 }
2447
2448 /*
2449 * Unregisters an enablement address/bit within a task/user mm.
2450 */
user_events_ioctl_unreg(unsigned long uarg)2451 static long user_events_ioctl_unreg(unsigned long uarg)
2452 {
2453 struct user_unreg __user *ureg = (struct user_unreg __user *)uarg;
2454 struct user_event_mm *mm = current->user_event_mm;
2455 struct user_event_enabler *enabler, *next;
2456 struct user_unreg reg;
2457 unsigned long flags;
2458 long ret;
2459
2460 ret = user_unreg_get(ureg, ®);
2461
2462 if (ret)
2463 return ret;
2464
2465 if (!mm)
2466 return -ENOENT;
2467
2468 flags = 0;
2469 ret = -ENOENT;
2470
2471 /*
2472 * Flags freeing and faulting are used to indicate if the enabler is in
2473 * use at all. When faulting is set a page-fault is occurring asyncly.
2474 * During async fault if freeing is set, the enabler will be destroyed.
2475 * If no async fault is happening, we can destroy it now since we hold
2476 * the event_mutex during these checks.
2477 */
2478 mutex_lock(&event_mutex);
2479
2480 list_for_each_entry_safe(enabler, next, &mm->enablers, mm_enablers_link) {
2481 if (enabler->addr == reg.disable_addr &&
2482 ENABLE_BIT(enabler) == reg.disable_bit) {
2483 set_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler));
2484
2485 /* We must keep compat flags for the clear */
2486 flags |= enabler->values & ENABLE_VAL_COMPAT_MASK;
2487
2488 if (!test_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler)))
2489 user_event_enabler_destroy(enabler, true);
2490
2491 /* Removed at least one */
2492 ret = 0;
2493 }
2494 }
2495
2496 mutex_unlock(&event_mutex);
2497
2498 /* Ensure bit is now cleared for user, regardless of event status */
2499 if (!ret)
2500 ret = user_event_mm_clear_bit(mm, reg.disable_addr,
2501 reg.disable_bit, flags);
2502
2503 return ret;
2504 }
2505
2506 /*
2507 * Handles the ioctl from user mode to register or alter operations.
2508 */
user_events_ioctl(struct file * file,unsigned int cmd,unsigned long uarg)2509 static long user_events_ioctl(struct file *file, unsigned int cmd,
2510 unsigned long uarg)
2511 {
2512 struct user_event_file_info *info = file->private_data;
2513 struct user_event_group *group = info->group;
2514 long ret = -ENOTTY;
2515
2516 switch (cmd) {
2517 case DIAG_IOCSREG:
2518 mutex_lock(&group->reg_mutex);
2519 ret = user_events_ioctl_reg(info, uarg);
2520 mutex_unlock(&group->reg_mutex);
2521 break;
2522
2523 case DIAG_IOCSDEL:
2524 mutex_lock(&group->reg_mutex);
2525 ret = user_events_ioctl_del(info, uarg);
2526 mutex_unlock(&group->reg_mutex);
2527 break;
2528
2529 case DIAG_IOCSUNREG:
2530 mutex_lock(&group->reg_mutex);
2531 ret = user_events_ioctl_unreg(uarg);
2532 mutex_unlock(&group->reg_mutex);
2533 break;
2534 }
2535
2536 return ret;
2537 }
2538
2539 /*
2540 * Handles the final close of the file from user mode.
2541 */
user_events_release(struct inode * node,struct file * file)2542 static int user_events_release(struct inode *node, struct file *file)
2543 {
2544 struct user_event_file_info *info = file->private_data;
2545 struct user_event_group *group;
2546 struct user_event_refs *refs;
2547 int i;
2548
2549 if (!info)
2550 return -EINVAL;
2551
2552 group = info->group;
2553
2554 /*
2555 * Ensure refs cannot change under any situation by taking the
2556 * register mutex during the final freeing of the references.
2557 */
2558 mutex_lock(&group->reg_mutex);
2559
2560 refs = info->refs;
2561
2562 if (!refs)
2563 goto out;
2564
2565 /*
2566 * The lifetime of refs has reached an end, it's tied to this file.
2567 * The underlying user_events are ref counted, and cannot be freed.
2568 * After this decrement, the user_events may be freed elsewhere.
2569 */
2570 for (i = 0; i < refs->count; ++i)
2571 user_event_put(refs->events[i], false);
2572
2573 out:
2574 file->private_data = NULL;
2575
2576 mutex_unlock(&group->reg_mutex);
2577
2578 kfree(refs);
2579 kfree(info);
2580
2581 return 0;
2582 }
2583
2584 static const struct file_operations user_data_fops = {
2585 .open = user_events_open,
2586 .write = user_events_write,
2587 .write_iter = user_events_write_iter,
2588 .unlocked_ioctl = user_events_ioctl,
2589 .release = user_events_release,
2590 };
2591
user_seq_start(struct seq_file * m,loff_t * pos)2592 static void *user_seq_start(struct seq_file *m, loff_t *pos)
2593 {
2594 if (*pos)
2595 return NULL;
2596
2597 return (void *)1;
2598 }
2599
user_seq_next(struct seq_file * m,void * p,loff_t * pos)2600 static void *user_seq_next(struct seq_file *m, void *p, loff_t *pos)
2601 {
2602 ++*pos;
2603 return NULL;
2604 }
2605
user_seq_stop(struct seq_file * m,void * p)2606 static void user_seq_stop(struct seq_file *m, void *p)
2607 {
2608 }
2609
user_seq_show(struct seq_file * m,void * p)2610 static int user_seq_show(struct seq_file *m, void *p)
2611 {
2612 struct user_event_group *group = m->private;
2613 struct user_event *user;
2614 char status;
2615 int i, active = 0, busy = 0;
2616
2617 if (!group)
2618 return -EINVAL;
2619
2620 mutex_lock(&group->reg_mutex);
2621
2622 hash_for_each(group->register_table, i, user, node) {
2623 status = user->status;
2624
2625 seq_printf(m, "%s", EVENT_NAME(user));
2626
2627 if (status != 0)
2628 seq_puts(m, " #");
2629
2630 if (status != 0) {
2631 seq_puts(m, " Used by");
2632 if (status & EVENT_STATUS_FTRACE)
2633 seq_puts(m, " ftrace");
2634 if (status & EVENT_STATUS_PERF)
2635 seq_puts(m, " perf");
2636 if (status & EVENT_STATUS_OTHER)
2637 seq_puts(m, " other");
2638 busy++;
2639 }
2640
2641 seq_puts(m, "\n");
2642 active++;
2643 }
2644
2645 mutex_unlock(&group->reg_mutex);
2646
2647 seq_puts(m, "\n");
2648 seq_printf(m, "Active: %d\n", active);
2649 seq_printf(m, "Busy: %d\n", busy);
2650
2651 return 0;
2652 }
2653
2654 static const struct seq_operations user_seq_ops = {
2655 .start = user_seq_start,
2656 .next = user_seq_next,
2657 .stop = user_seq_stop,
2658 .show = user_seq_show,
2659 };
2660
user_status_open(struct inode * node,struct file * file)2661 static int user_status_open(struct inode *node, struct file *file)
2662 {
2663 struct user_event_group *group;
2664 int ret;
2665
2666 group = current_user_event_group();
2667
2668 if (!group)
2669 return -ENOENT;
2670
2671 ret = seq_open(file, &user_seq_ops);
2672
2673 if (!ret) {
2674 /* Chain group to seq_file */
2675 struct seq_file *m = file->private_data;
2676
2677 m->private = group;
2678 }
2679
2680 return ret;
2681 }
2682
2683 static const struct file_operations user_status_fops = {
2684 .open = user_status_open,
2685 .read = seq_read,
2686 .llseek = seq_lseek,
2687 .release = seq_release,
2688 };
2689
2690 /*
2691 * Creates a set of tracefs files to allow user mode interactions.
2692 */
create_user_tracefs(void)2693 static int create_user_tracefs(void)
2694 {
2695 struct dentry *edata, *emmap;
2696
2697 edata = tracefs_create_file("user_events_data", TRACE_MODE_WRITE,
2698 NULL, NULL, &user_data_fops);
2699
2700 if (!edata) {
2701 pr_warn("Could not create tracefs 'user_events_data' entry\n");
2702 goto err;
2703 }
2704
2705 emmap = tracefs_create_file("user_events_status", TRACE_MODE_READ,
2706 NULL, NULL, &user_status_fops);
2707
2708 if (!emmap) {
2709 tracefs_remove(edata);
2710 pr_warn("Could not create tracefs 'user_events_mmap' entry\n");
2711 goto err;
2712 }
2713
2714 return 0;
2715 err:
2716 return -ENODEV;
2717 }
2718
set_max_user_events_sysctl(struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)2719 static int set_max_user_events_sysctl(struct ctl_table *table, int write,
2720 void *buffer, size_t *lenp, loff_t *ppos)
2721 {
2722 int ret;
2723
2724 mutex_lock(&event_mutex);
2725
2726 ret = proc_douintvec(table, write, buffer, lenp, ppos);
2727
2728 mutex_unlock(&event_mutex);
2729
2730 return ret;
2731 }
2732
2733 static struct ctl_table user_event_sysctls[] = {
2734 {
2735 .procname = "user_events_max",
2736 .data = &max_user_events,
2737 .maxlen = sizeof(unsigned int),
2738 .mode = 0644,
2739 .proc_handler = set_max_user_events_sysctl,
2740 },
2741 {}
2742 };
2743
trace_events_user_init(void)2744 static int __init trace_events_user_init(void)
2745 {
2746 int ret;
2747
2748 fault_cache = KMEM_CACHE(user_event_enabler_fault, 0);
2749
2750 if (!fault_cache)
2751 return -ENOMEM;
2752
2753 init_group = user_event_group_create();
2754
2755 if (!init_group) {
2756 kmem_cache_destroy(fault_cache);
2757 return -ENOMEM;
2758 }
2759
2760 ret = create_user_tracefs();
2761
2762 if (ret) {
2763 pr_warn("user_events could not register with tracefs\n");
2764 user_event_group_destroy(init_group);
2765 kmem_cache_destroy(fault_cache);
2766 init_group = NULL;
2767 return ret;
2768 }
2769
2770 if (dyn_event_register(&user_event_dops))
2771 pr_warn("user_events could not register with dyn_events\n");
2772
2773 register_sysctl_init("kernel", user_event_sysctls);
2774
2775 return 0;
2776 }
2777
2778 fs_initcall(trace_events_user_init);
2779