1 /*
2  * Performance events core code:
3  *
4  *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5  *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
6  *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7  *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8  *
9  * For licensing details see kernel-base/COPYING
10  */
11 
12 #include <linux/fs.h>
13 #include <linux/mm.h>
14 #include <linux/cpu.h>
15 #include <linux/smp.h>
16 #include <linux/idr.h>
17 #include <linux/file.h>
18 #include <linux/poll.h>
19 #include <linux/slab.h>
20 #include <linux/hash.h>
21 #include <linux/sysfs.h>
22 #include <linux/dcache.h>
23 #include <linux/percpu.h>
24 #include <linux/ptrace.h>
25 #include <linux/reboot.h>
26 #include <linux/vmstat.h>
27 #include <linux/device.h>
28 #include <linux/export.h>
29 #include <linux/vmalloc.h>
30 #include <linux/hardirq.h>
31 #include <linux/rculist.h>
32 #include <linux/uaccess.h>
33 #include <linux/syscalls.h>
34 #include <linux/anon_inodes.h>
35 #include <linux/kernel_stat.h>
36 #include <linux/perf_event.h>
37 #include <linux/ftrace_event.h>
38 #include <linux/hw_breakpoint.h>
39 
40 #include "internal.h"
41 
42 #include <asm/irq_regs.h>
43 
44 struct remote_function_call {
45 	struct task_struct	*p;
46 	int			(*func)(void *info);
47 	void			*info;
48 	int			ret;
49 };
50 
remote_function(void * data)51 static void remote_function(void *data)
52 {
53 	struct remote_function_call *tfc = data;
54 	struct task_struct *p = tfc->p;
55 
56 	if (p) {
57 		tfc->ret = -EAGAIN;
58 		if (task_cpu(p) != smp_processor_id() || !task_curr(p))
59 			return;
60 	}
61 
62 	tfc->ret = tfc->func(tfc->info);
63 }
64 
65 /**
66  * task_function_call - call a function on the cpu on which a task runs
67  * @p:		the task to evaluate
68  * @func:	the function to be called
69  * @info:	the function call argument
70  *
71  * Calls the function @func when the task is currently running. This might
72  * be on the current CPU, which just calls the function directly
73  *
74  * returns: @func return value, or
75  *	    -ESRCH  - when the process isn't running
76  *	    -EAGAIN - when the process moved away
77  */
78 static int
task_function_call(struct task_struct * p,int (* func)(void * info),void * info)79 task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
80 {
81 	struct remote_function_call data = {
82 		.p	= p,
83 		.func	= func,
84 		.info	= info,
85 		.ret	= -ESRCH, /* No such (running) process */
86 	};
87 
88 	if (task_curr(p))
89 		smp_call_function_single(task_cpu(p), remote_function, &data, 1);
90 
91 	return data.ret;
92 }
93 
94 /**
95  * cpu_function_call - call a function on the cpu
96  * @func:	the function to be called
97  * @info:	the function call argument
98  *
99  * Calls the function @func on the remote cpu.
100  *
101  * returns: @func return value or -ENXIO when the cpu is offline
102  */
cpu_function_call(int cpu,int (* func)(void * info),void * info)103 static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
104 {
105 	struct remote_function_call data = {
106 		.p	= NULL,
107 		.func	= func,
108 		.info	= info,
109 		.ret	= -ENXIO, /* No such CPU */
110 	};
111 
112 	smp_call_function_single(cpu, remote_function, &data, 1);
113 
114 	return data.ret;
115 }
116 
117 #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
118 		       PERF_FLAG_FD_OUTPUT  |\
119 		       PERF_FLAG_PID_CGROUP)
120 
121 /*
122  * branch priv levels that need permission checks
123  */
124 #define PERF_SAMPLE_BRANCH_PERM_PLM \
125 	(PERF_SAMPLE_BRANCH_KERNEL |\
126 	 PERF_SAMPLE_BRANCH_HV)
127 
128 enum event_type_t {
129 	EVENT_FLEXIBLE = 0x1,
130 	EVENT_PINNED = 0x2,
131 	EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
132 };
133 
134 /*
135  * perf_sched_events : >0 events exist
136  * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
137  */
138 struct static_key_deferred perf_sched_events __read_mostly;
139 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
140 static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
141 
142 static atomic_t nr_mmap_events __read_mostly;
143 static atomic_t nr_comm_events __read_mostly;
144 static atomic_t nr_task_events __read_mostly;
145 
146 static LIST_HEAD(pmus);
147 static DEFINE_MUTEX(pmus_lock);
148 static struct srcu_struct pmus_srcu;
149 
150 /*
151  * perf event paranoia level:
152  *  -1 - not paranoid at all
153  *   0 - disallow raw tracepoint access for unpriv
154  *   1 - disallow cpu events for unpriv
155  *   2 - disallow kernel profiling for unpriv
156  */
157 int sysctl_perf_event_paranoid __read_mostly = 1;
158 
159 /* Minimum for 512 kiB + 1 user control page */
160 int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
161 
162 /*
163  * max perf event sample rate
164  */
165 #define DEFAULT_MAX_SAMPLE_RATE 100000
166 int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
167 static int max_samples_per_tick __read_mostly =
168 	DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
169 
perf_proc_update_handler(struct ctl_table * table,int write,void __user * buffer,size_t * lenp,loff_t * ppos)170 int perf_proc_update_handler(struct ctl_table *table, int write,
171 		void __user *buffer, size_t *lenp,
172 		loff_t *ppos)
173 {
174 	int ret = proc_dointvec(table, write, buffer, lenp, ppos);
175 
176 	if (ret || !write)
177 		return ret;
178 
179 	max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
180 
181 	return 0;
182 }
183 
184 static atomic64_t perf_event_id;
185 
186 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
187 			      enum event_type_t event_type);
188 
189 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
190 			     enum event_type_t event_type,
191 			     struct task_struct *task);
192 
193 static void update_context_time(struct perf_event_context *ctx);
194 static u64 perf_event_time(struct perf_event *event);
195 
perf_event_print_debug(void)196 void __weak perf_event_print_debug(void)	{ }
197 
perf_pmu_name(void)198 extern __weak const char *perf_pmu_name(void)
199 {
200 	return "pmu";
201 }
202 
perf_clock(void)203 static inline u64 perf_clock(void)
204 {
205 	return local_clock();
206 }
207 
208 static inline struct perf_cpu_context *
__get_cpu_context(struct perf_event_context * ctx)209 __get_cpu_context(struct perf_event_context *ctx)
210 {
211 	return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
212 }
213 
perf_ctx_lock(struct perf_cpu_context * cpuctx,struct perf_event_context * ctx)214 static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
215 			  struct perf_event_context *ctx)
216 {
217 	raw_spin_lock(&cpuctx->ctx.lock);
218 	if (ctx)
219 		raw_spin_lock(&ctx->lock);
220 }
221 
perf_ctx_unlock(struct perf_cpu_context * cpuctx,struct perf_event_context * ctx)222 static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
223 			    struct perf_event_context *ctx)
224 {
225 	if (ctx)
226 		raw_spin_unlock(&ctx->lock);
227 	raw_spin_unlock(&cpuctx->ctx.lock);
228 }
229 
230 #ifdef CONFIG_CGROUP_PERF
231 
232 /*
233  * Must ensure cgroup is pinned (css_get) before calling
234  * this function. In other words, we cannot call this function
235  * if there is no cgroup event for the current CPU context.
236  */
237 static inline struct perf_cgroup *
perf_cgroup_from_task(struct task_struct * task)238 perf_cgroup_from_task(struct task_struct *task)
239 {
240 	return container_of(task_subsys_state(task, perf_subsys_id),
241 			struct perf_cgroup, css);
242 }
243 
244 static inline bool
perf_cgroup_match(struct perf_event * event)245 perf_cgroup_match(struct perf_event *event)
246 {
247 	struct perf_event_context *ctx = event->ctx;
248 	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
249 
250 	return !event->cgrp || event->cgrp == cpuctx->cgrp;
251 }
252 
perf_tryget_cgroup(struct perf_event * event)253 static inline bool perf_tryget_cgroup(struct perf_event *event)
254 {
255 	return css_tryget(&event->cgrp->css);
256 }
257 
perf_put_cgroup(struct perf_event * event)258 static inline void perf_put_cgroup(struct perf_event *event)
259 {
260 	css_put(&event->cgrp->css);
261 }
262 
perf_detach_cgroup(struct perf_event * event)263 static inline void perf_detach_cgroup(struct perf_event *event)
264 {
265 	perf_put_cgroup(event);
266 	event->cgrp = NULL;
267 }
268 
is_cgroup_event(struct perf_event * event)269 static inline int is_cgroup_event(struct perf_event *event)
270 {
271 	return event->cgrp != NULL;
272 }
273 
perf_cgroup_event_time(struct perf_event * event)274 static inline u64 perf_cgroup_event_time(struct perf_event *event)
275 {
276 	struct perf_cgroup_info *t;
277 
278 	t = per_cpu_ptr(event->cgrp->info, event->cpu);
279 	return t->time;
280 }
281 
__update_cgrp_time(struct perf_cgroup * cgrp)282 static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
283 {
284 	struct perf_cgroup_info *info;
285 	u64 now;
286 
287 	now = perf_clock();
288 
289 	info = this_cpu_ptr(cgrp->info);
290 
291 	info->time += now - info->timestamp;
292 	info->timestamp = now;
293 }
294 
update_cgrp_time_from_cpuctx(struct perf_cpu_context * cpuctx)295 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
296 {
297 	struct perf_cgroup *cgrp_out = cpuctx->cgrp;
298 	if (cgrp_out)
299 		__update_cgrp_time(cgrp_out);
300 }
301 
update_cgrp_time_from_event(struct perf_event * event)302 static inline void update_cgrp_time_from_event(struct perf_event *event)
303 {
304 	struct perf_cgroup *cgrp;
305 
306 	/*
307 	 * ensure we access cgroup data only when needed and
308 	 * when we know the cgroup is pinned (css_get)
309 	 */
310 	if (!is_cgroup_event(event))
311 		return;
312 
313 	cgrp = perf_cgroup_from_task(current);
314 	/*
315 	 * Do not update time when cgroup is not active
316 	 */
317 	if (cgrp == event->cgrp)
318 		__update_cgrp_time(event->cgrp);
319 }
320 
321 static inline void
perf_cgroup_set_timestamp(struct task_struct * task,struct perf_event_context * ctx)322 perf_cgroup_set_timestamp(struct task_struct *task,
323 			  struct perf_event_context *ctx)
324 {
325 	struct perf_cgroup *cgrp;
326 	struct perf_cgroup_info *info;
327 
328 	/*
329 	 * ctx->lock held by caller
330 	 * ensure we do not access cgroup data
331 	 * unless we have the cgroup pinned (css_get)
332 	 */
333 	if (!task || !ctx->nr_cgroups)
334 		return;
335 
336 	cgrp = perf_cgroup_from_task(task);
337 	info = this_cpu_ptr(cgrp->info);
338 	info->timestamp = ctx->timestamp;
339 }
340 
341 #define PERF_CGROUP_SWOUT	0x1 /* cgroup switch out every event */
342 #define PERF_CGROUP_SWIN	0x2 /* cgroup switch in events based on task */
343 
344 /*
345  * reschedule events based on the cgroup constraint of task.
346  *
347  * mode SWOUT : schedule out everything
348  * mode SWIN : schedule in based on cgroup for next
349  */
perf_cgroup_switch(struct task_struct * task,int mode)350 void perf_cgroup_switch(struct task_struct *task, int mode)
351 {
352 	struct perf_cpu_context *cpuctx;
353 	struct pmu *pmu;
354 	unsigned long flags;
355 
356 	/*
357 	 * disable interrupts to avoid geting nr_cgroup
358 	 * changes via __perf_event_disable(). Also
359 	 * avoids preemption.
360 	 */
361 	local_irq_save(flags);
362 
363 	/*
364 	 * we reschedule only in the presence of cgroup
365 	 * constrained events.
366 	 */
367 	rcu_read_lock();
368 
369 	list_for_each_entry_rcu(pmu, &pmus, entry) {
370 		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
371 		if (cpuctx->unique_pmu != pmu)
372 			continue; /* ensure we process each cpuctx once */
373 
374 		/*
375 		 * perf_cgroup_events says at least one
376 		 * context on this CPU has cgroup events.
377 		 *
378 		 * ctx->nr_cgroups reports the number of cgroup
379 		 * events for a context.
380 		 */
381 		if (cpuctx->ctx.nr_cgroups > 0) {
382 			perf_ctx_lock(cpuctx, cpuctx->task_ctx);
383 			perf_pmu_disable(cpuctx->ctx.pmu);
384 
385 			if (mode & PERF_CGROUP_SWOUT) {
386 				cpu_ctx_sched_out(cpuctx, EVENT_ALL);
387 				/*
388 				 * must not be done before ctxswout due
389 				 * to event_filter_match() in event_sched_out()
390 				 */
391 				cpuctx->cgrp = NULL;
392 			}
393 
394 			if (mode & PERF_CGROUP_SWIN) {
395 				WARN_ON_ONCE(cpuctx->cgrp);
396 				/*
397 				 * set cgrp before ctxsw in to allow
398 				 * event_filter_match() to not have to pass
399 				 * task around
400 				 */
401 				cpuctx->cgrp = perf_cgroup_from_task(task);
402 				cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
403 			}
404 			perf_pmu_enable(cpuctx->ctx.pmu);
405 			perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
406 		}
407 	}
408 
409 	rcu_read_unlock();
410 
411 	local_irq_restore(flags);
412 }
413 
perf_cgroup_sched_out(struct task_struct * task,struct task_struct * next)414 static inline void perf_cgroup_sched_out(struct task_struct *task,
415 					 struct task_struct *next)
416 {
417 	struct perf_cgroup *cgrp1;
418 	struct perf_cgroup *cgrp2 = NULL;
419 
420 	/*
421 	 * we come here when we know perf_cgroup_events > 0
422 	 */
423 	cgrp1 = perf_cgroup_from_task(task);
424 
425 	/*
426 	 * next is NULL when called from perf_event_enable_on_exec()
427 	 * that will systematically cause a cgroup_switch()
428 	 */
429 	if (next)
430 		cgrp2 = perf_cgroup_from_task(next);
431 
432 	/*
433 	 * only schedule out current cgroup events if we know
434 	 * that we are switching to a different cgroup. Otherwise,
435 	 * do no touch the cgroup events.
436 	 */
437 	if (cgrp1 != cgrp2)
438 		perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
439 }
440 
perf_cgroup_sched_in(struct task_struct * prev,struct task_struct * task)441 static inline void perf_cgroup_sched_in(struct task_struct *prev,
442 					struct task_struct *task)
443 {
444 	struct perf_cgroup *cgrp1;
445 	struct perf_cgroup *cgrp2 = NULL;
446 
447 	/*
448 	 * we come here when we know perf_cgroup_events > 0
449 	 */
450 	cgrp1 = perf_cgroup_from_task(task);
451 
452 	/* prev can never be NULL */
453 	cgrp2 = perf_cgroup_from_task(prev);
454 
455 	/*
456 	 * only need to schedule in cgroup events if we are changing
457 	 * cgroup during ctxsw. Cgroup events were not scheduled
458 	 * out of ctxsw out if that was not the case.
459 	 */
460 	if (cgrp1 != cgrp2)
461 		perf_cgroup_switch(task, PERF_CGROUP_SWIN);
462 }
463 
perf_cgroup_connect(int fd,struct perf_event * event,struct perf_event_attr * attr,struct perf_event * group_leader)464 static inline int perf_cgroup_connect(int fd, struct perf_event *event,
465 				      struct perf_event_attr *attr,
466 				      struct perf_event *group_leader)
467 {
468 	struct perf_cgroup *cgrp;
469 	struct cgroup_subsys_state *css;
470 	struct file *file;
471 	int ret = 0, fput_needed;
472 
473 	file = fget_light(fd, &fput_needed);
474 	if (!file)
475 		return -EBADF;
476 
477 	css = cgroup_css_from_dir(file, perf_subsys_id);
478 	if (IS_ERR(css)) {
479 		ret = PTR_ERR(css);
480 		goto out;
481 	}
482 
483 	cgrp = container_of(css, struct perf_cgroup, css);
484 	event->cgrp = cgrp;
485 
486 	/* must be done before we fput() the file */
487 	if (!perf_tryget_cgroup(event)) {
488 		event->cgrp = NULL;
489 		ret = -ENOENT;
490 		goto out;
491 	}
492 
493 	/*
494 	 * all events in a group must monitor
495 	 * the same cgroup because a task belongs
496 	 * to only one perf cgroup at a time
497 	 */
498 	if (group_leader && group_leader->cgrp != cgrp) {
499 		perf_detach_cgroup(event);
500 		ret = -EINVAL;
501 	}
502 out:
503 	fput_light(file, fput_needed);
504 	return ret;
505 }
506 
507 static inline void
perf_cgroup_set_shadow_time(struct perf_event * event,u64 now)508 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
509 {
510 	struct perf_cgroup_info *t;
511 	t = per_cpu_ptr(event->cgrp->info, event->cpu);
512 	event->shadow_ctx_time = now - t->timestamp;
513 }
514 
515 static inline void
perf_cgroup_defer_enabled(struct perf_event * event)516 perf_cgroup_defer_enabled(struct perf_event *event)
517 {
518 	/*
519 	 * when the current task's perf cgroup does not match
520 	 * the event's, we need to remember to call the
521 	 * perf_mark_enable() function the first time a task with
522 	 * a matching perf cgroup is scheduled in.
523 	 */
524 	if (is_cgroup_event(event) && !perf_cgroup_match(event))
525 		event->cgrp_defer_enabled = 1;
526 }
527 
528 static inline void
perf_cgroup_mark_enabled(struct perf_event * event,struct perf_event_context * ctx)529 perf_cgroup_mark_enabled(struct perf_event *event,
530 			 struct perf_event_context *ctx)
531 {
532 	struct perf_event *sub;
533 	u64 tstamp = perf_event_time(event);
534 
535 	if (!event->cgrp_defer_enabled)
536 		return;
537 
538 	event->cgrp_defer_enabled = 0;
539 
540 	event->tstamp_enabled = tstamp - event->total_time_enabled;
541 	list_for_each_entry(sub, &event->sibling_list, group_entry) {
542 		if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
543 			sub->tstamp_enabled = tstamp - sub->total_time_enabled;
544 			sub->cgrp_defer_enabled = 0;
545 		}
546 	}
547 }
548 #else /* !CONFIG_CGROUP_PERF */
549 
550 static inline bool
perf_cgroup_match(struct perf_event * event)551 perf_cgroup_match(struct perf_event *event)
552 {
553 	return true;
554 }
555 
perf_detach_cgroup(struct perf_event * event)556 static inline void perf_detach_cgroup(struct perf_event *event)
557 {}
558 
is_cgroup_event(struct perf_event * event)559 static inline int is_cgroup_event(struct perf_event *event)
560 {
561 	return 0;
562 }
563 
perf_cgroup_event_cgrp_time(struct perf_event * event)564 static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
565 {
566 	return 0;
567 }
568 
update_cgrp_time_from_event(struct perf_event * event)569 static inline void update_cgrp_time_from_event(struct perf_event *event)
570 {
571 }
572 
update_cgrp_time_from_cpuctx(struct perf_cpu_context * cpuctx)573 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
574 {
575 }
576 
perf_cgroup_sched_out(struct task_struct * task,struct task_struct * next)577 static inline void perf_cgroup_sched_out(struct task_struct *task,
578 					 struct task_struct *next)
579 {
580 }
581 
perf_cgroup_sched_in(struct task_struct * prev,struct task_struct * task)582 static inline void perf_cgroup_sched_in(struct task_struct *prev,
583 					struct task_struct *task)
584 {
585 }
586 
perf_cgroup_connect(pid_t pid,struct perf_event * event,struct perf_event_attr * attr,struct perf_event * group_leader)587 static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
588 				      struct perf_event_attr *attr,
589 				      struct perf_event *group_leader)
590 {
591 	return -EINVAL;
592 }
593 
594 static inline void
perf_cgroup_set_timestamp(struct task_struct * task,struct perf_event_context * ctx)595 perf_cgroup_set_timestamp(struct task_struct *task,
596 			  struct perf_event_context *ctx)
597 {
598 }
599 
600 void
perf_cgroup_switch(struct task_struct * task,struct task_struct * next)601 perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
602 {
603 }
604 
605 static inline void
perf_cgroup_set_shadow_time(struct perf_event * event,u64 now)606 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
607 {
608 }
609 
perf_cgroup_event_time(struct perf_event * event)610 static inline u64 perf_cgroup_event_time(struct perf_event *event)
611 {
612 	return 0;
613 }
614 
615 static inline void
perf_cgroup_defer_enabled(struct perf_event * event)616 perf_cgroup_defer_enabled(struct perf_event *event)
617 {
618 }
619 
620 static inline void
perf_cgroup_mark_enabled(struct perf_event * event,struct perf_event_context * ctx)621 perf_cgroup_mark_enabled(struct perf_event *event,
622 			 struct perf_event_context *ctx)
623 {
624 }
625 #endif
626 
perf_pmu_disable(struct pmu * pmu)627 void perf_pmu_disable(struct pmu *pmu)
628 {
629 	int *count = this_cpu_ptr(pmu->pmu_disable_count);
630 	if (!(*count)++)
631 		pmu->pmu_disable(pmu);
632 }
633 
perf_pmu_enable(struct pmu * pmu)634 void perf_pmu_enable(struct pmu *pmu)
635 {
636 	int *count = this_cpu_ptr(pmu->pmu_disable_count);
637 	if (!--(*count))
638 		pmu->pmu_enable(pmu);
639 }
640 
641 static DEFINE_PER_CPU(struct list_head, rotation_list);
642 
643 /*
644  * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
645  * because they're strictly cpu affine and rotate_start is called with IRQs
646  * disabled, while rotate_context is called from IRQ context.
647  */
perf_pmu_rotate_start(struct pmu * pmu)648 static void perf_pmu_rotate_start(struct pmu *pmu)
649 {
650 	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
651 	struct list_head *head = &__get_cpu_var(rotation_list);
652 
653 	WARN_ON(!irqs_disabled());
654 
655 	if (list_empty(&cpuctx->rotation_list))
656 		list_add(&cpuctx->rotation_list, head);
657 }
658 
get_ctx(struct perf_event_context * ctx)659 static void get_ctx(struct perf_event_context *ctx)
660 {
661 	WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
662 }
663 
put_ctx(struct perf_event_context * ctx)664 static void put_ctx(struct perf_event_context *ctx)
665 {
666 	if (atomic_dec_and_test(&ctx->refcount)) {
667 		if (ctx->parent_ctx)
668 			put_ctx(ctx->parent_ctx);
669 		if (ctx->task)
670 			put_task_struct(ctx->task);
671 		kfree_rcu(ctx, rcu_head);
672 	}
673 }
674 
unclone_ctx(struct perf_event_context * ctx)675 static void unclone_ctx(struct perf_event_context *ctx)
676 {
677 	if (ctx->parent_ctx) {
678 		put_ctx(ctx->parent_ctx);
679 		ctx->parent_ctx = NULL;
680 	}
681 }
682 
perf_event_pid(struct perf_event * event,struct task_struct * p)683 static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
684 {
685 	/*
686 	 * only top level events have the pid namespace they were created in
687 	 */
688 	if (event->parent)
689 		event = event->parent;
690 
691 	return task_tgid_nr_ns(p, event->ns);
692 }
693 
perf_event_tid(struct perf_event * event,struct task_struct * p)694 static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
695 {
696 	/*
697 	 * only top level events have the pid namespace they were created in
698 	 */
699 	if (event->parent)
700 		event = event->parent;
701 
702 	return task_pid_nr_ns(p, event->ns);
703 }
704 
705 /*
706  * If we inherit events we want to return the parent event id
707  * to userspace.
708  */
primary_event_id(struct perf_event * event)709 static u64 primary_event_id(struct perf_event *event)
710 {
711 	u64 id = event->id;
712 
713 	if (event->parent)
714 		id = event->parent->id;
715 
716 	return id;
717 }
718 
719 /*
720  * Get the perf_event_context for a task and lock it.
721  * This has to cope with with the fact that until it is locked,
722  * the context could get moved to another task.
723  */
724 static struct perf_event_context *
perf_lock_task_context(struct task_struct * task,int ctxn,unsigned long * flags)725 perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
726 {
727 	struct perf_event_context *ctx;
728 
729 retry:
730 	/*
731 	 * One of the few rules of preemptible RCU is that one cannot do
732 	 * rcu_read_unlock() while holding a scheduler (or nested) lock when
733 	 * part of the read side critical section was preemptible -- see
734 	 * rcu_read_unlock_special().
735 	 *
736 	 * Since ctx->lock nests under rq->lock we must ensure the entire read
737 	 * side critical section is non-preemptible.
738 	 */
739 	preempt_disable();
740 	rcu_read_lock();
741 	ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
742 	if (ctx) {
743 		/*
744 		 * If this context is a clone of another, it might
745 		 * get swapped for another underneath us by
746 		 * perf_event_task_sched_out, though the
747 		 * rcu_read_lock() protects us from any context
748 		 * getting freed.  Lock the context and check if it
749 		 * got swapped before we could get the lock, and retry
750 		 * if so.  If we locked the right context, then it
751 		 * can't get swapped on us any more.
752 		 */
753 		raw_spin_lock_irqsave(&ctx->lock, *flags);
754 		if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
755 			raw_spin_unlock_irqrestore(&ctx->lock, *flags);
756 			rcu_read_unlock();
757 			preempt_enable();
758 			goto retry;
759 		}
760 
761 		if (!atomic_inc_not_zero(&ctx->refcount)) {
762 			raw_spin_unlock_irqrestore(&ctx->lock, *flags);
763 			ctx = NULL;
764 		}
765 	}
766 	rcu_read_unlock();
767 	preempt_enable();
768 	return ctx;
769 }
770 
771 /*
772  * Get the context for a task and increment its pin_count so it
773  * can't get swapped to another task.  This also increments its
774  * reference count so that the context can't get freed.
775  */
776 static struct perf_event_context *
perf_pin_task_context(struct task_struct * task,int ctxn)777 perf_pin_task_context(struct task_struct *task, int ctxn)
778 {
779 	struct perf_event_context *ctx;
780 	unsigned long flags;
781 
782 	ctx = perf_lock_task_context(task, ctxn, &flags);
783 	if (ctx) {
784 		++ctx->pin_count;
785 		raw_spin_unlock_irqrestore(&ctx->lock, flags);
786 	}
787 	return ctx;
788 }
789 
perf_unpin_context(struct perf_event_context * ctx)790 static void perf_unpin_context(struct perf_event_context *ctx)
791 {
792 	unsigned long flags;
793 
794 	raw_spin_lock_irqsave(&ctx->lock, flags);
795 	--ctx->pin_count;
796 	raw_spin_unlock_irqrestore(&ctx->lock, flags);
797 }
798 
799 /*
800  * Update the record of the current time in a context.
801  */
update_context_time(struct perf_event_context * ctx)802 static void update_context_time(struct perf_event_context *ctx)
803 {
804 	u64 now = perf_clock();
805 
806 	ctx->time += now - ctx->timestamp;
807 	ctx->timestamp = now;
808 }
809 
perf_event_time(struct perf_event * event)810 static u64 perf_event_time(struct perf_event *event)
811 {
812 	struct perf_event_context *ctx = event->ctx;
813 
814 	if (is_cgroup_event(event))
815 		return perf_cgroup_event_time(event);
816 
817 	return ctx ? ctx->time : 0;
818 }
819 
820 /*
821  * Update the total_time_enabled and total_time_running fields for a event.
822  * The caller of this function needs to hold the ctx->lock.
823  */
update_event_times(struct perf_event * event)824 static void update_event_times(struct perf_event *event)
825 {
826 	struct perf_event_context *ctx = event->ctx;
827 	u64 run_end;
828 
829 	if (event->state < PERF_EVENT_STATE_INACTIVE ||
830 	    event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
831 		return;
832 	/*
833 	 * in cgroup mode, time_enabled represents
834 	 * the time the event was enabled AND active
835 	 * tasks were in the monitored cgroup. This is
836 	 * independent of the activity of the context as
837 	 * there may be a mix of cgroup and non-cgroup events.
838 	 *
839 	 * That is why we treat cgroup events differently
840 	 * here.
841 	 */
842 	if (is_cgroup_event(event))
843 		run_end = perf_cgroup_event_time(event);
844 	else if (ctx->is_active)
845 		run_end = ctx->time;
846 	else
847 		run_end = event->tstamp_stopped;
848 
849 	event->total_time_enabled = run_end - event->tstamp_enabled;
850 
851 	if (event->state == PERF_EVENT_STATE_INACTIVE)
852 		run_end = event->tstamp_stopped;
853 	else
854 		run_end = perf_event_time(event);
855 
856 	event->total_time_running = run_end - event->tstamp_running;
857 
858 }
859 
860 /*
861  * Update total_time_enabled and total_time_running for all events in a group.
862  */
update_group_times(struct perf_event * leader)863 static void update_group_times(struct perf_event *leader)
864 {
865 	struct perf_event *event;
866 
867 	update_event_times(leader);
868 	list_for_each_entry(event, &leader->sibling_list, group_entry)
869 		update_event_times(event);
870 }
871 
872 static struct list_head *
ctx_group_list(struct perf_event * event,struct perf_event_context * ctx)873 ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
874 {
875 	if (event->attr.pinned)
876 		return &ctx->pinned_groups;
877 	else
878 		return &ctx->flexible_groups;
879 }
880 
881 /*
882  * Add a event from the lists for its context.
883  * Must be called with ctx->mutex and ctx->lock held.
884  */
885 static void
list_add_event(struct perf_event * event,struct perf_event_context * ctx)886 list_add_event(struct perf_event *event, struct perf_event_context *ctx)
887 {
888 	WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
889 	event->attach_state |= PERF_ATTACH_CONTEXT;
890 
891 	/*
892 	 * If we're a stand alone event or group leader, we go to the context
893 	 * list, group events are kept attached to the group so that
894 	 * perf_group_detach can, at all times, locate all siblings.
895 	 */
896 	if (event->group_leader == event) {
897 		struct list_head *list;
898 
899 		if (is_software_event(event))
900 			event->group_flags |= PERF_GROUP_SOFTWARE;
901 
902 		list = ctx_group_list(event, ctx);
903 		list_add_tail(&event->group_entry, list);
904 	}
905 
906 	if (is_cgroup_event(event))
907 		ctx->nr_cgroups++;
908 
909 	if (has_branch_stack(event))
910 		ctx->nr_branch_stack++;
911 
912 	list_add_rcu(&event->event_entry, &ctx->event_list);
913 	if (!ctx->nr_events)
914 		perf_pmu_rotate_start(ctx->pmu);
915 	ctx->nr_events++;
916 	if (event->attr.inherit_stat)
917 		ctx->nr_stat++;
918 }
919 
920 /*
921  * Initialize event state based on the perf_event_attr::disabled.
922  */
perf_event__state_init(struct perf_event * event)923 static inline void perf_event__state_init(struct perf_event *event)
924 {
925 	event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
926 					      PERF_EVENT_STATE_INACTIVE;
927 }
928 
929 /*
930  * Called at perf_event creation and when events are attached/detached from a
931  * group.
932  */
perf_event__read_size(struct perf_event * event)933 static void perf_event__read_size(struct perf_event *event)
934 {
935 	int entry = sizeof(u64); /* value */
936 	int size = 0;
937 	int nr = 1;
938 
939 	if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
940 		size += sizeof(u64);
941 
942 	if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
943 		size += sizeof(u64);
944 
945 	if (event->attr.read_format & PERF_FORMAT_ID)
946 		entry += sizeof(u64);
947 
948 	if (event->attr.read_format & PERF_FORMAT_GROUP) {
949 		nr += event->group_leader->nr_siblings;
950 		size += sizeof(u64);
951 	}
952 
953 	size += entry * nr;
954 	event->read_size = size;
955 }
956 
perf_event__header_size(struct perf_event * event)957 static void perf_event__header_size(struct perf_event *event)
958 {
959 	struct perf_sample_data *data;
960 	u64 sample_type = event->attr.sample_type;
961 	u16 size = 0;
962 
963 	perf_event__read_size(event);
964 
965 	if (sample_type & PERF_SAMPLE_IP)
966 		size += sizeof(data->ip);
967 
968 	if (sample_type & PERF_SAMPLE_ADDR)
969 		size += sizeof(data->addr);
970 
971 	if (sample_type & PERF_SAMPLE_PERIOD)
972 		size += sizeof(data->period);
973 
974 	if (sample_type & PERF_SAMPLE_READ)
975 		size += event->read_size;
976 
977 	event->header_size = size;
978 }
979 
perf_event__id_header_size(struct perf_event * event)980 static void perf_event__id_header_size(struct perf_event *event)
981 {
982 	struct perf_sample_data *data;
983 	u64 sample_type = event->attr.sample_type;
984 	u16 size = 0;
985 
986 	if (sample_type & PERF_SAMPLE_TID)
987 		size += sizeof(data->tid_entry);
988 
989 	if (sample_type & PERF_SAMPLE_TIME)
990 		size += sizeof(data->time);
991 
992 	if (sample_type & PERF_SAMPLE_ID)
993 		size += sizeof(data->id);
994 
995 	if (sample_type & PERF_SAMPLE_STREAM_ID)
996 		size += sizeof(data->stream_id);
997 
998 	if (sample_type & PERF_SAMPLE_CPU)
999 		size += sizeof(data->cpu_entry);
1000 
1001 	event->id_header_size = size;
1002 }
1003 
perf_group_attach(struct perf_event * event)1004 static void perf_group_attach(struct perf_event *event)
1005 {
1006 	struct perf_event *group_leader = event->group_leader, *pos;
1007 
1008 	/*
1009 	 * We can have double attach due to group movement in perf_event_open.
1010 	 */
1011 	if (event->attach_state & PERF_ATTACH_GROUP)
1012 		return;
1013 
1014 	event->attach_state |= PERF_ATTACH_GROUP;
1015 
1016 	if (group_leader == event)
1017 		return;
1018 
1019 	if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
1020 			!is_software_event(event))
1021 		group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
1022 
1023 	list_add_tail(&event->group_entry, &group_leader->sibling_list);
1024 	group_leader->nr_siblings++;
1025 
1026 	perf_event__header_size(group_leader);
1027 
1028 	list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
1029 		perf_event__header_size(pos);
1030 }
1031 
1032 /*
1033  * Remove a event from the lists for its context.
1034  * Must be called with ctx->mutex and ctx->lock held.
1035  */
1036 static void
list_del_event(struct perf_event * event,struct perf_event_context * ctx)1037 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1038 {
1039 	struct perf_cpu_context *cpuctx;
1040 	/*
1041 	 * We can have double detach due to exit/hot-unplug + close.
1042 	 */
1043 	if (!(event->attach_state & PERF_ATTACH_CONTEXT))
1044 		return;
1045 
1046 	event->attach_state &= ~PERF_ATTACH_CONTEXT;
1047 
1048 	if (is_cgroup_event(event)) {
1049 		ctx->nr_cgroups--;
1050 		cpuctx = __get_cpu_context(ctx);
1051 		/*
1052 		 * if there are no more cgroup events
1053 		 * then cler cgrp to avoid stale pointer
1054 		 * in update_cgrp_time_from_cpuctx()
1055 		 */
1056 		if (!ctx->nr_cgroups)
1057 			cpuctx->cgrp = NULL;
1058 	}
1059 
1060 	if (has_branch_stack(event))
1061 		ctx->nr_branch_stack--;
1062 
1063 	ctx->nr_events--;
1064 	if (event->attr.inherit_stat)
1065 		ctx->nr_stat--;
1066 
1067 	list_del_rcu(&event->event_entry);
1068 
1069 	if (event->group_leader == event)
1070 		list_del_init(&event->group_entry);
1071 
1072 	update_group_times(event);
1073 
1074 	/*
1075 	 * If event was in error state, then keep it
1076 	 * that way, otherwise bogus counts will be
1077 	 * returned on read(). The only way to get out
1078 	 * of error state is by explicit re-enabling
1079 	 * of the event
1080 	 */
1081 	if (event->state > PERF_EVENT_STATE_OFF)
1082 		event->state = PERF_EVENT_STATE_OFF;
1083 }
1084 
perf_group_detach(struct perf_event * event)1085 static void perf_group_detach(struct perf_event *event)
1086 {
1087 	struct perf_event *sibling, *tmp;
1088 	struct list_head *list = NULL;
1089 
1090 	/*
1091 	 * We can have double detach due to exit/hot-unplug + close.
1092 	 */
1093 	if (!(event->attach_state & PERF_ATTACH_GROUP))
1094 		return;
1095 
1096 	event->attach_state &= ~PERF_ATTACH_GROUP;
1097 
1098 	/*
1099 	 * If this is a sibling, remove it from its group.
1100 	 */
1101 	if (event->group_leader != event) {
1102 		list_del_init(&event->group_entry);
1103 		event->group_leader->nr_siblings--;
1104 		goto out;
1105 	}
1106 
1107 	if (!list_empty(&event->group_entry))
1108 		list = &event->group_entry;
1109 
1110 	/*
1111 	 * If this was a group event with sibling events then
1112 	 * upgrade the siblings to singleton events by adding them
1113 	 * to whatever list we are on.
1114 	 */
1115 	list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
1116 		if (list)
1117 			list_move_tail(&sibling->group_entry, list);
1118 		sibling->group_leader = sibling;
1119 
1120 		/* Inherit group flags from the previous leader */
1121 		sibling->group_flags = event->group_flags;
1122 	}
1123 
1124 out:
1125 	perf_event__header_size(event->group_leader);
1126 
1127 	list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
1128 		perf_event__header_size(tmp);
1129 }
1130 
1131 static inline int
event_filter_match(struct perf_event * event)1132 event_filter_match(struct perf_event *event)
1133 {
1134 	return (event->cpu == -1 || event->cpu == smp_processor_id())
1135 	    && perf_cgroup_match(event);
1136 }
1137 
1138 static void
event_sched_out(struct perf_event * event,struct perf_cpu_context * cpuctx,struct perf_event_context * ctx)1139 event_sched_out(struct perf_event *event,
1140 		  struct perf_cpu_context *cpuctx,
1141 		  struct perf_event_context *ctx)
1142 {
1143 	u64 tstamp = perf_event_time(event);
1144 	u64 delta;
1145 	/*
1146 	 * An event which could not be activated because of
1147 	 * filter mismatch still needs to have its timings
1148 	 * maintained, otherwise bogus information is return
1149 	 * via read() for time_enabled, time_running:
1150 	 */
1151 	if (event->state == PERF_EVENT_STATE_INACTIVE
1152 	    && !event_filter_match(event)) {
1153 		delta = tstamp - event->tstamp_stopped;
1154 		event->tstamp_running += delta;
1155 		event->tstamp_stopped = tstamp;
1156 	}
1157 
1158 	if (event->state != PERF_EVENT_STATE_ACTIVE)
1159 		return;
1160 
1161 	event->state = PERF_EVENT_STATE_INACTIVE;
1162 	if (event->pending_disable) {
1163 		event->pending_disable = 0;
1164 		event->state = PERF_EVENT_STATE_OFF;
1165 	}
1166 	event->tstamp_stopped = tstamp;
1167 	event->pmu->del(event, 0);
1168 	event->oncpu = -1;
1169 
1170 	if (!is_software_event(event))
1171 		cpuctx->active_oncpu--;
1172 	ctx->nr_active--;
1173 	if (event->attr.freq && event->attr.sample_freq)
1174 		ctx->nr_freq--;
1175 	if (event->attr.exclusive || !cpuctx->active_oncpu)
1176 		cpuctx->exclusive = 0;
1177 }
1178 
1179 static void
group_sched_out(struct perf_event * group_event,struct perf_cpu_context * cpuctx,struct perf_event_context * ctx)1180 group_sched_out(struct perf_event *group_event,
1181 		struct perf_cpu_context *cpuctx,
1182 		struct perf_event_context *ctx)
1183 {
1184 	struct perf_event *event;
1185 	int state = group_event->state;
1186 
1187 	event_sched_out(group_event, cpuctx, ctx);
1188 
1189 	/*
1190 	 * Schedule out siblings (if any):
1191 	 */
1192 	list_for_each_entry(event, &group_event->sibling_list, group_entry)
1193 		event_sched_out(event, cpuctx, ctx);
1194 
1195 	if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
1196 		cpuctx->exclusive = 0;
1197 }
1198 
1199 struct remove_event {
1200 	struct perf_event *event;
1201 	bool detach_group;
1202 };
1203 
1204 /*
1205  * Cross CPU call to remove a performance event
1206  *
1207  * We disable the event on the hardware level first. After that we
1208  * remove it from the context list.
1209  */
__perf_remove_from_context(void * info)1210 static int __perf_remove_from_context(void *info)
1211 {
1212 	struct remove_event *re = info;
1213 	struct perf_event *event = re->event;
1214 	struct perf_event_context *ctx = event->ctx;
1215 	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1216 
1217 	raw_spin_lock(&ctx->lock);
1218 	event_sched_out(event, cpuctx, ctx);
1219 	if (re->detach_group)
1220 		perf_group_detach(event);
1221 	list_del_event(event, ctx);
1222 	if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
1223 		ctx->is_active = 0;
1224 		cpuctx->task_ctx = NULL;
1225 	}
1226 	raw_spin_unlock(&ctx->lock);
1227 
1228 	return 0;
1229 }
1230 
1231 
1232 /*
1233  * Remove the event from a task's (or a CPU's) list of events.
1234  *
1235  * CPU events are removed with a smp call. For task events we only
1236  * call when the task is on a CPU.
1237  *
1238  * If event->ctx is a cloned context, callers must make sure that
1239  * every task struct that event->ctx->task could possibly point to
1240  * remains valid.  This is OK when called from perf_release since
1241  * that only calls us on the top-level context, which can't be a clone.
1242  * When called from perf_event_exit_task, it's OK because the
1243  * context has been detached from its task.
1244  */
perf_remove_from_context(struct perf_event * event,bool detach_group)1245 static void perf_remove_from_context(struct perf_event *event, bool detach_group)
1246 {
1247 	struct perf_event_context *ctx = event->ctx;
1248 	struct task_struct *task = ctx->task;
1249 	struct remove_event re = {
1250 		.event = event,
1251 		.detach_group = detach_group,
1252 	};
1253 
1254 	lockdep_assert_held(&ctx->mutex);
1255 
1256 	if (!task) {
1257 		/*
1258 		 * Per cpu events are removed via an smp call and
1259 		 * the removal is always successful.
1260 		 */
1261 		cpu_function_call(event->cpu, __perf_remove_from_context, &re);
1262 		return;
1263 	}
1264 
1265 retry:
1266 	if (!task_function_call(task, __perf_remove_from_context, &re))
1267 		return;
1268 
1269 	raw_spin_lock_irq(&ctx->lock);
1270 	/*
1271 	 * If we failed to find a running task, but find the context active now
1272 	 * that we've acquired the ctx->lock, retry.
1273 	 */
1274 	if (ctx->is_active) {
1275 		raw_spin_unlock_irq(&ctx->lock);
1276 		goto retry;
1277 	}
1278 
1279 	/*
1280 	 * Since the task isn't running, its safe to remove the event, us
1281 	 * holding the ctx->lock ensures the task won't get scheduled in.
1282 	 */
1283 	if (detach_group)
1284 		perf_group_detach(event);
1285 	list_del_event(event, ctx);
1286 	raw_spin_unlock_irq(&ctx->lock);
1287 }
1288 
1289 /*
1290  * Cross CPU call to disable a performance event
1291  */
__perf_event_disable(void * info)1292 static int __perf_event_disable(void *info)
1293 {
1294 	struct perf_event *event = info;
1295 	struct perf_event_context *ctx = event->ctx;
1296 	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1297 
1298 	/*
1299 	 * If this is a per-task event, need to check whether this
1300 	 * event's task is the current task on this cpu.
1301 	 *
1302 	 * Can trigger due to concurrent perf_event_context_sched_out()
1303 	 * flipping contexts around.
1304 	 */
1305 	if (ctx->task && cpuctx->task_ctx != ctx)
1306 		return -EINVAL;
1307 
1308 	raw_spin_lock(&ctx->lock);
1309 
1310 	/*
1311 	 * If the event is on, turn it off.
1312 	 * If it is in error state, leave it in error state.
1313 	 */
1314 	if (event->state >= PERF_EVENT_STATE_INACTIVE) {
1315 		update_context_time(ctx);
1316 		update_cgrp_time_from_event(event);
1317 		update_group_times(event);
1318 		if (event == event->group_leader)
1319 			group_sched_out(event, cpuctx, ctx);
1320 		else
1321 			event_sched_out(event, cpuctx, ctx);
1322 		event->state = PERF_EVENT_STATE_OFF;
1323 	}
1324 
1325 	raw_spin_unlock(&ctx->lock);
1326 
1327 	return 0;
1328 }
1329 
1330 /*
1331  * Disable a event.
1332  *
1333  * If event->ctx is a cloned context, callers must make sure that
1334  * every task struct that event->ctx->task could possibly point to
1335  * remains valid.  This condition is satisifed when called through
1336  * perf_event_for_each_child or perf_event_for_each because they
1337  * hold the top-level event's child_mutex, so any descendant that
1338  * goes to exit will block in sync_child_event.
1339  * When called from perf_pending_event it's OK because event->ctx
1340  * is the current context on this CPU and preemption is disabled,
1341  * hence we can't get into perf_event_task_sched_out for this context.
1342  */
perf_event_disable(struct perf_event * event)1343 void perf_event_disable(struct perf_event *event)
1344 {
1345 	struct perf_event_context *ctx = event->ctx;
1346 	struct task_struct *task = ctx->task;
1347 
1348 	if (!task) {
1349 		/*
1350 		 * Disable the event on the cpu that it's on
1351 		 */
1352 		cpu_function_call(event->cpu, __perf_event_disable, event);
1353 		return;
1354 	}
1355 
1356 retry:
1357 	if (!task_function_call(task, __perf_event_disable, event))
1358 		return;
1359 
1360 	raw_spin_lock_irq(&ctx->lock);
1361 	/*
1362 	 * If the event is still active, we need to retry the cross-call.
1363 	 */
1364 	if (event->state == PERF_EVENT_STATE_ACTIVE) {
1365 		raw_spin_unlock_irq(&ctx->lock);
1366 		/*
1367 		 * Reload the task pointer, it might have been changed by
1368 		 * a concurrent perf_event_context_sched_out().
1369 		 */
1370 		task = ctx->task;
1371 		goto retry;
1372 	}
1373 
1374 	/*
1375 	 * Since we have the lock this context can't be scheduled
1376 	 * in, so we can change the state safely.
1377 	 */
1378 	if (event->state == PERF_EVENT_STATE_INACTIVE) {
1379 		update_group_times(event);
1380 		event->state = PERF_EVENT_STATE_OFF;
1381 	}
1382 	raw_spin_unlock_irq(&ctx->lock);
1383 }
1384 EXPORT_SYMBOL_GPL(perf_event_disable);
1385 
perf_set_shadow_time(struct perf_event * event,struct perf_event_context * ctx,u64 tstamp)1386 static void perf_set_shadow_time(struct perf_event *event,
1387 				 struct perf_event_context *ctx,
1388 				 u64 tstamp)
1389 {
1390 	/*
1391 	 * use the correct time source for the time snapshot
1392 	 *
1393 	 * We could get by without this by leveraging the
1394 	 * fact that to get to this function, the caller
1395 	 * has most likely already called update_context_time()
1396 	 * and update_cgrp_time_xx() and thus both timestamp
1397 	 * are identical (or very close). Given that tstamp is,
1398 	 * already adjusted for cgroup, we could say that:
1399 	 *    tstamp - ctx->timestamp
1400 	 * is equivalent to
1401 	 *    tstamp - cgrp->timestamp.
1402 	 *
1403 	 * Then, in perf_output_read(), the calculation would
1404 	 * work with no changes because:
1405 	 * - event is guaranteed scheduled in
1406 	 * - no scheduled out in between
1407 	 * - thus the timestamp would be the same
1408 	 *
1409 	 * But this is a bit hairy.
1410 	 *
1411 	 * So instead, we have an explicit cgroup call to remain
1412 	 * within the time time source all along. We believe it
1413 	 * is cleaner and simpler to understand.
1414 	 */
1415 	if (is_cgroup_event(event))
1416 		perf_cgroup_set_shadow_time(event, tstamp);
1417 	else
1418 		event->shadow_ctx_time = tstamp - ctx->timestamp;
1419 }
1420 
1421 #define MAX_INTERRUPTS (~0ULL)
1422 
1423 static void perf_log_throttle(struct perf_event *event, int enable);
1424 
1425 static int
event_sched_in(struct perf_event * event,struct perf_cpu_context * cpuctx,struct perf_event_context * ctx)1426 event_sched_in(struct perf_event *event,
1427 		 struct perf_cpu_context *cpuctx,
1428 		 struct perf_event_context *ctx)
1429 {
1430 	u64 tstamp = perf_event_time(event);
1431 
1432 	if (event->state <= PERF_EVENT_STATE_OFF)
1433 		return 0;
1434 
1435 	event->state = PERF_EVENT_STATE_ACTIVE;
1436 	event->oncpu = smp_processor_id();
1437 
1438 	/*
1439 	 * Unthrottle events, since we scheduled we might have missed several
1440 	 * ticks already, also for a heavily scheduling task there is little
1441 	 * guarantee it'll get a tick in a timely manner.
1442 	 */
1443 	if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
1444 		perf_log_throttle(event, 1);
1445 		event->hw.interrupts = 0;
1446 	}
1447 
1448 	/*
1449 	 * The new state must be visible before we turn it on in the hardware:
1450 	 */
1451 	smp_wmb();
1452 
1453 	if (event->pmu->add(event, PERF_EF_START)) {
1454 		event->state = PERF_EVENT_STATE_INACTIVE;
1455 		event->oncpu = -1;
1456 		return -EAGAIN;
1457 	}
1458 
1459 	event->tstamp_running += tstamp - event->tstamp_stopped;
1460 
1461 	perf_set_shadow_time(event, ctx, tstamp);
1462 
1463 	if (!is_software_event(event))
1464 		cpuctx->active_oncpu++;
1465 	ctx->nr_active++;
1466 	if (event->attr.freq && event->attr.sample_freq)
1467 		ctx->nr_freq++;
1468 
1469 	if (event->attr.exclusive)
1470 		cpuctx->exclusive = 1;
1471 
1472 	return 0;
1473 }
1474 
1475 static int
group_sched_in(struct perf_event * group_event,struct perf_cpu_context * cpuctx,struct perf_event_context * ctx)1476 group_sched_in(struct perf_event *group_event,
1477 	       struct perf_cpu_context *cpuctx,
1478 	       struct perf_event_context *ctx)
1479 {
1480 	struct perf_event *event, *partial_group = NULL;
1481 	struct pmu *pmu = group_event->pmu;
1482 	u64 now = ctx->time;
1483 	bool simulate = false;
1484 
1485 	if (group_event->state == PERF_EVENT_STATE_OFF)
1486 		return 0;
1487 
1488 	pmu->start_txn(pmu);
1489 
1490 	if (event_sched_in(group_event, cpuctx, ctx)) {
1491 		pmu->cancel_txn(pmu);
1492 		return -EAGAIN;
1493 	}
1494 
1495 	/*
1496 	 * Schedule in siblings as one group (if any):
1497 	 */
1498 	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
1499 		if (event_sched_in(event, cpuctx, ctx)) {
1500 			partial_group = event;
1501 			goto group_error;
1502 		}
1503 	}
1504 
1505 	if (!pmu->commit_txn(pmu))
1506 		return 0;
1507 
1508 group_error:
1509 	/*
1510 	 * Groups can be scheduled in as one unit only, so undo any
1511 	 * partial group before returning:
1512 	 * The events up to the failed event are scheduled out normally,
1513 	 * tstamp_stopped will be updated.
1514 	 *
1515 	 * The failed events and the remaining siblings need to have
1516 	 * their timings updated as if they had gone thru event_sched_in()
1517 	 * and event_sched_out(). This is required to get consistent timings
1518 	 * across the group. This also takes care of the case where the group
1519 	 * could never be scheduled by ensuring tstamp_stopped is set to mark
1520 	 * the time the event was actually stopped, such that time delta
1521 	 * calculation in update_event_times() is correct.
1522 	 */
1523 	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
1524 		if (event == partial_group)
1525 			simulate = true;
1526 
1527 		if (simulate) {
1528 			event->tstamp_running += now - event->tstamp_stopped;
1529 			event->tstamp_stopped = now;
1530 		} else {
1531 			event_sched_out(event, cpuctx, ctx);
1532 		}
1533 	}
1534 	event_sched_out(group_event, cpuctx, ctx);
1535 
1536 	pmu->cancel_txn(pmu);
1537 
1538 	return -EAGAIN;
1539 }
1540 
1541 /*
1542  * Work out whether we can put this event group on the CPU now.
1543  */
group_can_go_on(struct perf_event * event,struct perf_cpu_context * cpuctx,int can_add_hw)1544 static int group_can_go_on(struct perf_event *event,
1545 			   struct perf_cpu_context *cpuctx,
1546 			   int can_add_hw)
1547 {
1548 	/*
1549 	 * Groups consisting entirely of software events can always go on.
1550 	 */
1551 	if (event->group_flags & PERF_GROUP_SOFTWARE)
1552 		return 1;
1553 	/*
1554 	 * If an exclusive group is already on, no other hardware
1555 	 * events can go on.
1556 	 */
1557 	if (cpuctx->exclusive)
1558 		return 0;
1559 	/*
1560 	 * If this group is exclusive and there are already
1561 	 * events on the CPU, it can't go on.
1562 	 */
1563 	if (event->attr.exclusive && cpuctx->active_oncpu)
1564 		return 0;
1565 	/*
1566 	 * Otherwise, try to add it if all previous groups were able
1567 	 * to go on.
1568 	 */
1569 	return can_add_hw;
1570 }
1571 
add_event_to_ctx(struct perf_event * event,struct perf_event_context * ctx)1572 static void add_event_to_ctx(struct perf_event *event,
1573 			       struct perf_event_context *ctx)
1574 {
1575 	u64 tstamp = perf_event_time(event);
1576 
1577 	list_add_event(event, ctx);
1578 	perf_group_attach(event);
1579 	event->tstamp_enabled = tstamp;
1580 	event->tstamp_running = tstamp;
1581 	event->tstamp_stopped = tstamp;
1582 }
1583 
1584 static void task_ctx_sched_out(struct perf_event_context *ctx);
1585 static void
1586 ctx_sched_in(struct perf_event_context *ctx,
1587 	     struct perf_cpu_context *cpuctx,
1588 	     enum event_type_t event_type,
1589 	     struct task_struct *task);
1590 
perf_event_sched_in(struct perf_cpu_context * cpuctx,struct perf_event_context * ctx,struct task_struct * task)1591 static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
1592 				struct perf_event_context *ctx,
1593 				struct task_struct *task)
1594 {
1595 	cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
1596 	if (ctx)
1597 		ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
1598 	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
1599 	if (ctx)
1600 		ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
1601 }
1602 
1603 /*
1604  * Cross CPU call to install and enable a performance event
1605  *
1606  * Must be called with ctx->mutex held
1607  */
__perf_install_in_context(void * info)1608 static int  __perf_install_in_context(void *info)
1609 {
1610 	struct perf_event *event = info;
1611 	struct perf_event_context *ctx = event->ctx;
1612 	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1613 	struct perf_event_context *task_ctx = cpuctx->task_ctx;
1614 	struct task_struct *task = current;
1615 
1616 	perf_ctx_lock(cpuctx, task_ctx);
1617 	perf_pmu_disable(cpuctx->ctx.pmu);
1618 
1619 	/*
1620 	 * If there was an active task_ctx schedule it out.
1621 	 */
1622 	if (task_ctx)
1623 		task_ctx_sched_out(task_ctx);
1624 
1625 	/*
1626 	 * If the context we're installing events in is not the
1627 	 * active task_ctx, flip them.
1628 	 */
1629 	if (ctx->task && task_ctx != ctx) {
1630 		if (task_ctx)
1631 			raw_spin_unlock(&task_ctx->lock);
1632 		raw_spin_lock(&ctx->lock);
1633 		task_ctx = ctx;
1634 	}
1635 
1636 	if (task_ctx) {
1637 		cpuctx->task_ctx = task_ctx;
1638 		task = task_ctx->task;
1639 	}
1640 
1641 	cpu_ctx_sched_out(cpuctx, EVENT_ALL);
1642 
1643 	update_context_time(ctx);
1644 	/*
1645 	 * update cgrp time only if current cgrp
1646 	 * matches event->cgrp. Must be done before
1647 	 * calling add_event_to_ctx()
1648 	 */
1649 	update_cgrp_time_from_event(event);
1650 
1651 	add_event_to_ctx(event, ctx);
1652 
1653 	/*
1654 	 * Schedule everything back in
1655 	 */
1656 	perf_event_sched_in(cpuctx, task_ctx, task);
1657 
1658 	perf_pmu_enable(cpuctx->ctx.pmu);
1659 	perf_ctx_unlock(cpuctx, task_ctx);
1660 
1661 	return 0;
1662 }
1663 
1664 /*
1665  * Attach a performance event to a context
1666  *
1667  * First we add the event to the list with the hardware enable bit
1668  * in event->hw_config cleared.
1669  *
1670  * If the event is attached to a task which is on a CPU we use a smp
1671  * call to enable it in the task context. The task might have been
1672  * scheduled away, but we check this in the smp call again.
1673  */
1674 static void
perf_install_in_context(struct perf_event_context * ctx,struct perf_event * event,int cpu)1675 perf_install_in_context(struct perf_event_context *ctx,
1676 			struct perf_event *event,
1677 			int cpu)
1678 {
1679 	struct task_struct *task = ctx->task;
1680 
1681 	lockdep_assert_held(&ctx->mutex);
1682 
1683 	event->ctx = ctx;
1684 
1685 	if (!task) {
1686 		/*
1687 		 * Per cpu events are installed via an smp call and
1688 		 * the install is always successful.
1689 		 */
1690 		cpu_function_call(cpu, __perf_install_in_context, event);
1691 		return;
1692 	}
1693 
1694 retry:
1695 	if (!task_function_call(task, __perf_install_in_context, event))
1696 		return;
1697 
1698 	raw_spin_lock_irq(&ctx->lock);
1699 	/*
1700 	 * If we failed to find a running task, but find the context active now
1701 	 * that we've acquired the ctx->lock, retry.
1702 	 */
1703 	if (ctx->is_active) {
1704 		raw_spin_unlock_irq(&ctx->lock);
1705 		goto retry;
1706 	}
1707 
1708 	/*
1709 	 * Since the task isn't running, its safe to add the event, us holding
1710 	 * the ctx->lock ensures the task won't get scheduled in.
1711 	 */
1712 	add_event_to_ctx(event, ctx);
1713 	raw_spin_unlock_irq(&ctx->lock);
1714 }
1715 
1716 /*
1717  * Put a event into inactive state and update time fields.
1718  * Enabling the leader of a group effectively enables all
1719  * the group members that aren't explicitly disabled, so we
1720  * have to update their ->tstamp_enabled also.
1721  * Note: this works for group members as well as group leaders
1722  * since the non-leader members' sibling_lists will be empty.
1723  */
__perf_event_mark_enabled(struct perf_event * event)1724 static void __perf_event_mark_enabled(struct perf_event *event)
1725 {
1726 	struct perf_event *sub;
1727 	u64 tstamp = perf_event_time(event);
1728 
1729 	event->state = PERF_EVENT_STATE_INACTIVE;
1730 	event->tstamp_enabled = tstamp - event->total_time_enabled;
1731 	list_for_each_entry(sub, &event->sibling_list, group_entry) {
1732 		if (sub->state >= PERF_EVENT_STATE_INACTIVE)
1733 			sub->tstamp_enabled = tstamp - sub->total_time_enabled;
1734 	}
1735 }
1736 
1737 /*
1738  * Cross CPU call to enable a performance event
1739  */
__perf_event_enable(void * info)1740 static int __perf_event_enable(void *info)
1741 {
1742 	struct perf_event *event = info;
1743 	struct perf_event_context *ctx = event->ctx;
1744 	struct perf_event *leader = event->group_leader;
1745 	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1746 	int err;
1747 
1748 	/*
1749 	 * There's a time window between 'ctx->is_active' check
1750 	 * in perf_event_enable function and this place having:
1751 	 *   - IRQs on
1752 	 *   - ctx->lock unlocked
1753 	 *
1754 	 * where the task could be killed and 'ctx' deactivated
1755 	 * by perf_event_exit_task.
1756 	 */
1757 	if (!ctx->is_active)
1758 		return -EINVAL;
1759 
1760 	raw_spin_lock(&ctx->lock);
1761 	update_context_time(ctx);
1762 
1763 	if (event->state >= PERF_EVENT_STATE_INACTIVE)
1764 		goto unlock;
1765 
1766 	/*
1767 	 * set current task's cgroup time reference point
1768 	 */
1769 	perf_cgroup_set_timestamp(current, ctx);
1770 
1771 	__perf_event_mark_enabled(event);
1772 
1773 	if (!event_filter_match(event)) {
1774 		if (is_cgroup_event(event))
1775 			perf_cgroup_defer_enabled(event);
1776 		goto unlock;
1777 	}
1778 
1779 	/*
1780 	 * If the event is in a group and isn't the group leader,
1781 	 * then don't put it on unless the group is on.
1782 	 */
1783 	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
1784 		goto unlock;
1785 
1786 	if (!group_can_go_on(event, cpuctx, 1)) {
1787 		err = -EEXIST;
1788 	} else {
1789 		if (event == leader)
1790 			err = group_sched_in(event, cpuctx, ctx);
1791 		else
1792 			err = event_sched_in(event, cpuctx, ctx);
1793 	}
1794 
1795 	if (err) {
1796 		/*
1797 		 * If this event can't go on and it's part of a
1798 		 * group, then the whole group has to come off.
1799 		 */
1800 		if (leader != event)
1801 			group_sched_out(leader, cpuctx, ctx);
1802 		if (leader->attr.pinned) {
1803 			update_group_times(leader);
1804 			leader->state = PERF_EVENT_STATE_ERROR;
1805 		}
1806 	}
1807 
1808 unlock:
1809 	raw_spin_unlock(&ctx->lock);
1810 
1811 	return 0;
1812 }
1813 
1814 /*
1815  * Enable a event.
1816  *
1817  * If event->ctx is a cloned context, callers must make sure that
1818  * every task struct that event->ctx->task could possibly point to
1819  * remains valid.  This condition is satisfied when called through
1820  * perf_event_for_each_child or perf_event_for_each as described
1821  * for perf_event_disable.
1822  */
perf_event_enable(struct perf_event * event)1823 void perf_event_enable(struct perf_event *event)
1824 {
1825 	struct perf_event_context *ctx = event->ctx;
1826 	struct task_struct *task = ctx->task;
1827 
1828 	if (!task) {
1829 		/*
1830 		 * Enable the event on the cpu that it's on
1831 		 */
1832 		cpu_function_call(event->cpu, __perf_event_enable, event);
1833 		return;
1834 	}
1835 
1836 	raw_spin_lock_irq(&ctx->lock);
1837 	if (event->state >= PERF_EVENT_STATE_INACTIVE)
1838 		goto out;
1839 
1840 	/*
1841 	 * If the event is in error state, clear that first.
1842 	 * That way, if we see the event in error state below, we
1843 	 * know that it has gone back into error state, as distinct
1844 	 * from the task having been scheduled away before the
1845 	 * cross-call arrived.
1846 	 */
1847 	if (event->state == PERF_EVENT_STATE_ERROR)
1848 		event->state = PERF_EVENT_STATE_OFF;
1849 
1850 retry:
1851 	if (!ctx->is_active) {
1852 		__perf_event_mark_enabled(event);
1853 		goto out;
1854 	}
1855 
1856 	raw_spin_unlock_irq(&ctx->lock);
1857 
1858 	if (!task_function_call(task, __perf_event_enable, event))
1859 		return;
1860 
1861 	raw_spin_lock_irq(&ctx->lock);
1862 
1863 	/*
1864 	 * If the context is active and the event is still off,
1865 	 * we need to retry the cross-call.
1866 	 */
1867 	if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
1868 		/*
1869 		 * task could have been flipped by a concurrent
1870 		 * perf_event_context_sched_out()
1871 		 */
1872 		task = ctx->task;
1873 		goto retry;
1874 	}
1875 
1876 out:
1877 	raw_spin_unlock_irq(&ctx->lock);
1878 }
1879 EXPORT_SYMBOL_GPL(perf_event_enable);
1880 
perf_event_refresh(struct perf_event * event,int refresh)1881 int perf_event_refresh(struct perf_event *event, int refresh)
1882 {
1883 	/*
1884 	 * not supported on inherited events
1885 	 */
1886 	if (event->attr.inherit || !is_sampling_event(event))
1887 		return -EINVAL;
1888 
1889 	atomic_add(refresh, &event->event_limit);
1890 	perf_event_enable(event);
1891 
1892 	return 0;
1893 }
1894 EXPORT_SYMBOL_GPL(perf_event_refresh);
1895 
ctx_sched_out(struct perf_event_context * ctx,struct perf_cpu_context * cpuctx,enum event_type_t event_type)1896 static void ctx_sched_out(struct perf_event_context *ctx,
1897 			  struct perf_cpu_context *cpuctx,
1898 			  enum event_type_t event_type)
1899 {
1900 	struct perf_event *event;
1901 	int is_active = ctx->is_active;
1902 
1903 	ctx->is_active &= ~event_type;
1904 	if (likely(!ctx->nr_events))
1905 		return;
1906 
1907 	update_context_time(ctx);
1908 	update_cgrp_time_from_cpuctx(cpuctx);
1909 	if (!ctx->nr_active)
1910 		return;
1911 
1912 	perf_pmu_disable(ctx->pmu);
1913 	if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) {
1914 		list_for_each_entry(event, &ctx->pinned_groups, group_entry)
1915 			group_sched_out(event, cpuctx, ctx);
1916 	}
1917 
1918 	if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) {
1919 		list_for_each_entry(event, &ctx->flexible_groups, group_entry)
1920 			group_sched_out(event, cpuctx, ctx);
1921 	}
1922 	perf_pmu_enable(ctx->pmu);
1923 }
1924 
1925 /*
1926  * Test whether two contexts are equivalent, i.e. whether they
1927  * have both been cloned from the same version of the same context
1928  * and they both have the same number of enabled events.
1929  * If the number of enabled events is the same, then the set
1930  * of enabled events should be the same, because these are both
1931  * inherited contexts, therefore we can't access individual events
1932  * in them directly with an fd; we can only enable/disable all
1933  * events via prctl, or enable/disable all events in a family
1934  * via ioctl, which will have the same effect on both contexts.
1935  */
context_equiv(struct perf_event_context * ctx1,struct perf_event_context * ctx2)1936 static int context_equiv(struct perf_event_context *ctx1,
1937 			 struct perf_event_context *ctx2)
1938 {
1939 	return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1940 		&& ctx1->parent_gen == ctx2->parent_gen
1941 		&& !ctx1->pin_count && !ctx2->pin_count;
1942 }
1943 
__perf_event_sync_stat(struct perf_event * event,struct perf_event * next_event)1944 static void __perf_event_sync_stat(struct perf_event *event,
1945 				     struct perf_event *next_event)
1946 {
1947 	u64 value;
1948 
1949 	if (!event->attr.inherit_stat)
1950 		return;
1951 
1952 	/*
1953 	 * Update the event value, we cannot use perf_event_read()
1954 	 * because we're in the middle of a context switch and have IRQs
1955 	 * disabled, which upsets smp_call_function_single(), however
1956 	 * we know the event must be on the current CPU, therefore we
1957 	 * don't need to use it.
1958 	 */
1959 	switch (event->state) {
1960 	case PERF_EVENT_STATE_ACTIVE:
1961 		event->pmu->read(event);
1962 		/* fall-through */
1963 
1964 	case PERF_EVENT_STATE_INACTIVE:
1965 		update_event_times(event);
1966 		break;
1967 
1968 	default:
1969 		break;
1970 	}
1971 
1972 	/*
1973 	 * In order to keep per-task stats reliable we need to flip the event
1974 	 * values when we flip the contexts.
1975 	 */
1976 	value = local64_read(&next_event->count);
1977 	value = local64_xchg(&event->count, value);
1978 	local64_set(&next_event->count, value);
1979 
1980 	swap(event->total_time_enabled, next_event->total_time_enabled);
1981 	swap(event->total_time_running, next_event->total_time_running);
1982 
1983 	/*
1984 	 * Since we swizzled the values, update the user visible data too.
1985 	 */
1986 	perf_event_update_userpage(event);
1987 	perf_event_update_userpage(next_event);
1988 }
1989 
perf_event_sync_stat(struct perf_event_context * ctx,struct perf_event_context * next_ctx)1990 static void perf_event_sync_stat(struct perf_event_context *ctx,
1991 				   struct perf_event_context *next_ctx)
1992 {
1993 	struct perf_event *event, *next_event;
1994 
1995 	if (!ctx->nr_stat)
1996 		return;
1997 
1998 	update_context_time(ctx);
1999 
2000 	event = list_first_entry(&ctx->event_list,
2001 				   struct perf_event, event_entry);
2002 
2003 	next_event = list_first_entry(&next_ctx->event_list,
2004 					struct perf_event, event_entry);
2005 
2006 	while (&event->event_entry != &ctx->event_list &&
2007 	       &next_event->event_entry != &next_ctx->event_list) {
2008 
2009 		__perf_event_sync_stat(event, next_event);
2010 
2011 		event = list_next_entry(event, event_entry);
2012 		next_event = list_next_entry(next_event, event_entry);
2013 	}
2014 }
2015 
perf_event_context_sched_out(struct task_struct * task,int ctxn,struct task_struct * next)2016 static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2017 					 struct task_struct *next)
2018 {
2019 	struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
2020 	struct perf_event_context *next_ctx;
2021 	struct perf_event_context *parent;
2022 	struct perf_cpu_context *cpuctx;
2023 	int do_switch = 1;
2024 
2025 	if (likely(!ctx))
2026 		return;
2027 
2028 	cpuctx = __get_cpu_context(ctx);
2029 	if (!cpuctx->task_ctx)
2030 		return;
2031 
2032 	rcu_read_lock();
2033 	parent = rcu_dereference(ctx->parent_ctx);
2034 	next_ctx = next->perf_event_ctxp[ctxn];
2035 	if (parent && next_ctx &&
2036 	    rcu_dereference(next_ctx->parent_ctx) == parent) {
2037 		/*
2038 		 * Looks like the two contexts are clones, so we might be
2039 		 * able to optimize the context switch.  We lock both
2040 		 * contexts and check that they are clones under the
2041 		 * lock (including re-checking that neither has been
2042 		 * uncloned in the meantime).  It doesn't matter which
2043 		 * order we take the locks because no other cpu could
2044 		 * be trying to lock both of these tasks.
2045 		 */
2046 		raw_spin_lock(&ctx->lock);
2047 		raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
2048 		if (context_equiv(ctx, next_ctx)) {
2049 			/*
2050 			 * XXX do we need a memory barrier of sorts
2051 			 * wrt to rcu_dereference() of perf_event_ctxp
2052 			 */
2053 			task->perf_event_ctxp[ctxn] = next_ctx;
2054 			next->perf_event_ctxp[ctxn] = ctx;
2055 			ctx->task = next;
2056 			next_ctx->task = task;
2057 			do_switch = 0;
2058 
2059 			perf_event_sync_stat(ctx, next_ctx);
2060 		}
2061 		raw_spin_unlock(&next_ctx->lock);
2062 		raw_spin_unlock(&ctx->lock);
2063 	}
2064 	rcu_read_unlock();
2065 
2066 	if (do_switch) {
2067 		raw_spin_lock(&ctx->lock);
2068 		ctx_sched_out(ctx, cpuctx, EVENT_ALL);
2069 		cpuctx->task_ctx = NULL;
2070 		raw_spin_unlock(&ctx->lock);
2071 	}
2072 }
2073 
2074 #define for_each_task_context_nr(ctxn)					\
2075 	for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
2076 
2077 /*
2078  * Called from scheduler to remove the events of the current task,
2079  * with interrupts disabled.
2080  *
2081  * We stop each event and update the event value in event->count.
2082  *
2083  * This does not protect us against NMI, but disable()
2084  * sets the disabled bit in the control field of event _before_
2085  * accessing the event control register. If a NMI hits, then it will
2086  * not restart the event.
2087  */
__perf_event_task_sched_out(struct task_struct * task,struct task_struct * next)2088 void __perf_event_task_sched_out(struct task_struct *task,
2089 				 struct task_struct *next)
2090 {
2091 	int ctxn;
2092 
2093 	for_each_task_context_nr(ctxn)
2094 		perf_event_context_sched_out(task, ctxn, next);
2095 
2096 	/*
2097 	 * if cgroup events exist on this CPU, then we need
2098 	 * to check if we have to switch out PMU state.
2099 	 * cgroup event are system-wide mode only
2100 	 */
2101 	if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
2102 		perf_cgroup_sched_out(task, next);
2103 }
2104 
task_ctx_sched_out(struct perf_event_context * ctx)2105 static void task_ctx_sched_out(struct perf_event_context *ctx)
2106 {
2107 	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2108 
2109 	if (!cpuctx->task_ctx)
2110 		return;
2111 
2112 	if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2113 		return;
2114 
2115 	ctx_sched_out(ctx, cpuctx, EVENT_ALL);
2116 	cpuctx->task_ctx = NULL;
2117 }
2118 
2119 /*
2120  * Called with IRQs disabled
2121  */
cpu_ctx_sched_out(struct perf_cpu_context * cpuctx,enum event_type_t event_type)2122 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
2123 			      enum event_type_t event_type)
2124 {
2125 	ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
2126 }
2127 
2128 static void
ctx_pinned_sched_in(struct perf_event_context * ctx,struct perf_cpu_context * cpuctx)2129 ctx_pinned_sched_in(struct perf_event_context *ctx,
2130 		    struct perf_cpu_context *cpuctx)
2131 {
2132 	struct perf_event *event;
2133 
2134 	list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
2135 		if (event->state <= PERF_EVENT_STATE_OFF)
2136 			continue;
2137 		if (!event_filter_match(event))
2138 			continue;
2139 
2140 		/* may need to reset tstamp_enabled */
2141 		if (is_cgroup_event(event))
2142 			perf_cgroup_mark_enabled(event, ctx);
2143 
2144 		if (group_can_go_on(event, cpuctx, 1))
2145 			group_sched_in(event, cpuctx, ctx);
2146 
2147 		/*
2148 		 * If this pinned group hasn't been scheduled,
2149 		 * put it in error state.
2150 		 */
2151 		if (event->state == PERF_EVENT_STATE_INACTIVE) {
2152 			update_group_times(event);
2153 			event->state = PERF_EVENT_STATE_ERROR;
2154 		}
2155 	}
2156 }
2157 
2158 static void
ctx_flexible_sched_in(struct perf_event_context * ctx,struct perf_cpu_context * cpuctx)2159 ctx_flexible_sched_in(struct perf_event_context *ctx,
2160 		      struct perf_cpu_context *cpuctx)
2161 {
2162 	struct perf_event *event;
2163 	int can_add_hw = 1;
2164 
2165 	list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
2166 		/* Ignore events in OFF or ERROR state */
2167 		if (event->state <= PERF_EVENT_STATE_OFF)
2168 			continue;
2169 		/*
2170 		 * Listen to the 'cpu' scheduling filter constraint
2171 		 * of events:
2172 		 */
2173 		if (!event_filter_match(event))
2174 			continue;
2175 
2176 		/* may need to reset tstamp_enabled */
2177 		if (is_cgroup_event(event))
2178 			perf_cgroup_mark_enabled(event, ctx);
2179 
2180 		if (group_can_go_on(event, cpuctx, can_add_hw)) {
2181 			if (group_sched_in(event, cpuctx, ctx))
2182 				can_add_hw = 0;
2183 		}
2184 	}
2185 }
2186 
2187 static void
ctx_sched_in(struct perf_event_context * ctx,struct perf_cpu_context * cpuctx,enum event_type_t event_type,struct task_struct * task)2188 ctx_sched_in(struct perf_event_context *ctx,
2189 	     struct perf_cpu_context *cpuctx,
2190 	     enum event_type_t event_type,
2191 	     struct task_struct *task)
2192 {
2193 	u64 now;
2194 	int is_active = ctx->is_active;
2195 
2196 	ctx->is_active |= event_type;
2197 	if (likely(!ctx->nr_events))
2198 		return;
2199 
2200 	now = perf_clock();
2201 	ctx->timestamp = now;
2202 	perf_cgroup_set_timestamp(task, ctx);
2203 	/*
2204 	 * First go through the list and put on any pinned groups
2205 	 * in order to give them the best chance of going on.
2206 	 */
2207 	if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED))
2208 		ctx_pinned_sched_in(ctx, cpuctx);
2209 
2210 	/* Then walk through the lower prio flexible groups */
2211 	if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE))
2212 		ctx_flexible_sched_in(ctx, cpuctx);
2213 }
2214 
cpu_ctx_sched_in(struct perf_cpu_context * cpuctx,enum event_type_t event_type,struct task_struct * task)2215 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
2216 			     enum event_type_t event_type,
2217 			     struct task_struct *task)
2218 {
2219 	struct perf_event_context *ctx = &cpuctx->ctx;
2220 
2221 	ctx_sched_in(ctx, cpuctx, event_type, task);
2222 }
2223 
perf_event_context_sched_in(struct perf_event_context * ctx,struct task_struct * task)2224 static void perf_event_context_sched_in(struct perf_event_context *ctx,
2225 					struct task_struct *task)
2226 {
2227 	struct perf_cpu_context *cpuctx;
2228 
2229 	cpuctx = __get_cpu_context(ctx);
2230 	if (cpuctx->task_ctx == ctx)
2231 		return;
2232 
2233 	perf_ctx_lock(cpuctx, ctx);
2234 	perf_pmu_disable(ctx->pmu);
2235 	/*
2236 	 * We want to keep the following priority order:
2237 	 * cpu pinned (that don't need to move), task pinned,
2238 	 * cpu flexible, task flexible.
2239 	 */
2240 	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2241 
2242 	if (ctx->nr_events)
2243 		cpuctx->task_ctx = ctx;
2244 
2245 	perf_event_sched_in(cpuctx, cpuctx->task_ctx, task);
2246 
2247 	perf_pmu_enable(ctx->pmu);
2248 	perf_ctx_unlock(cpuctx, ctx);
2249 
2250 	/*
2251 	 * Since these rotations are per-cpu, we need to ensure the
2252 	 * cpu-context we got scheduled on is actually rotating.
2253 	 */
2254 	perf_pmu_rotate_start(ctx->pmu);
2255 }
2256 
2257 /*
2258  * When sampling the branck stack in system-wide, it may be necessary
2259  * to flush the stack on context switch. This happens when the branch
2260  * stack does not tag its entries with the pid of the current task.
2261  * Otherwise it becomes impossible to associate a branch entry with a
2262  * task. This ambiguity is more likely to appear when the branch stack
2263  * supports priv level filtering and the user sets it to monitor only
2264  * at the user level (which could be a useful measurement in system-wide
2265  * mode). In that case, the risk is high of having a branch stack with
2266  * branch from multiple tasks. Flushing may mean dropping the existing
2267  * entries or stashing them somewhere in the PMU specific code layer.
2268  *
2269  * This function provides the context switch callback to the lower code
2270  * layer. It is invoked ONLY when there is at least one system-wide context
2271  * with at least one active event using taken branch sampling.
2272  */
perf_branch_stack_sched_in(struct task_struct * prev,struct task_struct * task)2273 static void perf_branch_stack_sched_in(struct task_struct *prev,
2274 				       struct task_struct *task)
2275 {
2276 	struct perf_cpu_context *cpuctx;
2277 	struct pmu *pmu;
2278 	unsigned long flags;
2279 
2280 	/* no need to flush branch stack if not changing task */
2281 	if (prev == task)
2282 		return;
2283 
2284 	local_irq_save(flags);
2285 
2286 	rcu_read_lock();
2287 
2288 	list_for_each_entry_rcu(pmu, &pmus, entry) {
2289 		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2290 
2291 		/*
2292 		 * check if the context has at least one
2293 		 * event using PERF_SAMPLE_BRANCH_STACK
2294 		 */
2295 		if (cpuctx->ctx.nr_branch_stack > 0
2296 		    && pmu->flush_branch_stack) {
2297 
2298 			pmu = cpuctx->ctx.pmu;
2299 
2300 			perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2301 
2302 			perf_pmu_disable(pmu);
2303 
2304 			pmu->flush_branch_stack();
2305 
2306 			perf_pmu_enable(pmu);
2307 
2308 			perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2309 		}
2310 	}
2311 
2312 	rcu_read_unlock();
2313 
2314 	local_irq_restore(flags);
2315 }
2316 
2317 /*
2318  * Called from scheduler to add the events of the current task
2319  * with interrupts disabled.
2320  *
2321  * We restore the event value and then enable it.
2322  *
2323  * This does not protect us against NMI, but enable()
2324  * sets the enabled bit in the control field of event _before_
2325  * accessing the event control register. If a NMI hits, then it will
2326  * keep the event running.
2327  */
__perf_event_task_sched_in(struct task_struct * prev,struct task_struct * task)2328 void __perf_event_task_sched_in(struct task_struct *prev,
2329 				struct task_struct *task)
2330 {
2331 	struct perf_event_context *ctx;
2332 	int ctxn;
2333 
2334 	for_each_task_context_nr(ctxn) {
2335 		ctx = task->perf_event_ctxp[ctxn];
2336 		if (likely(!ctx))
2337 			continue;
2338 
2339 		perf_event_context_sched_in(ctx, task);
2340 	}
2341 	/*
2342 	 * if cgroup events exist on this CPU, then we need
2343 	 * to check if we have to switch in PMU state.
2344 	 * cgroup event are system-wide mode only
2345 	 */
2346 	if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
2347 		perf_cgroup_sched_in(prev, task);
2348 
2349 	/* check for system-wide branch_stack events */
2350 	if (atomic_read(&__get_cpu_var(perf_branch_stack_events)))
2351 		perf_branch_stack_sched_in(prev, task);
2352 }
2353 
perf_calculate_period(struct perf_event * event,u64 nsec,u64 count)2354 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
2355 {
2356 	u64 frequency = event->attr.sample_freq;
2357 	u64 sec = NSEC_PER_SEC;
2358 	u64 divisor, dividend;
2359 
2360 	int count_fls, nsec_fls, frequency_fls, sec_fls;
2361 
2362 	count_fls = fls64(count);
2363 	nsec_fls = fls64(nsec);
2364 	frequency_fls = fls64(frequency);
2365 	sec_fls = 30;
2366 
2367 	/*
2368 	 * We got @count in @nsec, with a target of sample_freq HZ
2369 	 * the target period becomes:
2370 	 *
2371 	 *             @count * 10^9
2372 	 * period = -------------------
2373 	 *          @nsec * sample_freq
2374 	 *
2375 	 */
2376 
2377 	/*
2378 	 * Reduce accuracy by one bit such that @a and @b converge
2379 	 * to a similar magnitude.
2380 	 */
2381 #define REDUCE_FLS(a, b)		\
2382 do {					\
2383 	if (a##_fls > b##_fls) {	\
2384 		a >>= 1;		\
2385 		a##_fls--;		\
2386 	} else {			\
2387 		b >>= 1;		\
2388 		b##_fls--;		\
2389 	}				\
2390 } while (0)
2391 
2392 	/*
2393 	 * Reduce accuracy until either term fits in a u64, then proceed with
2394 	 * the other, so that finally we can do a u64/u64 division.
2395 	 */
2396 	while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
2397 		REDUCE_FLS(nsec, frequency);
2398 		REDUCE_FLS(sec, count);
2399 	}
2400 
2401 	if (count_fls + sec_fls > 64) {
2402 		divisor = nsec * frequency;
2403 
2404 		while (count_fls + sec_fls > 64) {
2405 			REDUCE_FLS(count, sec);
2406 			divisor >>= 1;
2407 		}
2408 
2409 		dividend = count * sec;
2410 	} else {
2411 		dividend = count * sec;
2412 
2413 		while (nsec_fls + frequency_fls > 64) {
2414 			REDUCE_FLS(nsec, frequency);
2415 			dividend >>= 1;
2416 		}
2417 
2418 		divisor = nsec * frequency;
2419 	}
2420 
2421 	if (!divisor)
2422 		return dividend;
2423 
2424 	return div64_u64(dividend, divisor);
2425 }
2426 
2427 static DEFINE_PER_CPU(int, perf_throttled_count);
2428 static DEFINE_PER_CPU(u64, perf_throttled_seq);
2429 
perf_adjust_period(struct perf_event * event,u64 nsec,u64 count,bool disable)2430 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
2431 {
2432 	struct hw_perf_event *hwc = &event->hw;
2433 	s64 period, sample_period;
2434 	s64 delta;
2435 
2436 	period = perf_calculate_period(event, nsec, count);
2437 
2438 	delta = (s64)(period - hwc->sample_period);
2439 	delta = (delta + 7) / 8; /* low pass filter */
2440 
2441 	sample_period = hwc->sample_period + delta;
2442 
2443 	if (!sample_period)
2444 		sample_period = 1;
2445 
2446 	hwc->sample_period = sample_period;
2447 
2448 	if (local64_read(&hwc->period_left) > 8*sample_period) {
2449 		if (disable)
2450 			event->pmu->stop(event, PERF_EF_UPDATE);
2451 
2452 		local64_set(&hwc->period_left, 0);
2453 
2454 		if (disable)
2455 			event->pmu->start(event, PERF_EF_RELOAD);
2456 	}
2457 }
2458 
2459 /*
2460  * combine freq adjustment with unthrottling to avoid two passes over the
2461  * events. At the same time, make sure, having freq events does not change
2462  * the rate of unthrottling as that would introduce bias.
2463  */
perf_adjust_freq_unthr_context(struct perf_event_context * ctx,int needs_unthr)2464 static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
2465 					   int needs_unthr)
2466 {
2467 	struct perf_event *event;
2468 	struct hw_perf_event *hwc;
2469 	u64 now, period = TICK_NSEC;
2470 	s64 delta;
2471 
2472 	/*
2473 	 * only need to iterate over all events iff:
2474 	 * - context have events in frequency mode (needs freq adjust)
2475 	 * - there are events to unthrottle on this cpu
2476 	 */
2477 	if (!(ctx->nr_freq || needs_unthr))
2478 		return;
2479 
2480 	raw_spin_lock(&ctx->lock);
2481 	perf_pmu_disable(ctx->pmu);
2482 
2483 	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
2484 		if (event->state != PERF_EVENT_STATE_ACTIVE)
2485 			continue;
2486 
2487 		if (!event_filter_match(event))
2488 			continue;
2489 
2490 		hwc = &event->hw;
2491 
2492 		if (needs_unthr && hwc->interrupts == MAX_INTERRUPTS) {
2493 			hwc->interrupts = 0;
2494 			perf_log_throttle(event, 1);
2495 			event->pmu->start(event, 0);
2496 		}
2497 
2498 		if (!event->attr.freq || !event->attr.sample_freq)
2499 			continue;
2500 
2501 		/*
2502 		 * stop the event and update event->count
2503 		 */
2504 		event->pmu->stop(event, PERF_EF_UPDATE);
2505 
2506 		now = local64_read(&event->count);
2507 		delta = now - hwc->freq_count_stamp;
2508 		hwc->freq_count_stamp = now;
2509 
2510 		/*
2511 		 * restart the event
2512 		 * reload only if value has changed
2513 		 * we have stopped the event so tell that
2514 		 * to perf_adjust_period() to avoid stopping it
2515 		 * twice.
2516 		 */
2517 		if (delta > 0)
2518 			perf_adjust_period(event, period, delta, false);
2519 
2520 		event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
2521 	}
2522 
2523 	perf_pmu_enable(ctx->pmu);
2524 	raw_spin_unlock(&ctx->lock);
2525 }
2526 
2527 /*
2528  * Round-robin a context's events:
2529  */
rotate_ctx(struct perf_event_context * ctx)2530 static void rotate_ctx(struct perf_event_context *ctx)
2531 {
2532 	/*
2533 	 * Rotate the first entry last of non-pinned groups. Rotation might be
2534 	 * disabled by the inheritance code.
2535 	 */
2536 	if (!ctx->rotate_disable)
2537 		list_rotate_left(&ctx->flexible_groups);
2538 }
2539 
2540 /*
2541  * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
2542  * because they're strictly cpu affine and rotate_start is called with IRQs
2543  * disabled, while rotate_context is called from IRQ context.
2544  */
perf_rotate_context(struct perf_cpu_context * cpuctx)2545 static void perf_rotate_context(struct perf_cpu_context *cpuctx)
2546 {
2547 	struct perf_event_context *ctx = NULL;
2548 	int rotate = 0, remove = 1;
2549 
2550 	if (cpuctx->ctx.nr_events) {
2551 		remove = 0;
2552 		if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
2553 			rotate = 1;
2554 	}
2555 
2556 	ctx = cpuctx->task_ctx;
2557 	if (ctx && ctx->nr_events) {
2558 		remove = 0;
2559 		if (ctx->nr_events != ctx->nr_active)
2560 			rotate = 1;
2561 	}
2562 
2563 	if (!rotate)
2564 		goto done;
2565 
2566 	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2567 	perf_pmu_disable(cpuctx->ctx.pmu);
2568 
2569 	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2570 	if (ctx)
2571 		ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
2572 
2573 	rotate_ctx(&cpuctx->ctx);
2574 	if (ctx)
2575 		rotate_ctx(ctx);
2576 
2577 	perf_event_sched_in(cpuctx, ctx, current);
2578 
2579 	perf_pmu_enable(cpuctx->ctx.pmu);
2580 	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2581 done:
2582 	if (remove)
2583 		list_del_init(&cpuctx->rotation_list);
2584 }
2585 
perf_event_task_tick(void)2586 void perf_event_task_tick(void)
2587 {
2588 	struct list_head *head = &__get_cpu_var(rotation_list);
2589 	struct perf_cpu_context *cpuctx, *tmp;
2590 	struct perf_event_context *ctx;
2591 	int throttled;
2592 
2593 	WARN_ON(!irqs_disabled());
2594 
2595 	__this_cpu_inc(perf_throttled_seq);
2596 	throttled = __this_cpu_xchg(perf_throttled_count, 0);
2597 
2598 	list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
2599 		ctx = &cpuctx->ctx;
2600 		perf_adjust_freq_unthr_context(ctx, throttled);
2601 
2602 		ctx = cpuctx->task_ctx;
2603 		if (ctx)
2604 			perf_adjust_freq_unthr_context(ctx, throttled);
2605 
2606 		if (cpuctx->jiffies_interval == 1 ||
2607 				!(jiffies % cpuctx->jiffies_interval))
2608 			perf_rotate_context(cpuctx);
2609 	}
2610 }
2611 
event_enable_on_exec(struct perf_event * event,struct perf_event_context * ctx)2612 static int event_enable_on_exec(struct perf_event *event,
2613 				struct perf_event_context *ctx)
2614 {
2615 	if (!event->attr.enable_on_exec)
2616 		return 0;
2617 
2618 	event->attr.enable_on_exec = 0;
2619 	if (event->state >= PERF_EVENT_STATE_INACTIVE)
2620 		return 0;
2621 
2622 	__perf_event_mark_enabled(event);
2623 
2624 	return 1;
2625 }
2626 
2627 /*
2628  * Enable all of a task's events that have been marked enable-on-exec.
2629  * This expects task == current.
2630  */
perf_event_enable_on_exec(struct perf_event_context * ctx)2631 static void perf_event_enable_on_exec(struct perf_event_context *ctx)
2632 {
2633 	struct perf_event *event;
2634 	unsigned long flags;
2635 	int enabled = 0;
2636 	int ret;
2637 
2638 	local_irq_save(flags);
2639 	if (!ctx || !ctx->nr_events)
2640 		goto out;
2641 
2642 	/*
2643 	 * We must ctxsw out cgroup events to avoid conflict
2644 	 * when invoking perf_task_event_sched_in() later on
2645 	 * in this function. Otherwise we end up trying to
2646 	 * ctxswin cgroup events which are already scheduled
2647 	 * in.
2648 	 */
2649 	perf_cgroup_sched_out(current, NULL);
2650 
2651 	raw_spin_lock(&ctx->lock);
2652 	task_ctx_sched_out(ctx);
2653 
2654 	list_for_each_entry(event, &ctx->event_list, event_entry) {
2655 		ret = event_enable_on_exec(event, ctx);
2656 		if (ret)
2657 			enabled = 1;
2658 	}
2659 
2660 	/*
2661 	 * Unclone this context if we enabled any event.
2662 	 */
2663 	if (enabled)
2664 		unclone_ctx(ctx);
2665 
2666 	raw_spin_unlock(&ctx->lock);
2667 
2668 	/*
2669 	 * Also calls ctxswin for cgroup events, if any:
2670 	 */
2671 	perf_event_context_sched_in(ctx, ctx->task);
2672 out:
2673 	local_irq_restore(flags);
2674 }
2675 
2676 /*
2677  * Cross CPU call to read the hardware event
2678  */
__perf_event_read(void * info)2679 static void __perf_event_read(void *info)
2680 {
2681 	struct perf_event *event = info;
2682 	struct perf_event_context *ctx = event->ctx;
2683 	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2684 
2685 	/*
2686 	 * If this is a task context, we need to check whether it is
2687 	 * the current task context of this cpu.  If not it has been
2688 	 * scheduled out before the smp call arrived.  In that case
2689 	 * event->count would have been updated to a recent sample
2690 	 * when the event was scheduled out.
2691 	 */
2692 	if (ctx->task && cpuctx->task_ctx != ctx)
2693 		return;
2694 
2695 	raw_spin_lock(&ctx->lock);
2696 	if (ctx->is_active) {
2697 		update_context_time(ctx);
2698 		update_cgrp_time_from_event(event);
2699 	}
2700 	update_event_times(event);
2701 	if (event->state == PERF_EVENT_STATE_ACTIVE)
2702 		event->pmu->read(event);
2703 	raw_spin_unlock(&ctx->lock);
2704 }
2705 
perf_event_count(struct perf_event * event)2706 static inline u64 perf_event_count(struct perf_event *event)
2707 {
2708 	return local64_read(&event->count) + atomic64_read(&event->child_count);
2709 }
2710 
perf_event_read(struct perf_event * event)2711 static u64 perf_event_read(struct perf_event *event)
2712 {
2713 	/*
2714 	 * If event is enabled and currently active on a CPU, update the
2715 	 * value in the event structure:
2716 	 */
2717 	if (event->state == PERF_EVENT_STATE_ACTIVE) {
2718 		smp_call_function_single(event->oncpu,
2719 					 __perf_event_read, event, 1);
2720 	} else if (event->state == PERF_EVENT_STATE_INACTIVE) {
2721 		struct perf_event_context *ctx = event->ctx;
2722 		unsigned long flags;
2723 
2724 		raw_spin_lock_irqsave(&ctx->lock, flags);
2725 		/*
2726 		 * may read while context is not active
2727 		 * (e.g., thread is blocked), in that case
2728 		 * we cannot update context time
2729 		 */
2730 		if (ctx->is_active) {
2731 			update_context_time(ctx);
2732 			update_cgrp_time_from_event(event);
2733 		}
2734 		update_event_times(event);
2735 		raw_spin_unlock_irqrestore(&ctx->lock, flags);
2736 	}
2737 
2738 	return perf_event_count(event);
2739 }
2740 
2741 /*
2742  * Initialize the perf_event context in a task_struct:
2743  */
__perf_event_init_context(struct perf_event_context * ctx)2744 static void __perf_event_init_context(struct perf_event_context *ctx)
2745 {
2746 	raw_spin_lock_init(&ctx->lock);
2747 	mutex_init(&ctx->mutex);
2748 	INIT_LIST_HEAD(&ctx->pinned_groups);
2749 	INIT_LIST_HEAD(&ctx->flexible_groups);
2750 	INIT_LIST_HEAD(&ctx->event_list);
2751 	atomic_set(&ctx->refcount, 1);
2752 }
2753 
2754 static struct perf_event_context *
alloc_perf_context(struct pmu * pmu,struct task_struct * task)2755 alloc_perf_context(struct pmu *pmu, struct task_struct *task)
2756 {
2757 	struct perf_event_context *ctx;
2758 
2759 	ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
2760 	if (!ctx)
2761 		return NULL;
2762 
2763 	__perf_event_init_context(ctx);
2764 	if (task) {
2765 		ctx->task = task;
2766 		get_task_struct(task);
2767 	}
2768 	ctx->pmu = pmu;
2769 
2770 	return ctx;
2771 }
2772 
2773 static struct task_struct *
find_lively_task_by_vpid(pid_t vpid)2774 find_lively_task_by_vpid(pid_t vpid)
2775 {
2776 	struct task_struct *task;
2777 	int err;
2778 
2779 	rcu_read_lock();
2780 	if (!vpid)
2781 		task = current;
2782 	else
2783 		task = find_task_by_vpid(vpid);
2784 	if (task)
2785 		get_task_struct(task);
2786 	rcu_read_unlock();
2787 
2788 	if (!task)
2789 		return ERR_PTR(-ESRCH);
2790 
2791 	/* Reuse ptrace permission checks for now. */
2792 	err = -EACCES;
2793 	if (!ptrace_may_access(task, PTRACE_MODE_READ))
2794 		goto errout;
2795 
2796 	return task;
2797 errout:
2798 	put_task_struct(task);
2799 	return ERR_PTR(err);
2800 
2801 }
2802 
2803 /*
2804  * Returns a matching context with refcount and pincount.
2805  */
2806 static struct perf_event_context *
find_get_context(struct pmu * pmu,struct task_struct * task,int cpu)2807 find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
2808 {
2809 	struct perf_event_context *ctx;
2810 	struct perf_cpu_context *cpuctx;
2811 	unsigned long flags;
2812 	int ctxn, err;
2813 
2814 	if (!task) {
2815 		/* Must be root to operate on a CPU event: */
2816 		if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
2817 			return ERR_PTR(-EACCES);
2818 
2819 		/*
2820 		 * We could be clever and allow to attach a event to an
2821 		 * offline CPU and activate it when the CPU comes up, but
2822 		 * that's for later.
2823 		 */
2824 		if (!cpu_online(cpu))
2825 			return ERR_PTR(-ENODEV);
2826 
2827 		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
2828 		ctx = &cpuctx->ctx;
2829 		get_ctx(ctx);
2830 		++ctx->pin_count;
2831 
2832 		return ctx;
2833 	}
2834 
2835 	err = -EINVAL;
2836 	ctxn = pmu->task_ctx_nr;
2837 	if (ctxn < 0)
2838 		goto errout;
2839 
2840 retry:
2841 	ctx = perf_lock_task_context(task, ctxn, &flags);
2842 	if (ctx) {
2843 		unclone_ctx(ctx);
2844 		++ctx->pin_count;
2845 		raw_spin_unlock_irqrestore(&ctx->lock, flags);
2846 	} else {
2847 		ctx = alloc_perf_context(pmu, task);
2848 		err = -ENOMEM;
2849 		if (!ctx)
2850 			goto errout;
2851 
2852 		err = 0;
2853 		mutex_lock(&task->perf_event_mutex);
2854 		/*
2855 		 * If it has already passed perf_event_exit_task().
2856 		 * we must see PF_EXITING, it takes this mutex too.
2857 		 */
2858 		if (task->flags & PF_EXITING)
2859 			err = -ESRCH;
2860 		else if (task->perf_event_ctxp[ctxn])
2861 			err = -EAGAIN;
2862 		else {
2863 			get_ctx(ctx);
2864 			++ctx->pin_count;
2865 			rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
2866 		}
2867 		mutex_unlock(&task->perf_event_mutex);
2868 
2869 		if (unlikely(err)) {
2870 			put_ctx(ctx);
2871 
2872 			if (err == -EAGAIN)
2873 				goto retry;
2874 			goto errout;
2875 		}
2876 	}
2877 
2878 	return ctx;
2879 
2880 errout:
2881 	return ERR_PTR(err);
2882 }
2883 
2884 static void perf_event_free_filter(struct perf_event *event);
2885 
free_event_rcu(struct rcu_head * head)2886 static void free_event_rcu(struct rcu_head *head)
2887 {
2888 	struct perf_event *event;
2889 
2890 	event = container_of(head, struct perf_event, rcu_head);
2891 	if (event->ns)
2892 		put_pid_ns(event->ns);
2893 	perf_event_free_filter(event);
2894 	kfree(event);
2895 }
2896 
2897 static void ring_buffer_put(struct ring_buffer *rb);
2898 static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb);
2899 
free_event(struct perf_event * event)2900 static void free_event(struct perf_event *event)
2901 {
2902 	irq_work_sync(&event->pending);
2903 
2904 	if (!event->parent) {
2905 		if (event->attach_state & PERF_ATTACH_TASK)
2906 			static_key_slow_dec_deferred(&perf_sched_events);
2907 		if (event->attr.mmap || event->attr.mmap_data)
2908 			atomic_dec(&nr_mmap_events);
2909 		if (event->attr.comm)
2910 			atomic_dec(&nr_comm_events);
2911 		if (event->attr.task)
2912 			atomic_dec(&nr_task_events);
2913 		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
2914 			put_callchain_buffers();
2915 		if (is_cgroup_event(event)) {
2916 			atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
2917 			static_key_slow_dec_deferred(&perf_sched_events);
2918 		}
2919 
2920 		if (has_branch_stack(event)) {
2921 			static_key_slow_dec_deferred(&perf_sched_events);
2922 			/* is system-wide event */
2923 			if (!(event->attach_state & PERF_ATTACH_TASK)) {
2924 				atomic_dec(&per_cpu(perf_branch_stack_events,
2925 						    event->cpu));
2926 			}
2927 		}
2928 	}
2929 
2930 	if (event->rb) {
2931 		struct ring_buffer *rb;
2932 
2933 		/*
2934 		 * Can happen when we close an event with re-directed output.
2935 		 *
2936 		 * Since we have a 0 refcount, perf_mmap_close() will skip
2937 		 * over us; possibly making our ring_buffer_put() the last.
2938 		 */
2939 		mutex_lock(&event->mmap_mutex);
2940 		rb = event->rb;
2941 		if (rb) {
2942 			rcu_assign_pointer(event->rb, NULL);
2943 			ring_buffer_detach(event, rb);
2944 			ring_buffer_put(rb); /* could be last */
2945 		}
2946 		mutex_unlock(&event->mmap_mutex);
2947 	}
2948 
2949 	if (is_cgroup_event(event))
2950 		perf_detach_cgroup(event);
2951 
2952 	if (event->destroy)
2953 		event->destroy(event);
2954 
2955 	if (event->ctx)
2956 		put_ctx(event->ctx);
2957 
2958 	call_rcu(&event->rcu_head, free_event_rcu);
2959 }
2960 
perf_event_release_kernel(struct perf_event * event)2961 int perf_event_release_kernel(struct perf_event *event)
2962 {
2963 	struct perf_event_context *ctx = event->ctx;
2964 
2965 	WARN_ON_ONCE(ctx->parent_ctx);
2966 	/*
2967 	 * There are two ways this annotation is useful:
2968 	 *
2969 	 *  1) there is a lock recursion from perf_event_exit_task
2970 	 *     see the comment there.
2971 	 *
2972 	 *  2) there is a lock-inversion with mmap_sem through
2973 	 *     perf_event_read_group(), which takes faults while
2974 	 *     holding ctx->mutex, however this is called after
2975 	 *     the last filedesc died, so there is no possibility
2976 	 *     to trigger the AB-BA case.
2977 	 */
2978 	mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
2979 	perf_remove_from_context(event, true);
2980 	mutex_unlock(&ctx->mutex);
2981 
2982 	free_event(event);
2983 
2984 	return 0;
2985 }
2986 EXPORT_SYMBOL_GPL(perf_event_release_kernel);
2987 
2988 /*
2989  * Called when the last reference to the file is gone.
2990  */
put_event(struct perf_event * event)2991 static void put_event(struct perf_event *event)
2992 {
2993 	struct task_struct *owner;
2994 
2995 	if (!atomic_long_dec_and_test(&event->refcount))
2996 		return;
2997 
2998 	rcu_read_lock();
2999 	owner = ACCESS_ONCE(event->owner);
3000 	/*
3001 	 * Matches the smp_wmb() in perf_event_exit_task(). If we observe
3002 	 * !owner it means the list deletion is complete and we can indeed
3003 	 * free this event, otherwise we need to serialize on
3004 	 * owner->perf_event_mutex.
3005 	 */
3006 	smp_read_barrier_depends();
3007 	if (owner) {
3008 		/*
3009 		 * Since delayed_put_task_struct() also drops the last
3010 		 * task reference we can safely take a new reference
3011 		 * while holding the rcu_read_lock().
3012 		 */
3013 		get_task_struct(owner);
3014 	}
3015 	rcu_read_unlock();
3016 
3017 	if (owner) {
3018 		mutex_lock(&owner->perf_event_mutex);
3019 		/*
3020 		 * We have to re-check the event->owner field, if it is cleared
3021 		 * we raced with perf_event_exit_task(), acquiring the mutex
3022 		 * ensured they're done, and we can proceed with freeing the
3023 		 * event.
3024 		 */
3025 		if (event->owner)
3026 			list_del_init(&event->owner_entry);
3027 		mutex_unlock(&owner->perf_event_mutex);
3028 		put_task_struct(owner);
3029 	}
3030 
3031 	perf_event_release_kernel(event);
3032 }
3033 
perf_release(struct inode * inode,struct file * file)3034 static int perf_release(struct inode *inode, struct file *file)
3035 {
3036 	put_event(file->private_data);
3037 	return 0;
3038 }
3039 
perf_event_read_value(struct perf_event * event,u64 * enabled,u64 * running)3040 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
3041 {
3042 	struct perf_event *child;
3043 	u64 total = 0;
3044 
3045 	*enabled = 0;
3046 	*running = 0;
3047 
3048 	mutex_lock(&event->child_mutex);
3049 	total += perf_event_read(event);
3050 	*enabled += event->total_time_enabled +
3051 			atomic64_read(&event->child_total_time_enabled);
3052 	*running += event->total_time_running +
3053 			atomic64_read(&event->child_total_time_running);
3054 
3055 	list_for_each_entry(child, &event->child_list, child_list) {
3056 		total += perf_event_read(child);
3057 		*enabled += child->total_time_enabled;
3058 		*running += child->total_time_running;
3059 	}
3060 	mutex_unlock(&event->child_mutex);
3061 
3062 	return total;
3063 }
3064 EXPORT_SYMBOL_GPL(perf_event_read_value);
3065 
perf_event_read_group(struct perf_event * event,u64 read_format,char __user * buf)3066 static int perf_event_read_group(struct perf_event *event,
3067 				   u64 read_format, char __user *buf)
3068 {
3069 	struct perf_event *leader = event->group_leader, *sub;
3070 	int n = 0, size = 0, ret = -EFAULT;
3071 	struct perf_event_context *ctx = leader->ctx;
3072 	u64 values[5];
3073 	u64 count, enabled, running;
3074 
3075 	mutex_lock(&ctx->mutex);
3076 	count = perf_event_read_value(leader, &enabled, &running);
3077 
3078 	values[n++] = 1 + leader->nr_siblings;
3079 	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3080 		values[n++] = enabled;
3081 	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3082 		values[n++] = running;
3083 	values[n++] = count;
3084 	if (read_format & PERF_FORMAT_ID)
3085 		values[n++] = primary_event_id(leader);
3086 
3087 	size = n * sizeof(u64);
3088 
3089 	if (copy_to_user(buf, values, size))
3090 		goto unlock;
3091 
3092 	ret = size;
3093 
3094 	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
3095 		n = 0;
3096 
3097 		values[n++] = perf_event_read_value(sub, &enabled, &running);
3098 		if (read_format & PERF_FORMAT_ID)
3099 			values[n++] = primary_event_id(sub);
3100 
3101 		size = n * sizeof(u64);
3102 
3103 		if (copy_to_user(buf + ret, values, size)) {
3104 			ret = -EFAULT;
3105 			goto unlock;
3106 		}
3107 
3108 		ret += size;
3109 	}
3110 unlock:
3111 	mutex_unlock(&ctx->mutex);
3112 
3113 	return ret;
3114 }
3115 
perf_event_read_one(struct perf_event * event,u64 read_format,char __user * buf)3116 static int perf_event_read_one(struct perf_event *event,
3117 				 u64 read_format, char __user *buf)
3118 {
3119 	u64 enabled, running;
3120 	u64 values[4];
3121 	int n = 0;
3122 
3123 	values[n++] = perf_event_read_value(event, &enabled, &running);
3124 	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3125 		values[n++] = enabled;
3126 	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3127 		values[n++] = running;
3128 	if (read_format & PERF_FORMAT_ID)
3129 		values[n++] = primary_event_id(event);
3130 
3131 	if (copy_to_user(buf, values, n * sizeof(u64)))
3132 		return -EFAULT;
3133 
3134 	return n * sizeof(u64);
3135 }
3136 
3137 /*
3138  * Read the performance event - simple non blocking version for now
3139  */
3140 static ssize_t
perf_read_hw(struct perf_event * event,char __user * buf,size_t count)3141 perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
3142 {
3143 	u64 read_format = event->attr.read_format;
3144 	int ret;
3145 
3146 	/*
3147 	 * Return end-of-file for a read on a event that is in
3148 	 * error state (i.e. because it was pinned but it couldn't be
3149 	 * scheduled on to the CPU at some point).
3150 	 */
3151 	if (event->state == PERF_EVENT_STATE_ERROR)
3152 		return 0;
3153 
3154 	if (count < event->read_size)
3155 		return -ENOSPC;
3156 
3157 	WARN_ON_ONCE(event->ctx->parent_ctx);
3158 	if (read_format & PERF_FORMAT_GROUP)
3159 		ret = perf_event_read_group(event, read_format, buf);
3160 	else
3161 		ret = perf_event_read_one(event, read_format, buf);
3162 
3163 	return ret;
3164 }
3165 
3166 static ssize_t
perf_read(struct file * file,char __user * buf,size_t count,loff_t * ppos)3167 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
3168 {
3169 	struct perf_event *event = file->private_data;
3170 
3171 	return perf_read_hw(event, buf, count);
3172 }
3173 
perf_poll(struct file * file,poll_table * wait)3174 static unsigned int perf_poll(struct file *file, poll_table *wait)
3175 {
3176 	struct perf_event *event = file->private_data;
3177 	struct ring_buffer *rb;
3178 	unsigned int events = POLL_HUP;
3179 
3180 	/*
3181 	 * Pin the event->rb by taking event->mmap_mutex; otherwise
3182 	 * perf_event_set_output() can swizzle our rb and make us miss wakeups.
3183 	 */
3184 	mutex_lock(&event->mmap_mutex);
3185 	rb = event->rb;
3186 	if (rb)
3187 		events = atomic_xchg(&rb->poll, 0);
3188 	mutex_unlock(&event->mmap_mutex);
3189 
3190 	poll_wait(file, &event->waitq, wait);
3191 
3192 	return events;
3193 }
3194 
perf_event_reset(struct perf_event * event)3195 static void perf_event_reset(struct perf_event *event)
3196 {
3197 	(void)perf_event_read(event);
3198 	local64_set(&event->count, 0);
3199 	perf_event_update_userpage(event);
3200 }
3201 
3202 /*
3203  * Holding the top-level event's child_mutex means that any
3204  * descendant process that has inherited this event will block
3205  * in sync_child_event if it goes to exit, thus satisfying the
3206  * task existence requirements of perf_event_enable/disable.
3207  */
perf_event_for_each_child(struct perf_event * event,void (* func)(struct perf_event *))3208 static void perf_event_for_each_child(struct perf_event *event,
3209 					void (*func)(struct perf_event *))
3210 {
3211 	struct perf_event *child;
3212 
3213 	WARN_ON_ONCE(event->ctx->parent_ctx);
3214 	mutex_lock(&event->child_mutex);
3215 	func(event);
3216 	list_for_each_entry(child, &event->child_list, child_list)
3217 		func(child);
3218 	mutex_unlock(&event->child_mutex);
3219 }
3220 
perf_event_for_each(struct perf_event * event,void (* func)(struct perf_event *))3221 static void perf_event_for_each(struct perf_event *event,
3222 				  void (*func)(struct perf_event *))
3223 {
3224 	struct perf_event_context *ctx = event->ctx;
3225 	struct perf_event *sibling;
3226 
3227 	WARN_ON_ONCE(ctx->parent_ctx);
3228 	mutex_lock(&ctx->mutex);
3229 	event = event->group_leader;
3230 
3231 	perf_event_for_each_child(event, func);
3232 	func(event);
3233 	list_for_each_entry(sibling, &event->sibling_list, group_entry)
3234 		perf_event_for_each_child(sibling, func);
3235 	mutex_unlock(&ctx->mutex);
3236 }
3237 
perf_event_period(struct perf_event * event,u64 __user * arg)3238 static int perf_event_period(struct perf_event *event, u64 __user *arg)
3239 {
3240 	struct perf_event_context *ctx = event->ctx;
3241 	int ret = 0;
3242 	u64 value;
3243 
3244 	if (!is_sampling_event(event))
3245 		return -EINVAL;
3246 
3247 	if (copy_from_user(&value, arg, sizeof(value)))
3248 		return -EFAULT;
3249 
3250 	if (!value)
3251 		return -EINVAL;
3252 
3253 	raw_spin_lock_irq(&ctx->lock);
3254 	if (event->attr.freq) {
3255 		if (value > sysctl_perf_event_sample_rate) {
3256 			ret = -EINVAL;
3257 			goto unlock;
3258 		}
3259 
3260 		event->attr.sample_freq = value;
3261 	} else {
3262 		event->attr.sample_period = value;
3263 		event->hw.sample_period = value;
3264 	}
3265 unlock:
3266 	raw_spin_unlock_irq(&ctx->lock);
3267 
3268 	return ret;
3269 }
3270 
3271 static const struct file_operations perf_fops;
3272 
perf_fget_light(int fd,int * fput_needed)3273 static struct file *perf_fget_light(int fd, int *fput_needed)
3274 {
3275 	struct file *file;
3276 
3277 	file = fget_light(fd, fput_needed);
3278 	if (!file)
3279 		return ERR_PTR(-EBADF);
3280 
3281 	if (file->f_op != &perf_fops) {
3282 		fput_light(file, *fput_needed);
3283 		*fput_needed = 0;
3284 		return ERR_PTR(-EBADF);
3285 	}
3286 
3287 	return file;
3288 }
3289 
3290 static int perf_event_set_output(struct perf_event *event,
3291 				 struct perf_event *output_event);
3292 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
3293 
perf_ioctl(struct file * file,unsigned int cmd,unsigned long arg)3294 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3295 {
3296 	struct perf_event *event = file->private_data;
3297 	void (*func)(struct perf_event *);
3298 	u32 flags = arg;
3299 
3300 	switch (cmd) {
3301 	case PERF_EVENT_IOC_ENABLE:
3302 		func = perf_event_enable;
3303 		break;
3304 	case PERF_EVENT_IOC_DISABLE:
3305 		func = perf_event_disable;
3306 		break;
3307 	case PERF_EVENT_IOC_RESET:
3308 		func = perf_event_reset;
3309 		break;
3310 
3311 	case PERF_EVENT_IOC_REFRESH:
3312 		return perf_event_refresh(event, arg);
3313 
3314 	case PERF_EVENT_IOC_PERIOD:
3315 		return perf_event_period(event, (u64 __user *)arg);
3316 
3317 	case PERF_EVENT_IOC_SET_OUTPUT:
3318 	{
3319 		struct file *output_file = NULL;
3320 		struct perf_event *output_event = NULL;
3321 		int fput_needed = 0;
3322 		int ret;
3323 
3324 		if (arg != -1) {
3325 			output_file = perf_fget_light(arg, &fput_needed);
3326 			if (IS_ERR(output_file))
3327 				return PTR_ERR(output_file);
3328 			output_event = output_file->private_data;
3329 		}
3330 
3331 		ret = perf_event_set_output(event, output_event);
3332 		if (output_event)
3333 			fput_light(output_file, fput_needed);
3334 
3335 		return ret;
3336 	}
3337 
3338 	case PERF_EVENT_IOC_SET_FILTER:
3339 		return perf_event_set_filter(event, (void __user *)arg);
3340 
3341 	default:
3342 		return -ENOTTY;
3343 	}
3344 
3345 	if (flags & PERF_IOC_FLAG_GROUP)
3346 		perf_event_for_each(event, func);
3347 	else
3348 		perf_event_for_each_child(event, func);
3349 
3350 	return 0;
3351 }
3352 
perf_event_task_enable(void)3353 int perf_event_task_enable(void)
3354 {
3355 	struct perf_event *event;
3356 
3357 	mutex_lock(&current->perf_event_mutex);
3358 	list_for_each_entry(event, &current->perf_event_list, owner_entry)
3359 		perf_event_for_each_child(event, perf_event_enable);
3360 	mutex_unlock(&current->perf_event_mutex);
3361 
3362 	return 0;
3363 }
3364 
perf_event_task_disable(void)3365 int perf_event_task_disable(void)
3366 {
3367 	struct perf_event *event;
3368 
3369 	mutex_lock(&current->perf_event_mutex);
3370 	list_for_each_entry(event, &current->perf_event_list, owner_entry)
3371 		perf_event_for_each_child(event, perf_event_disable);
3372 	mutex_unlock(&current->perf_event_mutex);
3373 
3374 	return 0;
3375 }
3376 
perf_event_index(struct perf_event * event)3377 static int perf_event_index(struct perf_event *event)
3378 {
3379 	if (event->hw.state & PERF_HES_STOPPED)
3380 		return 0;
3381 
3382 	if (event->state != PERF_EVENT_STATE_ACTIVE)
3383 		return 0;
3384 
3385 	return event->pmu->event_idx(event);
3386 }
3387 
calc_timer_values(struct perf_event * event,u64 * now,u64 * enabled,u64 * running)3388 static void calc_timer_values(struct perf_event *event,
3389 				u64 *now,
3390 				u64 *enabled,
3391 				u64 *running)
3392 {
3393 	u64 ctx_time;
3394 
3395 	*now = perf_clock();
3396 	ctx_time = event->shadow_ctx_time + *now;
3397 	*enabled = ctx_time - event->tstamp_enabled;
3398 	*running = ctx_time - event->tstamp_running;
3399 }
3400 
arch_perf_update_userpage(struct perf_event_mmap_page * userpg,u64 now)3401 void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
3402 {
3403 }
3404 
3405 /*
3406  * Callers need to ensure there can be no nesting of this function, otherwise
3407  * the seqlock logic goes bad. We can not serialize this because the arch
3408  * code calls this from NMI context.
3409  */
perf_event_update_userpage(struct perf_event * event)3410 void perf_event_update_userpage(struct perf_event *event)
3411 {
3412 	struct perf_event_mmap_page *userpg;
3413 	struct ring_buffer *rb;
3414 	u64 enabled, running, now;
3415 
3416 	rcu_read_lock();
3417 	/*
3418 	 * compute total_time_enabled, total_time_running
3419 	 * based on snapshot values taken when the event
3420 	 * was last scheduled in.
3421 	 *
3422 	 * we cannot simply called update_context_time()
3423 	 * because of locking issue as we can be called in
3424 	 * NMI context
3425 	 */
3426 	calc_timer_values(event, &now, &enabled, &running);
3427 	rb = rcu_dereference(event->rb);
3428 	if (!rb)
3429 		goto unlock;
3430 
3431 	userpg = rb->user_page;
3432 
3433 	/*
3434 	 * Disable preemption so as to not let the corresponding user-space
3435 	 * spin too long if we get preempted.
3436 	 */
3437 	preempt_disable();
3438 	++userpg->lock;
3439 	barrier();
3440 	userpg->index = perf_event_index(event);
3441 	userpg->offset = perf_event_count(event);
3442 	if (userpg->index)
3443 		userpg->offset -= local64_read(&event->hw.prev_count);
3444 
3445 	userpg->time_enabled = enabled +
3446 			atomic64_read(&event->child_total_time_enabled);
3447 
3448 	userpg->time_running = running +
3449 			atomic64_read(&event->child_total_time_running);
3450 
3451 	arch_perf_update_userpage(userpg, now);
3452 
3453 	barrier();
3454 	++userpg->lock;
3455 	preempt_enable();
3456 unlock:
3457 	rcu_read_unlock();
3458 }
3459 
perf_mmap_fault(struct vm_area_struct * vma,struct vm_fault * vmf)3460 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
3461 {
3462 	struct perf_event *event = vma->vm_file->private_data;
3463 	struct ring_buffer *rb;
3464 	int ret = VM_FAULT_SIGBUS;
3465 
3466 	if (vmf->flags & FAULT_FLAG_MKWRITE) {
3467 		if (vmf->pgoff == 0)
3468 			ret = 0;
3469 		return ret;
3470 	}
3471 
3472 	rcu_read_lock();
3473 	rb = rcu_dereference(event->rb);
3474 	if (!rb)
3475 		goto unlock;
3476 
3477 	if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
3478 		goto unlock;
3479 
3480 	vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
3481 	if (!vmf->page)
3482 		goto unlock;
3483 
3484 	get_page(vmf->page);
3485 	vmf->page->mapping = vma->vm_file->f_mapping;
3486 	vmf->page->index   = vmf->pgoff;
3487 
3488 	ret = 0;
3489 unlock:
3490 	rcu_read_unlock();
3491 
3492 	return ret;
3493 }
3494 
ring_buffer_attach(struct perf_event * event,struct ring_buffer * rb)3495 static void ring_buffer_attach(struct perf_event *event,
3496 			       struct ring_buffer *rb)
3497 {
3498 	unsigned long flags;
3499 
3500 	if (!list_empty(&event->rb_entry))
3501 		return;
3502 
3503 	spin_lock_irqsave(&rb->event_lock, flags);
3504 	if (list_empty(&event->rb_entry))
3505 		list_add(&event->rb_entry, &rb->event_list);
3506 	spin_unlock_irqrestore(&rb->event_lock, flags);
3507 }
3508 
ring_buffer_detach(struct perf_event * event,struct ring_buffer * rb)3509 static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb)
3510 {
3511 	unsigned long flags;
3512 
3513 	if (list_empty(&event->rb_entry))
3514 		return;
3515 
3516 	spin_lock_irqsave(&rb->event_lock, flags);
3517 	list_del_init(&event->rb_entry);
3518 	wake_up_all(&event->waitq);
3519 	spin_unlock_irqrestore(&rb->event_lock, flags);
3520 }
3521 
ring_buffer_wakeup(struct perf_event * event)3522 static void ring_buffer_wakeup(struct perf_event *event)
3523 {
3524 	struct ring_buffer *rb;
3525 
3526 	rcu_read_lock();
3527 	rb = rcu_dereference(event->rb);
3528 	if (rb) {
3529 		list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
3530 			wake_up_all(&event->waitq);
3531 	}
3532 	rcu_read_unlock();
3533 }
3534 
rb_free_rcu(struct rcu_head * rcu_head)3535 static void rb_free_rcu(struct rcu_head *rcu_head)
3536 {
3537 	struct ring_buffer *rb;
3538 
3539 	rb = container_of(rcu_head, struct ring_buffer, rcu_head);
3540 	rb_free(rb);
3541 }
3542 
ring_buffer_get(struct perf_event * event)3543 static struct ring_buffer *ring_buffer_get(struct perf_event *event)
3544 {
3545 	struct ring_buffer *rb;
3546 
3547 	rcu_read_lock();
3548 	rb = rcu_dereference(event->rb);
3549 	if (rb) {
3550 		if (!atomic_inc_not_zero(&rb->refcount))
3551 			rb = NULL;
3552 	}
3553 	rcu_read_unlock();
3554 
3555 	return rb;
3556 }
3557 
ring_buffer_put(struct ring_buffer * rb)3558 static void ring_buffer_put(struct ring_buffer *rb)
3559 {
3560 	if (!atomic_dec_and_test(&rb->refcount))
3561 		return;
3562 
3563 	WARN_ON_ONCE(!list_empty(&rb->event_list));
3564 
3565 	call_rcu(&rb->rcu_head, rb_free_rcu);
3566 }
3567 
perf_mmap_open(struct vm_area_struct * vma)3568 static void perf_mmap_open(struct vm_area_struct *vma)
3569 {
3570 	struct perf_event *event = vma->vm_file->private_data;
3571 
3572 	atomic_inc(&event->mmap_count);
3573 	atomic_inc(&event->rb->mmap_count);
3574 }
3575 
3576 /*
3577  * A buffer can be mmap()ed multiple times; either directly through the same
3578  * event, or through other events by use of perf_event_set_output().
3579  *
3580  * In order to undo the VM accounting done by perf_mmap() we need to destroy
3581  * the buffer here, where we still have a VM context. This means we need
3582  * to detach all events redirecting to us.
3583  */
perf_mmap_close(struct vm_area_struct * vma)3584 static void perf_mmap_close(struct vm_area_struct *vma)
3585 {
3586 	struct perf_event *event = vma->vm_file->private_data;
3587 
3588 	struct ring_buffer *rb = event->rb;
3589 	struct user_struct *mmap_user = rb->mmap_user;
3590 	int mmap_locked = rb->mmap_locked;
3591 	unsigned long size = perf_data_size(rb);
3592 
3593 	atomic_dec(&rb->mmap_count);
3594 
3595 	if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
3596 		return;
3597 
3598 	/* Detach current event from the buffer. */
3599 	rcu_assign_pointer(event->rb, NULL);
3600 	ring_buffer_detach(event, rb);
3601 	mutex_unlock(&event->mmap_mutex);
3602 
3603 	/* If there's still other mmap()s of this buffer, we're done. */
3604 	if (atomic_read(&rb->mmap_count)) {
3605 		ring_buffer_put(rb); /* can't be last */
3606 		return;
3607 	}
3608 
3609 	/*
3610 	 * No other mmap()s, detach from all other events that might redirect
3611 	 * into the now unreachable buffer. Somewhat complicated by the
3612 	 * fact that rb::event_lock otherwise nests inside mmap_mutex.
3613 	 */
3614 again:
3615 	rcu_read_lock();
3616 	list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
3617 		if (!atomic_long_inc_not_zero(&event->refcount)) {
3618 			/*
3619 			 * This event is en-route to free_event() which will
3620 			 * detach it and remove it from the list.
3621 			 */
3622 			continue;
3623 		}
3624 		rcu_read_unlock();
3625 
3626 		mutex_lock(&event->mmap_mutex);
3627 		/*
3628 		 * Check we didn't race with perf_event_set_output() which can
3629 		 * swizzle the rb from under us while we were waiting to
3630 		 * acquire mmap_mutex.
3631 		 *
3632 		 * If we find a different rb; ignore this event, a next
3633 		 * iteration will no longer find it on the list. We have to
3634 		 * still restart the iteration to make sure we're not now
3635 		 * iterating the wrong list.
3636 		 */
3637 		if (event->rb == rb) {
3638 			rcu_assign_pointer(event->rb, NULL);
3639 			ring_buffer_detach(event, rb);
3640 			ring_buffer_put(rb); /* can't be last, we still have one */
3641 		}
3642 		mutex_unlock(&event->mmap_mutex);
3643 		put_event(event);
3644 
3645 		/*
3646 		 * Restart the iteration; either we're on the wrong list or
3647 		 * destroyed its integrity by doing a deletion.
3648 		 */
3649 		goto again;
3650 	}
3651 	rcu_read_unlock();
3652 
3653 	/*
3654 	 * It could be there's still a few 0-ref events on the list; they'll
3655 	 * get cleaned up by free_event() -- they'll also still have their
3656 	 * ref on the rb and will free it whenever they are done with it.
3657 	 *
3658 	 * Aside from that, this buffer is 'fully' detached and unmapped,
3659 	 * undo the VM accounting.
3660 	 */
3661 
3662 	atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
3663 	vma->vm_mm->pinned_vm -= mmap_locked;
3664 	free_uid(mmap_user);
3665 
3666 	ring_buffer_put(rb); /* could be last */
3667 }
3668 
3669 static const struct vm_operations_struct perf_mmap_vmops = {
3670 	.open		= perf_mmap_open,
3671 	.close		= perf_mmap_close,
3672 	.fault		= perf_mmap_fault,
3673 	.page_mkwrite	= perf_mmap_fault,
3674 };
3675 
perf_mmap(struct file * file,struct vm_area_struct * vma)3676 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3677 {
3678 	struct perf_event *event = file->private_data;
3679 	unsigned long user_locked, user_lock_limit;
3680 	struct user_struct *user = current_user();
3681 	unsigned long locked, lock_limit;
3682 	struct ring_buffer *rb;
3683 	unsigned long vma_size;
3684 	unsigned long nr_pages;
3685 	long user_extra, extra;
3686 	int ret = 0, flags = 0;
3687 
3688 	/*
3689 	 * Don't allow mmap() of inherited per-task counters. This would
3690 	 * create a performance issue due to all children writing to the
3691 	 * same rb.
3692 	 */
3693 	if (event->cpu == -1 && event->attr.inherit)
3694 		return -EINVAL;
3695 
3696 	if (!(vma->vm_flags & VM_SHARED))
3697 		return -EINVAL;
3698 
3699 	vma_size = vma->vm_end - vma->vm_start;
3700 	nr_pages = (vma_size / PAGE_SIZE) - 1;
3701 
3702 	/*
3703 	 * If we have rb pages ensure they're a power-of-two number, so we
3704 	 * can do bitmasks instead of modulo.
3705 	 */
3706 	if (nr_pages != 0 && !is_power_of_2(nr_pages))
3707 		return -EINVAL;
3708 
3709 	if (vma_size != PAGE_SIZE * (1 + nr_pages))
3710 		return -EINVAL;
3711 
3712 	if (vma->vm_pgoff != 0)
3713 		return -EINVAL;
3714 
3715 	WARN_ON_ONCE(event->ctx->parent_ctx);
3716 again:
3717 	mutex_lock(&event->mmap_mutex);
3718 	if (event->rb) {
3719 		if (event->rb->nr_pages != nr_pages) {
3720 			ret = -EINVAL;
3721 			goto unlock;
3722 		}
3723 
3724 		if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
3725 			/*
3726 			 * Raced against perf_mmap_close() through
3727 			 * perf_event_set_output(). Try again, hope for better
3728 			 * luck.
3729 			 */
3730 			mutex_unlock(&event->mmap_mutex);
3731 			goto again;
3732 		}
3733 
3734 		goto unlock;
3735 	}
3736 
3737 	user_extra = nr_pages + 1;
3738 	user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
3739 
3740 	/*
3741 	 * Increase the limit linearly with more CPUs:
3742 	 */
3743 	user_lock_limit *= num_online_cpus();
3744 
3745 	user_locked = atomic_long_read(&user->locked_vm) + user_extra;
3746 
3747 	extra = 0;
3748 	if (user_locked > user_lock_limit)
3749 		extra = user_locked - user_lock_limit;
3750 
3751 	lock_limit = rlimit(RLIMIT_MEMLOCK);
3752 	lock_limit >>= PAGE_SHIFT;
3753 	locked = vma->vm_mm->pinned_vm + extra;
3754 
3755 	if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
3756 		!capable(CAP_IPC_LOCK)) {
3757 		ret = -EPERM;
3758 		goto unlock;
3759 	}
3760 
3761 	WARN_ON(event->rb);
3762 
3763 	if (vma->vm_flags & VM_WRITE)
3764 		flags |= RING_BUFFER_WRITABLE;
3765 
3766 	rb = rb_alloc(nr_pages,
3767 		event->attr.watermark ? event->attr.wakeup_watermark : 0,
3768 		event->cpu, flags);
3769 
3770 	if (!rb) {
3771 		ret = -ENOMEM;
3772 		goto unlock;
3773 	}
3774 
3775 	atomic_set(&rb->mmap_count, 1);
3776 	rb->mmap_locked = extra;
3777 	rb->mmap_user = get_current_user();
3778 
3779 	atomic_long_add(user_extra, &user->locked_vm);
3780 	vma->vm_mm->pinned_vm += extra;
3781 
3782 	ring_buffer_attach(event, rb);
3783 	rcu_assign_pointer(event->rb, rb);
3784 
3785 	perf_event_update_userpage(event);
3786 
3787 unlock:
3788 	if (!ret)
3789 		atomic_inc(&event->mmap_count);
3790 	mutex_unlock(&event->mmap_mutex);
3791 
3792 	/*
3793 	 * Since pinned accounting is per vm we cannot allow fork() to copy our
3794 	 * vma.
3795 	 */
3796 	vma->vm_flags |= VM_DONTCOPY | VM_RESERVED;
3797 	vma->vm_ops = &perf_mmap_vmops;
3798 
3799 	return ret;
3800 }
3801 
perf_fasync(int fd,struct file * filp,int on)3802 static int perf_fasync(int fd, struct file *filp, int on)
3803 {
3804 	struct inode *inode = filp->f_path.dentry->d_inode;
3805 	struct perf_event *event = filp->private_data;
3806 	int retval;
3807 
3808 	mutex_lock(&inode->i_mutex);
3809 	retval = fasync_helper(fd, filp, on, &event->fasync);
3810 	mutex_unlock(&inode->i_mutex);
3811 
3812 	if (retval < 0)
3813 		return retval;
3814 
3815 	return 0;
3816 }
3817 
3818 static const struct file_operations perf_fops = {
3819 	.llseek			= no_llseek,
3820 	.release		= perf_release,
3821 	.read			= perf_read,
3822 	.poll			= perf_poll,
3823 	.unlocked_ioctl		= perf_ioctl,
3824 	.compat_ioctl		= perf_ioctl,
3825 	.mmap			= perf_mmap,
3826 	.fasync			= perf_fasync,
3827 };
3828 
3829 /*
3830  * Perf event wakeup
3831  *
3832  * If there's data, ensure we set the poll() state and publish everything
3833  * to user-space before waking everybody up.
3834  */
3835 
perf_event_wakeup(struct perf_event * event)3836 void perf_event_wakeup(struct perf_event *event)
3837 {
3838 	ring_buffer_wakeup(event);
3839 
3840 	if (event->pending_kill) {
3841 		kill_fasync(&event->fasync, SIGIO, event->pending_kill);
3842 		event->pending_kill = 0;
3843 	}
3844 }
3845 
perf_pending_event(struct irq_work * entry)3846 static void perf_pending_event(struct irq_work *entry)
3847 {
3848 	struct perf_event *event = container_of(entry,
3849 			struct perf_event, pending);
3850 
3851 	if (event->pending_disable) {
3852 		event->pending_disable = 0;
3853 		__perf_event_disable(event);
3854 	}
3855 
3856 	if (event->pending_wakeup) {
3857 		event->pending_wakeup = 0;
3858 		perf_event_wakeup(event);
3859 	}
3860 }
3861 
3862 /*
3863  * We assume there is only KVM supporting the callbacks.
3864  * Later on, we might change it to a list if there is
3865  * another virtualization implementation supporting the callbacks.
3866  */
3867 struct perf_guest_info_callbacks *perf_guest_cbs;
3868 
perf_register_guest_info_callbacks(struct perf_guest_info_callbacks * cbs)3869 int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
3870 {
3871 	perf_guest_cbs = cbs;
3872 	return 0;
3873 }
3874 EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
3875 
perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks * cbs)3876 int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
3877 {
3878 	perf_guest_cbs = NULL;
3879 	return 0;
3880 }
3881 EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
3882 
__perf_event_header__init_id(struct perf_event_header * header,struct perf_sample_data * data,struct perf_event * event)3883 static void __perf_event_header__init_id(struct perf_event_header *header,
3884 					 struct perf_sample_data *data,
3885 					 struct perf_event *event)
3886 {
3887 	u64 sample_type = event->attr.sample_type;
3888 
3889 	data->type = sample_type;
3890 	header->size += event->id_header_size;
3891 
3892 	if (sample_type & PERF_SAMPLE_TID) {
3893 		/* namespace issues */
3894 		data->tid_entry.pid = perf_event_pid(event, current);
3895 		data->tid_entry.tid = perf_event_tid(event, current);
3896 	}
3897 
3898 	if (sample_type & PERF_SAMPLE_TIME)
3899 		data->time = perf_clock();
3900 
3901 	if (sample_type & PERF_SAMPLE_ID)
3902 		data->id = primary_event_id(event);
3903 
3904 	if (sample_type & PERF_SAMPLE_STREAM_ID)
3905 		data->stream_id = event->id;
3906 
3907 	if (sample_type & PERF_SAMPLE_CPU) {
3908 		data->cpu_entry.cpu	 = raw_smp_processor_id();
3909 		data->cpu_entry.reserved = 0;
3910 	}
3911 }
3912 
perf_event_header__init_id(struct perf_event_header * header,struct perf_sample_data * data,struct perf_event * event)3913 void perf_event_header__init_id(struct perf_event_header *header,
3914 				struct perf_sample_data *data,
3915 				struct perf_event *event)
3916 {
3917 	if (event->attr.sample_id_all)
3918 		__perf_event_header__init_id(header, data, event);
3919 }
3920 
__perf_event__output_id_sample(struct perf_output_handle * handle,struct perf_sample_data * data)3921 static void __perf_event__output_id_sample(struct perf_output_handle *handle,
3922 					   struct perf_sample_data *data)
3923 {
3924 	u64 sample_type = data->type;
3925 
3926 	if (sample_type & PERF_SAMPLE_TID)
3927 		perf_output_put(handle, data->tid_entry);
3928 
3929 	if (sample_type & PERF_SAMPLE_TIME)
3930 		perf_output_put(handle, data->time);
3931 
3932 	if (sample_type & PERF_SAMPLE_ID)
3933 		perf_output_put(handle, data->id);
3934 
3935 	if (sample_type & PERF_SAMPLE_STREAM_ID)
3936 		perf_output_put(handle, data->stream_id);
3937 
3938 	if (sample_type & PERF_SAMPLE_CPU)
3939 		perf_output_put(handle, data->cpu_entry);
3940 }
3941 
perf_event__output_id_sample(struct perf_event * event,struct perf_output_handle * handle,struct perf_sample_data * sample)3942 void perf_event__output_id_sample(struct perf_event *event,
3943 				  struct perf_output_handle *handle,
3944 				  struct perf_sample_data *sample)
3945 {
3946 	if (event->attr.sample_id_all)
3947 		__perf_event__output_id_sample(handle, sample);
3948 }
3949 
perf_output_read_one(struct perf_output_handle * handle,struct perf_event * event,u64 enabled,u64 running)3950 static void perf_output_read_one(struct perf_output_handle *handle,
3951 				 struct perf_event *event,
3952 				 u64 enabled, u64 running)
3953 {
3954 	u64 read_format = event->attr.read_format;
3955 	u64 values[4];
3956 	int n = 0;
3957 
3958 	values[n++] = perf_event_count(event);
3959 	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
3960 		values[n++] = enabled +
3961 			atomic64_read(&event->child_total_time_enabled);
3962 	}
3963 	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
3964 		values[n++] = running +
3965 			atomic64_read(&event->child_total_time_running);
3966 	}
3967 	if (read_format & PERF_FORMAT_ID)
3968 		values[n++] = primary_event_id(event);
3969 
3970 	__output_copy(handle, values, n * sizeof(u64));
3971 }
3972 
3973 /*
3974  * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
3975  */
perf_output_read_group(struct perf_output_handle * handle,struct perf_event * event,u64 enabled,u64 running)3976 static void perf_output_read_group(struct perf_output_handle *handle,
3977 			    struct perf_event *event,
3978 			    u64 enabled, u64 running)
3979 {
3980 	struct perf_event *leader = event->group_leader, *sub;
3981 	u64 read_format = event->attr.read_format;
3982 	u64 values[5];
3983 	int n = 0;
3984 
3985 	values[n++] = 1 + leader->nr_siblings;
3986 
3987 	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3988 		values[n++] = enabled;
3989 
3990 	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3991 		values[n++] = running;
3992 
3993 	if (leader != event)
3994 		leader->pmu->read(leader);
3995 
3996 	values[n++] = perf_event_count(leader);
3997 	if (read_format & PERF_FORMAT_ID)
3998 		values[n++] = primary_event_id(leader);
3999 
4000 	__output_copy(handle, values, n * sizeof(u64));
4001 
4002 	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
4003 		n = 0;
4004 
4005 		if (sub != event)
4006 			sub->pmu->read(sub);
4007 
4008 		values[n++] = perf_event_count(sub);
4009 		if (read_format & PERF_FORMAT_ID)
4010 			values[n++] = primary_event_id(sub);
4011 
4012 		__output_copy(handle, values, n * sizeof(u64));
4013 	}
4014 }
4015 
4016 #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
4017 				 PERF_FORMAT_TOTAL_TIME_RUNNING)
4018 
perf_output_read(struct perf_output_handle * handle,struct perf_event * event)4019 static void perf_output_read(struct perf_output_handle *handle,
4020 			     struct perf_event *event)
4021 {
4022 	u64 enabled = 0, running = 0, now;
4023 	u64 read_format = event->attr.read_format;
4024 
4025 	/*
4026 	 * compute total_time_enabled, total_time_running
4027 	 * based on snapshot values taken when the event
4028 	 * was last scheduled in.
4029 	 *
4030 	 * we cannot simply called update_context_time()
4031 	 * because of locking issue as we are called in
4032 	 * NMI context
4033 	 */
4034 	if (read_format & PERF_FORMAT_TOTAL_TIMES)
4035 		calc_timer_values(event, &now, &enabled, &running);
4036 
4037 	if (event->attr.read_format & PERF_FORMAT_GROUP)
4038 		perf_output_read_group(handle, event, enabled, running);
4039 	else
4040 		perf_output_read_one(handle, event, enabled, running);
4041 }
4042 
perf_output_sample(struct perf_output_handle * handle,struct perf_event_header * header,struct perf_sample_data * data,struct perf_event * event)4043 void perf_output_sample(struct perf_output_handle *handle,
4044 			struct perf_event_header *header,
4045 			struct perf_sample_data *data,
4046 			struct perf_event *event)
4047 {
4048 	u64 sample_type = data->type;
4049 
4050 	perf_output_put(handle, *header);
4051 
4052 	if (sample_type & PERF_SAMPLE_IP)
4053 		perf_output_put(handle, data->ip);
4054 
4055 	if (sample_type & PERF_SAMPLE_TID)
4056 		perf_output_put(handle, data->tid_entry);
4057 
4058 	if (sample_type & PERF_SAMPLE_TIME)
4059 		perf_output_put(handle, data->time);
4060 
4061 	if (sample_type & PERF_SAMPLE_ADDR)
4062 		perf_output_put(handle, data->addr);
4063 
4064 	if (sample_type & PERF_SAMPLE_ID)
4065 		perf_output_put(handle, data->id);
4066 
4067 	if (sample_type & PERF_SAMPLE_STREAM_ID)
4068 		perf_output_put(handle, data->stream_id);
4069 
4070 	if (sample_type & PERF_SAMPLE_CPU)
4071 		perf_output_put(handle, data->cpu_entry);
4072 
4073 	if (sample_type & PERF_SAMPLE_PERIOD)
4074 		perf_output_put(handle, data->period);
4075 
4076 	if (sample_type & PERF_SAMPLE_READ)
4077 		perf_output_read(handle, event);
4078 
4079 	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
4080 		if (data->callchain) {
4081 			int size = 1;
4082 
4083 			if (data->callchain)
4084 				size += data->callchain->nr;
4085 
4086 			size *= sizeof(u64);
4087 
4088 			__output_copy(handle, data->callchain, size);
4089 		} else {
4090 			u64 nr = 0;
4091 			perf_output_put(handle, nr);
4092 		}
4093 	}
4094 
4095 	if (sample_type & PERF_SAMPLE_RAW) {
4096 		if (data->raw) {
4097 			perf_output_put(handle, data->raw->size);
4098 			__output_copy(handle, data->raw->data,
4099 					   data->raw->size);
4100 		} else {
4101 			struct {
4102 				u32	size;
4103 				u32	data;
4104 			} raw = {
4105 				.size = sizeof(u32),
4106 				.data = 0,
4107 			};
4108 			perf_output_put(handle, raw);
4109 		}
4110 	}
4111 
4112 	if (!event->attr.watermark) {
4113 		int wakeup_events = event->attr.wakeup_events;
4114 
4115 		if (wakeup_events) {
4116 			struct ring_buffer *rb = handle->rb;
4117 			int events = local_inc_return(&rb->events);
4118 
4119 			if (events >= wakeup_events) {
4120 				local_sub(wakeup_events, &rb->events);
4121 				local_inc(&rb->wakeup);
4122 			}
4123 		}
4124 	}
4125 
4126 	if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
4127 		if (data->br_stack) {
4128 			size_t size;
4129 
4130 			size = data->br_stack->nr
4131 			     * sizeof(struct perf_branch_entry);
4132 
4133 			perf_output_put(handle, data->br_stack->nr);
4134 			perf_output_copy(handle, data->br_stack->entries, size);
4135 		} else {
4136 			/*
4137 			 * we always store at least the value of nr
4138 			 */
4139 			u64 nr = 0;
4140 			perf_output_put(handle, nr);
4141 		}
4142 	}
4143 }
4144 
perf_prepare_sample(struct perf_event_header * header,struct perf_sample_data * data,struct perf_event * event,struct pt_regs * regs)4145 void perf_prepare_sample(struct perf_event_header *header,
4146 			 struct perf_sample_data *data,
4147 			 struct perf_event *event,
4148 			 struct pt_regs *regs)
4149 {
4150 	u64 sample_type = event->attr.sample_type;
4151 
4152 	header->type = PERF_RECORD_SAMPLE;
4153 	header->size = sizeof(*header) + event->header_size;
4154 
4155 	header->misc = 0;
4156 	header->misc |= perf_misc_flags(regs);
4157 
4158 	__perf_event_header__init_id(header, data, event);
4159 
4160 	if (sample_type & PERF_SAMPLE_IP)
4161 		data->ip = perf_instruction_pointer(regs);
4162 
4163 	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
4164 		int size = 1;
4165 
4166 		data->callchain = perf_callchain(regs);
4167 
4168 		if (data->callchain)
4169 			size += data->callchain->nr;
4170 
4171 		header->size += size * sizeof(u64);
4172 	}
4173 
4174 	if (sample_type & PERF_SAMPLE_RAW) {
4175 		int size = sizeof(u32);
4176 
4177 		if (data->raw)
4178 			size += data->raw->size;
4179 		else
4180 			size += sizeof(u32);
4181 
4182 		WARN_ON_ONCE(size & (sizeof(u64)-1));
4183 		header->size += size;
4184 	}
4185 
4186 	if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
4187 		int size = sizeof(u64); /* nr */
4188 		if (data->br_stack) {
4189 			size += data->br_stack->nr
4190 			      * sizeof(struct perf_branch_entry);
4191 		}
4192 		header->size += size;
4193 	}
4194 }
4195 
perf_event_output(struct perf_event * event,struct perf_sample_data * data,struct pt_regs * regs)4196 static void perf_event_output(struct perf_event *event,
4197 				struct perf_sample_data *data,
4198 				struct pt_regs *regs)
4199 {
4200 	struct perf_output_handle handle;
4201 	struct perf_event_header header;
4202 
4203 	/* protect the callchain buffers */
4204 	rcu_read_lock();
4205 
4206 	perf_prepare_sample(&header, data, event, regs);
4207 
4208 	if (perf_output_begin(&handle, event, header.size))
4209 		goto exit;
4210 
4211 	perf_output_sample(&handle, &header, data, event);
4212 
4213 	perf_output_end(&handle);
4214 
4215 exit:
4216 	rcu_read_unlock();
4217 }
4218 
4219 /*
4220  * read event_id
4221  */
4222 
4223 struct perf_read_event {
4224 	struct perf_event_header	header;
4225 
4226 	u32				pid;
4227 	u32				tid;
4228 };
4229 
4230 static void
perf_event_read_event(struct perf_event * event,struct task_struct * task)4231 perf_event_read_event(struct perf_event *event,
4232 			struct task_struct *task)
4233 {
4234 	struct perf_output_handle handle;
4235 	struct perf_sample_data sample;
4236 	struct perf_read_event read_event = {
4237 		.header = {
4238 			.type = PERF_RECORD_READ,
4239 			.misc = 0,
4240 			.size = sizeof(read_event) + event->read_size,
4241 		},
4242 		.pid = perf_event_pid(event, task),
4243 		.tid = perf_event_tid(event, task),
4244 	};
4245 	int ret;
4246 
4247 	perf_event_header__init_id(&read_event.header, &sample, event);
4248 	ret = perf_output_begin(&handle, event, read_event.header.size);
4249 	if (ret)
4250 		return;
4251 
4252 	perf_output_put(&handle, read_event);
4253 	perf_output_read(&handle, event);
4254 	perf_event__output_id_sample(event, &handle, &sample);
4255 
4256 	perf_output_end(&handle);
4257 }
4258 
4259 /*
4260  * task tracking -- fork/exit
4261  *
4262  * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task
4263  */
4264 
4265 struct perf_task_event {
4266 	struct task_struct		*task;
4267 	struct perf_event_context	*task_ctx;
4268 
4269 	struct {
4270 		struct perf_event_header	header;
4271 
4272 		u32				pid;
4273 		u32				ppid;
4274 		u32				tid;
4275 		u32				ptid;
4276 		u64				time;
4277 	} event_id;
4278 };
4279 
perf_event_task_output(struct perf_event * event,struct perf_task_event * task_event)4280 static void perf_event_task_output(struct perf_event *event,
4281 				     struct perf_task_event *task_event)
4282 {
4283 	struct perf_output_handle handle;
4284 	struct perf_sample_data	sample;
4285 	struct task_struct *task = task_event->task;
4286 	int ret, size = task_event->event_id.header.size;
4287 
4288 	perf_event_header__init_id(&task_event->event_id.header, &sample, event);
4289 
4290 	ret = perf_output_begin(&handle, event,
4291 				task_event->event_id.header.size);
4292 	if (ret)
4293 		goto out;
4294 
4295 	task_event->event_id.pid = perf_event_pid(event, task);
4296 	task_event->event_id.ppid = perf_event_pid(event, current);
4297 
4298 	task_event->event_id.tid = perf_event_tid(event, task);
4299 	task_event->event_id.ptid = perf_event_tid(event, current);
4300 
4301 	perf_output_put(&handle, task_event->event_id);
4302 
4303 	perf_event__output_id_sample(event, &handle, &sample);
4304 
4305 	perf_output_end(&handle);
4306 out:
4307 	task_event->event_id.header.size = size;
4308 }
4309 
perf_event_task_match(struct perf_event * event)4310 static int perf_event_task_match(struct perf_event *event)
4311 {
4312 	if (event->state < PERF_EVENT_STATE_INACTIVE)
4313 		return 0;
4314 
4315 	if (!event_filter_match(event))
4316 		return 0;
4317 
4318 	if (event->attr.comm || event->attr.mmap ||
4319 	    event->attr.mmap_data || event->attr.task)
4320 		return 1;
4321 
4322 	return 0;
4323 }
4324 
perf_event_task_ctx(struct perf_event_context * ctx,struct perf_task_event * task_event)4325 static void perf_event_task_ctx(struct perf_event_context *ctx,
4326 				  struct perf_task_event *task_event)
4327 {
4328 	struct perf_event *event;
4329 
4330 	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
4331 		if (perf_event_task_match(event))
4332 			perf_event_task_output(event, task_event);
4333 	}
4334 }
4335 
perf_event_task_event(struct perf_task_event * task_event)4336 static void perf_event_task_event(struct perf_task_event *task_event)
4337 {
4338 	struct perf_cpu_context *cpuctx;
4339 	struct perf_event_context *ctx;
4340 	struct pmu *pmu;
4341 	int ctxn;
4342 
4343 	rcu_read_lock();
4344 	list_for_each_entry_rcu(pmu, &pmus, entry) {
4345 		cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
4346 		if (cpuctx->unique_pmu != pmu)
4347 			goto next;
4348 		perf_event_task_ctx(&cpuctx->ctx, task_event);
4349 
4350 		ctx = task_event->task_ctx;
4351 		if (!ctx) {
4352 			ctxn = pmu->task_ctx_nr;
4353 			if (ctxn < 0)
4354 				goto next;
4355 			ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4356 		}
4357 		if (ctx)
4358 			perf_event_task_ctx(ctx, task_event);
4359 next:
4360 		put_cpu_ptr(pmu->pmu_cpu_context);
4361 	}
4362 	rcu_read_unlock();
4363 }
4364 
perf_event_task(struct task_struct * task,struct perf_event_context * task_ctx,int new)4365 static void perf_event_task(struct task_struct *task,
4366 			      struct perf_event_context *task_ctx,
4367 			      int new)
4368 {
4369 	struct perf_task_event task_event;
4370 
4371 	if (!atomic_read(&nr_comm_events) &&
4372 	    !atomic_read(&nr_mmap_events) &&
4373 	    !atomic_read(&nr_task_events))
4374 		return;
4375 
4376 	task_event = (struct perf_task_event){
4377 		.task	  = task,
4378 		.task_ctx = task_ctx,
4379 		.event_id    = {
4380 			.header = {
4381 				.type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
4382 				.misc = 0,
4383 				.size = sizeof(task_event.event_id),
4384 			},
4385 			/* .pid  */
4386 			/* .ppid */
4387 			/* .tid  */
4388 			/* .ptid */
4389 			.time = perf_clock(),
4390 		},
4391 	};
4392 
4393 	perf_event_task_event(&task_event);
4394 }
4395 
perf_event_fork(struct task_struct * task)4396 void perf_event_fork(struct task_struct *task)
4397 {
4398 	perf_event_task(task, NULL, 1);
4399 }
4400 
4401 /*
4402  * comm tracking
4403  */
4404 
4405 struct perf_comm_event {
4406 	struct task_struct	*task;
4407 	char			*comm;
4408 	int			comm_size;
4409 
4410 	struct {
4411 		struct perf_event_header	header;
4412 
4413 		u32				pid;
4414 		u32				tid;
4415 	} event_id;
4416 };
4417 
perf_event_comm_output(struct perf_event * event,struct perf_comm_event * comm_event)4418 static void perf_event_comm_output(struct perf_event *event,
4419 				     struct perf_comm_event *comm_event)
4420 {
4421 	struct perf_output_handle handle;
4422 	struct perf_sample_data sample;
4423 	int size = comm_event->event_id.header.size;
4424 	int ret;
4425 
4426 	perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
4427 	ret = perf_output_begin(&handle, event,
4428 				comm_event->event_id.header.size);
4429 
4430 	if (ret)
4431 		goto out;
4432 
4433 	comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
4434 	comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
4435 
4436 	perf_output_put(&handle, comm_event->event_id);
4437 	__output_copy(&handle, comm_event->comm,
4438 				   comm_event->comm_size);
4439 
4440 	perf_event__output_id_sample(event, &handle, &sample);
4441 
4442 	perf_output_end(&handle);
4443 out:
4444 	comm_event->event_id.header.size = size;
4445 }
4446 
perf_event_comm_match(struct perf_event * event)4447 static int perf_event_comm_match(struct perf_event *event)
4448 {
4449 	if (event->state < PERF_EVENT_STATE_INACTIVE)
4450 		return 0;
4451 
4452 	if (!event_filter_match(event))
4453 		return 0;
4454 
4455 	if (event->attr.comm)
4456 		return 1;
4457 
4458 	return 0;
4459 }
4460 
perf_event_comm_ctx(struct perf_event_context * ctx,struct perf_comm_event * comm_event)4461 static void perf_event_comm_ctx(struct perf_event_context *ctx,
4462 				  struct perf_comm_event *comm_event)
4463 {
4464 	struct perf_event *event;
4465 
4466 	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
4467 		if (perf_event_comm_match(event))
4468 			perf_event_comm_output(event, comm_event);
4469 	}
4470 }
4471 
perf_event_comm_event(struct perf_comm_event * comm_event)4472 static void perf_event_comm_event(struct perf_comm_event *comm_event)
4473 {
4474 	struct perf_cpu_context *cpuctx;
4475 	struct perf_event_context *ctx;
4476 	char comm[TASK_COMM_LEN];
4477 	unsigned int size;
4478 	struct pmu *pmu;
4479 	int ctxn;
4480 
4481 	memset(comm, 0, sizeof(comm));
4482 	strlcpy(comm, comm_event->task->comm, sizeof(comm));
4483 	size = ALIGN(strlen(comm)+1, sizeof(u64));
4484 
4485 	comm_event->comm = comm;
4486 	comm_event->comm_size = size;
4487 
4488 	comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
4489 	rcu_read_lock();
4490 	list_for_each_entry_rcu(pmu, &pmus, entry) {
4491 		cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
4492 		if (cpuctx->unique_pmu != pmu)
4493 			goto next;
4494 		perf_event_comm_ctx(&cpuctx->ctx, comm_event);
4495 
4496 		ctxn = pmu->task_ctx_nr;
4497 		if (ctxn < 0)
4498 			goto next;
4499 
4500 		ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4501 		if (ctx)
4502 			perf_event_comm_ctx(ctx, comm_event);
4503 next:
4504 		put_cpu_ptr(pmu->pmu_cpu_context);
4505 	}
4506 	rcu_read_unlock();
4507 }
4508 
perf_event_comm(struct task_struct * task)4509 void perf_event_comm(struct task_struct *task)
4510 {
4511 	struct perf_comm_event comm_event;
4512 	struct perf_event_context *ctx;
4513 	int ctxn;
4514 
4515 	for_each_task_context_nr(ctxn) {
4516 		ctx = task->perf_event_ctxp[ctxn];
4517 		if (!ctx)
4518 			continue;
4519 
4520 		perf_event_enable_on_exec(ctx);
4521 	}
4522 
4523 	if (!atomic_read(&nr_comm_events))
4524 		return;
4525 
4526 	comm_event = (struct perf_comm_event){
4527 		.task	= task,
4528 		/* .comm      */
4529 		/* .comm_size */
4530 		.event_id  = {
4531 			.header = {
4532 				.type = PERF_RECORD_COMM,
4533 				.misc = 0,
4534 				/* .size */
4535 			},
4536 			/* .pid */
4537 			/* .tid */
4538 		},
4539 	};
4540 
4541 	perf_event_comm_event(&comm_event);
4542 }
4543 
4544 /*
4545  * mmap tracking
4546  */
4547 
4548 struct perf_mmap_event {
4549 	struct vm_area_struct	*vma;
4550 
4551 	const char		*file_name;
4552 	int			file_size;
4553 
4554 	struct {
4555 		struct perf_event_header	header;
4556 
4557 		u32				pid;
4558 		u32				tid;
4559 		u64				start;
4560 		u64				len;
4561 		u64				pgoff;
4562 	} event_id;
4563 };
4564 
perf_event_mmap_output(struct perf_event * event,struct perf_mmap_event * mmap_event)4565 static void perf_event_mmap_output(struct perf_event *event,
4566 				     struct perf_mmap_event *mmap_event)
4567 {
4568 	struct perf_output_handle handle;
4569 	struct perf_sample_data sample;
4570 	int size = mmap_event->event_id.header.size;
4571 	int ret;
4572 
4573 	perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
4574 	ret = perf_output_begin(&handle, event,
4575 				mmap_event->event_id.header.size);
4576 	if (ret)
4577 		goto out;
4578 
4579 	mmap_event->event_id.pid = perf_event_pid(event, current);
4580 	mmap_event->event_id.tid = perf_event_tid(event, current);
4581 
4582 	perf_output_put(&handle, mmap_event->event_id);
4583 	__output_copy(&handle, mmap_event->file_name,
4584 				   mmap_event->file_size);
4585 
4586 	perf_event__output_id_sample(event, &handle, &sample);
4587 
4588 	perf_output_end(&handle);
4589 out:
4590 	mmap_event->event_id.header.size = size;
4591 }
4592 
perf_event_mmap_match(struct perf_event * event,struct perf_mmap_event * mmap_event,int executable)4593 static int perf_event_mmap_match(struct perf_event *event,
4594 				   struct perf_mmap_event *mmap_event,
4595 				   int executable)
4596 {
4597 	if (event->state < PERF_EVENT_STATE_INACTIVE)
4598 		return 0;
4599 
4600 	if (!event_filter_match(event))
4601 		return 0;
4602 
4603 	if ((!executable && event->attr.mmap_data) ||
4604 	    (executable && event->attr.mmap))
4605 		return 1;
4606 
4607 	return 0;
4608 }
4609 
perf_event_mmap_ctx(struct perf_event_context * ctx,struct perf_mmap_event * mmap_event,int executable)4610 static void perf_event_mmap_ctx(struct perf_event_context *ctx,
4611 				  struct perf_mmap_event *mmap_event,
4612 				  int executable)
4613 {
4614 	struct perf_event *event;
4615 
4616 	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
4617 		if (perf_event_mmap_match(event, mmap_event, executable))
4618 			perf_event_mmap_output(event, mmap_event);
4619 	}
4620 }
4621 
perf_event_mmap_event(struct perf_mmap_event * mmap_event)4622 static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
4623 {
4624 	struct perf_cpu_context *cpuctx;
4625 	struct perf_event_context *ctx;
4626 	struct vm_area_struct *vma = mmap_event->vma;
4627 	struct file *file = vma->vm_file;
4628 	unsigned int size;
4629 	char tmp[16];
4630 	char *buf = NULL;
4631 	const char *name;
4632 	struct pmu *pmu;
4633 	int ctxn;
4634 
4635 	memset(tmp, 0, sizeof(tmp));
4636 
4637 	if (file) {
4638 		/*
4639 		 * d_path works from the end of the rb backwards, so we
4640 		 * need to add enough zero bytes after the string to handle
4641 		 * the 64bit alignment we do later.
4642 		 */
4643 		buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
4644 		if (!buf) {
4645 			name = strncpy(tmp, "//enomem", sizeof(tmp));
4646 			goto got_name;
4647 		}
4648 		name = d_path(&file->f_path, buf, PATH_MAX);
4649 		if (IS_ERR(name)) {
4650 			name = strncpy(tmp, "//toolong", sizeof(tmp));
4651 			goto got_name;
4652 		}
4653 	} else {
4654 		if (arch_vma_name(mmap_event->vma)) {
4655 			name = strncpy(tmp, arch_vma_name(mmap_event->vma),
4656 				       sizeof(tmp));
4657 			goto got_name;
4658 		}
4659 
4660 		if (!vma->vm_mm) {
4661 			name = strncpy(tmp, "[vdso]", sizeof(tmp));
4662 			goto got_name;
4663 		} else if (vma->vm_start <= vma->vm_mm->start_brk &&
4664 				vma->vm_end >= vma->vm_mm->brk) {
4665 			name = strncpy(tmp, "[heap]", sizeof(tmp));
4666 			goto got_name;
4667 		} else if (vma->vm_start <= vma->vm_mm->start_stack &&
4668 				vma->vm_end >= vma->vm_mm->start_stack) {
4669 			name = strncpy(tmp, "[stack]", sizeof(tmp));
4670 			goto got_name;
4671 		}
4672 
4673 		name = strncpy(tmp, "//anon", sizeof(tmp));
4674 		goto got_name;
4675 	}
4676 
4677 got_name:
4678 	size = ALIGN(strlen(name)+1, sizeof(u64));
4679 
4680 	mmap_event->file_name = name;
4681 	mmap_event->file_size = size;
4682 
4683 	mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
4684 
4685 	rcu_read_lock();
4686 	list_for_each_entry_rcu(pmu, &pmus, entry) {
4687 		cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
4688 		if (cpuctx->unique_pmu != pmu)
4689 			goto next;
4690 		perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
4691 					vma->vm_flags & VM_EXEC);
4692 
4693 		ctxn = pmu->task_ctx_nr;
4694 		if (ctxn < 0)
4695 			goto next;
4696 
4697 		ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4698 		if (ctx) {
4699 			perf_event_mmap_ctx(ctx, mmap_event,
4700 					vma->vm_flags & VM_EXEC);
4701 		}
4702 next:
4703 		put_cpu_ptr(pmu->pmu_cpu_context);
4704 	}
4705 	rcu_read_unlock();
4706 
4707 	kfree(buf);
4708 }
4709 
perf_event_mmap(struct vm_area_struct * vma)4710 void perf_event_mmap(struct vm_area_struct *vma)
4711 {
4712 	struct perf_mmap_event mmap_event;
4713 
4714 	if (!atomic_read(&nr_mmap_events))
4715 		return;
4716 
4717 	mmap_event = (struct perf_mmap_event){
4718 		.vma	= vma,
4719 		/* .file_name */
4720 		/* .file_size */
4721 		.event_id  = {
4722 			.header = {
4723 				.type = PERF_RECORD_MMAP,
4724 				.misc = PERF_RECORD_MISC_USER,
4725 				/* .size */
4726 			},
4727 			/* .pid */
4728 			/* .tid */
4729 			.start  = vma->vm_start,
4730 			.len    = vma->vm_end - vma->vm_start,
4731 			.pgoff  = (u64)vma->vm_pgoff << PAGE_SHIFT,
4732 		},
4733 	};
4734 
4735 	perf_event_mmap_event(&mmap_event);
4736 }
4737 
4738 /*
4739  * IRQ throttle logging
4740  */
4741 
perf_log_throttle(struct perf_event * event,int enable)4742 static void perf_log_throttle(struct perf_event *event, int enable)
4743 {
4744 	struct perf_output_handle handle;
4745 	struct perf_sample_data sample;
4746 	int ret;
4747 
4748 	struct {
4749 		struct perf_event_header	header;
4750 		u64				time;
4751 		u64				id;
4752 		u64				stream_id;
4753 	} throttle_event = {
4754 		.header = {
4755 			.type = PERF_RECORD_THROTTLE,
4756 			.misc = 0,
4757 			.size = sizeof(throttle_event),
4758 		},
4759 		.time		= perf_clock(),
4760 		.id		= primary_event_id(event),
4761 		.stream_id	= event->id,
4762 	};
4763 
4764 	if (enable)
4765 		throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
4766 
4767 	perf_event_header__init_id(&throttle_event.header, &sample, event);
4768 
4769 	ret = perf_output_begin(&handle, event,
4770 				throttle_event.header.size);
4771 	if (ret)
4772 		return;
4773 
4774 	perf_output_put(&handle, throttle_event);
4775 	perf_event__output_id_sample(event, &handle, &sample);
4776 	perf_output_end(&handle);
4777 }
4778 
4779 /*
4780  * Generic event overflow handling, sampling.
4781  */
4782 
__perf_event_overflow(struct perf_event * event,int throttle,struct perf_sample_data * data,struct pt_regs * regs)4783 static int __perf_event_overflow(struct perf_event *event,
4784 				   int throttle, struct perf_sample_data *data,
4785 				   struct pt_regs *regs)
4786 {
4787 	int events = atomic_read(&event->event_limit);
4788 	struct hw_perf_event *hwc = &event->hw;
4789 	u64 seq;
4790 	int ret = 0;
4791 
4792 	/*
4793 	 * Non-sampling counters might still use the PMI to fold short
4794 	 * hardware counters, ignore those.
4795 	 */
4796 	if (unlikely(!is_sampling_event(event)))
4797 		return 0;
4798 
4799 	seq = __this_cpu_read(perf_throttled_seq);
4800 	if (seq != hwc->interrupts_seq) {
4801 		hwc->interrupts_seq = seq;
4802 		hwc->interrupts = 1;
4803 	} else {
4804 		hwc->interrupts++;
4805 		if (unlikely(throttle
4806 			     && hwc->interrupts >= max_samples_per_tick)) {
4807 			__this_cpu_inc(perf_throttled_count);
4808 			hwc->interrupts = MAX_INTERRUPTS;
4809 			perf_log_throttle(event, 0);
4810 			ret = 1;
4811 		}
4812 	}
4813 
4814 	if (event->attr.freq) {
4815 		u64 now = perf_clock();
4816 		s64 delta = now - hwc->freq_time_stamp;
4817 
4818 		hwc->freq_time_stamp = now;
4819 
4820 		if (delta > 0 && delta < 2*TICK_NSEC)
4821 			perf_adjust_period(event, delta, hwc->last_period, true);
4822 	}
4823 
4824 	/*
4825 	 * XXX event_limit might not quite work as expected on inherited
4826 	 * events
4827 	 */
4828 
4829 	event->pending_kill = POLL_IN;
4830 	if (events && atomic_dec_and_test(&event->event_limit)) {
4831 		ret = 1;
4832 		event->pending_kill = POLL_HUP;
4833 		event->pending_disable = 1;
4834 		irq_work_queue(&event->pending);
4835 	}
4836 
4837 	if (event->overflow_handler)
4838 		event->overflow_handler(event, data, regs);
4839 	else
4840 		perf_event_output(event, data, regs);
4841 
4842 	if (event->fasync && event->pending_kill) {
4843 		event->pending_wakeup = 1;
4844 		irq_work_queue(&event->pending);
4845 	}
4846 
4847 	return ret;
4848 }
4849 
perf_event_overflow(struct perf_event * event,struct perf_sample_data * data,struct pt_regs * regs)4850 int perf_event_overflow(struct perf_event *event,
4851 			  struct perf_sample_data *data,
4852 			  struct pt_regs *regs)
4853 {
4854 	return __perf_event_overflow(event, 1, data, regs);
4855 }
4856 
4857 /*
4858  * Generic software event infrastructure
4859  */
4860 
4861 struct swevent_htable {
4862 	struct swevent_hlist		*swevent_hlist;
4863 	struct mutex			hlist_mutex;
4864 	int				hlist_refcount;
4865 
4866 	/* Recursion avoidance in each contexts */
4867 	int				recursion[PERF_NR_CONTEXTS];
4868 
4869 	/* Keeps track of cpu being initialized/exited */
4870 	bool				online;
4871 };
4872 
4873 static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
4874 
4875 /*
4876  * We directly increment event->count and keep a second value in
4877  * event->hw.period_left to count intervals. This period event
4878  * is kept in the range [-sample_period, 0] so that we can use the
4879  * sign as trigger.
4880  */
4881 
perf_swevent_set_period(struct perf_event * event)4882 static u64 perf_swevent_set_period(struct perf_event *event)
4883 {
4884 	struct hw_perf_event *hwc = &event->hw;
4885 	u64 period = hwc->last_period;
4886 	u64 nr, offset;
4887 	s64 old, val;
4888 
4889 	hwc->last_period = hwc->sample_period;
4890 
4891 again:
4892 	old = val = local64_read(&hwc->period_left);
4893 	if (val < 0)
4894 		return 0;
4895 
4896 	nr = div64_u64(period + val, period);
4897 	offset = nr * period;
4898 	val -= offset;
4899 	if (local64_cmpxchg(&hwc->period_left, old, val) != old)
4900 		goto again;
4901 
4902 	return nr;
4903 }
4904 
perf_swevent_overflow(struct perf_event * event,u64 overflow,struct perf_sample_data * data,struct pt_regs * regs)4905 static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
4906 				    struct perf_sample_data *data,
4907 				    struct pt_regs *regs)
4908 {
4909 	struct hw_perf_event *hwc = &event->hw;
4910 	int throttle = 0;
4911 
4912 	if (!overflow)
4913 		overflow = perf_swevent_set_period(event);
4914 
4915 	if (hwc->interrupts == MAX_INTERRUPTS)
4916 		return;
4917 
4918 	for (; overflow; overflow--) {
4919 		if (__perf_event_overflow(event, throttle,
4920 					    data, regs)) {
4921 			/*
4922 			 * We inhibit the overflow from happening when
4923 			 * hwc->interrupts == MAX_INTERRUPTS.
4924 			 */
4925 			break;
4926 		}
4927 		throttle = 1;
4928 	}
4929 }
4930 
perf_swevent_event(struct perf_event * event,u64 nr,struct perf_sample_data * data,struct pt_regs * regs)4931 static void perf_swevent_event(struct perf_event *event, u64 nr,
4932 			       struct perf_sample_data *data,
4933 			       struct pt_regs *regs)
4934 {
4935 	struct hw_perf_event *hwc = &event->hw;
4936 
4937 	local64_add(nr, &event->count);
4938 
4939 	if (!regs)
4940 		return;
4941 
4942 	if (!is_sampling_event(event))
4943 		return;
4944 
4945 	if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
4946 		data->period = nr;
4947 		return perf_swevent_overflow(event, 1, data, regs);
4948 	} else
4949 		data->period = event->hw.last_period;
4950 
4951 	if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
4952 		return perf_swevent_overflow(event, 1, data, regs);
4953 
4954 	if (local64_add_negative(nr, &hwc->period_left))
4955 		return;
4956 
4957 	perf_swevent_overflow(event, 0, data, regs);
4958 }
4959 
perf_exclude_event(struct perf_event * event,struct pt_regs * regs)4960 static int perf_exclude_event(struct perf_event *event,
4961 			      struct pt_regs *regs)
4962 {
4963 	if (event->hw.state & PERF_HES_STOPPED)
4964 		return 1;
4965 
4966 	if (regs) {
4967 		if (event->attr.exclude_user && user_mode(regs))
4968 			return 1;
4969 
4970 		if (event->attr.exclude_kernel && !user_mode(regs))
4971 			return 1;
4972 	}
4973 
4974 	return 0;
4975 }
4976 
perf_swevent_match(struct perf_event * event,enum perf_type_id type,u32 event_id,struct perf_sample_data * data,struct pt_regs * regs)4977 static int perf_swevent_match(struct perf_event *event,
4978 				enum perf_type_id type,
4979 				u32 event_id,
4980 				struct perf_sample_data *data,
4981 				struct pt_regs *regs)
4982 {
4983 	if (event->attr.type != type)
4984 		return 0;
4985 
4986 	if (event->attr.config != event_id)
4987 		return 0;
4988 
4989 	if (perf_exclude_event(event, regs))
4990 		return 0;
4991 
4992 	return 1;
4993 }
4994 
swevent_hash(u64 type,u32 event_id)4995 static inline u64 swevent_hash(u64 type, u32 event_id)
4996 {
4997 	u64 val = event_id | (type << 32);
4998 
4999 	return hash_64(val, SWEVENT_HLIST_BITS);
5000 }
5001 
5002 static inline struct hlist_head *
__find_swevent_head(struct swevent_hlist * hlist,u64 type,u32 event_id)5003 __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
5004 {
5005 	u64 hash = swevent_hash(type, event_id);
5006 
5007 	return &hlist->heads[hash];
5008 }
5009 
5010 /* For the read side: events when they trigger */
5011 static inline struct hlist_head *
find_swevent_head_rcu(struct swevent_htable * swhash,u64 type,u32 event_id)5012 find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
5013 {
5014 	struct swevent_hlist *hlist;
5015 
5016 	hlist = rcu_dereference(swhash->swevent_hlist);
5017 	if (!hlist)
5018 		return NULL;
5019 
5020 	return __find_swevent_head(hlist, type, event_id);
5021 }
5022 
5023 /* For the event head insertion and removal in the hlist */
5024 static inline struct hlist_head *
find_swevent_head(struct swevent_htable * swhash,struct perf_event * event)5025 find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
5026 {
5027 	struct swevent_hlist *hlist;
5028 	u32 event_id = event->attr.config;
5029 	u64 type = event->attr.type;
5030 
5031 	/*
5032 	 * Event scheduling is always serialized against hlist allocation
5033 	 * and release. Which makes the protected version suitable here.
5034 	 * The context lock guarantees that.
5035 	 */
5036 	hlist = rcu_dereference_protected(swhash->swevent_hlist,
5037 					  lockdep_is_held(&event->ctx->lock));
5038 	if (!hlist)
5039 		return NULL;
5040 
5041 	return __find_swevent_head(hlist, type, event_id);
5042 }
5043 
do_perf_sw_event(enum perf_type_id type,u32 event_id,u64 nr,struct perf_sample_data * data,struct pt_regs * regs)5044 static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
5045 				    u64 nr,
5046 				    struct perf_sample_data *data,
5047 				    struct pt_regs *regs)
5048 {
5049 	struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
5050 	struct perf_event *event;
5051 	struct hlist_node *node;
5052 	struct hlist_head *head;
5053 
5054 	rcu_read_lock();
5055 	head = find_swevent_head_rcu(swhash, type, event_id);
5056 	if (!head)
5057 		goto end;
5058 
5059 	hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
5060 		if (perf_swevent_match(event, type, event_id, data, regs))
5061 			perf_swevent_event(event, nr, data, regs);
5062 	}
5063 end:
5064 	rcu_read_unlock();
5065 }
5066 
perf_swevent_get_recursion_context(void)5067 int perf_swevent_get_recursion_context(void)
5068 {
5069 	struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
5070 
5071 	return get_recursion_context(swhash->recursion);
5072 }
5073 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
5074 
perf_swevent_put_recursion_context(int rctx)5075 inline void perf_swevent_put_recursion_context(int rctx)
5076 {
5077 	struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
5078 
5079 	put_recursion_context(swhash->recursion, rctx);
5080 }
5081 
__perf_sw_event(u32 event_id,u64 nr,struct pt_regs * regs,u64 addr)5082 void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
5083 {
5084 	struct perf_sample_data data;
5085 	int rctx;
5086 
5087 	preempt_disable_notrace();
5088 	rctx = perf_swevent_get_recursion_context();
5089 	if (rctx < 0)
5090 		return;
5091 
5092 	perf_sample_data_init(&data, addr);
5093 
5094 	do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
5095 
5096 	perf_swevent_put_recursion_context(rctx);
5097 	preempt_enable_notrace();
5098 }
5099 
perf_swevent_read(struct perf_event * event)5100 static void perf_swevent_read(struct perf_event *event)
5101 {
5102 }
5103 
perf_swevent_add(struct perf_event * event,int flags)5104 static int perf_swevent_add(struct perf_event *event, int flags)
5105 {
5106 	struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
5107 	struct hw_perf_event *hwc = &event->hw;
5108 	struct hlist_head *head;
5109 
5110 	if (is_sampling_event(event)) {
5111 		hwc->last_period = hwc->sample_period;
5112 		perf_swevent_set_period(event);
5113 	}
5114 
5115 	hwc->state = !(flags & PERF_EF_START);
5116 
5117 	head = find_swevent_head(swhash, event);
5118 	if (!head) {
5119 		/*
5120 		 * We can race with cpu hotplug code. Do not
5121 		 * WARN if the cpu just got unplugged.
5122 		 */
5123 		WARN_ON_ONCE(swhash->online);
5124 		return -EINVAL;
5125 	}
5126 
5127 	hlist_add_head_rcu(&event->hlist_entry, head);
5128 
5129 	return 0;
5130 }
5131 
perf_swevent_del(struct perf_event * event,int flags)5132 static void perf_swevent_del(struct perf_event *event, int flags)
5133 {
5134 	hlist_del_rcu(&event->hlist_entry);
5135 }
5136 
perf_swevent_start(struct perf_event * event,int flags)5137 static void perf_swevent_start(struct perf_event *event, int flags)
5138 {
5139 	event->hw.state = 0;
5140 }
5141 
perf_swevent_stop(struct perf_event * event,int flags)5142 static void perf_swevent_stop(struct perf_event *event, int flags)
5143 {
5144 	event->hw.state = PERF_HES_STOPPED;
5145 }
5146 
5147 /* Deref the hlist from the update side */
5148 static inline struct swevent_hlist *
swevent_hlist_deref(struct swevent_htable * swhash)5149 swevent_hlist_deref(struct swevent_htable *swhash)
5150 {
5151 	return rcu_dereference_protected(swhash->swevent_hlist,
5152 					 lockdep_is_held(&swhash->hlist_mutex));
5153 }
5154 
swevent_hlist_release(struct swevent_htable * swhash)5155 static void swevent_hlist_release(struct swevent_htable *swhash)
5156 {
5157 	struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
5158 
5159 	if (!hlist)
5160 		return;
5161 
5162 	rcu_assign_pointer(swhash->swevent_hlist, NULL);
5163 	kfree_rcu(hlist, rcu_head);
5164 }
5165 
swevent_hlist_put_cpu(struct perf_event * event,int cpu)5166 static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
5167 {
5168 	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
5169 
5170 	mutex_lock(&swhash->hlist_mutex);
5171 
5172 	if (!--swhash->hlist_refcount)
5173 		swevent_hlist_release(swhash);
5174 
5175 	mutex_unlock(&swhash->hlist_mutex);
5176 }
5177 
swevent_hlist_put(struct perf_event * event)5178 static void swevent_hlist_put(struct perf_event *event)
5179 {
5180 	int cpu;
5181 
5182 	if (event->cpu != -1) {
5183 		swevent_hlist_put_cpu(event, event->cpu);
5184 		return;
5185 	}
5186 
5187 	for_each_possible_cpu(cpu)
5188 		swevent_hlist_put_cpu(event, cpu);
5189 }
5190 
swevent_hlist_get_cpu(struct perf_event * event,int cpu)5191 static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
5192 {
5193 	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
5194 	int err = 0;
5195 
5196 	mutex_lock(&swhash->hlist_mutex);
5197 
5198 	if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
5199 		struct swevent_hlist *hlist;
5200 
5201 		hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
5202 		if (!hlist) {
5203 			err = -ENOMEM;
5204 			goto exit;
5205 		}
5206 		rcu_assign_pointer(swhash->swevent_hlist, hlist);
5207 	}
5208 	swhash->hlist_refcount++;
5209 exit:
5210 	mutex_unlock(&swhash->hlist_mutex);
5211 
5212 	return err;
5213 }
5214 
swevent_hlist_get(struct perf_event * event)5215 static int swevent_hlist_get(struct perf_event *event)
5216 {
5217 	int err;
5218 	int cpu, failed_cpu;
5219 
5220 	if (event->cpu != -1)
5221 		return swevent_hlist_get_cpu(event, event->cpu);
5222 
5223 	get_online_cpus();
5224 	for_each_possible_cpu(cpu) {
5225 		err = swevent_hlist_get_cpu(event, cpu);
5226 		if (err) {
5227 			failed_cpu = cpu;
5228 			goto fail;
5229 		}
5230 	}
5231 	put_online_cpus();
5232 
5233 	return 0;
5234 fail:
5235 	for_each_possible_cpu(cpu) {
5236 		if (cpu == failed_cpu)
5237 			break;
5238 		swevent_hlist_put_cpu(event, cpu);
5239 	}
5240 
5241 	put_online_cpus();
5242 	return err;
5243 }
5244 
5245 struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
5246 
sw_perf_event_destroy(struct perf_event * event)5247 static void sw_perf_event_destroy(struct perf_event *event)
5248 {
5249 	u64 event_id = event->attr.config;
5250 
5251 	WARN_ON(event->parent);
5252 
5253 	static_key_slow_dec(&perf_swevent_enabled[event_id]);
5254 	swevent_hlist_put(event);
5255 }
5256 
perf_swevent_init(struct perf_event * event)5257 static int perf_swevent_init(struct perf_event *event)
5258 {
5259 	u64 event_id = event->attr.config;
5260 
5261 	if (event->attr.type != PERF_TYPE_SOFTWARE)
5262 		return -ENOENT;
5263 
5264 	/*
5265 	 * no branch sampling for software events
5266 	 */
5267 	if (has_branch_stack(event))
5268 		return -EOPNOTSUPP;
5269 
5270 	switch (event_id) {
5271 	case PERF_COUNT_SW_CPU_CLOCK:
5272 	case PERF_COUNT_SW_TASK_CLOCK:
5273 		return -ENOENT;
5274 
5275 	default:
5276 		break;
5277 	}
5278 
5279 	if (event_id >= PERF_COUNT_SW_MAX)
5280 		return -ENOENT;
5281 
5282 	if (!event->parent) {
5283 		int err;
5284 
5285 		err = swevent_hlist_get(event);
5286 		if (err)
5287 			return err;
5288 
5289 		static_key_slow_inc(&perf_swevent_enabled[event_id]);
5290 		event->destroy = sw_perf_event_destroy;
5291 	}
5292 
5293 	return 0;
5294 }
5295 
perf_swevent_event_idx(struct perf_event * event)5296 static int perf_swevent_event_idx(struct perf_event *event)
5297 {
5298 	return 0;
5299 }
5300 
5301 static struct pmu perf_swevent = {
5302 	.task_ctx_nr	= perf_sw_context,
5303 
5304 	.event_init	= perf_swevent_init,
5305 	.add		= perf_swevent_add,
5306 	.del		= perf_swevent_del,
5307 	.start		= perf_swevent_start,
5308 	.stop		= perf_swevent_stop,
5309 	.read		= perf_swevent_read,
5310 
5311 	.event_idx	= perf_swevent_event_idx,
5312 };
5313 
5314 #ifdef CONFIG_EVENT_TRACING
5315 
perf_tp_filter_match(struct perf_event * event,struct perf_sample_data * data)5316 static int perf_tp_filter_match(struct perf_event *event,
5317 				struct perf_sample_data *data)
5318 {
5319 	void *record = data->raw->data;
5320 
5321 	if (likely(!event->filter) || filter_match_preds(event->filter, record))
5322 		return 1;
5323 	return 0;
5324 }
5325 
perf_tp_event_match(struct perf_event * event,struct perf_sample_data * data,struct pt_regs * regs)5326 static int perf_tp_event_match(struct perf_event *event,
5327 				struct perf_sample_data *data,
5328 				struct pt_regs *regs)
5329 {
5330 	if (event->hw.state & PERF_HES_STOPPED)
5331 		return 0;
5332 	/*
5333 	 * All tracepoints are from kernel-space.
5334 	 */
5335 	if (event->attr.exclude_kernel)
5336 		return 0;
5337 
5338 	if (!perf_tp_filter_match(event, data))
5339 		return 0;
5340 
5341 	return 1;
5342 }
5343 
perf_tp_event(u64 addr,u64 count,void * record,int entry_size,struct pt_regs * regs,struct hlist_head * head,int rctx)5344 void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
5345 		   struct pt_regs *regs, struct hlist_head *head, int rctx)
5346 {
5347 	struct perf_sample_data data;
5348 	struct perf_event *event;
5349 	struct hlist_node *node;
5350 
5351 	struct perf_raw_record raw = {
5352 		.size = entry_size,
5353 		.data = record,
5354 	};
5355 
5356 	perf_sample_data_init(&data, addr);
5357 	data.raw = &raw;
5358 
5359 	hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
5360 		if (perf_tp_event_match(event, &data, regs))
5361 			perf_swevent_event(event, count, &data, regs);
5362 	}
5363 
5364 	perf_swevent_put_recursion_context(rctx);
5365 }
5366 EXPORT_SYMBOL_GPL(perf_tp_event);
5367 
tp_perf_event_destroy(struct perf_event * event)5368 static void tp_perf_event_destroy(struct perf_event *event)
5369 {
5370 	perf_trace_destroy(event);
5371 }
5372 
perf_tp_event_init(struct perf_event * event)5373 static int perf_tp_event_init(struct perf_event *event)
5374 {
5375 	int err;
5376 
5377 	if (event->attr.type != PERF_TYPE_TRACEPOINT)
5378 		return -ENOENT;
5379 
5380 	/*
5381 	 * no branch sampling for tracepoint events
5382 	 */
5383 	if (has_branch_stack(event))
5384 		return -EOPNOTSUPP;
5385 
5386 	err = perf_trace_init(event);
5387 	if (err)
5388 		return err;
5389 
5390 	event->destroy = tp_perf_event_destroy;
5391 
5392 	return 0;
5393 }
5394 
5395 static struct pmu perf_tracepoint = {
5396 	.task_ctx_nr	= perf_sw_context,
5397 
5398 	.event_init	= perf_tp_event_init,
5399 	.add		= perf_trace_add,
5400 	.del		= perf_trace_del,
5401 	.start		= perf_swevent_start,
5402 	.stop		= perf_swevent_stop,
5403 	.read		= perf_swevent_read,
5404 
5405 	.event_idx	= perf_swevent_event_idx,
5406 };
5407 
perf_tp_register(void)5408 static inline void perf_tp_register(void)
5409 {
5410 	perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
5411 }
5412 
perf_event_set_filter(struct perf_event * event,void __user * arg)5413 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
5414 {
5415 	char *filter_str;
5416 	int ret;
5417 
5418 	if (event->attr.type != PERF_TYPE_TRACEPOINT)
5419 		return -EINVAL;
5420 
5421 	filter_str = strndup_user(arg, PAGE_SIZE);
5422 	if (IS_ERR(filter_str))
5423 		return PTR_ERR(filter_str);
5424 
5425 	ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
5426 
5427 	kfree(filter_str);
5428 	return ret;
5429 }
5430 
perf_event_free_filter(struct perf_event * event)5431 static void perf_event_free_filter(struct perf_event *event)
5432 {
5433 	ftrace_profile_free_filter(event);
5434 }
5435 
5436 #else
5437 
perf_tp_register(void)5438 static inline void perf_tp_register(void)
5439 {
5440 }
5441 
perf_event_set_filter(struct perf_event * event,void __user * arg)5442 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
5443 {
5444 	return -ENOENT;
5445 }
5446 
perf_event_free_filter(struct perf_event * event)5447 static void perf_event_free_filter(struct perf_event *event)
5448 {
5449 }
5450 
5451 #endif /* CONFIG_EVENT_TRACING */
5452 
5453 #ifdef CONFIG_HAVE_HW_BREAKPOINT
perf_bp_event(struct perf_event * bp,void * data)5454 void perf_bp_event(struct perf_event *bp, void *data)
5455 {
5456 	struct perf_sample_data sample;
5457 	struct pt_regs *regs = data;
5458 
5459 	perf_sample_data_init(&sample, bp->attr.bp_addr);
5460 
5461 	if (!bp->hw.state && !perf_exclude_event(bp, regs))
5462 		perf_swevent_event(bp, 1, &sample, regs);
5463 }
5464 #endif
5465 
5466 /*
5467  * hrtimer based swevent callback
5468  */
5469 
perf_swevent_hrtimer(struct hrtimer * hrtimer)5470 static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
5471 {
5472 	enum hrtimer_restart ret = HRTIMER_RESTART;
5473 	struct perf_sample_data data;
5474 	struct pt_regs *regs;
5475 	struct perf_event *event;
5476 	u64 period;
5477 
5478 	event = container_of(hrtimer, struct perf_event, hw.hrtimer);
5479 
5480 	if (event->state != PERF_EVENT_STATE_ACTIVE)
5481 		return HRTIMER_NORESTART;
5482 
5483 	event->pmu->read(event);
5484 
5485 	perf_sample_data_init(&data, 0);
5486 	data.period = event->hw.last_period;
5487 	regs = get_irq_regs();
5488 
5489 	if (regs && !perf_exclude_event(event, regs)) {
5490 		if (!(event->attr.exclude_idle && is_idle_task(current)))
5491 			if (perf_event_overflow(event, &data, regs))
5492 				ret = HRTIMER_NORESTART;
5493 	}
5494 
5495 	period = max_t(u64, 10000, event->hw.sample_period);
5496 	hrtimer_forward_now(hrtimer, ns_to_ktime(period));
5497 
5498 	return ret;
5499 }
5500 
perf_swevent_start_hrtimer(struct perf_event * event)5501 static void perf_swevent_start_hrtimer(struct perf_event *event)
5502 {
5503 	struct hw_perf_event *hwc = &event->hw;
5504 	s64 period;
5505 
5506 	if (!is_sampling_event(event))
5507 		return;
5508 
5509 	period = local64_read(&hwc->period_left);
5510 	if (period) {
5511 		if (period < 0)
5512 			period = 10000;
5513 
5514 		local64_set(&hwc->period_left, 0);
5515 	} else {
5516 		period = max_t(u64, 10000, hwc->sample_period);
5517 	}
5518 	__hrtimer_start_range_ns(&hwc->hrtimer,
5519 				ns_to_ktime(period), 0,
5520 				HRTIMER_MODE_REL_PINNED, 0);
5521 }
5522 
perf_swevent_cancel_hrtimer(struct perf_event * event)5523 static void perf_swevent_cancel_hrtimer(struct perf_event *event)
5524 {
5525 	struct hw_perf_event *hwc = &event->hw;
5526 
5527 	if (is_sampling_event(event)) {
5528 		ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
5529 		local64_set(&hwc->period_left, ktime_to_ns(remaining));
5530 
5531 		hrtimer_cancel(&hwc->hrtimer);
5532 	}
5533 }
5534 
perf_swevent_init_hrtimer(struct perf_event * event)5535 static void perf_swevent_init_hrtimer(struct perf_event *event)
5536 {
5537 	struct hw_perf_event *hwc = &event->hw;
5538 
5539 	if (!is_sampling_event(event))
5540 		return;
5541 
5542 	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5543 	hwc->hrtimer.function = perf_swevent_hrtimer;
5544 
5545 	/*
5546 	 * Since hrtimers have a fixed rate, we can do a static freq->period
5547 	 * mapping and avoid the whole period adjust feedback stuff.
5548 	 */
5549 	if (event->attr.freq) {
5550 		long freq = event->attr.sample_freq;
5551 
5552 		event->attr.sample_period = NSEC_PER_SEC / freq;
5553 		hwc->sample_period = event->attr.sample_period;
5554 		local64_set(&hwc->period_left, hwc->sample_period);
5555 		event->attr.freq = 0;
5556 	}
5557 }
5558 
5559 /*
5560  * Software event: cpu wall time clock
5561  */
5562 
cpu_clock_event_update(struct perf_event * event)5563 static void cpu_clock_event_update(struct perf_event *event)
5564 {
5565 	s64 prev;
5566 	u64 now;
5567 
5568 	now = local_clock();
5569 	prev = local64_xchg(&event->hw.prev_count, now);
5570 	local64_add(now - prev, &event->count);
5571 }
5572 
cpu_clock_event_start(struct perf_event * event,int flags)5573 static void cpu_clock_event_start(struct perf_event *event, int flags)
5574 {
5575 	local64_set(&event->hw.prev_count, local_clock());
5576 	perf_swevent_start_hrtimer(event);
5577 }
5578 
cpu_clock_event_stop(struct perf_event * event,int flags)5579 static void cpu_clock_event_stop(struct perf_event *event, int flags)
5580 {
5581 	perf_swevent_cancel_hrtimer(event);
5582 	cpu_clock_event_update(event);
5583 }
5584 
cpu_clock_event_add(struct perf_event * event,int flags)5585 static int cpu_clock_event_add(struct perf_event *event, int flags)
5586 {
5587 	if (flags & PERF_EF_START)
5588 		cpu_clock_event_start(event, flags);
5589 
5590 	return 0;
5591 }
5592 
cpu_clock_event_del(struct perf_event * event,int flags)5593 static void cpu_clock_event_del(struct perf_event *event, int flags)
5594 {
5595 	cpu_clock_event_stop(event, flags);
5596 }
5597 
cpu_clock_event_read(struct perf_event * event)5598 static void cpu_clock_event_read(struct perf_event *event)
5599 {
5600 	cpu_clock_event_update(event);
5601 }
5602 
cpu_clock_event_init(struct perf_event * event)5603 static int cpu_clock_event_init(struct perf_event *event)
5604 {
5605 	if (event->attr.type != PERF_TYPE_SOFTWARE)
5606 		return -ENOENT;
5607 
5608 	if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
5609 		return -ENOENT;
5610 
5611 	/*
5612 	 * no branch sampling for software events
5613 	 */
5614 	if (has_branch_stack(event))
5615 		return -EOPNOTSUPP;
5616 
5617 	perf_swevent_init_hrtimer(event);
5618 
5619 	return 0;
5620 }
5621 
5622 static struct pmu perf_cpu_clock = {
5623 	.task_ctx_nr	= perf_sw_context,
5624 
5625 	.event_init	= cpu_clock_event_init,
5626 	.add		= cpu_clock_event_add,
5627 	.del		= cpu_clock_event_del,
5628 	.start		= cpu_clock_event_start,
5629 	.stop		= cpu_clock_event_stop,
5630 	.read		= cpu_clock_event_read,
5631 
5632 	.event_idx	= perf_swevent_event_idx,
5633 };
5634 
5635 /*
5636  * Software event: task time clock
5637  */
5638 
task_clock_event_update(struct perf_event * event,u64 now)5639 static void task_clock_event_update(struct perf_event *event, u64 now)
5640 {
5641 	u64 prev;
5642 	s64 delta;
5643 
5644 	prev = local64_xchg(&event->hw.prev_count, now);
5645 	delta = now - prev;
5646 	local64_add(delta, &event->count);
5647 }
5648 
task_clock_event_start(struct perf_event * event,int flags)5649 static void task_clock_event_start(struct perf_event *event, int flags)
5650 {
5651 	local64_set(&event->hw.prev_count, event->ctx->time);
5652 	perf_swevent_start_hrtimer(event);
5653 }
5654 
task_clock_event_stop(struct perf_event * event,int flags)5655 static void task_clock_event_stop(struct perf_event *event, int flags)
5656 {
5657 	perf_swevent_cancel_hrtimer(event);
5658 	task_clock_event_update(event, event->ctx->time);
5659 }
5660 
task_clock_event_add(struct perf_event * event,int flags)5661 static int task_clock_event_add(struct perf_event *event, int flags)
5662 {
5663 	if (flags & PERF_EF_START)
5664 		task_clock_event_start(event, flags);
5665 
5666 	return 0;
5667 }
5668 
task_clock_event_del(struct perf_event * event,int flags)5669 static void task_clock_event_del(struct perf_event *event, int flags)
5670 {
5671 	task_clock_event_stop(event, PERF_EF_UPDATE);
5672 }
5673 
task_clock_event_read(struct perf_event * event)5674 static void task_clock_event_read(struct perf_event *event)
5675 {
5676 	u64 now = perf_clock();
5677 	u64 delta = now - event->ctx->timestamp;
5678 	u64 time = event->ctx->time + delta;
5679 
5680 	task_clock_event_update(event, time);
5681 }
5682 
task_clock_event_init(struct perf_event * event)5683 static int task_clock_event_init(struct perf_event *event)
5684 {
5685 	if (event->attr.type != PERF_TYPE_SOFTWARE)
5686 		return -ENOENT;
5687 
5688 	if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
5689 		return -ENOENT;
5690 
5691 	/*
5692 	 * no branch sampling for software events
5693 	 */
5694 	if (has_branch_stack(event))
5695 		return -EOPNOTSUPP;
5696 
5697 	perf_swevent_init_hrtimer(event);
5698 
5699 	return 0;
5700 }
5701 
5702 static struct pmu perf_task_clock = {
5703 	.task_ctx_nr	= perf_sw_context,
5704 
5705 	.event_init	= task_clock_event_init,
5706 	.add		= task_clock_event_add,
5707 	.del		= task_clock_event_del,
5708 	.start		= task_clock_event_start,
5709 	.stop		= task_clock_event_stop,
5710 	.read		= task_clock_event_read,
5711 
5712 	.event_idx	= perf_swevent_event_idx,
5713 };
5714 
perf_pmu_nop_void(struct pmu * pmu)5715 static void perf_pmu_nop_void(struct pmu *pmu)
5716 {
5717 }
5718 
perf_pmu_nop_int(struct pmu * pmu)5719 static int perf_pmu_nop_int(struct pmu *pmu)
5720 {
5721 	return 0;
5722 }
5723 
perf_pmu_start_txn(struct pmu * pmu)5724 static void perf_pmu_start_txn(struct pmu *pmu)
5725 {
5726 	perf_pmu_disable(pmu);
5727 }
5728 
perf_pmu_commit_txn(struct pmu * pmu)5729 static int perf_pmu_commit_txn(struct pmu *pmu)
5730 {
5731 	perf_pmu_enable(pmu);
5732 	return 0;
5733 }
5734 
perf_pmu_cancel_txn(struct pmu * pmu)5735 static void perf_pmu_cancel_txn(struct pmu *pmu)
5736 {
5737 	perf_pmu_enable(pmu);
5738 }
5739 
perf_event_idx_default(struct perf_event * event)5740 static int perf_event_idx_default(struct perf_event *event)
5741 {
5742 	return event->hw.idx + 1;
5743 }
5744 
5745 /*
5746  * Ensures all contexts with the same task_ctx_nr have the same
5747  * pmu_cpu_context too.
5748  */
find_pmu_context(int ctxn)5749 static void *find_pmu_context(int ctxn)
5750 {
5751 	struct pmu *pmu;
5752 
5753 	if (ctxn < 0)
5754 		return NULL;
5755 
5756 	list_for_each_entry(pmu, &pmus, entry) {
5757 		if (pmu->task_ctx_nr == ctxn)
5758 			return pmu->pmu_cpu_context;
5759 	}
5760 
5761 	return NULL;
5762 }
5763 
update_pmu_context(struct pmu * pmu,struct pmu * old_pmu)5764 static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
5765 {
5766 	int cpu;
5767 
5768 	for_each_possible_cpu(cpu) {
5769 		struct perf_cpu_context *cpuctx;
5770 
5771 		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
5772 
5773 		if (cpuctx->unique_pmu == old_pmu)
5774 			cpuctx->unique_pmu = pmu;
5775 	}
5776 }
5777 
free_pmu_context(struct pmu * pmu)5778 static void free_pmu_context(struct pmu *pmu)
5779 {
5780 	struct pmu *i;
5781 
5782 	mutex_lock(&pmus_lock);
5783 	/*
5784 	 * Like a real lame refcount.
5785 	 */
5786 	list_for_each_entry(i, &pmus, entry) {
5787 		if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
5788 			update_pmu_context(i, pmu);
5789 			goto out;
5790 		}
5791 	}
5792 
5793 	free_percpu(pmu->pmu_cpu_context);
5794 out:
5795 	mutex_unlock(&pmus_lock);
5796 }
5797 static struct idr pmu_idr;
5798 
5799 static ssize_t
type_show(struct device * dev,struct device_attribute * attr,char * page)5800 type_show(struct device *dev, struct device_attribute *attr, char *page)
5801 {
5802 	struct pmu *pmu = dev_get_drvdata(dev);
5803 
5804 	return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
5805 }
5806 
5807 static struct device_attribute pmu_dev_attrs[] = {
5808        __ATTR_RO(type),
5809        __ATTR_NULL,
5810 };
5811 
5812 static int pmu_bus_running;
5813 static struct bus_type pmu_bus = {
5814 	.name		= "event_source",
5815 	.dev_attrs	= pmu_dev_attrs,
5816 };
5817 
pmu_dev_release(struct device * dev)5818 static void pmu_dev_release(struct device *dev)
5819 {
5820 	kfree(dev);
5821 }
5822 
pmu_dev_alloc(struct pmu * pmu)5823 static int pmu_dev_alloc(struct pmu *pmu)
5824 {
5825 	int ret = -ENOMEM;
5826 
5827 	pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
5828 	if (!pmu->dev)
5829 		goto out;
5830 
5831 	pmu->dev->groups = pmu->attr_groups;
5832 	device_initialize(pmu->dev);
5833 	ret = dev_set_name(pmu->dev, "%s", pmu->name);
5834 	if (ret)
5835 		goto free_dev;
5836 
5837 	dev_set_drvdata(pmu->dev, pmu);
5838 	pmu->dev->bus = &pmu_bus;
5839 	pmu->dev->release = pmu_dev_release;
5840 	ret = device_add(pmu->dev);
5841 	if (ret)
5842 		goto free_dev;
5843 
5844 out:
5845 	return ret;
5846 
5847 free_dev:
5848 	put_device(pmu->dev);
5849 	goto out;
5850 }
5851 
5852 static struct lock_class_key cpuctx_mutex;
5853 static struct lock_class_key cpuctx_lock;
5854 
perf_pmu_register(struct pmu * pmu,char * name,int type)5855 int perf_pmu_register(struct pmu *pmu, char *name, int type)
5856 {
5857 	int cpu, ret;
5858 
5859 	mutex_lock(&pmus_lock);
5860 	ret = -ENOMEM;
5861 	pmu->pmu_disable_count = alloc_percpu(int);
5862 	if (!pmu->pmu_disable_count)
5863 		goto unlock;
5864 
5865 	pmu->type = -1;
5866 	if (!name)
5867 		goto skip_type;
5868 	pmu->name = name;
5869 
5870 	if (type < 0) {
5871 		int err = idr_pre_get(&pmu_idr, GFP_KERNEL);
5872 		if (!err)
5873 			goto free_pdc;
5874 
5875 		err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type);
5876 		if (err) {
5877 			ret = err;
5878 			goto free_pdc;
5879 		}
5880 	}
5881 	pmu->type = type;
5882 
5883 	if (pmu_bus_running) {
5884 		ret = pmu_dev_alloc(pmu);
5885 		if (ret)
5886 			goto free_idr;
5887 	}
5888 
5889 skip_type:
5890 	pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
5891 	if (pmu->pmu_cpu_context)
5892 		goto got_cpu_context;
5893 
5894 	ret = -ENOMEM;
5895 	pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
5896 	if (!pmu->pmu_cpu_context)
5897 		goto free_dev;
5898 
5899 	for_each_possible_cpu(cpu) {
5900 		struct perf_cpu_context *cpuctx;
5901 
5902 		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
5903 		__perf_event_init_context(&cpuctx->ctx);
5904 		lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
5905 		lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
5906 		cpuctx->ctx.type = cpu_context;
5907 		cpuctx->ctx.pmu = pmu;
5908 		cpuctx->jiffies_interval = 1;
5909 		INIT_LIST_HEAD(&cpuctx->rotation_list);
5910 		cpuctx->unique_pmu = pmu;
5911 	}
5912 
5913 got_cpu_context:
5914 	if (!pmu->start_txn) {
5915 		if (pmu->pmu_enable) {
5916 			/*
5917 			 * If we have pmu_enable/pmu_disable calls, install
5918 			 * transaction stubs that use that to try and batch
5919 			 * hardware accesses.
5920 			 */
5921 			pmu->start_txn  = perf_pmu_start_txn;
5922 			pmu->commit_txn = perf_pmu_commit_txn;
5923 			pmu->cancel_txn = perf_pmu_cancel_txn;
5924 		} else {
5925 			pmu->start_txn  = perf_pmu_nop_void;
5926 			pmu->commit_txn = perf_pmu_nop_int;
5927 			pmu->cancel_txn = perf_pmu_nop_void;
5928 		}
5929 	}
5930 
5931 	if (!pmu->pmu_enable) {
5932 		pmu->pmu_enable  = perf_pmu_nop_void;
5933 		pmu->pmu_disable = perf_pmu_nop_void;
5934 	}
5935 
5936 	if (!pmu->event_idx)
5937 		pmu->event_idx = perf_event_idx_default;
5938 
5939 	list_add_rcu(&pmu->entry, &pmus);
5940 	ret = 0;
5941 unlock:
5942 	mutex_unlock(&pmus_lock);
5943 
5944 	return ret;
5945 
5946 free_dev:
5947 	device_del(pmu->dev);
5948 	put_device(pmu->dev);
5949 
5950 free_idr:
5951 	if (pmu->type >= PERF_TYPE_MAX)
5952 		idr_remove(&pmu_idr, pmu->type);
5953 
5954 free_pdc:
5955 	free_percpu(pmu->pmu_disable_count);
5956 	goto unlock;
5957 }
5958 
perf_pmu_unregister(struct pmu * pmu)5959 void perf_pmu_unregister(struct pmu *pmu)
5960 {
5961 	mutex_lock(&pmus_lock);
5962 	list_del_rcu(&pmu->entry);
5963 	mutex_unlock(&pmus_lock);
5964 
5965 	/*
5966 	 * We dereference the pmu list under both SRCU and regular RCU, so
5967 	 * synchronize against both of those.
5968 	 */
5969 	synchronize_srcu(&pmus_srcu);
5970 	synchronize_rcu();
5971 
5972 	free_percpu(pmu->pmu_disable_count);
5973 	if (pmu->type >= PERF_TYPE_MAX)
5974 		idr_remove(&pmu_idr, pmu->type);
5975 	device_del(pmu->dev);
5976 	put_device(pmu->dev);
5977 	free_pmu_context(pmu);
5978 }
5979 
perf_init_event(struct perf_event * event)5980 struct pmu *perf_init_event(struct perf_event *event)
5981 {
5982 	struct pmu *pmu = NULL;
5983 	int idx;
5984 	int ret;
5985 
5986 	idx = srcu_read_lock(&pmus_srcu);
5987 
5988 	rcu_read_lock();
5989 	pmu = idr_find(&pmu_idr, event->attr.type);
5990 	rcu_read_unlock();
5991 	if (pmu) {
5992 		event->pmu = pmu;
5993 		ret = pmu->event_init(event);
5994 		if (ret)
5995 			pmu = ERR_PTR(ret);
5996 		goto unlock;
5997 	}
5998 
5999 	list_for_each_entry_rcu(pmu, &pmus, entry) {
6000 		event->pmu = pmu;
6001 		ret = pmu->event_init(event);
6002 		if (!ret)
6003 			goto unlock;
6004 
6005 		if (ret != -ENOENT) {
6006 			pmu = ERR_PTR(ret);
6007 			goto unlock;
6008 		}
6009 	}
6010 	pmu = ERR_PTR(-ENOENT);
6011 unlock:
6012 	srcu_read_unlock(&pmus_srcu, idx);
6013 
6014 	return pmu;
6015 }
6016 
6017 /*
6018  * Allocate and initialize a event structure
6019  */
6020 static struct perf_event *
perf_event_alloc(struct perf_event_attr * attr,int cpu,struct task_struct * task,struct perf_event * group_leader,struct perf_event * parent_event,perf_overflow_handler_t overflow_handler,void * context)6021 perf_event_alloc(struct perf_event_attr *attr, int cpu,
6022 		 struct task_struct *task,
6023 		 struct perf_event *group_leader,
6024 		 struct perf_event *parent_event,
6025 		 perf_overflow_handler_t overflow_handler,
6026 		 void *context)
6027 {
6028 	struct pmu *pmu;
6029 	struct perf_event *event;
6030 	struct hw_perf_event *hwc;
6031 	long err;
6032 
6033 	if ((unsigned)cpu >= nr_cpu_ids) {
6034 		if (!task || cpu != -1)
6035 			return ERR_PTR(-EINVAL);
6036 	}
6037 
6038 	event = kzalloc(sizeof(*event), GFP_KERNEL);
6039 	if (!event)
6040 		return ERR_PTR(-ENOMEM);
6041 
6042 	/*
6043 	 * Single events are their own group leaders, with an
6044 	 * empty sibling list:
6045 	 */
6046 	if (!group_leader)
6047 		group_leader = event;
6048 
6049 	mutex_init(&event->child_mutex);
6050 	INIT_LIST_HEAD(&event->child_list);
6051 
6052 	INIT_LIST_HEAD(&event->group_entry);
6053 	INIT_LIST_HEAD(&event->event_entry);
6054 	INIT_LIST_HEAD(&event->sibling_list);
6055 	INIT_LIST_HEAD(&event->rb_entry);
6056 
6057 	init_waitqueue_head(&event->waitq);
6058 	init_irq_work(&event->pending, perf_pending_event);
6059 
6060 	mutex_init(&event->mmap_mutex);
6061 
6062 	atomic_long_set(&event->refcount, 1);
6063 	event->cpu		= cpu;
6064 	event->attr		= *attr;
6065 	event->group_leader	= group_leader;
6066 	event->pmu		= NULL;
6067 	event->oncpu		= -1;
6068 
6069 	event->parent		= parent_event;
6070 
6071 	event->ns		= get_pid_ns(current->nsproxy->pid_ns);
6072 	event->id		= atomic64_inc_return(&perf_event_id);
6073 
6074 	event->state		= PERF_EVENT_STATE_INACTIVE;
6075 
6076 	if (task) {
6077 		event->attach_state = PERF_ATTACH_TASK;
6078 #ifdef CONFIG_HAVE_HW_BREAKPOINT
6079 		/*
6080 		 * hw_breakpoint is a bit difficult here..
6081 		 */
6082 		if (attr->type == PERF_TYPE_BREAKPOINT)
6083 			event->hw.bp_target = task;
6084 #endif
6085 	}
6086 
6087 	if (!overflow_handler && parent_event) {
6088 		overflow_handler = parent_event->overflow_handler;
6089 		context = parent_event->overflow_handler_context;
6090 	}
6091 
6092 	event->overflow_handler	= overflow_handler;
6093 	event->overflow_handler_context = context;
6094 
6095 	perf_event__state_init(event);
6096 
6097 	pmu = NULL;
6098 
6099 	hwc = &event->hw;
6100 	hwc->sample_period = attr->sample_period;
6101 	if (attr->freq && attr->sample_freq)
6102 		hwc->sample_period = 1;
6103 	hwc->last_period = hwc->sample_period;
6104 
6105 	local64_set(&hwc->period_left, hwc->sample_period);
6106 
6107 	/*
6108 	 * we currently do not support PERF_FORMAT_GROUP on inherited events
6109 	 */
6110 	if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
6111 		goto done;
6112 
6113 	pmu = perf_init_event(event);
6114 
6115 done:
6116 	err = 0;
6117 	if (!pmu)
6118 		err = -EINVAL;
6119 	else if (IS_ERR(pmu))
6120 		err = PTR_ERR(pmu);
6121 
6122 	if (err) {
6123 		if (event->ns)
6124 			put_pid_ns(event->ns);
6125 		kfree(event);
6126 		return ERR_PTR(err);
6127 	}
6128 
6129 	if (!event->parent) {
6130 		if (event->attach_state & PERF_ATTACH_TASK)
6131 			static_key_slow_inc(&perf_sched_events.key);
6132 		if (event->attr.mmap || event->attr.mmap_data)
6133 			atomic_inc(&nr_mmap_events);
6134 		if (event->attr.comm)
6135 			atomic_inc(&nr_comm_events);
6136 		if (event->attr.task)
6137 			atomic_inc(&nr_task_events);
6138 		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
6139 			err = get_callchain_buffers();
6140 			if (err) {
6141 				free_event(event);
6142 				return ERR_PTR(err);
6143 			}
6144 		}
6145 		if (has_branch_stack(event)) {
6146 			static_key_slow_inc(&perf_sched_events.key);
6147 			if (!(event->attach_state & PERF_ATTACH_TASK))
6148 				atomic_inc(&per_cpu(perf_branch_stack_events,
6149 						    event->cpu));
6150 		}
6151 	}
6152 
6153 	return event;
6154 }
6155 
perf_copy_attr(struct perf_event_attr __user * uattr,struct perf_event_attr * attr)6156 static int perf_copy_attr(struct perf_event_attr __user *uattr,
6157 			  struct perf_event_attr *attr)
6158 {
6159 	u32 size;
6160 	int ret;
6161 
6162 	if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
6163 		return -EFAULT;
6164 
6165 	/*
6166 	 * zero the full structure, so that a short copy will be nice.
6167 	 */
6168 	memset(attr, 0, sizeof(*attr));
6169 
6170 	ret = get_user(size, &uattr->size);
6171 	if (ret)
6172 		return ret;
6173 
6174 	if (size > PAGE_SIZE)	/* silly large */
6175 		goto err_size;
6176 
6177 	if (!size)		/* abi compat */
6178 		size = PERF_ATTR_SIZE_VER0;
6179 
6180 	if (size < PERF_ATTR_SIZE_VER0)
6181 		goto err_size;
6182 
6183 	/*
6184 	 * If we're handed a bigger struct than we know of,
6185 	 * ensure all the unknown bits are 0 - i.e. new
6186 	 * user-space does not rely on any kernel feature
6187 	 * extensions we dont know about yet.
6188 	 */
6189 	if (size > sizeof(*attr)) {
6190 		unsigned char __user *addr;
6191 		unsigned char __user *end;
6192 		unsigned char val;
6193 
6194 		addr = (void __user *)uattr + sizeof(*attr);
6195 		end  = (void __user *)uattr + size;
6196 
6197 		for (; addr < end; addr++) {
6198 			ret = get_user(val, addr);
6199 			if (ret)
6200 				return ret;
6201 			if (val)
6202 				goto err_size;
6203 		}
6204 		size = sizeof(*attr);
6205 	}
6206 
6207 	ret = copy_from_user(attr, uattr, size);
6208 	if (ret)
6209 		return -EFAULT;
6210 
6211 	if (attr->__reserved_1)
6212 		return -EINVAL;
6213 
6214 	if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
6215 		return -EINVAL;
6216 
6217 	if (attr->read_format & ~(PERF_FORMAT_MAX-1))
6218 		return -EINVAL;
6219 
6220 	if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
6221 		u64 mask = attr->branch_sample_type;
6222 
6223 		/* only using defined bits */
6224 		if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
6225 			return -EINVAL;
6226 
6227 		/* at least one branch bit must be set */
6228 		if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
6229 			return -EINVAL;
6230 
6231 		/* kernel level capture: check permissions */
6232 		if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
6233 		    && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
6234 			return -EACCES;
6235 
6236 		/* propagate priv level, when not set for branch */
6237 		if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
6238 
6239 			/* exclude_kernel checked on syscall entry */
6240 			if (!attr->exclude_kernel)
6241 				mask |= PERF_SAMPLE_BRANCH_KERNEL;
6242 
6243 			if (!attr->exclude_user)
6244 				mask |= PERF_SAMPLE_BRANCH_USER;
6245 
6246 			if (!attr->exclude_hv)
6247 				mask |= PERF_SAMPLE_BRANCH_HV;
6248 			/*
6249 			 * adjust user setting (for HW filter setup)
6250 			 */
6251 			attr->branch_sample_type = mask;
6252 		}
6253 	}
6254 out:
6255 	return ret;
6256 
6257 err_size:
6258 	put_user(sizeof(*attr), &uattr->size);
6259 	ret = -E2BIG;
6260 	goto out;
6261 }
6262 
6263 static int
perf_event_set_output(struct perf_event * event,struct perf_event * output_event)6264 perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
6265 {
6266 	struct ring_buffer *rb = NULL, *old_rb = NULL;
6267 	int ret = -EINVAL;
6268 
6269 	if (!output_event)
6270 		goto set;
6271 
6272 	/* don't allow circular references */
6273 	if (event == output_event)
6274 		goto out;
6275 
6276 	/*
6277 	 * Don't allow cross-cpu buffers
6278 	 */
6279 	if (output_event->cpu != event->cpu)
6280 		goto out;
6281 
6282 	/*
6283 	 * If its not a per-cpu rb, it must be the same task.
6284 	 */
6285 	if (output_event->cpu == -1 && output_event->ctx != event->ctx)
6286 		goto out;
6287 
6288 set:
6289 	mutex_lock(&event->mmap_mutex);
6290 	/* Can't redirect output if we've got an active mmap() */
6291 	if (atomic_read(&event->mmap_count))
6292 		goto unlock;
6293 
6294 	old_rb = event->rb;
6295 
6296 	if (output_event) {
6297 		/* get the rb we want to redirect to */
6298 		rb = ring_buffer_get(output_event);
6299 		if (!rb)
6300 			goto unlock;
6301 	}
6302 
6303 	if (old_rb)
6304 		ring_buffer_detach(event, old_rb);
6305 
6306 	if (rb)
6307 		ring_buffer_attach(event, rb);
6308 
6309 	rcu_assign_pointer(event->rb, rb);
6310 
6311 	if (old_rb) {
6312 		ring_buffer_put(old_rb);
6313 		/*
6314 		 * Since we detached before setting the new rb, so that we
6315 		 * could attach the new rb, we could have missed a wakeup.
6316 		 * Provide it now.
6317 		 */
6318 		wake_up_all(&event->waitq);
6319 	}
6320 
6321 	ret = 0;
6322 unlock:
6323 	mutex_unlock(&event->mmap_mutex);
6324 
6325 out:
6326 	return ret;
6327 }
6328 
6329 /**
6330  * sys_perf_event_open - open a performance event, associate it to a task/cpu
6331  *
6332  * @attr_uptr:	event_id type attributes for monitoring/sampling
6333  * @pid:		target pid
6334  * @cpu:		target cpu
6335  * @group_fd:		group leader event fd
6336  */
SYSCALL_DEFINE5(perf_event_open,struct perf_event_attr __user *,attr_uptr,pid_t,pid,int,cpu,int,group_fd,unsigned long,flags)6337 SYSCALL_DEFINE5(perf_event_open,
6338 		struct perf_event_attr __user *, attr_uptr,
6339 		pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
6340 {
6341 	struct perf_event *group_leader = NULL, *output_event = NULL;
6342 	struct perf_event *event, *sibling;
6343 	struct perf_event_attr attr;
6344 	struct perf_event_context *ctx;
6345 	struct file *event_file = NULL;
6346 	struct file *group_file = NULL;
6347 	struct task_struct *task = NULL;
6348 	struct pmu *pmu;
6349 	int event_fd;
6350 	int move_group = 0;
6351 	int fput_needed = 0;
6352 	int err;
6353 
6354 	/* for future expandability... */
6355 	if (flags & ~PERF_FLAG_ALL)
6356 		return -EINVAL;
6357 
6358 	err = perf_copy_attr(attr_uptr, &attr);
6359 	if (err)
6360 		return err;
6361 
6362 	if (!attr.exclude_kernel) {
6363 		if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
6364 			return -EACCES;
6365 	}
6366 
6367 	if (attr.freq) {
6368 		if (attr.sample_freq > sysctl_perf_event_sample_rate)
6369 			return -EINVAL;
6370 	} else {
6371 		if (attr.sample_period & (1ULL << 63))
6372 			return -EINVAL;
6373 	}
6374 
6375 	/*
6376 	 * In cgroup mode, the pid argument is used to pass the fd
6377 	 * opened to the cgroup directory in cgroupfs. The cpu argument
6378 	 * designates the cpu on which to monitor threads from that
6379 	 * cgroup.
6380 	 */
6381 	if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
6382 		return -EINVAL;
6383 
6384 	event_fd = get_unused_fd_flags(O_RDWR);
6385 	if (event_fd < 0)
6386 		return event_fd;
6387 
6388 	if (group_fd != -1) {
6389 		group_file = perf_fget_light(group_fd, &fput_needed);
6390 		if (IS_ERR(group_file)) {
6391 			err = PTR_ERR(group_file);
6392 			goto err_fd;
6393 		}
6394 		group_leader = group_file->private_data;
6395 		if (flags & PERF_FLAG_FD_OUTPUT)
6396 			output_event = group_leader;
6397 		if (flags & PERF_FLAG_FD_NO_GROUP)
6398 			group_leader = NULL;
6399 	}
6400 
6401 	if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
6402 		task = find_lively_task_by_vpid(pid);
6403 		if (IS_ERR(task)) {
6404 			err = PTR_ERR(task);
6405 			goto err_group_fd;
6406 		}
6407 	}
6408 
6409 	event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
6410 				 NULL, NULL);
6411 	if (IS_ERR(event)) {
6412 		err = PTR_ERR(event);
6413 		goto err_task;
6414 	}
6415 
6416 	if (flags & PERF_FLAG_PID_CGROUP) {
6417 		err = perf_cgroup_connect(pid, event, &attr, group_leader);
6418 		if (err)
6419 			goto err_alloc;
6420 		/*
6421 		 * one more event:
6422 		 * - that has cgroup constraint on event->cpu
6423 		 * - that may need work on context switch
6424 		 */
6425 		atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
6426 		static_key_slow_inc(&perf_sched_events.key);
6427 	}
6428 
6429 	/*
6430 	 * Special case software events and allow them to be part of
6431 	 * any hardware group.
6432 	 */
6433 	pmu = event->pmu;
6434 
6435 	if (group_leader &&
6436 	    (is_software_event(event) != is_software_event(group_leader))) {
6437 		if (is_software_event(event)) {
6438 			/*
6439 			 * If event and group_leader are not both a software
6440 			 * event, and event is, then group leader is not.
6441 			 *
6442 			 * Allow the addition of software events to !software
6443 			 * groups, this is safe because software events never
6444 			 * fail to schedule.
6445 			 */
6446 			pmu = group_leader->pmu;
6447 		} else if (is_software_event(group_leader) &&
6448 			   (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
6449 			/*
6450 			 * In case the group is a pure software group, and we
6451 			 * try to add a hardware event, move the whole group to
6452 			 * the hardware context.
6453 			 */
6454 			move_group = 1;
6455 		}
6456 	}
6457 
6458 	/*
6459 	 * Get the target context (task or percpu):
6460 	 */
6461 	ctx = find_get_context(pmu, task, cpu);
6462 	if (IS_ERR(ctx)) {
6463 		err = PTR_ERR(ctx);
6464 		goto err_alloc;
6465 	}
6466 
6467 	if (task) {
6468 		put_task_struct(task);
6469 		task = NULL;
6470 	}
6471 
6472 	/*
6473 	 * Look up the group leader (we will attach this event to it):
6474 	 */
6475 	if (group_leader) {
6476 		err = -EINVAL;
6477 
6478 		/*
6479 		 * Do not allow a recursive hierarchy (this new sibling
6480 		 * becoming part of another group-sibling):
6481 		 */
6482 		if (group_leader->group_leader != group_leader)
6483 			goto err_context;
6484 		/*
6485 		 * Do not allow to attach to a group in a different
6486 		 * task or CPU context:
6487 		 */
6488 		if (move_group) {
6489 			if (group_leader->ctx->type != ctx->type)
6490 				goto err_context;
6491 		} else {
6492 			if (group_leader->ctx != ctx)
6493 				goto err_context;
6494 		}
6495 
6496 		/*
6497 		 * Only a group leader can be exclusive or pinned
6498 		 */
6499 		if (attr.exclusive || attr.pinned)
6500 			goto err_context;
6501 	}
6502 
6503 	if (output_event) {
6504 		err = perf_event_set_output(event, output_event);
6505 		if (err)
6506 			goto err_context;
6507 	}
6508 
6509 	event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
6510 	if (IS_ERR(event_file)) {
6511 		err = PTR_ERR(event_file);
6512 		goto err_context;
6513 	}
6514 
6515 	if (move_group) {
6516 		struct perf_event_context *gctx = group_leader->ctx;
6517 
6518 		mutex_lock(&gctx->mutex);
6519 		perf_remove_from_context(group_leader, false);
6520 
6521 		/*
6522 		 * Removing from the context ends up with disabled
6523 		 * event. What we want here is event in the initial
6524 		 * startup state, ready to be add into new context.
6525 		 */
6526 		perf_event__state_init(group_leader);
6527 		list_for_each_entry(sibling, &group_leader->sibling_list,
6528 				    group_entry) {
6529 			perf_remove_from_context(sibling, false);
6530 			perf_event__state_init(sibling);
6531 			put_ctx(gctx);
6532 		}
6533 		mutex_unlock(&gctx->mutex);
6534 		put_ctx(gctx);
6535 	}
6536 
6537 	WARN_ON_ONCE(ctx->parent_ctx);
6538 	mutex_lock(&ctx->mutex);
6539 
6540 	if (move_group) {
6541 		perf_install_in_context(ctx, group_leader, cpu);
6542 		get_ctx(ctx);
6543 		list_for_each_entry(sibling, &group_leader->sibling_list,
6544 				    group_entry) {
6545 			perf_install_in_context(ctx, sibling, cpu);
6546 			get_ctx(ctx);
6547 		}
6548 	}
6549 
6550 	perf_install_in_context(ctx, event, cpu);
6551 	++ctx->generation;
6552 	perf_unpin_context(ctx);
6553 	mutex_unlock(&ctx->mutex);
6554 
6555 	event->owner = current;
6556 
6557 	mutex_lock(&current->perf_event_mutex);
6558 	list_add_tail(&event->owner_entry, &current->perf_event_list);
6559 	mutex_unlock(&current->perf_event_mutex);
6560 
6561 	/*
6562 	 * Precalculate sample_data sizes
6563 	 */
6564 	perf_event__header_size(event);
6565 	perf_event__id_header_size(event);
6566 
6567 	/*
6568 	 * Drop the reference on the group_event after placing the
6569 	 * new event on the sibling_list. This ensures destruction
6570 	 * of the group leader will find the pointer to itself in
6571 	 * perf_group_detach().
6572 	 */
6573 	fput_light(group_file, fput_needed);
6574 	fd_install(event_fd, event_file);
6575 	return event_fd;
6576 
6577 err_context:
6578 	perf_unpin_context(ctx);
6579 	put_ctx(ctx);
6580 err_alloc:
6581 	free_event(event);
6582 err_task:
6583 	if (task)
6584 		put_task_struct(task);
6585 err_group_fd:
6586 	fput_light(group_file, fput_needed);
6587 err_fd:
6588 	put_unused_fd(event_fd);
6589 	return err;
6590 }
6591 
6592 /**
6593  * perf_event_create_kernel_counter
6594  *
6595  * @attr: attributes of the counter to create
6596  * @cpu: cpu in which the counter is bound
6597  * @task: task to profile (NULL for percpu)
6598  */
6599 struct perf_event *
perf_event_create_kernel_counter(struct perf_event_attr * attr,int cpu,struct task_struct * task,perf_overflow_handler_t overflow_handler,void * context)6600 perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
6601 				 struct task_struct *task,
6602 				 perf_overflow_handler_t overflow_handler,
6603 				 void *context)
6604 {
6605 	struct perf_event_context *ctx;
6606 	struct perf_event *event;
6607 	int err;
6608 
6609 	/*
6610 	 * Get the target context (task or percpu):
6611 	 */
6612 
6613 	event = perf_event_alloc(attr, cpu, task, NULL, NULL,
6614 				 overflow_handler, context);
6615 	if (IS_ERR(event)) {
6616 		err = PTR_ERR(event);
6617 		goto err;
6618 	}
6619 
6620 	ctx = find_get_context(event->pmu, task, cpu);
6621 	if (IS_ERR(ctx)) {
6622 		err = PTR_ERR(ctx);
6623 		goto err_free;
6624 	}
6625 
6626 	WARN_ON_ONCE(ctx->parent_ctx);
6627 	mutex_lock(&ctx->mutex);
6628 	perf_install_in_context(ctx, event, cpu);
6629 	++ctx->generation;
6630 	perf_unpin_context(ctx);
6631 	mutex_unlock(&ctx->mutex);
6632 
6633 	return event;
6634 
6635 err_free:
6636 	free_event(event);
6637 err:
6638 	return ERR_PTR(err);
6639 }
6640 EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
6641 
sync_child_event(struct perf_event * child_event,struct task_struct * child)6642 static void sync_child_event(struct perf_event *child_event,
6643 			       struct task_struct *child)
6644 {
6645 	struct perf_event *parent_event = child_event->parent;
6646 	u64 child_val;
6647 
6648 	if (child_event->attr.inherit_stat)
6649 		perf_event_read_event(child_event, child);
6650 
6651 	child_val = perf_event_count(child_event);
6652 
6653 	/*
6654 	 * Add back the child's count to the parent's count:
6655 	 */
6656 	atomic64_add(child_val, &parent_event->child_count);
6657 	atomic64_add(child_event->total_time_enabled,
6658 		     &parent_event->child_total_time_enabled);
6659 	atomic64_add(child_event->total_time_running,
6660 		     &parent_event->child_total_time_running);
6661 
6662 	/*
6663 	 * Remove this event from the parent's list
6664 	 */
6665 	WARN_ON_ONCE(parent_event->ctx->parent_ctx);
6666 	mutex_lock(&parent_event->child_mutex);
6667 	list_del_init(&child_event->child_list);
6668 	mutex_unlock(&parent_event->child_mutex);
6669 
6670 	/*
6671 	 * Release the parent event, if this was the last
6672 	 * reference to it.
6673 	 */
6674 	put_event(parent_event);
6675 }
6676 
6677 static void
__perf_event_exit_task(struct perf_event * child_event,struct perf_event_context * child_ctx,struct task_struct * child)6678 __perf_event_exit_task(struct perf_event *child_event,
6679 			 struct perf_event_context *child_ctx,
6680 			 struct task_struct *child)
6681 {
6682 	perf_remove_from_context(child_event, !!child_event->parent);
6683 
6684 	/*
6685 	 * It can happen that the parent exits first, and has events
6686 	 * that are still around due to the child reference. These
6687 	 * events need to be zapped.
6688 	 */
6689 	if (child_event->parent) {
6690 		sync_child_event(child_event, child);
6691 		free_event(child_event);
6692 	}
6693 }
6694 
perf_event_exit_task_context(struct task_struct * child,int ctxn)6695 static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
6696 {
6697 	struct perf_event *child_event, *tmp;
6698 	struct perf_event_context *child_ctx;
6699 	unsigned long flags;
6700 
6701 	if (likely(!child->perf_event_ctxp[ctxn])) {
6702 		perf_event_task(child, NULL, 0);
6703 		return;
6704 	}
6705 
6706 	local_irq_save(flags);
6707 	/*
6708 	 * We can't reschedule here because interrupts are disabled,
6709 	 * and either child is current or it is a task that can't be
6710 	 * scheduled, so we are now safe from rescheduling changing
6711 	 * our context.
6712 	 */
6713 	child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
6714 
6715 	/*
6716 	 * Take the context lock here so that if find_get_context is
6717 	 * reading child->perf_event_ctxp, we wait until it has
6718 	 * incremented the context's refcount before we do put_ctx below.
6719 	 */
6720 	raw_spin_lock(&child_ctx->lock);
6721 	task_ctx_sched_out(child_ctx);
6722 	child->perf_event_ctxp[ctxn] = NULL;
6723 	/*
6724 	 * If this context is a clone; unclone it so it can't get
6725 	 * swapped to another process while we're removing all
6726 	 * the events from it.
6727 	 */
6728 	unclone_ctx(child_ctx);
6729 	update_context_time(child_ctx);
6730 	raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
6731 
6732 	/*
6733 	 * Report the task dead after unscheduling the events so that we
6734 	 * won't get any samples after PERF_RECORD_EXIT. We can however still
6735 	 * get a few PERF_RECORD_READ events.
6736 	 */
6737 	perf_event_task(child, child_ctx, 0);
6738 
6739 	/*
6740 	 * We can recurse on the same lock type through:
6741 	 *
6742 	 *   __perf_event_exit_task()
6743 	 *     sync_child_event()
6744 	 *       put_event()
6745 	 *         mutex_lock(&ctx->mutex)
6746 	 *
6747 	 * But since its the parent context it won't be the same instance.
6748 	 */
6749 	mutex_lock(&child_ctx->mutex);
6750 
6751 again:
6752 	list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
6753 				 group_entry)
6754 		__perf_event_exit_task(child_event, child_ctx, child);
6755 
6756 	list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups,
6757 				 group_entry)
6758 		__perf_event_exit_task(child_event, child_ctx, child);
6759 
6760 	/*
6761 	 * If the last event was a group event, it will have appended all
6762 	 * its siblings to the list, but we obtained 'tmp' before that which
6763 	 * will still point to the list head terminating the iteration.
6764 	 */
6765 	if (!list_empty(&child_ctx->pinned_groups) ||
6766 	    !list_empty(&child_ctx->flexible_groups))
6767 		goto again;
6768 
6769 	mutex_unlock(&child_ctx->mutex);
6770 
6771 	put_ctx(child_ctx);
6772 }
6773 
6774 /*
6775  * When a child task exits, feed back event values to parent events.
6776  */
perf_event_exit_task(struct task_struct * child)6777 void perf_event_exit_task(struct task_struct *child)
6778 {
6779 	struct perf_event *event, *tmp;
6780 	int ctxn;
6781 
6782 	mutex_lock(&child->perf_event_mutex);
6783 	list_for_each_entry_safe(event, tmp, &child->perf_event_list,
6784 				 owner_entry) {
6785 		list_del_init(&event->owner_entry);
6786 
6787 		/*
6788 		 * Ensure the list deletion is visible before we clear
6789 		 * the owner, closes a race against perf_release() where
6790 		 * we need to serialize on the owner->perf_event_mutex.
6791 		 */
6792 		smp_wmb();
6793 		event->owner = NULL;
6794 	}
6795 	mutex_unlock(&child->perf_event_mutex);
6796 
6797 	for_each_task_context_nr(ctxn)
6798 		perf_event_exit_task_context(child, ctxn);
6799 }
6800 
perf_free_event(struct perf_event * event,struct perf_event_context * ctx)6801 static void perf_free_event(struct perf_event *event,
6802 			    struct perf_event_context *ctx)
6803 {
6804 	struct perf_event *parent = event->parent;
6805 
6806 	if (WARN_ON_ONCE(!parent))
6807 		return;
6808 
6809 	mutex_lock(&parent->child_mutex);
6810 	list_del_init(&event->child_list);
6811 	mutex_unlock(&parent->child_mutex);
6812 
6813 	put_event(parent);
6814 
6815 	perf_group_detach(event);
6816 	list_del_event(event, ctx);
6817 	free_event(event);
6818 }
6819 
6820 /*
6821  * free an unexposed, unused context as created by inheritance by
6822  * perf_event_init_task below, used by fork() in case of fail.
6823  */
perf_event_free_task(struct task_struct * task)6824 void perf_event_free_task(struct task_struct *task)
6825 {
6826 	struct perf_event_context *ctx;
6827 	struct perf_event *event, *tmp;
6828 	int ctxn;
6829 
6830 	for_each_task_context_nr(ctxn) {
6831 		ctx = task->perf_event_ctxp[ctxn];
6832 		if (!ctx)
6833 			continue;
6834 
6835 		mutex_lock(&ctx->mutex);
6836 again:
6837 		list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
6838 				group_entry)
6839 			perf_free_event(event, ctx);
6840 
6841 		list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
6842 				group_entry)
6843 			perf_free_event(event, ctx);
6844 
6845 		if (!list_empty(&ctx->pinned_groups) ||
6846 				!list_empty(&ctx->flexible_groups))
6847 			goto again;
6848 
6849 		mutex_unlock(&ctx->mutex);
6850 
6851 		put_ctx(ctx);
6852 	}
6853 }
6854 
perf_event_delayed_put(struct task_struct * task)6855 void perf_event_delayed_put(struct task_struct *task)
6856 {
6857 	int ctxn;
6858 
6859 	for_each_task_context_nr(ctxn)
6860 		WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
6861 }
6862 
6863 /*
6864  * inherit a event from parent task to child task:
6865  */
6866 static struct perf_event *
inherit_event(struct perf_event * parent_event,struct task_struct * parent,struct perf_event_context * parent_ctx,struct task_struct * child,struct perf_event * group_leader,struct perf_event_context * child_ctx)6867 inherit_event(struct perf_event *parent_event,
6868 	      struct task_struct *parent,
6869 	      struct perf_event_context *parent_ctx,
6870 	      struct task_struct *child,
6871 	      struct perf_event *group_leader,
6872 	      struct perf_event_context *child_ctx)
6873 {
6874 	struct perf_event *child_event;
6875 	unsigned long flags;
6876 
6877 	/*
6878 	 * Instead of creating recursive hierarchies of events,
6879 	 * we link inherited events back to the original parent,
6880 	 * which has a filp for sure, which we use as the reference
6881 	 * count:
6882 	 */
6883 	if (parent_event->parent)
6884 		parent_event = parent_event->parent;
6885 
6886 	child_event = perf_event_alloc(&parent_event->attr,
6887 					   parent_event->cpu,
6888 					   child,
6889 					   group_leader, parent_event,
6890 				           NULL, NULL);
6891 	if (IS_ERR(child_event))
6892 		return child_event;
6893 
6894 	if (!atomic_long_inc_not_zero(&parent_event->refcount)) {
6895 		free_event(child_event);
6896 		return NULL;
6897 	}
6898 
6899 	get_ctx(child_ctx);
6900 
6901 	/*
6902 	 * Make the child state follow the state of the parent event,
6903 	 * not its attr.disabled bit.  We hold the parent's mutex,
6904 	 * so we won't race with perf_event_{en, dis}able_family.
6905 	 */
6906 	if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
6907 		child_event->state = PERF_EVENT_STATE_INACTIVE;
6908 	else
6909 		child_event->state = PERF_EVENT_STATE_OFF;
6910 
6911 	if (parent_event->attr.freq) {
6912 		u64 sample_period = parent_event->hw.sample_period;
6913 		struct hw_perf_event *hwc = &child_event->hw;
6914 
6915 		hwc->sample_period = sample_period;
6916 		hwc->last_period   = sample_period;
6917 
6918 		local64_set(&hwc->period_left, sample_period);
6919 	}
6920 
6921 	child_event->ctx = child_ctx;
6922 	child_event->overflow_handler = parent_event->overflow_handler;
6923 	child_event->overflow_handler_context
6924 		= parent_event->overflow_handler_context;
6925 
6926 	/*
6927 	 * Precalculate sample_data sizes
6928 	 */
6929 	perf_event__header_size(child_event);
6930 	perf_event__id_header_size(child_event);
6931 
6932 	/*
6933 	 * Link it up in the child's context:
6934 	 */
6935 	raw_spin_lock_irqsave(&child_ctx->lock, flags);
6936 	add_event_to_ctx(child_event, child_ctx);
6937 	raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
6938 
6939 	/*
6940 	 * Link this into the parent event's child list
6941 	 */
6942 	WARN_ON_ONCE(parent_event->ctx->parent_ctx);
6943 	mutex_lock(&parent_event->child_mutex);
6944 	list_add_tail(&child_event->child_list, &parent_event->child_list);
6945 	mutex_unlock(&parent_event->child_mutex);
6946 
6947 	return child_event;
6948 }
6949 
inherit_group(struct perf_event * parent_event,struct task_struct * parent,struct perf_event_context * parent_ctx,struct task_struct * child,struct perf_event_context * child_ctx)6950 static int inherit_group(struct perf_event *parent_event,
6951 	      struct task_struct *parent,
6952 	      struct perf_event_context *parent_ctx,
6953 	      struct task_struct *child,
6954 	      struct perf_event_context *child_ctx)
6955 {
6956 	struct perf_event *leader;
6957 	struct perf_event *sub;
6958 	struct perf_event *child_ctr;
6959 
6960 	leader = inherit_event(parent_event, parent, parent_ctx,
6961 				 child, NULL, child_ctx);
6962 	if (IS_ERR(leader))
6963 		return PTR_ERR(leader);
6964 	list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
6965 		child_ctr = inherit_event(sub, parent, parent_ctx,
6966 					    child, leader, child_ctx);
6967 		if (IS_ERR(child_ctr))
6968 			return PTR_ERR(child_ctr);
6969 	}
6970 	return 0;
6971 }
6972 
6973 static int
inherit_task_group(struct perf_event * event,struct task_struct * parent,struct perf_event_context * parent_ctx,struct task_struct * child,int ctxn,int * inherited_all)6974 inherit_task_group(struct perf_event *event, struct task_struct *parent,
6975 		   struct perf_event_context *parent_ctx,
6976 		   struct task_struct *child, int ctxn,
6977 		   int *inherited_all)
6978 {
6979 	int ret;
6980 	struct perf_event_context *child_ctx;
6981 
6982 	if (!event->attr.inherit) {
6983 		*inherited_all = 0;
6984 		return 0;
6985 	}
6986 
6987 	child_ctx = child->perf_event_ctxp[ctxn];
6988 	if (!child_ctx) {
6989 		/*
6990 		 * This is executed from the parent task context, so
6991 		 * inherit events that have been marked for cloning.
6992 		 * First allocate and initialize a context for the
6993 		 * child.
6994 		 */
6995 
6996 		child_ctx = alloc_perf_context(parent_ctx->pmu, child);
6997 		if (!child_ctx)
6998 			return -ENOMEM;
6999 
7000 		child->perf_event_ctxp[ctxn] = child_ctx;
7001 	}
7002 
7003 	ret = inherit_group(event, parent, parent_ctx,
7004 			    child, child_ctx);
7005 
7006 	if (ret)
7007 		*inherited_all = 0;
7008 
7009 	return ret;
7010 }
7011 
7012 /*
7013  * Initialize the perf_event context in task_struct
7014  */
perf_event_init_context(struct task_struct * child,int ctxn)7015 int perf_event_init_context(struct task_struct *child, int ctxn)
7016 {
7017 	struct perf_event_context *child_ctx, *parent_ctx;
7018 	struct perf_event_context *cloned_ctx;
7019 	struct perf_event *event;
7020 	struct task_struct *parent = current;
7021 	int inherited_all = 1;
7022 	unsigned long flags;
7023 	int ret = 0;
7024 
7025 	if (likely(!parent->perf_event_ctxp[ctxn]))
7026 		return 0;
7027 
7028 	/*
7029 	 * If the parent's context is a clone, pin it so it won't get
7030 	 * swapped under us.
7031 	 */
7032 	parent_ctx = perf_pin_task_context(parent, ctxn);
7033 
7034 	/*
7035 	 * No need to check if parent_ctx != NULL here; since we saw
7036 	 * it non-NULL earlier, the only reason for it to become NULL
7037 	 * is if we exit, and since we're currently in the middle of
7038 	 * a fork we can't be exiting at the same time.
7039 	 */
7040 
7041 	/*
7042 	 * Lock the parent list. No need to lock the child - not PID
7043 	 * hashed yet and not running, so nobody can access it.
7044 	 */
7045 	mutex_lock(&parent_ctx->mutex);
7046 
7047 	/*
7048 	 * We dont have to disable NMIs - we are only looking at
7049 	 * the list, not manipulating it:
7050 	 */
7051 	list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
7052 		ret = inherit_task_group(event, parent, parent_ctx,
7053 					 child, ctxn, &inherited_all);
7054 		if (ret)
7055 			break;
7056 	}
7057 
7058 	/*
7059 	 * We can't hold ctx->lock when iterating the ->flexible_group list due
7060 	 * to allocations, but we need to prevent rotation because
7061 	 * rotate_ctx() will change the list from interrupt context.
7062 	 */
7063 	raw_spin_lock_irqsave(&parent_ctx->lock, flags);
7064 	parent_ctx->rotate_disable = 1;
7065 	raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
7066 
7067 	list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
7068 		ret = inherit_task_group(event, parent, parent_ctx,
7069 					 child, ctxn, &inherited_all);
7070 		if (ret)
7071 			break;
7072 	}
7073 
7074 	raw_spin_lock_irqsave(&parent_ctx->lock, flags);
7075 	parent_ctx->rotate_disable = 0;
7076 
7077 	child_ctx = child->perf_event_ctxp[ctxn];
7078 
7079 	if (child_ctx && inherited_all) {
7080 		/*
7081 		 * Mark the child context as a clone of the parent
7082 		 * context, or of whatever the parent is a clone of.
7083 		 *
7084 		 * Note that if the parent is a clone, the holding of
7085 		 * parent_ctx->lock avoids it from being uncloned.
7086 		 */
7087 		cloned_ctx = parent_ctx->parent_ctx;
7088 		if (cloned_ctx) {
7089 			child_ctx->parent_ctx = cloned_ctx;
7090 			child_ctx->parent_gen = parent_ctx->parent_gen;
7091 		} else {
7092 			child_ctx->parent_ctx = parent_ctx;
7093 			child_ctx->parent_gen = parent_ctx->generation;
7094 		}
7095 		get_ctx(child_ctx->parent_ctx);
7096 	}
7097 
7098 	raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
7099 	mutex_unlock(&parent_ctx->mutex);
7100 
7101 	perf_unpin_context(parent_ctx);
7102 	put_ctx(parent_ctx);
7103 
7104 	return ret;
7105 }
7106 
7107 /*
7108  * Initialize the perf_event context in task_struct
7109  */
perf_event_init_task(struct task_struct * child)7110 int perf_event_init_task(struct task_struct *child)
7111 {
7112 	int ctxn, ret;
7113 
7114 	memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
7115 	mutex_init(&child->perf_event_mutex);
7116 	INIT_LIST_HEAD(&child->perf_event_list);
7117 
7118 	for_each_task_context_nr(ctxn) {
7119 		ret = perf_event_init_context(child, ctxn);
7120 		if (ret)
7121 			return ret;
7122 	}
7123 
7124 	return 0;
7125 }
7126 
perf_event_init_all_cpus(void)7127 static void __init perf_event_init_all_cpus(void)
7128 {
7129 	struct swevent_htable *swhash;
7130 	int cpu;
7131 
7132 	for_each_possible_cpu(cpu) {
7133 		swhash = &per_cpu(swevent_htable, cpu);
7134 		mutex_init(&swhash->hlist_mutex);
7135 		INIT_LIST_HEAD(&per_cpu(rotation_list, cpu));
7136 	}
7137 }
7138 
perf_event_init_cpu(int cpu)7139 static void __cpuinit perf_event_init_cpu(int cpu)
7140 {
7141 	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
7142 
7143 	mutex_lock(&swhash->hlist_mutex);
7144 	swhash->online = true;
7145 	if (swhash->hlist_refcount > 0) {
7146 		struct swevent_hlist *hlist;
7147 
7148 		hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
7149 		WARN_ON(!hlist);
7150 		rcu_assign_pointer(swhash->swevent_hlist, hlist);
7151 	}
7152 	mutex_unlock(&swhash->hlist_mutex);
7153 }
7154 
7155 #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
perf_pmu_rotate_stop(struct pmu * pmu)7156 static void perf_pmu_rotate_stop(struct pmu *pmu)
7157 {
7158 	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
7159 
7160 	WARN_ON(!irqs_disabled());
7161 
7162 	list_del_init(&cpuctx->rotation_list);
7163 }
7164 
__perf_event_exit_context(void * __info)7165 static void __perf_event_exit_context(void *__info)
7166 {
7167 	struct remove_event re = { .detach_group = false };
7168 	struct perf_event_context *ctx = __info;
7169 
7170 	perf_pmu_rotate_stop(ctx->pmu);
7171 
7172 	rcu_read_lock();
7173 	list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
7174 		__perf_remove_from_context(&re);
7175 	rcu_read_unlock();
7176 }
7177 
perf_event_exit_cpu_context(int cpu)7178 static void perf_event_exit_cpu_context(int cpu)
7179 {
7180 	struct perf_event_context *ctx;
7181 	struct pmu *pmu;
7182 	int idx;
7183 
7184 	idx = srcu_read_lock(&pmus_srcu);
7185 	list_for_each_entry_rcu(pmu, &pmus, entry) {
7186 		ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
7187 
7188 		mutex_lock(&ctx->mutex);
7189 		smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
7190 		mutex_unlock(&ctx->mutex);
7191 	}
7192 	srcu_read_unlock(&pmus_srcu, idx);
7193 }
7194 
perf_event_exit_cpu(int cpu)7195 static void perf_event_exit_cpu(int cpu)
7196 {
7197 	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
7198 
7199 	perf_event_exit_cpu_context(cpu);
7200 
7201 	mutex_lock(&swhash->hlist_mutex);
7202 	swhash->online = false;
7203 	swevent_hlist_release(swhash);
7204 	mutex_unlock(&swhash->hlist_mutex);
7205 }
7206 #else
perf_event_exit_cpu(int cpu)7207 static inline void perf_event_exit_cpu(int cpu) { }
7208 #endif
7209 
7210 static int
perf_reboot(struct notifier_block * notifier,unsigned long val,void * v)7211 perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
7212 {
7213 	int cpu;
7214 
7215 	for_each_online_cpu(cpu)
7216 		perf_event_exit_cpu(cpu);
7217 
7218 	return NOTIFY_OK;
7219 }
7220 
7221 /*
7222  * Run the perf reboot notifier at the very last possible moment so that
7223  * the generic watchdog code runs as long as possible.
7224  */
7225 static struct notifier_block perf_reboot_notifier = {
7226 	.notifier_call = perf_reboot,
7227 	.priority = INT_MIN,
7228 };
7229 
7230 static int __cpuinit
perf_cpu_notify(struct notifier_block * self,unsigned long action,void * hcpu)7231 perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
7232 {
7233 	unsigned int cpu = (long)hcpu;
7234 
7235 	switch (action & ~CPU_TASKS_FROZEN) {
7236 
7237 	case CPU_UP_PREPARE:
7238 	case CPU_DOWN_FAILED:
7239 		perf_event_init_cpu(cpu);
7240 		break;
7241 
7242 	case CPU_UP_CANCELED:
7243 	case CPU_DOWN_PREPARE:
7244 		perf_event_exit_cpu(cpu);
7245 		break;
7246 
7247 	default:
7248 		break;
7249 	}
7250 
7251 	return NOTIFY_OK;
7252 }
7253 
perf_event_init(void)7254 void __init perf_event_init(void)
7255 {
7256 	int ret;
7257 
7258 	idr_init(&pmu_idr);
7259 
7260 	perf_event_init_all_cpus();
7261 	init_srcu_struct(&pmus_srcu);
7262 	perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
7263 	perf_pmu_register(&perf_cpu_clock, NULL, -1);
7264 	perf_pmu_register(&perf_task_clock, NULL, -1);
7265 	perf_tp_register();
7266 	perf_cpu_notifier(perf_cpu_notify);
7267 	register_reboot_notifier(&perf_reboot_notifier);
7268 
7269 	ret = init_hw_breakpoint();
7270 	WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
7271 
7272 	/* do not patch jump label more than once per second */
7273 	jump_label_rate_limit(&perf_sched_events, HZ);
7274 
7275 	/*
7276 	 * Build time assertion that we keep the data_head at the intended
7277 	 * location.  IOW, validation we got the __reserved[] size right.
7278 	 */
7279 	BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
7280 		     != 1024);
7281 }
7282 
perf_event_sysfs_init(void)7283 static int __init perf_event_sysfs_init(void)
7284 {
7285 	struct pmu *pmu;
7286 	int ret;
7287 
7288 	mutex_lock(&pmus_lock);
7289 
7290 	ret = bus_register(&pmu_bus);
7291 	if (ret)
7292 		goto unlock;
7293 
7294 	list_for_each_entry(pmu, &pmus, entry) {
7295 		if (!pmu->name || pmu->type < 0)
7296 			continue;
7297 
7298 		ret = pmu_dev_alloc(pmu);
7299 		WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
7300 	}
7301 	pmu_bus_running = 1;
7302 	ret = 0;
7303 
7304 unlock:
7305 	mutex_unlock(&pmus_lock);
7306 
7307 	return ret;
7308 }
7309 device_initcall(perf_event_sysfs_init);
7310 
7311 #ifdef CONFIG_CGROUP_PERF
perf_cgroup_create(struct cgroup * cont)7312 static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont)
7313 {
7314 	struct perf_cgroup *jc;
7315 
7316 	jc = kzalloc(sizeof(*jc), GFP_KERNEL);
7317 	if (!jc)
7318 		return ERR_PTR(-ENOMEM);
7319 
7320 	jc->info = alloc_percpu(struct perf_cgroup_info);
7321 	if (!jc->info) {
7322 		kfree(jc);
7323 		return ERR_PTR(-ENOMEM);
7324 	}
7325 
7326 	return &jc->css;
7327 }
7328 
perf_cgroup_destroy(struct cgroup * cont)7329 static void perf_cgroup_destroy(struct cgroup *cont)
7330 {
7331 	struct perf_cgroup *jc;
7332 	jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
7333 			  struct perf_cgroup, css);
7334 	free_percpu(jc->info);
7335 	kfree(jc);
7336 }
7337 
__perf_cgroup_move(void * info)7338 static int __perf_cgroup_move(void *info)
7339 {
7340 	struct task_struct *task = info;
7341 	perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
7342 	return 0;
7343 }
7344 
perf_cgroup_attach(struct cgroup * cgrp,struct cgroup_taskset * tset)7345 static void perf_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
7346 {
7347 	struct task_struct *task;
7348 
7349 	cgroup_taskset_for_each(task, cgrp, tset)
7350 		task_function_call(task, __perf_cgroup_move, task);
7351 }
7352 
perf_cgroup_exit(struct cgroup * cgrp,struct cgroup * old_cgrp,struct task_struct * task)7353 static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
7354 			     struct task_struct *task)
7355 {
7356 	/*
7357 	 * cgroup_exit() is called in the copy_process() failure path.
7358 	 * Ignore this case since the task hasn't ran yet, this avoids
7359 	 * trying to poke a half freed task state from generic code.
7360 	 */
7361 	if (!(task->flags & PF_EXITING))
7362 		return;
7363 
7364 	task_function_call(task, __perf_cgroup_move, task);
7365 }
7366 
7367 struct cgroup_subsys perf_subsys = {
7368 	.name		= "perf_event",
7369 	.subsys_id	= perf_subsys_id,
7370 	.create		= perf_cgroup_create,
7371 	.destroy	= perf_cgroup_destroy,
7372 	.exit		= perf_cgroup_exit,
7373 	.attach		= perf_cgroup_attach,
7374 };
7375 #endif /* CONFIG_CGROUP_PERF */
7376