1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * intel_powerclamp.c - package c-state idle injection
4  *
5  * Copyright (c) 2012, Intel Corporation.
6  *
7  * Authors:
8  *     Arjan van de Ven <arjan@linux.intel.com>
9  *     Jacob Pan <jacob.jun.pan@linux.intel.com>
10  *
11  *	TODO:
12  *           1. better handle wakeup from external interrupts, currently a fixed
13  *              compensation is added to clamping duration when excessive amount
14  *              of wakeups are observed during idle time. the reason is that in
15  *              case of external interrupts without need for ack, clamping down
16  *              cpu in non-irq context does not reduce irq. for majority of the
17  *              cases, clamping down cpu does help reduce irq as well, we should
18  *              be able to differentiate the two cases and give a quantitative
19  *              solution for the irqs that we can control. perhaps based on
20  *              get_cpu_iowait_time_us()
21  *
22  *	     2. synchronization with other hw blocks
23  */
24 
25 #define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
26 
27 #include <linux/module.h>
28 #include <linux/kernel.h>
29 #include <linux/delay.h>
30 #include <linux/kthread.h>
31 #include <linux/cpu.h>
32 #include <linux/thermal.h>
33 #include <linux/slab.h>
34 #include <linux/tick.h>
35 #include <linux/debugfs.h>
36 #include <linux/seq_file.h>
37 #include <linux/sched/rt.h>
38 #include <uapi/linux/sched/types.h>
39 
40 #include <asm/nmi.h>
41 #include <asm/msr.h>
42 #include <asm/mwait.h>
43 #include <asm/cpu_device_id.h>
44 #include <asm/hardirq.h>
45 
46 #define MAX_TARGET_RATIO (50U)
47 /* For each undisturbed clamping period (no extra wake ups during idle time),
48  * we increment the confidence counter for the given target ratio.
49  * CONFIDENCE_OK defines the level where runtime calibration results are
50  * valid.
51  */
52 #define CONFIDENCE_OK (3)
53 /* Default idle injection duration, driver adjust sleep time to meet target
54  * idle ratio. Similar to frequency modulation.
55  */
56 #define DEFAULT_DURATION_JIFFIES (6)
57 
58 static unsigned int target_mwait;
59 static struct dentry *debug_dir;
60 
61 /* user selected target */
62 static unsigned int set_target_ratio;
63 static unsigned int current_ratio;
64 static bool should_skip;
65 
66 static unsigned int control_cpu; /* The cpu assigned to collect stat and update
67 				  * control parameters. default to BSP but BSP
68 				  * can be offlined.
69 				  */
70 static bool clamping;
71 
72 struct powerclamp_worker_data {
73 	struct kthread_worker *worker;
74 	struct kthread_work balancing_work;
75 	struct kthread_delayed_work idle_injection_work;
76 	unsigned int cpu;
77 	unsigned int count;
78 	unsigned int guard;
79 	unsigned int window_size_now;
80 	unsigned int target_ratio;
81 	unsigned int duration_jiffies;
82 	bool clamping;
83 };
84 
85 static struct powerclamp_worker_data __percpu *worker_data;
86 static struct thermal_cooling_device *cooling_dev;
87 static unsigned long *cpu_clamping_mask;  /* bit map for tracking per cpu
88 					   * clamping kthread worker
89 					   */
90 
91 static unsigned int duration;
92 static unsigned int pkg_cstate_ratio_cur;
93 static unsigned int window_size;
94 
duration_set(const char * arg,const struct kernel_param * kp)95 static int duration_set(const char *arg, const struct kernel_param *kp)
96 {
97 	int ret = 0;
98 	unsigned long new_duration;
99 
100 	ret = kstrtoul(arg, 10, &new_duration);
101 	if (ret)
102 		goto exit;
103 	if (new_duration > 25 || new_duration < 6) {
104 		pr_err("Out of recommended range %lu, between 6-25ms\n",
105 			new_duration);
106 		ret = -EINVAL;
107 	}
108 
109 	duration = clamp(new_duration, 6ul, 25ul);
110 	smp_mb();
111 
112 exit:
113 
114 	return ret;
115 }
116 
117 static const struct kernel_param_ops duration_ops = {
118 	.set = duration_set,
119 	.get = param_get_int,
120 };
121 
122 
123 module_param_cb(duration, &duration_ops, &duration, 0644);
124 MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
125 
126 struct powerclamp_calibration_data {
127 	unsigned long confidence;  /* used for calibration, basically a counter
128 				    * gets incremented each time a clamping
129 				    * period is completed without extra wakeups
130 				    * once that counter is reached given level,
131 				    * compensation is deemed usable.
132 				    */
133 	unsigned long steady_comp; /* steady state compensation used when
134 				    * no extra wakeups occurred.
135 				    */
136 	unsigned long dynamic_comp; /* compensate excessive wakeup from idle
137 				     * mostly from external interrupts.
138 				     */
139 };
140 
141 static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO];
142 
window_size_set(const char * arg,const struct kernel_param * kp)143 static int window_size_set(const char *arg, const struct kernel_param *kp)
144 {
145 	int ret = 0;
146 	unsigned long new_window_size;
147 
148 	ret = kstrtoul(arg, 10, &new_window_size);
149 	if (ret)
150 		goto exit_win;
151 	if (new_window_size > 10 || new_window_size < 2) {
152 		pr_err("Out of recommended window size %lu, between 2-10\n",
153 			new_window_size);
154 		ret = -EINVAL;
155 	}
156 
157 	window_size = clamp(new_window_size, 2ul, 10ul);
158 	smp_mb();
159 
160 exit_win:
161 
162 	return ret;
163 }
164 
165 static const struct kernel_param_ops window_size_ops = {
166 	.set = window_size_set,
167 	.get = param_get_int,
168 };
169 
170 module_param_cb(window_size, &window_size_ops, &window_size, 0644);
171 MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n"
172 	"\tpowerclamp controls idle ratio within this window. larger\n"
173 	"\twindow size results in slower response time but more smooth\n"
174 	"\tclamping results. default to 2.");
175 
find_target_mwait(void)176 static void find_target_mwait(void)
177 {
178 	unsigned int eax, ebx, ecx, edx;
179 	unsigned int highest_cstate = 0;
180 	unsigned int highest_subcstate = 0;
181 	int i;
182 
183 	if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
184 		return;
185 
186 	cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
187 
188 	if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
189 	    !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
190 		return;
191 
192 	edx >>= MWAIT_SUBSTATE_SIZE;
193 	for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
194 		if (edx & MWAIT_SUBSTATE_MASK) {
195 			highest_cstate = i;
196 			highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
197 		}
198 	}
199 	target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
200 		(highest_subcstate - 1);
201 
202 }
203 
204 struct pkg_cstate_info {
205 	bool skip;
206 	int msr_index;
207 	int cstate_id;
208 };
209 
210 #define PKG_CSTATE_INIT(id) {				\
211 		.msr_index = MSR_PKG_C##id##_RESIDENCY, \
212 		.cstate_id = id				\
213 			}
214 
215 static struct pkg_cstate_info pkg_cstates[] = {
216 	PKG_CSTATE_INIT(2),
217 	PKG_CSTATE_INIT(3),
218 	PKG_CSTATE_INIT(6),
219 	PKG_CSTATE_INIT(7),
220 	PKG_CSTATE_INIT(8),
221 	PKG_CSTATE_INIT(9),
222 	PKG_CSTATE_INIT(10),
223 	{NULL},
224 };
225 
has_pkg_state_counter(void)226 static bool has_pkg_state_counter(void)
227 {
228 	u64 val;
229 	struct pkg_cstate_info *info = pkg_cstates;
230 
231 	/* check if any one of the counter msrs exists */
232 	while (info->msr_index) {
233 		if (!rdmsrl_safe(info->msr_index, &val))
234 			return true;
235 		info++;
236 	}
237 
238 	return false;
239 }
240 
pkg_state_counter(void)241 static u64 pkg_state_counter(void)
242 {
243 	u64 val;
244 	u64 count = 0;
245 	struct pkg_cstate_info *info = pkg_cstates;
246 
247 	while (info->msr_index) {
248 		if (!info->skip) {
249 			if (!rdmsrl_safe(info->msr_index, &val))
250 				count += val;
251 			else
252 				info->skip = true;
253 		}
254 		info++;
255 	}
256 
257 	return count;
258 }
259 
get_compensation(int ratio)260 static unsigned int get_compensation(int ratio)
261 {
262 	unsigned int comp = 0;
263 
264 	/* we only use compensation if all adjacent ones are good */
265 	if (ratio == 1 &&
266 		cal_data[ratio].confidence >= CONFIDENCE_OK &&
267 		cal_data[ratio + 1].confidence >= CONFIDENCE_OK &&
268 		cal_data[ratio + 2].confidence >= CONFIDENCE_OK) {
269 		comp = (cal_data[ratio].steady_comp +
270 			cal_data[ratio + 1].steady_comp +
271 			cal_data[ratio + 2].steady_comp) / 3;
272 	} else if (ratio == MAX_TARGET_RATIO - 1 &&
273 		cal_data[ratio].confidence >= CONFIDENCE_OK &&
274 		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
275 		cal_data[ratio - 2].confidence >= CONFIDENCE_OK) {
276 		comp = (cal_data[ratio].steady_comp +
277 			cal_data[ratio - 1].steady_comp +
278 			cal_data[ratio - 2].steady_comp) / 3;
279 	} else if (cal_data[ratio].confidence >= CONFIDENCE_OK &&
280 		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
281 		cal_data[ratio + 1].confidence >= CONFIDENCE_OK) {
282 		comp = (cal_data[ratio].steady_comp +
283 			cal_data[ratio - 1].steady_comp +
284 			cal_data[ratio + 1].steady_comp) / 3;
285 	}
286 
287 	/* do not exceed limit */
288 	if (comp + ratio >= MAX_TARGET_RATIO)
289 		comp = MAX_TARGET_RATIO - ratio - 1;
290 
291 	return comp;
292 }
293 
adjust_compensation(int target_ratio,unsigned int win)294 static void adjust_compensation(int target_ratio, unsigned int win)
295 {
296 	int delta;
297 	struct powerclamp_calibration_data *d = &cal_data[target_ratio];
298 
299 	/*
300 	 * adjust compensations if confidence level has not been reached.
301 	 */
302 	if (d->confidence >= CONFIDENCE_OK)
303 		return;
304 
305 	delta = set_target_ratio - current_ratio;
306 	/* filter out bad data */
307 	if (delta >= 0 && delta <= (1+target_ratio/10)) {
308 		if (d->steady_comp)
309 			d->steady_comp =
310 				roundup(delta+d->steady_comp, 2)/2;
311 		else
312 			d->steady_comp = delta;
313 		d->confidence++;
314 	}
315 }
316 
powerclamp_adjust_controls(unsigned int target_ratio,unsigned int guard,unsigned int win)317 static bool powerclamp_adjust_controls(unsigned int target_ratio,
318 				unsigned int guard, unsigned int win)
319 {
320 	static u64 msr_last, tsc_last;
321 	u64 msr_now, tsc_now;
322 	u64 val64;
323 
324 	/* check result for the last window */
325 	msr_now = pkg_state_counter();
326 	tsc_now = rdtsc();
327 
328 	/* calculate pkg cstate vs tsc ratio */
329 	if (!msr_last || !tsc_last)
330 		current_ratio = 1;
331 	else if (tsc_now-tsc_last) {
332 		val64 = 100*(msr_now-msr_last);
333 		do_div(val64, (tsc_now-tsc_last));
334 		current_ratio = val64;
335 	}
336 
337 	/* update record */
338 	msr_last = msr_now;
339 	tsc_last = tsc_now;
340 
341 	adjust_compensation(target_ratio, win);
342 
343 	/* if we are above target+guard, skip */
344 	return set_target_ratio + guard <= current_ratio;
345 }
346 
clamp_balancing_func(struct kthread_work * work)347 static void clamp_balancing_func(struct kthread_work *work)
348 {
349 	struct powerclamp_worker_data *w_data;
350 	int sleeptime;
351 	unsigned long target_jiffies;
352 	unsigned int compensated_ratio;
353 	int interval; /* jiffies to sleep for each attempt */
354 
355 	w_data = container_of(work, struct powerclamp_worker_data,
356 			      balancing_work);
357 
358 	/*
359 	 * make sure user selected ratio does not take effect until
360 	 * the next round. adjust target_ratio if user has changed
361 	 * target such that we can converge quickly.
362 	 */
363 	w_data->target_ratio = READ_ONCE(set_target_ratio);
364 	w_data->guard = 1 + w_data->target_ratio / 20;
365 	w_data->window_size_now = window_size;
366 	w_data->duration_jiffies = msecs_to_jiffies(duration);
367 	w_data->count++;
368 
369 	/*
370 	 * systems may have different ability to enter package level
371 	 * c-states, thus we need to compensate the injected idle ratio
372 	 * to achieve the actual target reported by the HW.
373 	 */
374 	compensated_ratio = w_data->target_ratio +
375 		get_compensation(w_data->target_ratio);
376 	if (compensated_ratio <= 0)
377 		compensated_ratio = 1;
378 	interval = w_data->duration_jiffies * 100 / compensated_ratio;
379 
380 	/* align idle time */
381 	target_jiffies = roundup(jiffies, interval);
382 	sleeptime = target_jiffies - jiffies;
383 	if (sleeptime <= 0)
384 		sleeptime = 1;
385 
386 	if (clamping && w_data->clamping && cpu_online(w_data->cpu))
387 		kthread_queue_delayed_work(w_data->worker,
388 					   &w_data->idle_injection_work,
389 					   sleeptime);
390 }
391 
clamp_idle_injection_func(struct kthread_work * work)392 static void clamp_idle_injection_func(struct kthread_work *work)
393 {
394 	struct powerclamp_worker_data *w_data;
395 
396 	w_data = container_of(work, struct powerclamp_worker_data,
397 			      idle_injection_work.work);
398 
399 	/*
400 	 * only elected controlling cpu can collect stats and update
401 	 * control parameters.
402 	 */
403 	if (w_data->cpu == control_cpu &&
404 	    !(w_data->count % w_data->window_size_now)) {
405 		should_skip =
406 			powerclamp_adjust_controls(w_data->target_ratio,
407 						   w_data->guard,
408 						   w_data->window_size_now);
409 		smp_mb();
410 	}
411 
412 	if (should_skip)
413 		goto balance;
414 
415 	play_idle(jiffies_to_usecs(w_data->duration_jiffies));
416 
417 balance:
418 	if (clamping && w_data->clamping && cpu_online(w_data->cpu))
419 		kthread_queue_work(w_data->worker, &w_data->balancing_work);
420 }
421 
422 /*
423  * 1 HZ polling while clamping is active, useful for userspace
424  * to monitor actual idle ratio.
425  */
426 static void poll_pkg_cstate(struct work_struct *dummy);
427 static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate);
poll_pkg_cstate(struct work_struct * dummy)428 static void poll_pkg_cstate(struct work_struct *dummy)
429 {
430 	static u64 msr_last;
431 	static u64 tsc_last;
432 
433 	u64 msr_now;
434 	u64 tsc_now;
435 	u64 val64;
436 
437 	msr_now = pkg_state_counter();
438 	tsc_now = rdtsc();
439 
440 	/* calculate pkg cstate vs tsc ratio */
441 	if (!msr_last || !tsc_last)
442 		pkg_cstate_ratio_cur = 1;
443 	else {
444 		if (tsc_now - tsc_last) {
445 			val64 = 100 * (msr_now - msr_last);
446 			do_div(val64, (tsc_now - tsc_last));
447 			pkg_cstate_ratio_cur = val64;
448 		}
449 	}
450 
451 	/* update record */
452 	msr_last = msr_now;
453 	tsc_last = tsc_now;
454 
455 	if (true == clamping)
456 		schedule_delayed_work(&poll_pkg_cstate_work, HZ);
457 }
458 
start_power_clamp_worker(unsigned long cpu)459 static void start_power_clamp_worker(unsigned long cpu)
460 {
461 	struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
462 	struct kthread_worker *worker;
463 
464 	worker = kthread_create_worker_on_cpu(cpu, 0, "kidle_inj/%ld", cpu);
465 	if (IS_ERR(worker))
466 		return;
467 
468 	w_data->worker = worker;
469 	w_data->count = 0;
470 	w_data->cpu = cpu;
471 	w_data->clamping = true;
472 	set_bit(cpu, cpu_clamping_mask);
473 	sched_set_fifo(worker->task);
474 	kthread_init_work(&w_data->balancing_work, clamp_balancing_func);
475 	kthread_init_delayed_work(&w_data->idle_injection_work,
476 				  clamp_idle_injection_func);
477 	kthread_queue_work(w_data->worker, &w_data->balancing_work);
478 }
479 
stop_power_clamp_worker(unsigned long cpu)480 static void stop_power_clamp_worker(unsigned long cpu)
481 {
482 	struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
483 
484 	if (!w_data->worker)
485 		return;
486 
487 	w_data->clamping = false;
488 	/*
489 	 * Make sure that all works that get queued after this point see
490 	 * the clamping disabled. The counter part is not needed because
491 	 * there is an implicit memory barrier when the queued work
492 	 * is proceed.
493 	 */
494 	smp_wmb();
495 	kthread_cancel_work_sync(&w_data->balancing_work);
496 	kthread_cancel_delayed_work_sync(&w_data->idle_injection_work);
497 	/*
498 	 * The balancing work still might be queued here because
499 	 * the handling of the "clapming" variable, cancel, and queue
500 	 * operations are not synchronized via a lock. But it is not
501 	 * a big deal. The balancing work is fast and destroy kthread
502 	 * will wait for it.
503 	 */
504 	clear_bit(w_data->cpu, cpu_clamping_mask);
505 	kthread_destroy_worker(w_data->worker);
506 
507 	w_data->worker = NULL;
508 }
509 
start_power_clamp(void)510 static int start_power_clamp(void)
511 {
512 	unsigned long cpu;
513 
514 	set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1);
515 	/* prevent cpu hotplug */
516 	cpus_read_lock();
517 
518 	/* prefer BSP */
519 	control_cpu = cpumask_first(cpu_online_mask);
520 
521 	clamping = true;
522 	schedule_delayed_work(&poll_pkg_cstate_work, 0);
523 
524 	/* start one kthread worker per online cpu */
525 	for_each_online_cpu(cpu) {
526 		start_power_clamp_worker(cpu);
527 	}
528 	cpus_read_unlock();
529 
530 	return 0;
531 }
532 
end_power_clamp(void)533 static void end_power_clamp(void)
534 {
535 	int i;
536 
537 	/*
538 	 * Block requeuing in all the kthread workers. They will flush and
539 	 * stop faster.
540 	 */
541 	clamping = false;
542 	for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
543 		pr_debug("clamping worker for cpu %d alive, destroy\n", i);
544 		stop_power_clamp_worker(i);
545 	}
546 }
547 
powerclamp_cpu_online(unsigned int cpu)548 static int powerclamp_cpu_online(unsigned int cpu)
549 {
550 	if (clamping == false)
551 		return 0;
552 	start_power_clamp_worker(cpu);
553 	/* prefer BSP as controlling CPU */
554 	if (cpu == 0) {
555 		control_cpu = 0;
556 		smp_mb();
557 	}
558 	return 0;
559 }
560 
powerclamp_cpu_predown(unsigned int cpu)561 static int powerclamp_cpu_predown(unsigned int cpu)
562 {
563 	if (clamping == false)
564 		return 0;
565 
566 	stop_power_clamp_worker(cpu);
567 	if (cpu != control_cpu)
568 		return 0;
569 
570 	control_cpu = cpumask_first(cpu_online_mask);
571 	if (control_cpu == cpu)
572 		control_cpu = cpumask_next(cpu, cpu_online_mask);
573 	smp_mb();
574 	return 0;
575 }
576 
powerclamp_get_max_state(struct thermal_cooling_device * cdev,unsigned long * state)577 static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
578 				 unsigned long *state)
579 {
580 	*state = MAX_TARGET_RATIO;
581 
582 	return 0;
583 }
584 
powerclamp_get_cur_state(struct thermal_cooling_device * cdev,unsigned long * state)585 static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
586 				 unsigned long *state)
587 {
588 	if (true == clamping)
589 		*state = pkg_cstate_ratio_cur;
590 	else
591 		/* to save power, do not poll idle ratio while not clamping */
592 		*state = -1; /* indicates invalid state */
593 
594 	return 0;
595 }
596 
powerclamp_set_cur_state(struct thermal_cooling_device * cdev,unsigned long new_target_ratio)597 static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
598 				 unsigned long new_target_ratio)
599 {
600 	int ret = 0;
601 
602 	new_target_ratio = clamp(new_target_ratio, 0UL,
603 				(unsigned long) (MAX_TARGET_RATIO-1));
604 	if (set_target_ratio == 0 && new_target_ratio > 0) {
605 		pr_info("Start idle injection to reduce power\n");
606 		set_target_ratio = new_target_ratio;
607 		ret = start_power_clamp();
608 		goto exit_set;
609 	} else	if (set_target_ratio > 0 && new_target_ratio == 0) {
610 		pr_info("Stop forced idle injection\n");
611 		end_power_clamp();
612 		set_target_ratio = 0;
613 	} else	/* adjust currently running */ {
614 		set_target_ratio = new_target_ratio;
615 		/* make new set_target_ratio visible to other cpus */
616 		smp_mb();
617 	}
618 
619 exit_set:
620 	return ret;
621 }
622 
623 /* bind to generic thermal layer as cooling device*/
624 static const struct thermal_cooling_device_ops powerclamp_cooling_ops = {
625 	.get_max_state = powerclamp_get_max_state,
626 	.get_cur_state = powerclamp_get_cur_state,
627 	.set_cur_state = powerclamp_set_cur_state,
628 };
629 
630 static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = {
631 	X86_MATCH_VENDOR_FEATURE(INTEL, X86_FEATURE_MWAIT, NULL),
632 	{}
633 };
634 MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
635 
powerclamp_probe(void)636 static int __init powerclamp_probe(void)
637 {
638 
639 	if (!x86_match_cpu(intel_powerclamp_ids)) {
640 		pr_err("CPU does not support MWAIT\n");
641 		return -ENODEV;
642 	}
643 
644 	/* The goal for idle time alignment is to achieve package cstate. */
645 	if (!has_pkg_state_counter()) {
646 		pr_info("No package C-state available\n");
647 		return -ENODEV;
648 	}
649 
650 	/* find the deepest mwait value */
651 	find_target_mwait();
652 
653 	return 0;
654 }
655 
powerclamp_debug_show(struct seq_file * m,void * unused)656 static int powerclamp_debug_show(struct seq_file *m, void *unused)
657 {
658 	int i = 0;
659 
660 	seq_printf(m, "controlling cpu: %d\n", control_cpu);
661 	seq_printf(m, "pct confidence steady dynamic (compensation)\n");
662 	for (i = 0; i < MAX_TARGET_RATIO; i++) {
663 		seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
664 			i,
665 			cal_data[i].confidence,
666 			cal_data[i].steady_comp,
667 			cal_data[i].dynamic_comp);
668 	}
669 
670 	return 0;
671 }
672 
673 DEFINE_SHOW_ATTRIBUTE(powerclamp_debug);
674 
powerclamp_create_debug_files(void)675 static inline void powerclamp_create_debug_files(void)
676 {
677 	debug_dir = debugfs_create_dir("intel_powerclamp", NULL);
678 
679 	debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir, cal_data,
680 			    &powerclamp_debug_fops);
681 }
682 
683 static enum cpuhp_state hp_state;
684 
powerclamp_init(void)685 static int __init powerclamp_init(void)
686 {
687 	int retval;
688 
689 	cpu_clamping_mask = bitmap_zalloc(num_possible_cpus(), GFP_KERNEL);
690 	if (!cpu_clamping_mask)
691 		return -ENOMEM;
692 
693 	/* probe cpu features and ids here */
694 	retval = powerclamp_probe();
695 	if (retval)
696 		goto exit_free;
697 
698 	/* set default limit, maybe adjusted during runtime based on feedback */
699 	window_size = 2;
700 	retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
701 					   "thermal/intel_powerclamp:online",
702 					   powerclamp_cpu_online,
703 					   powerclamp_cpu_predown);
704 	if (retval < 0)
705 		goto exit_free;
706 
707 	hp_state = retval;
708 
709 	worker_data = alloc_percpu(struct powerclamp_worker_data);
710 	if (!worker_data) {
711 		retval = -ENOMEM;
712 		goto exit_unregister;
713 	}
714 
715 	cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
716 						&powerclamp_cooling_ops);
717 	if (IS_ERR(cooling_dev)) {
718 		retval = -ENODEV;
719 		goto exit_free_thread;
720 	}
721 
722 	if (!duration)
723 		duration = jiffies_to_msecs(DEFAULT_DURATION_JIFFIES);
724 
725 	powerclamp_create_debug_files();
726 
727 	return 0;
728 
729 exit_free_thread:
730 	free_percpu(worker_data);
731 exit_unregister:
732 	cpuhp_remove_state_nocalls(hp_state);
733 exit_free:
734 	bitmap_free(cpu_clamping_mask);
735 	return retval;
736 }
737 module_init(powerclamp_init);
738 
powerclamp_exit(void)739 static void __exit powerclamp_exit(void)
740 {
741 	end_power_clamp();
742 	cpuhp_remove_state_nocalls(hp_state);
743 	free_percpu(worker_data);
744 	thermal_cooling_device_unregister(cooling_dev);
745 	bitmap_free(cpu_clamping_mask);
746 
747 	cancel_delayed_work_sync(&poll_pkg_cstate_work);
748 	debugfs_remove_recursive(debug_dir);
749 }
750 module_exit(powerclamp_exit);
751 
752 MODULE_LICENSE("GPL");
753 MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
754 MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
755 MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");
756