1 /*
2  * Xen time implementation.
3  *
4  * This is implemented in terms of a clocksource driver which uses
5  * the hypervisor clock as a nanosecond timebase, and a clockevent
6  * driver which uses the hypervisor's timer mechanism.
7  *
8  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
9  */
10 #include <linux/kernel.h>
11 #include <linux/interrupt.h>
12 #include <linux/clocksource.h>
13 #include <linux/clockchips.h>
14 #include <linux/kernel_stat.h>
15 #include <linux/math64.h>
16 #include <linux/gfp.h>
17 
18 #include <asm/pvclock.h>
19 #include <asm/xen/hypervisor.h>
20 #include <asm/xen/hypercall.h>
21 
22 #include <xen/events.h>
23 #include <xen/features.h>
24 #include <xen/interface/xen.h>
25 #include <xen/interface/vcpu.h>
26 
27 #include "xen-ops.h"
28 
29 #define XEN_SHIFT 22
30 
31 /* Xen may fire a timer up to this many ns early */
32 #define TIMER_SLOP	100000
33 #define NS_PER_TICK	(1000000000LL / HZ)
34 
35 /* runstate info updated by Xen */
36 static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate);
37 
38 /* snapshots of runstate info */
39 static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate_snapshot);
40 
41 /* unused ns of stolen and blocked time */
42 static DEFINE_PER_CPU(u64, xen_residual_stolen);
43 static DEFINE_PER_CPU(u64, xen_residual_blocked);
44 
45 /* return an consistent snapshot of 64-bit time/counter value */
get64(const u64 * p)46 static u64 get64(const u64 *p)
47 {
48 	u64 ret;
49 
50 	if (BITS_PER_LONG < 64) {
51 		u32 *p32 = (u32 *)p;
52 		u32 h, l;
53 
54 		/*
55 		 * Read high then low, and then make sure high is
56 		 * still the same; this will only loop if low wraps
57 		 * and carries into high.
58 		 * XXX some clean way to make this endian-proof?
59 		 */
60 		do {
61 			h = p32[1];
62 			barrier();
63 			l = p32[0];
64 			barrier();
65 		} while (p32[1] != h);
66 
67 		ret = (((u64)h) << 32) | l;
68 	} else
69 		ret = *p;
70 
71 	return ret;
72 }
73 
74 /*
75  * Runstate accounting
76  */
get_runstate_snapshot(struct vcpu_runstate_info * res)77 static void get_runstate_snapshot(struct vcpu_runstate_info *res)
78 {
79 	u64 state_time;
80 	struct vcpu_runstate_info *state;
81 
82 	BUG_ON(preemptible());
83 
84 	state = &__get_cpu_var(xen_runstate);
85 
86 	/*
87 	 * The runstate info is always updated by the hypervisor on
88 	 * the current CPU, so there's no need to use anything
89 	 * stronger than a compiler barrier when fetching it.
90 	 */
91 	do {
92 		state_time = get64(&state->state_entry_time);
93 		barrier();
94 		*res = *state;
95 		barrier();
96 	} while (get64(&state->state_entry_time) != state_time);
97 }
98 
99 /* return true when a vcpu could run but has no real cpu to run on */
xen_vcpu_stolen(int vcpu)100 bool xen_vcpu_stolen(int vcpu)
101 {
102 	return per_cpu(xen_runstate, vcpu).state == RUNSTATE_runnable;
103 }
104 
xen_setup_runstate_info(int cpu)105 void xen_setup_runstate_info(int cpu)
106 {
107 	struct vcpu_register_runstate_memory_area area;
108 
109 	area.addr.v = &per_cpu(xen_runstate, cpu);
110 
111 	if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
112 			       cpu, &area))
113 		BUG();
114 }
115 
do_stolen_accounting(void)116 static void do_stolen_accounting(void)
117 {
118 	struct vcpu_runstate_info state;
119 	struct vcpu_runstate_info *snap;
120 	s64 blocked, runnable, offline, stolen;
121 	cputime_t ticks;
122 
123 	get_runstate_snapshot(&state);
124 
125 	WARN_ON(state.state != RUNSTATE_running);
126 
127 	snap = &__get_cpu_var(xen_runstate_snapshot);
128 
129 	/* work out how much time the VCPU has not been runn*ing*  */
130 	blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
131 	runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
132 	offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
133 
134 	*snap = state;
135 
136 	/* Add the appropriate number of ticks of stolen time,
137 	   including any left-overs from last time. */
138 	stolen = runnable + offline + __this_cpu_read(xen_residual_stolen);
139 
140 	if (stolen < 0)
141 		stolen = 0;
142 
143 	ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen);
144 	__this_cpu_write(xen_residual_stolen, stolen);
145 	account_steal_ticks(ticks);
146 
147 	/* Add the appropriate number of ticks of blocked time,
148 	   including any left-overs from last time. */
149 	blocked += __this_cpu_read(xen_residual_blocked);
150 
151 	if (blocked < 0)
152 		blocked = 0;
153 
154 	ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked);
155 	__this_cpu_write(xen_residual_blocked, blocked);
156 	account_idle_ticks(ticks);
157 }
158 
159 /* Get the TSC speed from Xen */
xen_tsc_khz(void)160 static unsigned long xen_tsc_khz(void)
161 {
162 	struct pvclock_vcpu_time_info *info =
163 		&HYPERVISOR_shared_info->vcpu_info[0].time;
164 
165 	return pvclock_tsc_khz(info);
166 }
167 
xen_clocksource_read(void)168 cycle_t xen_clocksource_read(void)
169 {
170         struct pvclock_vcpu_time_info *src;
171 	cycle_t ret;
172 
173 	src = &get_cpu_var(xen_vcpu)->time;
174 	ret = pvclock_clocksource_read(src);
175 	put_cpu_var(xen_vcpu);
176 	return ret;
177 }
178 
xen_clocksource_get_cycles(struct clocksource * cs)179 static cycle_t xen_clocksource_get_cycles(struct clocksource *cs)
180 {
181 	return xen_clocksource_read();
182 }
183 
xen_read_wallclock(struct timespec * ts)184 static void xen_read_wallclock(struct timespec *ts)
185 {
186 	struct shared_info *s = HYPERVISOR_shared_info;
187 	struct pvclock_wall_clock *wall_clock = &(s->wc);
188         struct pvclock_vcpu_time_info *vcpu_time;
189 
190 	vcpu_time = &get_cpu_var(xen_vcpu)->time;
191 	pvclock_read_wallclock(wall_clock, vcpu_time, ts);
192 	put_cpu_var(xen_vcpu);
193 }
194 
xen_get_wallclock(void)195 static unsigned long xen_get_wallclock(void)
196 {
197 	struct timespec ts;
198 
199 	xen_read_wallclock(&ts);
200 	return ts.tv_sec;
201 }
202 
xen_set_wallclock(unsigned long now)203 static int xen_set_wallclock(unsigned long now)
204 {
205 	/* do nothing for domU */
206 	return -1;
207 }
208 
209 static struct clocksource xen_clocksource __read_mostly = {
210 	.name = "xen",
211 	.rating = 400,
212 	.read = xen_clocksource_get_cycles,
213 	.mask = ~0,
214 	.mult = 1<<XEN_SHIFT,		/* time directly in nanoseconds */
215 	.shift = XEN_SHIFT,
216 	.flags = CLOCK_SOURCE_IS_CONTINUOUS,
217 };
218 
219 /*
220    Xen clockevent implementation
221 
222    Xen has two clockevent implementations:
223 
224    The old timer_op one works with all released versions of Xen prior
225    to version 3.0.4.  This version of the hypervisor provides a
226    single-shot timer with nanosecond resolution.  However, sharing the
227    same event channel is a 100Hz tick which is delivered while the
228    vcpu is running.  We don't care about or use this tick, but it will
229    cause the core time code to think the timer fired too soon, and
230    will end up resetting it each time.  It could be filtered, but
231    doing so has complications when the ktime clocksource is not yet
232    the xen clocksource (ie, at boot time).
233 
234    The new vcpu_op-based timer interface allows the tick timer period
235    to be changed or turned off.  The tick timer is not useful as a
236    periodic timer because events are only delivered to running vcpus.
237    The one-shot timer can report when a timeout is in the past, so
238    set_next_event is capable of returning -ETIME when appropriate.
239    This interface is used when available.
240 */
241 
242 
243 /*
244   Get a hypervisor absolute time.  In theory we could maintain an
245   offset between the kernel's time and the hypervisor's time, and
246   apply that to a kernel's absolute timeout.  Unfortunately the
247   hypervisor and kernel times can drift even if the kernel is using
248   the Xen clocksource, because ntp can warp the kernel's clocksource.
249 */
get_abs_timeout(unsigned long delta)250 static s64 get_abs_timeout(unsigned long delta)
251 {
252 	return xen_clocksource_read() + delta;
253 }
254 
xen_timerop_set_mode(enum clock_event_mode mode,struct clock_event_device * evt)255 static void xen_timerop_set_mode(enum clock_event_mode mode,
256 				 struct clock_event_device *evt)
257 {
258 	switch (mode) {
259 	case CLOCK_EVT_MODE_PERIODIC:
260 		/* unsupported */
261 		WARN_ON(1);
262 		break;
263 
264 	case CLOCK_EVT_MODE_ONESHOT:
265 	case CLOCK_EVT_MODE_RESUME:
266 		break;
267 
268 	case CLOCK_EVT_MODE_UNUSED:
269 	case CLOCK_EVT_MODE_SHUTDOWN:
270 		HYPERVISOR_set_timer_op(0);  /* cancel timeout */
271 		break;
272 	}
273 }
274 
xen_timerop_set_next_event(unsigned long delta,struct clock_event_device * evt)275 static int xen_timerop_set_next_event(unsigned long delta,
276 				      struct clock_event_device *evt)
277 {
278 	WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
279 
280 	if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
281 		BUG();
282 
283 	/* We may have missed the deadline, but there's no real way of
284 	   knowing for sure.  If the event was in the past, then we'll
285 	   get an immediate interrupt. */
286 
287 	return 0;
288 }
289 
290 static const struct clock_event_device xen_timerop_clockevent = {
291 	.name = "xen",
292 	.features = CLOCK_EVT_FEAT_ONESHOT,
293 
294 	.max_delta_ns = 0xffffffff,
295 	.min_delta_ns = TIMER_SLOP,
296 
297 	.mult = 1,
298 	.shift = 0,
299 	.rating = 500,
300 
301 	.set_mode = xen_timerop_set_mode,
302 	.set_next_event = xen_timerop_set_next_event,
303 };
304 
305 
306 
xen_vcpuop_set_mode(enum clock_event_mode mode,struct clock_event_device * evt)307 static void xen_vcpuop_set_mode(enum clock_event_mode mode,
308 				struct clock_event_device *evt)
309 {
310 	int cpu = smp_processor_id();
311 
312 	switch (mode) {
313 	case CLOCK_EVT_MODE_PERIODIC:
314 		WARN_ON(1);	/* unsupported */
315 		break;
316 
317 	case CLOCK_EVT_MODE_ONESHOT:
318 		if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
319 			BUG();
320 		break;
321 
322 	case CLOCK_EVT_MODE_UNUSED:
323 	case CLOCK_EVT_MODE_SHUTDOWN:
324 		if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
325 		    HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
326 			BUG();
327 		break;
328 	case CLOCK_EVT_MODE_RESUME:
329 		break;
330 	}
331 }
332 
xen_vcpuop_set_next_event(unsigned long delta,struct clock_event_device * evt)333 static int xen_vcpuop_set_next_event(unsigned long delta,
334 				     struct clock_event_device *evt)
335 {
336 	int cpu = smp_processor_id();
337 	struct vcpu_set_singleshot_timer single;
338 	int ret;
339 
340 	WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
341 
342 	single.timeout_abs_ns = get_abs_timeout(delta);
343 	single.flags = VCPU_SSHOTTMR_future;
344 
345 	ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single);
346 
347 	BUG_ON(ret != 0 && ret != -ETIME);
348 
349 	return ret;
350 }
351 
352 static const struct clock_event_device xen_vcpuop_clockevent = {
353 	.name = "xen",
354 	.features = CLOCK_EVT_FEAT_ONESHOT,
355 
356 	.max_delta_ns = 0xffffffff,
357 	.min_delta_ns = TIMER_SLOP,
358 
359 	.mult = 1,
360 	.shift = 0,
361 	.rating = 500,
362 
363 	.set_mode = xen_vcpuop_set_mode,
364 	.set_next_event = xen_vcpuop_set_next_event,
365 };
366 
367 static const struct clock_event_device *xen_clockevent =
368 	&xen_timerop_clockevent;
369 static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);
370 
xen_timer_interrupt(int irq,void * dev_id)371 static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
372 {
373 	struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
374 	irqreturn_t ret;
375 
376 	ret = IRQ_NONE;
377 	if (evt->event_handler) {
378 		evt->event_handler(evt);
379 		ret = IRQ_HANDLED;
380 	}
381 
382 	do_stolen_accounting();
383 
384 	return ret;
385 }
386 
xen_setup_timer(int cpu)387 void xen_setup_timer(int cpu)
388 {
389 	const char *name;
390 	struct clock_event_device *evt;
391 	int irq;
392 
393 	printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
394 
395 	name = kasprintf(GFP_KERNEL, "timer%d", cpu);
396 	if (!name)
397 		name = "<timer kasprintf failed>";
398 
399 	irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
400 				      IRQF_DISABLED|IRQF_PERCPU|
401 				      IRQF_NOBALANCING|IRQF_TIMER|
402 				      IRQF_FORCE_RESUME,
403 				      name, NULL);
404 
405 	evt = &per_cpu(xen_clock_events, cpu);
406 	memcpy(evt, xen_clockevent, sizeof(*evt));
407 
408 	evt->cpumask = cpumask_of(cpu);
409 	evt->irq = irq;
410 }
411 
xen_teardown_timer(int cpu)412 void xen_teardown_timer(int cpu)
413 {
414 	struct clock_event_device *evt;
415 	BUG_ON(cpu == 0);
416 	evt = &per_cpu(xen_clock_events, cpu);
417 	unbind_from_irqhandler(evt->irq, NULL);
418 }
419 
xen_setup_cpu_clockevents(void)420 void xen_setup_cpu_clockevents(void)
421 {
422 	BUG_ON(preemptible());
423 
424 	clockevents_register_device(&__get_cpu_var(xen_clock_events));
425 }
426 
xen_timer_resume(void)427 void xen_timer_resume(void)
428 {
429 	int cpu;
430 
431 	pvclock_resume();
432 
433 	if (xen_clockevent != &xen_vcpuop_clockevent)
434 		return;
435 
436 	for_each_online_cpu(cpu) {
437 		if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
438 			BUG();
439 	}
440 }
441 
442 static const struct pv_time_ops xen_time_ops __initdata = {
443 	.sched_clock = xen_clocksource_read,
444 };
445 
xen_time_init(void)446 static __init void xen_time_init(void)
447 {
448 	int cpu = smp_processor_id();
449 	struct timespec tp;
450 
451 	clocksource_register(&xen_clocksource);
452 
453 	if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
454 		/* Successfully turned off 100Hz tick, so we have the
455 		   vcpuop-based timer interface */
456 		printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
457 		xen_clockevent = &xen_vcpuop_clockevent;
458 	}
459 
460 	/* Set initial system time with full resolution */
461 	xen_read_wallclock(&tp);
462 	do_settimeofday(&tp);
463 
464 	setup_force_cpu_cap(X86_FEATURE_TSC);
465 
466 	xen_setup_runstate_info(cpu);
467 	xen_setup_timer(cpu);
468 	xen_setup_cpu_clockevents();
469 }
470 
xen_init_time_ops(void)471 __init void xen_init_time_ops(void)
472 {
473 	pv_time_ops = xen_time_ops;
474 
475 	x86_init.timers.timer_init = xen_time_init;
476 	x86_init.timers.setup_percpu_clockev = x86_init_noop;
477 	x86_cpuinit.setup_percpu_clockev = x86_init_noop;
478 
479 	x86_platform.calibrate_tsc = xen_tsc_khz;
480 	x86_platform.get_wallclock = xen_get_wallclock;
481 	x86_platform.set_wallclock = xen_set_wallclock;
482 }
483 
484 #ifdef CONFIG_XEN_PVHVM
xen_hvm_setup_cpu_clockevents(void)485 static void xen_hvm_setup_cpu_clockevents(void)
486 {
487 	int cpu = smp_processor_id();
488 	xen_setup_runstate_info(cpu);
489 	xen_setup_timer(cpu);
490 	xen_setup_cpu_clockevents();
491 }
492 
xen_hvm_init_time_ops(void)493 __init void xen_hvm_init_time_ops(void)
494 {
495 	/* vector callback is needed otherwise we cannot receive interrupts
496 	 * on cpu > 0 and at this point we don't know how many cpus are
497 	 * available */
498 	if (!xen_have_vector_callback)
499 		return;
500 	if (!xen_feature(XENFEAT_hvm_safe_pvclock)) {
501 		printk(KERN_INFO "Xen doesn't support pvclock on HVM,"
502 				"disable pv timer\n");
503 		return;
504 	}
505 
506 	pv_time_ops = xen_time_ops;
507 	x86_init.timers.setup_percpu_clockev = xen_time_init;
508 	x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents;
509 
510 	x86_platform.calibrate_tsc = xen_tsc_khz;
511 	x86_platform.get_wallclock = xen_get_wallclock;
512 	x86_platform.set_wallclock = xen_set_wallclock;
513 }
514 #endif
515