1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * MMU support
8  *
9  * Copyright (C) 2006 Qumranet, Inc.
10  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
11  *
12  * Authors:
13  *   Yaniv Kamay  <yaniv@qumranet.com>
14  *   Avi Kivity   <avi@qumranet.com>
15  *
16  * This work is licensed under the terms of the GNU GPL, version 2.  See
17  * the COPYING file in the top-level directory.
18  *
19  */
20 
21 #include "irq.h"
22 #include "mmu.h"
23 #include "x86.h"
24 #include "kvm_cache_regs.h"
25 #include "x86.h"
26 
27 #include <linux/kvm_host.h>
28 #include <linux/types.h>
29 #include <linux/string.h>
30 #include <linux/mm.h>
31 #include <linux/highmem.h>
32 #include <linux/module.h>
33 #include <linux/swap.h>
34 #include <linux/hugetlb.h>
35 #include <linux/compiler.h>
36 #include <linux/srcu.h>
37 #include <linux/slab.h>
38 #include <linux/uaccess.h>
39 
40 #include <asm/page.h>
41 #include <asm/cmpxchg.h>
42 #include <asm/io.h>
43 #include <asm/vmx.h>
44 
45 /*
46  * When setting this variable to true it enables Two-Dimensional-Paging
47  * where the hardware walks 2 page tables:
48  * 1. the guest-virtual to guest-physical
49  * 2. while doing 1. it walks guest-physical to host-physical
50  * If the hardware supports that we don't need to do shadow paging.
51  */
52 bool tdp_enabled = false;
53 
54 enum {
55 	AUDIT_PRE_PAGE_FAULT,
56 	AUDIT_POST_PAGE_FAULT,
57 	AUDIT_PRE_PTE_WRITE,
58 	AUDIT_POST_PTE_WRITE,
59 	AUDIT_PRE_SYNC,
60 	AUDIT_POST_SYNC
61 };
62 
63 char *audit_point_name[] = {
64 	"pre page fault",
65 	"post page fault",
66 	"pre pte write",
67 	"post pte write",
68 	"pre sync",
69 	"post sync"
70 };
71 
72 #undef MMU_DEBUG
73 
74 #ifdef MMU_DEBUG
75 
76 #define pgprintk(x...) do { if (dbg) printk(x); } while (0)
77 #define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
78 
79 #else
80 
81 #define pgprintk(x...) do { } while (0)
82 #define rmap_printk(x...) do { } while (0)
83 
84 #endif
85 
86 #ifdef MMU_DEBUG
87 static int dbg = 0;
88 module_param(dbg, bool, 0644);
89 #endif
90 
91 static int oos_shadow = 1;
92 module_param(oos_shadow, bool, 0644);
93 
94 #ifndef MMU_DEBUG
95 #define ASSERT(x) do { } while (0)
96 #else
97 #define ASSERT(x)							\
98 	if (!(x)) {							\
99 		printk(KERN_WARNING "assertion failed %s:%d: %s\n",	\
100 		       __FILE__, __LINE__, #x);				\
101 	}
102 #endif
103 
104 #define PTE_PREFETCH_NUM		8
105 
106 #define PT_FIRST_AVAIL_BITS_SHIFT 9
107 #define PT64_SECOND_AVAIL_BITS_SHIFT 52
108 
109 #define PT64_LEVEL_BITS 9
110 
111 #define PT64_LEVEL_SHIFT(level) \
112 		(PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
113 
114 #define PT64_INDEX(address, level)\
115 	(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
116 
117 
118 #define PT32_LEVEL_BITS 10
119 
120 #define PT32_LEVEL_SHIFT(level) \
121 		(PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
122 
123 #define PT32_LVL_OFFSET_MASK(level) \
124 	(PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
125 						* PT32_LEVEL_BITS))) - 1))
126 
127 #define PT32_INDEX(address, level)\
128 	(((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
129 
130 
131 #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
132 #define PT64_DIR_BASE_ADDR_MASK \
133 	(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
134 #define PT64_LVL_ADDR_MASK(level) \
135 	(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
136 						* PT64_LEVEL_BITS))) - 1))
137 #define PT64_LVL_OFFSET_MASK(level) \
138 	(PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
139 						* PT64_LEVEL_BITS))) - 1))
140 
141 #define PT32_BASE_ADDR_MASK PAGE_MASK
142 #define PT32_DIR_BASE_ADDR_MASK \
143 	(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
144 #define PT32_LVL_ADDR_MASK(level) \
145 	(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
146 					    * PT32_LEVEL_BITS))) - 1))
147 
148 #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
149 			| PT64_NX_MASK)
150 
151 #define RMAP_EXT 4
152 
153 #define ACC_EXEC_MASK    1
154 #define ACC_WRITE_MASK   PT_WRITABLE_MASK
155 #define ACC_USER_MASK    PT_USER_MASK
156 #define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
157 
158 #include <trace/events/kvm.h>
159 
160 #define CREATE_TRACE_POINTS
161 #include "mmutrace.h"
162 
163 #define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
164 
165 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
166 
167 struct kvm_rmap_desc {
168 	u64 *sptes[RMAP_EXT];
169 	struct kvm_rmap_desc *more;
170 };
171 
172 struct kvm_shadow_walk_iterator {
173 	u64 addr;
174 	hpa_t shadow_addr;
175 	int level;
176 	u64 *sptep;
177 	unsigned index;
178 };
179 
180 #define for_each_shadow_entry(_vcpu, _addr, _walker)    \
181 	for (shadow_walk_init(&(_walker), _vcpu, _addr);	\
182 	     shadow_walk_okay(&(_walker));			\
183 	     shadow_walk_next(&(_walker)))
184 
185 typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
186 
187 static struct kmem_cache *pte_chain_cache;
188 static struct kmem_cache *rmap_desc_cache;
189 static struct kmem_cache *mmu_page_header_cache;
190 static struct percpu_counter kvm_total_used_mmu_pages;
191 
192 static u64 __read_mostly shadow_trap_nonpresent_pte;
193 static u64 __read_mostly shadow_notrap_nonpresent_pte;
194 static u64 __read_mostly shadow_nx_mask;
195 static u64 __read_mostly shadow_x_mask;	/* mutual exclusive with nx_mask */
196 static u64 __read_mostly shadow_user_mask;
197 static u64 __read_mostly shadow_accessed_mask;
198 static u64 __read_mostly shadow_dirty_mask;
199 
rsvd_bits(int s,int e)200 static inline u64 rsvd_bits(int s, int e)
201 {
202 	return ((1ULL << (e - s + 1)) - 1) << s;
203 }
204 
kvm_mmu_set_nonpresent_ptes(u64 trap_pte,u64 notrap_pte)205 void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
206 {
207 	shadow_trap_nonpresent_pte = trap_pte;
208 	shadow_notrap_nonpresent_pte = notrap_pte;
209 }
210 EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
211 
kvm_mmu_set_mask_ptes(u64 user_mask,u64 accessed_mask,u64 dirty_mask,u64 nx_mask,u64 x_mask)212 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
213 		u64 dirty_mask, u64 nx_mask, u64 x_mask)
214 {
215 	shadow_user_mask = user_mask;
216 	shadow_accessed_mask = accessed_mask;
217 	shadow_dirty_mask = dirty_mask;
218 	shadow_nx_mask = nx_mask;
219 	shadow_x_mask = x_mask;
220 }
221 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
222 
is_write_protection(struct kvm_vcpu * vcpu)223 static bool is_write_protection(struct kvm_vcpu *vcpu)
224 {
225 	return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
226 }
227 
is_cpuid_PSE36(void)228 static int is_cpuid_PSE36(void)
229 {
230 	return 1;
231 }
232 
is_nx(struct kvm_vcpu * vcpu)233 static int is_nx(struct kvm_vcpu *vcpu)
234 {
235 	return vcpu->arch.efer & EFER_NX;
236 }
237 
is_shadow_present_pte(u64 pte)238 static int is_shadow_present_pte(u64 pte)
239 {
240 	return pte != shadow_trap_nonpresent_pte
241 		&& pte != shadow_notrap_nonpresent_pte;
242 }
243 
is_large_pte(u64 pte)244 static int is_large_pte(u64 pte)
245 {
246 	return pte & PT_PAGE_SIZE_MASK;
247 }
248 
is_writable_pte(unsigned long pte)249 static int is_writable_pte(unsigned long pte)
250 {
251 	return pte & PT_WRITABLE_MASK;
252 }
253 
is_dirty_gpte(unsigned long pte)254 static int is_dirty_gpte(unsigned long pte)
255 {
256 	return pte & PT_DIRTY_MASK;
257 }
258 
is_rmap_spte(u64 pte)259 static int is_rmap_spte(u64 pte)
260 {
261 	return is_shadow_present_pte(pte);
262 }
263 
is_last_spte(u64 pte,int level)264 static int is_last_spte(u64 pte, int level)
265 {
266 	if (level == PT_PAGE_TABLE_LEVEL)
267 		return 1;
268 	if (is_large_pte(pte))
269 		return 1;
270 	return 0;
271 }
272 
spte_to_pfn(u64 pte)273 static pfn_t spte_to_pfn(u64 pte)
274 {
275 	return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
276 }
277 
pse36_gfn_delta(u32 gpte)278 static gfn_t pse36_gfn_delta(u32 gpte)
279 {
280 	int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
281 
282 	return (gpte & PT32_DIR_PSE36_MASK) << shift;
283 }
284 
__set_spte(u64 * sptep,u64 spte)285 static void __set_spte(u64 *sptep, u64 spte)
286 {
287 	set_64bit(sptep, spte);
288 }
289 
__xchg_spte(u64 * sptep,u64 new_spte)290 static u64 __xchg_spte(u64 *sptep, u64 new_spte)
291 {
292 #ifdef CONFIG_X86_64
293 	return xchg(sptep, new_spte);
294 #else
295 	u64 old_spte;
296 
297 	do {
298 		old_spte = *sptep;
299 	} while (cmpxchg64(sptep, old_spte, new_spte) != old_spte);
300 
301 	return old_spte;
302 #endif
303 }
304 
spte_has_volatile_bits(u64 spte)305 static bool spte_has_volatile_bits(u64 spte)
306 {
307 	if (!shadow_accessed_mask)
308 		return false;
309 
310 	if (!is_shadow_present_pte(spte))
311 		return false;
312 
313 	if ((spte & shadow_accessed_mask) &&
314 	      (!is_writable_pte(spte) || (spte & shadow_dirty_mask)))
315 		return false;
316 
317 	return true;
318 }
319 
spte_is_bit_cleared(u64 old_spte,u64 new_spte,u64 bit_mask)320 static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
321 {
322 	return (old_spte & bit_mask) && !(new_spte & bit_mask);
323 }
324 
update_spte(u64 * sptep,u64 new_spte)325 static void update_spte(u64 *sptep, u64 new_spte)
326 {
327 	u64 mask, old_spte = *sptep;
328 
329 	WARN_ON(!is_rmap_spte(new_spte));
330 
331 	new_spte |= old_spte & shadow_dirty_mask;
332 
333 	mask = shadow_accessed_mask;
334 	if (is_writable_pte(old_spte))
335 		mask |= shadow_dirty_mask;
336 
337 	if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask)
338 		__set_spte(sptep, new_spte);
339 	else
340 		old_spte = __xchg_spte(sptep, new_spte);
341 
342 	if (!shadow_accessed_mask)
343 		return;
344 
345 	if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
346 		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
347 	if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
348 		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
349 }
350 
mmu_topup_memory_cache(struct kvm_mmu_memory_cache * cache,struct kmem_cache * base_cache,int min)351 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
352 				  struct kmem_cache *base_cache, int min)
353 {
354 	void *obj;
355 
356 	if (cache->nobjs >= min)
357 		return 0;
358 	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
359 		obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
360 		if (!obj)
361 			return -ENOMEM;
362 		cache->objects[cache->nobjs++] = obj;
363 	}
364 	return 0;
365 }
366 
mmu_free_memory_cache(struct kvm_mmu_memory_cache * mc,struct kmem_cache * cache)367 static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
368 				  struct kmem_cache *cache)
369 {
370 	while (mc->nobjs)
371 		kmem_cache_free(cache, mc->objects[--mc->nobjs]);
372 }
373 
mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache * cache,int min)374 static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
375 				       int min)
376 {
377 	void *page;
378 
379 	if (cache->nobjs >= min)
380 		return 0;
381 	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
382 		page = (void *)__get_free_page(GFP_KERNEL);
383 		if (!page)
384 			return -ENOMEM;
385 		cache->objects[cache->nobjs++] = page;
386 	}
387 	return 0;
388 }
389 
mmu_free_memory_cache_page(struct kvm_mmu_memory_cache * mc)390 static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
391 {
392 	while (mc->nobjs)
393 		free_page((unsigned long)mc->objects[--mc->nobjs]);
394 }
395 
mmu_topup_memory_caches(struct kvm_vcpu * vcpu)396 static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
397 {
398 	int r;
399 
400 	r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
401 				   pte_chain_cache, 4);
402 	if (r)
403 		goto out;
404 	r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
405 				   rmap_desc_cache, 4 + PTE_PREFETCH_NUM);
406 	if (r)
407 		goto out;
408 	r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
409 	if (r)
410 		goto out;
411 	r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
412 				   mmu_page_header_cache, 4);
413 out:
414 	return r;
415 }
416 
mmu_free_memory_caches(struct kvm_vcpu * vcpu)417 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
418 {
419 	mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache);
420 	mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache);
421 	mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
422 	mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
423 				mmu_page_header_cache);
424 }
425 
mmu_memory_cache_alloc(struct kvm_mmu_memory_cache * mc,size_t size)426 static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
427 				    size_t size)
428 {
429 	void *p;
430 
431 	BUG_ON(!mc->nobjs);
432 	p = mc->objects[--mc->nobjs];
433 	return p;
434 }
435 
mmu_alloc_pte_chain(struct kvm_vcpu * vcpu)436 static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
437 {
438 	return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
439 				      sizeof(struct kvm_pte_chain));
440 }
441 
mmu_free_pte_chain(struct kvm_pte_chain * pc)442 static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
443 {
444 	kmem_cache_free(pte_chain_cache, pc);
445 }
446 
mmu_alloc_rmap_desc(struct kvm_vcpu * vcpu)447 static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
448 {
449 	return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
450 				      sizeof(struct kvm_rmap_desc));
451 }
452 
mmu_free_rmap_desc(struct kvm_rmap_desc * rd)453 static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
454 {
455 	kmem_cache_free(rmap_desc_cache, rd);
456 }
457 
kvm_mmu_page_get_gfn(struct kvm_mmu_page * sp,int index)458 static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
459 {
460 	if (!sp->role.direct)
461 		return sp->gfns[index];
462 
463 	return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
464 }
465 
kvm_mmu_page_set_gfn(struct kvm_mmu_page * sp,int index,gfn_t gfn)466 static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
467 {
468 	if (sp->role.direct)
469 		BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index));
470 	else
471 		sp->gfns[index] = gfn;
472 }
473 
474 /*
475  * Return the pointer to the large page information for a given gfn,
476  * handling slots that are not large page aligned.
477  */
lpage_info_slot(gfn_t gfn,struct kvm_memory_slot * slot,int level)478 static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
479 					      struct kvm_memory_slot *slot,
480 					      int level)
481 {
482 	unsigned long idx;
483 
484 	idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
485 	      (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
486 	return &slot->lpage_info[level - 2][idx];
487 }
488 
account_shadowed(struct kvm * kvm,gfn_t gfn)489 static void account_shadowed(struct kvm *kvm, gfn_t gfn)
490 {
491 	struct kvm_memory_slot *slot;
492 	struct kvm_lpage_info *linfo;
493 	int i;
494 
495 	slot = gfn_to_memslot(kvm, gfn);
496 	for (i = PT_DIRECTORY_LEVEL;
497 	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
498 		linfo = lpage_info_slot(gfn, slot, i);
499 		linfo->write_count += 1;
500 	}
501 }
502 
unaccount_shadowed(struct kvm * kvm,gfn_t gfn)503 static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
504 {
505 	struct kvm_memory_slot *slot;
506 	struct kvm_lpage_info *linfo;
507 	int i;
508 
509 	slot = gfn_to_memslot(kvm, gfn);
510 	for (i = PT_DIRECTORY_LEVEL;
511 	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
512 		linfo = lpage_info_slot(gfn, slot, i);
513 		linfo->write_count -= 1;
514 		WARN_ON(linfo->write_count < 0);
515 	}
516 }
517 
has_wrprotected_page(struct kvm * kvm,gfn_t gfn,int level)518 static int has_wrprotected_page(struct kvm *kvm,
519 				gfn_t gfn,
520 				int level)
521 {
522 	struct kvm_memory_slot *slot;
523 	struct kvm_lpage_info *linfo;
524 
525 	slot = gfn_to_memslot(kvm, gfn);
526 	if (slot) {
527 		linfo = lpage_info_slot(gfn, slot, level);
528 		return linfo->write_count;
529 	}
530 
531 	return 1;
532 }
533 
host_mapping_level(struct kvm * kvm,gfn_t gfn)534 static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
535 {
536 	unsigned long page_size;
537 	int i, ret = 0;
538 
539 	page_size = kvm_host_page_size(kvm, gfn);
540 
541 	for (i = PT_PAGE_TABLE_LEVEL;
542 	     i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) {
543 		if (page_size >= KVM_HPAGE_SIZE(i))
544 			ret = i;
545 		else
546 			break;
547 	}
548 
549 	return ret;
550 }
551 
552 static struct kvm_memory_slot *
gfn_to_memslot_dirty_bitmap(struct kvm_vcpu * vcpu,gfn_t gfn,bool no_dirty_log)553 gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
554 			    bool no_dirty_log)
555 {
556 	struct kvm_memory_slot *slot;
557 
558 	slot = gfn_to_memslot(vcpu->kvm, gfn);
559 	if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
560 	      (no_dirty_log && slot->dirty_bitmap))
561 		slot = NULL;
562 
563 	return slot;
564 }
565 
mapping_level_dirty_bitmap(struct kvm_vcpu * vcpu,gfn_t large_gfn)566 static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
567 {
568 	return gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true);
569 }
570 
mapping_level(struct kvm_vcpu * vcpu,gfn_t large_gfn)571 static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
572 {
573 	int host_level, level, max_level;
574 
575 	host_level = host_mapping_level(vcpu->kvm, large_gfn);
576 
577 	if (host_level == PT_PAGE_TABLE_LEVEL)
578 		return host_level;
579 
580 	max_level = kvm_x86_ops->get_lpage_level() < host_level ?
581 		kvm_x86_ops->get_lpage_level() : host_level;
582 
583 	for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
584 		if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
585 			break;
586 
587 	return level - 1;
588 }
589 
590 /*
591  * Take gfn and return the reverse mapping to it.
592  */
593 
gfn_to_rmap(struct kvm * kvm,gfn_t gfn,int level)594 static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
595 {
596 	struct kvm_memory_slot *slot;
597 	struct kvm_lpage_info *linfo;
598 
599 	slot = gfn_to_memslot(kvm, gfn);
600 	if (likely(level == PT_PAGE_TABLE_LEVEL))
601 		return &slot->rmap[gfn - slot->base_gfn];
602 
603 	linfo = lpage_info_slot(gfn, slot, level);
604 
605 	return &linfo->rmap_pde;
606 }
607 
608 /*
609  * Reverse mapping data structures:
610  *
611  * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
612  * that points to page_address(page).
613  *
614  * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
615  * containing more mappings.
616  *
617  * Returns the number of rmap entries before the spte was added or zero if
618  * the spte was not added.
619  *
620  */
rmap_add(struct kvm_vcpu * vcpu,u64 * spte,gfn_t gfn)621 static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
622 {
623 	struct kvm_mmu_page *sp;
624 	struct kvm_rmap_desc *desc;
625 	unsigned long *rmapp;
626 	int i, count = 0;
627 
628 	if (!is_rmap_spte(*spte))
629 		return count;
630 	sp = page_header(__pa(spte));
631 	kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
632 	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
633 	if (!*rmapp) {
634 		rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
635 		*rmapp = (unsigned long)spte;
636 	} else if (!(*rmapp & 1)) {
637 		rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
638 		desc = mmu_alloc_rmap_desc(vcpu);
639 		desc->sptes[0] = (u64 *)*rmapp;
640 		desc->sptes[1] = spte;
641 		*rmapp = (unsigned long)desc | 1;
642 		++count;
643 	} else {
644 		rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
645 		desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
646 		while (desc->sptes[RMAP_EXT-1] && desc->more) {
647 			desc = desc->more;
648 			count += RMAP_EXT;
649 		}
650 		if (desc->sptes[RMAP_EXT-1]) {
651 			desc->more = mmu_alloc_rmap_desc(vcpu);
652 			desc = desc->more;
653 		}
654 		for (i = 0; desc->sptes[i]; ++i)
655 			++count;
656 		desc->sptes[i] = spte;
657 	}
658 	return count;
659 }
660 
rmap_desc_remove_entry(unsigned long * rmapp,struct kvm_rmap_desc * desc,int i,struct kvm_rmap_desc * prev_desc)661 static void rmap_desc_remove_entry(unsigned long *rmapp,
662 				   struct kvm_rmap_desc *desc,
663 				   int i,
664 				   struct kvm_rmap_desc *prev_desc)
665 {
666 	int j;
667 
668 	for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j)
669 		;
670 	desc->sptes[i] = desc->sptes[j];
671 	desc->sptes[j] = NULL;
672 	if (j != 0)
673 		return;
674 	if (!prev_desc && !desc->more)
675 		*rmapp = (unsigned long)desc->sptes[0];
676 	else
677 		if (prev_desc)
678 			prev_desc->more = desc->more;
679 		else
680 			*rmapp = (unsigned long)desc->more | 1;
681 	mmu_free_rmap_desc(desc);
682 }
683 
rmap_remove(struct kvm * kvm,u64 * spte)684 static void rmap_remove(struct kvm *kvm, u64 *spte)
685 {
686 	struct kvm_rmap_desc *desc;
687 	struct kvm_rmap_desc *prev_desc;
688 	struct kvm_mmu_page *sp;
689 	gfn_t gfn;
690 	unsigned long *rmapp;
691 	int i;
692 
693 	sp = page_header(__pa(spte));
694 	gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
695 	rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
696 	if (!*rmapp) {
697 		printk(KERN_ERR "rmap_remove: %p 0->BUG\n", spte);
698 		BUG();
699 	} else if (!(*rmapp & 1)) {
700 		rmap_printk("rmap_remove:  %p 1->0\n", spte);
701 		if ((u64 *)*rmapp != spte) {
702 			printk(KERN_ERR "rmap_remove:  %p 1->BUG\n", spte);
703 			BUG();
704 		}
705 		*rmapp = 0;
706 	} else {
707 		rmap_printk("rmap_remove:  %p many->many\n", spte);
708 		desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
709 		prev_desc = NULL;
710 		while (desc) {
711 			for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i)
712 				if (desc->sptes[i] == spte) {
713 					rmap_desc_remove_entry(rmapp,
714 							       desc, i,
715 							       prev_desc);
716 					return;
717 				}
718 			prev_desc = desc;
719 			desc = desc->more;
720 		}
721 		pr_err("rmap_remove: %p many->many\n", spte);
722 		BUG();
723 	}
724 }
725 
set_spte_track_bits(u64 * sptep,u64 new_spte)726 static int set_spte_track_bits(u64 *sptep, u64 new_spte)
727 {
728 	pfn_t pfn;
729 	u64 old_spte = *sptep;
730 
731 	if (!spte_has_volatile_bits(old_spte))
732 		__set_spte(sptep, new_spte);
733 	else
734 		old_spte = __xchg_spte(sptep, new_spte);
735 
736 	if (!is_rmap_spte(old_spte))
737 		return 0;
738 
739 	pfn = spte_to_pfn(old_spte);
740 	if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
741 		kvm_set_pfn_accessed(pfn);
742 	if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
743 		kvm_set_pfn_dirty(pfn);
744 	return 1;
745 }
746 
drop_spte(struct kvm * kvm,u64 * sptep,u64 new_spte)747 static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
748 {
749 	if (set_spte_track_bits(sptep, new_spte))
750 		rmap_remove(kvm, sptep);
751 }
752 
rmap_next(struct kvm * kvm,unsigned long * rmapp,u64 * spte)753 static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
754 {
755 	struct kvm_rmap_desc *desc;
756 	u64 *prev_spte;
757 	int i;
758 
759 	if (!*rmapp)
760 		return NULL;
761 	else if (!(*rmapp & 1)) {
762 		if (!spte)
763 			return (u64 *)*rmapp;
764 		return NULL;
765 	}
766 	desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
767 	prev_spte = NULL;
768 	while (desc) {
769 		for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) {
770 			if (prev_spte == spte)
771 				return desc->sptes[i];
772 			prev_spte = desc->sptes[i];
773 		}
774 		desc = desc->more;
775 	}
776 	return NULL;
777 }
778 
rmap_write_protect(struct kvm * kvm,u64 gfn)779 static int rmap_write_protect(struct kvm *kvm, u64 gfn)
780 {
781 	unsigned long *rmapp;
782 	u64 *spte;
783 	int i, write_protected = 0;
784 
785 	rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL);
786 
787 	spte = rmap_next(kvm, rmapp, NULL);
788 	while (spte) {
789 		BUG_ON(!spte);
790 		BUG_ON(!(*spte & PT_PRESENT_MASK));
791 		rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
792 		if (is_writable_pte(*spte)) {
793 			update_spte(spte, *spte & ~PT_WRITABLE_MASK);
794 			write_protected = 1;
795 		}
796 		spte = rmap_next(kvm, rmapp, spte);
797 	}
798 
799 	/* check for huge page mappings */
800 	for (i = PT_DIRECTORY_LEVEL;
801 	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
802 		rmapp = gfn_to_rmap(kvm, gfn, i);
803 		spte = rmap_next(kvm, rmapp, NULL);
804 		while (spte) {
805 			BUG_ON(!spte);
806 			BUG_ON(!(*spte & PT_PRESENT_MASK));
807 			BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
808 			pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
809 			if (is_writable_pte(*spte)) {
810 				drop_spte(kvm, spte,
811 					  shadow_trap_nonpresent_pte);
812 				--kvm->stat.lpages;
813 				spte = NULL;
814 				write_protected = 1;
815 			}
816 			spte = rmap_next(kvm, rmapp, spte);
817 		}
818 	}
819 
820 	return write_protected;
821 }
822 
kvm_unmap_rmapp(struct kvm * kvm,unsigned long * rmapp,unsigned long data)823 static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
824 			   unsigned long data)
825 {
826 	u64 *spte;
827 	int need_tlb_flush = 0;
828 
829 	while ((spte = rmap_next(kvm, rmapp, NULL))) {
830 		BUG_ON(!(*spte & PT_PRESENT_MASK));
831 		rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
832 		drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
833 		need_tlb_flush = 1;
834 	}
835 	return need_tlb_flush;
836 }
837 
kvm_set_pte_rmapp(struct kvm * kvm,unsigned long * rmapp,unsigned long data)838 static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
839 			     unsigned long data)
840 {
841 	int need_flush = 0;
842 	u64 *spte, new_spte;
843 	pte_t *ptep = (pte_t *)data;
844 	pfn_t new_pfn;
845 
846 	WARN_ON(pte_huge(*ptep));
847 	new_pfn = pte_pfn(*ptep);
848 	spte = rmap_next(kvm, rmapp, NULL);
849 	while (spte) {
850 		BUG_ON(!is_shadow_present_pte(*spte));
851 		rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
852 		need_flush = 1;
853 		if (pte_write(*ptep)) {
854 			drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
855 			spte = rmap_next(kvm, rmapp, NULL);
856 		} else {
857 			new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
858 			new_spte |= (u64)new_pfn << PAGE_SHIFT;
859 
860 			new_spte &= ~PT_WRITABLE_MASK;
861 			new_spte &= ~SPTE_HOST_WRITEABLE;
862 			new_spte &= ~shadow_accessed_mask;
863 			set_spte_track_bits(spte, new_spte);
864 			spte = rmap_next(kvm, rmapp, spte);
865 		}
866 	}
867 	if (need_flush)
868 		kvm_flush_remote_tlbs(kvm);
869 
870 	return 0;
871 }
872 
kvm_handle_hva(struct kvm * kvm,unsigned long hva,unsigned long data,int (* handler)(struct kvm * kvm,unsigned long * rmapp,unsigned long data))873 static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
874 			  unsigned long data,
875 			  int (*handler)(struct kvm *kvm, unsigned long *rmapp,
876 					 unsigned long data))
877 {
878 	int i, j;
879 	int ret;
880 	int retval = 0;
881 	struct kvm_memslots *slots;
882 
883 	slots = kvm_memslots(kvm);
884 
885 	for (i = 0; i < slots->nmemslots; i++) {
886 		struct kvm_memory_slot *memslot = &slots->memslots[i];
887 		unsigned long start = memslot->userspace_addr;
888 		unsigned long end;
889 
890 		end = start + (memslot->npages << PAGE_SHIFT);
891 		if (hva >= start && hva < end) {
892 			gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
893 			gfn_t gfn = memslot->base_gfn + gfn_offset;
894 
895 			ret = handler(kvm, &memslot->rmap[gfn_offset], data);
896 
897 			for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
898 				struct kvm_lpage_info *linfo;
899 
900 				linfo = lpage_info_slot(gfn, memslot,
901 							PT_DIRECTORY_LEVEL + j);
902 				ret |= handler(kvm, &linfo->rmap_pde, data);
903 			}
904 			trace_kvm_age_page(hva, memslot, ret);
905 			retval |= ret;
906 		}
907 	}
908 
909 	return retval;
910 }
911 
kvm_unmap_hva(struct kvm * kvm,unsigned long hva)912 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
913 {
914 	return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
915 }
916 
kvm_set_spte_hva(struct kvm * kvm,unsigned long hva,pte_t pte)917 void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
918 {
919 	kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
920 }
921 
kvm_age_rmapp(struct kvm * kvm,unsigned long * rmapp,unsigned long data)922 static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
923 			 unsigned long data)
924 {
925 	u64 *spte;
926 	int young = 0;
927 
928 	/*
929 	 * Emulate the accessed bit for EPT, by checking if this page has
930 	 * an EPT mapping, and clearing it if it does. On the next access,
931 	 * a new EPT mapping will be established.
932 	 * This has some overhead, but not as much as the cost of swapping
933 	 * out actively used pages or breaking up actively used hugepages.
934 	 */
935 	if (!shadow_accessed_mask)
936 		return kvm_unmap_rmapp(kvm, rmapp, data);
937 
938 	spte = rmap_next(kvm, rmapp, NULL);
939 	while (spte) {
940 		int _young;
941 		u64 _spte = *spte;
942 		BUG_ON(!(_spte & PT_PRESENT_MASK));
943 		_young = _spte & PT_ACCESSED_MASK;
944 		if (_young) {
945 			young = 1;
946 			clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
947 		}
948 		spte = rmap_next(kvm, rmapp, spte);
949 	}
950 	return young;
951 }
952 
kvm_test_age_rmapp(struct kvm * kvm,unsigned long * rmapp,unsigned long data)953 static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
954 			      unsigned long data)
955 {
956 	u64 *spte;
957 	int young = 0;
958 
959 	/*
960 	 * If there's no access bit in the secondary pte set by the
961 	 * hardware it's up to gup-fast/gup to set the access bit in
962 	 * the primary pte or in the page structure.
963 	 */
964 	if (!shadow_accessed_mask)
965 		goto out;
966 
967 	spte = rmap_next(kvm, rmapp, NULL);
968 	while (spte) {
969 		u64 _spte = *spte;
970 		BUG_ON(!(_spte & PT_PRESENT_MASK));
971 		young = _spte & PT_ACCESSED_MASK;
972 		if (young) {
973 			young = 1;
974 			break;
975 		}
976 		spte = rmap_next(kvm, rmapp, spte);
977 	}
978 out:
979 	return young;
980 }
981 
982 #define RMAP_RECYCLE_THRESHOLD 1000
983 
rmap_recycle(struct kvm_vcpu * vcpu,u64 * spte,gfn_t gfn)984 static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
985 {
986 	unsigned long *rmapp;
987 	struct kvm_mmu_page *sp;
988 
989 	sp = page_header(__pa(spte));
990 
991 	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
992 
993 	kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
994 	kvm_flush_remote_tlbs(vcpu->kvm);
995 }
996 
kvm_age_hva(struct kvm * kvm,unsigned long hva)997 int kvm_age_hva(struct kvm *kvm, unsigned long hva)
998 {
999 	return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
1000 }
1001 
kvm_test_age_hva(struct kvm * kvm,unsigned long hva)1002 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
1003 {
1004 	return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
1005 }
1006 
1007 #ifdef MMU_DEBUG
is_empty_shadow_page(u64 * spt)1008 static int is_empty_shadow_page(u64 *spt)
1009 {
1010 	u64 *pos;
1011 	u64 *end;
1012 
1013 	for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
1014 		if (is_shadow_present_pte(*pos)) {
1015 			printk(KERN_ERR "%s: %p %llx\n", __func__,
1016 			       pos, *pos);
1017 			return 0;
1018 		}
1019 	return 1;
1020 }
1021 #endif
1022 
1023 /*
1024  * This value is the sum of all of the kvm instances's
1025  * kvm->arch.n_used_mmu_pages values.  We need a global,
1026  * aggregate version in order to make the slab shrinker
1027  * faster
1028  */
kvm_mod_used_mmu_pages(struct kvm * kvm,int nr)1029 static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
1030 {
1031 	kvm->arch.n_used_mmu_pages += nr;
1032 	percpu_counter_add(&kvm_total_used_mmu_pages, nr);
1033 }
1034 
kvm_mmu_free_page(struct kvm * kvm,struct kvm_mmu_page * sp)1035 static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1036 {
1037 	ASSERT(is_empty_shadow_page(sp->spt));
1038 	hlist_del(&sp->hash_link);
1039 	list_del(&sp->link);
1040 	free_page((unsigned long)sp->spt);
1041 	if (!sp->role.direct)
1042 		free_page((unsigned long)sp->gfns);
1043 	kmem_cache_free(mmu_page_header_cache, sp);
1044 	kvm_mod_used_mmu_pages(kvm, -1);
1045 }
1046 
kvm_page_table_hashfn(gfn_t gfn)1047 static unsigned kvm_page_table_hashfn(gfn_t gfn)
1048 {
1049 	return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
1050 }
1051 
kvm_mmu_alloc_page(struct kvm_vcpu * vcpu,u64 * parent_pte,int direct)1052 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
1053 					       u64 *parent_pte, int direct)
1054 {
1055 	struct kvm_mmu_page *sp;
1056 
1057 	sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
1058 	sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
1059 	if (!direct)
1060 		sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
1061 						  PAGE_SIZE);
1062 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
1063 	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
1064 	bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
1065 	sp->multimapped = 0;
1066 	sp->parent_pte = parent_pte;
1067 	kvm_mod_used_mmu_pages(vcpu->kvm, +1);
1068 	return sp;
1069 }
1070 
mmu_page_add_parent_pte(struct kvm_vcpu * vcpu,struct kvm_mmu_page * sp,u64 * parent_pte)1071 static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
1072 				    struct kvm_mmu_page *sp, u64 *parent_pte)
1073 {
1074 	struct kvm_pte_chain *pte_chain;
1075 	struct hlist_node *node;
1076 	int i;
1077 
1078 	if (!parent_pte)
1079 		return;
1080 	if (!sp->multimapped) {
1081 		u64 *old = sp->parent_pte;
1082 
1083 		if (!old) {
1084 			sp->parent_pte = parent_pte;
1085 			return;
1086 		}
1087 		sp->multimapped = 1;
1088 		pte_chain = mmu_alloc_pte_chain(vcpu);
1089 		INIT_HLIST_HEAD(&sp->parent_ptes);
1090 		hlist_add_head(&pte_chain->link, &sp->parent_ptes);
1091 		pte_chain->parent_ptes[0] = old;
1092 	}
1093 	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
1094 		if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
1095 			continue;
1096 		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
1097 			if (!pte_chain->parent_ptes[i]) {
1098 				pte_chain->parent_ptes[i] = parent_pte;
1099 				return;
1100 			}
1101 	}
1102 	pte_chain = mmu_alloc_pte_chain(vcpu);
1103 	BUG_ON(!pte_chain);
1104 	hlist_add_head(&pte_chain->link, &sp->parent_ptes);
1105 	pte_chain->parent_ptes[0] = parent_pte;
1106 }
1107 
mmu_page_remove_parent_pte(struct kvm_mmu_page * sp,u64 * parent_pte)1108 static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
1109 				       u64 *parent_pte)
1110 {
1111 	struct kvm_pte_chain *pte_chain;
1112 	struct hlist_node *node;
1113 	int i;
1114 
1115 	if (!sp->multimapped) {
1116 		BUG_ON(sp->parent_pte != parent_pte);
1117 		sp->parent_pte = NULL;
1118 		return;
1119 	}
1120 	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
1121 		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
1122 			if (!pte_chain->parent_ptes[i])
1123 				break;
1124 			if (pte_chain->parent_ptes[i] != parent_pte)
1125 				continue;
1126 			while (i + 1 < NR_PTE_CHAIN_ENTRIES
1127 				&& pte_chain->parent_ptes[i + 1]) {
1128 				pte_chain->parent_ptes[i]
1129 					= pte_chain->parent_ptes[i + 1];
1130 				++i;
1131 			}
1132 			pte_chain->parent_ptes[i] = NULL;
1133 			if (i == 0) {
1134 				hlist_del(&pte_chain->link);
1135 				mmu_free_pte_chain(pte_chain);
1136 				if (hlist_empty(&sp->parent_ptes)) {
1137 					sp->multimapped = 0;
1138 					sp->parent_pte = NULL;
1139 				}
1140 			}
1141 			return;
1142 		}
1143 	BUG();
1144 }
1145 
mmu_parent_walk(struct kvm_mmu_page * sp,mmu_parent_walk_fn fn)1146 static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
1147 {
1148 	struct kvm_pte_chain *pte_chain;
1149 	struct hlist_node *node;
1150 	struct kvm_mmu_page *parent_sp;
1151 	int i;
1152 
1153 	if (!sp->multimapped && sp->parent_pte) {
1154 		parent_sp = page_header(__pa(sp->parent_pte));
1155 		fn(parent_sp, sp->parent_pte);
1156 		return;
1157 	}
1158 
1159 	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
1160 		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
1161 			u64 *spte = pte_chain->parent_ptes[i];
1162 
1163 			if (!spte)
1164 				break;
1165 			parent_sp = page_header(__pa(spte));
1166 			fn(parent_sp, spte);
1167 		}
1168 }
1169 
1170 static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte);
kvm_mmu_mark_parents_unsync(struct kvm_mmu_page * sp)1171 static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
1172 {
1173 	mmu_parent_walk(sp, mark_unsync);
1174 }
1175 
mark_unsync(struct kvm_mmu_page * sp,u64 * spte)1176 static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte)
1177 {
1178 	unsigned int index;
1179 
1180 	index = spte - sp->spt;
1181 	if (__test_and_set_bit(index, sp->unsync_child_bitmap))
1182 		return;
1183 	if (sp->unsync_children++)
1184 		return;
1185 	kvm_mmu_mark_parents_unsync(sp);
1186 }
1187 
nonpaging_prefetch_page(struct kvm_vcpu * vcpu,struct kvm_mmu_page * sp)1188 static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
1189 				    struct kvm_mmu_page *sp)
1190 {
1191 	int i;
1192 
1193 	for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1194 		sp->spt[i] = shadow_trap_nonpresent_pte;
1195 }
1196 
nonpaging_sync_page(struct kvm_vcpu * vcpu,struct kvm_mmu_page * sp)1197 static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
1198 			       struct kvm_mmu_page *sp)
1199 {
1200 	return 1;
1201 }
1202 
nonpaging_invlpg(struct kvm_vcpu * vcpu,gva_t gva)1203 static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
1204 {
1205 }
1206 
nonpaging_update_pte(struct kvm_vcpu * vcpu,struct kvm_mmu_page * sp,u64 * spte,const void * pte,unsigned long mmu_seq)1207 static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
1208 				 struct kvm_mmu_page *sp, u64 *spte,
1209 				 const void *pte, unsigned long mmu_seq)
1210 {
1211 	WARN_ON(1);
1212 }
1213 
1214 #define KVM_PAGE_ARRAY_NR 16
1215 
1216 struct kvm_mmu_pages {
1217 	struct mmu_page_and_offset {
1218 		struct kvm_mmu_page *sp;
1219 		unsigned int idx;
1220 	} page[KVM_PAGE_ARRAY_NR];
1221 	unsigned int nr;
1222 };
1223 
1224 #define for_each_unsync_children(bitmap, idx)		\
1225 	for (idx = find_first_bit(bitmap, 512);		\
1226 	     idx < 512;					\
1227 	     idx = find_next_bit(bitmap, 512, idx+1))
1228 
mmu_pages_add(struct kvm_mmu_pages * pvec,struct kvm_mmu_page * sp,int idx)1229 static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
1230 			 int idx)
1231 {
1232 	int i;
1233 
1234 	if (sp->unsync)
1235 		for (i=0; i < pvec->nr; i++)
1236 			if (pvec->page[i].sp == sp)
1237 				return 0;
1238 
1239 	pvec->page[pvec->nr].sp = sp;
1240 	pvec->page[pvec->nr].idx = idx;
1241 	pvec->nr++;
1242 	return (pvec->nr == KVM_PAGE_ARRAY_NR);
1243 }
1244 
__mmu_unsync_walk(struct kvm_mmu_page * sp,struct kvm_mmu_pages * pvec)1245 static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
1246 			   struct kvm_mmu_pages *pvec)
1247 {
1248 	int i, ret, nr_unsync_leaf = 0;
1249 
1250 	for_each_unsync_children(sp->unsync_child_bitmap, i) {
1251 		struct kvm_mmu_page *child;
1252 		u64 ent = sp->spt[i];
1253 
1254 		if (!is_shadow_present_pte(ent) || is_large_pte(ent))
1255 			goto clear_child_bitmap;
1256 
1257 		child = page_header(ent & PT64_BASE_ADDR_MASK);
1258 
1259 		if (child->unsync_children) {
1260 			if (mmu_pages_add(pvec, child, i))
1261 				return -ENOSPC;
1262 
1263 			ret = __mmu_unsync_walk(child, pvec);
1264 			if (!ret)
1265 				goto clear_child_bitmap;
1266 			else if (ret > 0)
1267 				nr_unsync_leaf += ret;
1268 			else
1269 				return ret;
1270 		} else if (child->unsync) {
1271 			nr_unsync_leaf++;
1272 			if (mmu_pages_add(pvec, child, i))
1273 				return -ENOSPC;
1274 		} else
1275 			 goto clear_child_bitmap;
1276 
1277 		continue;
1278 
1279 clear_child_bitmap:
1280 		__clear_bit(i, sp->unsync_child_bitmap);
1281 		sp->unsync_children--;
1282 		WARN_ON((int)sp->unsync_children < 0);
1283 	}
1284 
1285 
1286 	return nr_unsync_leaf;
1287 }
1288 
mmu_unsync_walk(struct kvm_mmu_page * sp,struct kvm_mmu_pages * pvec)1289 static int mmu_unsync_walk(struct kvm_mmu_page *sp,
1290 			   struct kvm_mmu_pages *pvec)
1291 {
1292 	if (!sp->unsync_children)
1293 		return 0;
1294 
1295 	mmu_pages_add(pvec, sp, 0);
1296 	return __mmu_unsync_walk(sp, pvec);
1297 }
1298 
kvm_unlink_unsync_page(struct kvm * kvm,struct kvm_mmu_page * sp)1299 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1300 {
1301 	WARN_ON(!sp->unsync);
1302 	trace_kvm_mmu_sync_page(sp);
1303 	sp->unsync = 0;
1304 	--kvm->stat.mmu_unsync;
1305 }
1306 
1307 static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1308 				    struct list_head *invalid_list);
1309 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1310 				    struct list_head *invalid_list);
1311 
1312 #define for_each_gfn_sp(kvm, sp, gfn, pos)				\
1313   hlist_for_each_entry(sp, pos,						\
1314    &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)	\
1315 	if ((sp)->gfn != (gfn)) {} else
1316 
1317 #define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos)		\
1318   hlist_for_each_entry(sp, pos,						\
1319    &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)	\
1320 		if ((sp)->gfn != (gfn) || (sp)->role.direct ||		\
1321 			(sp)->role.invalid) {} else
1322 
1323 /* @sp->gfn should be write-protected at the call site */
__kvm_sync_page(struct kvm_vcpu * vcpu,struct kvm_mmu_page * sp,struct list_head * invalid_list,bool clear_unsync)1324 static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1325 			   struct list_head *invalid_list, bool clear_unsync)
1326 {
1327 	if (sp->role.cr4_pae != !!is_pae(vcpu)) {
1328 		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1329 		return 1;
1330 	}
1331 
1332 	if (clear_unsync)
1333 		kvm_unlink_unsync_page(vcpu->kvm, sp);
1334 
1335 	if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
1336 		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1337 		return 1;
1338 	}
1339 
1340 	kvm_mmu_flush_tlb(vcpu);
1341 	return 0;
1342 }
1343 
kvm_sync_page_transient(struct kvm_vcpu * vcpu,struct kvm_mmu_page * sp)1344 static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
1345 				   struct kvm_mmu_page *sp)
1346 {
1347 	LIST_HEAD(invalid_list);
1348 	int ret;
1349 
1350 	ret = __kvm_sync_page(vcpu, sp, &invalid_list, false);
1351 	if (ret)
1352 		kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1353 
1354 	return ret;
1355 }
1356 
kvm_sync_page(struct kvm_vcpu * vcpu,struct kvm_mmu_page * sp,struct list_head * invalid_list)1357 static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1358 			 struct list_head *invalid_list)
1359 {
1360 	return __kvm_sync_page(vcpu, sp, invalid_list, true);
1361 }
1362 
1363 /* @gfn should be write-protected at the call site */
kvm_sync_pages(struct kvm_vcpu * vcpu,gfn_t gfn)1364 static void kvm_sync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
1365 {
1366 	struct kvm_mmu_page *s;
1367 	struct hlist_node *node;
1368 	LIST_HEAD(invalid_list);
1369 	bool flush = false;
1370 
1371 	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1372 		if (!s->unsync)
1373 			continue;
1374 
1375 		WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
1376 		kvm_unlink_unsync_page(vcpu->kvm, s);
1377 		if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
1378 			(vcpu->arch.mmu.sync_page(vcpu, s))) {
1379 			kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
1380 			continue;
1381 		}
1382 		flush = true;
1383 	}
1384 
1385 	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1386 	if (flush)
1387 		kvm_mmu_flush_tlb(vcpu);
1388 }
1389 
1390 struct mmu_page_path {
1391 	struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
1392 	unsigned int idx[PT64_ROOT_LEVEL-1];
1393 };
1394 
1395 #define for_each_sp(pvec, sp, parents, i)			\
1396 		for (i = mmu_pages_next(&pvec, &parents, -1),	\
1397 			sp = pvec.page[i].sp;			\
1398 			i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});	\
1399 			i = mmu_pages_next(&pvec, &parents, i))
1400 
mmu_pages_next(struct kvm_mmu_pages * pvec,struct mmu_page_path * parents,int i)1401 static int mmu_pages_next(struct kvm_mmu_pages *pvec,
1402 			  struct mmu_page_path *parents,
1403 			  int i)
1404 {
1405 	int n;
1406 
1407 	for (n = i+1; n < pvec->nr; n++) {
1408 		struct kvm_mmu_page *sp = pvec->page[n].sp;
1409 
1410 		if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
1411 			parents->idx[0] = pvec->page[n].idx;
1412 			return n;
1413 		}
1414 
1415 		parents->parent[sp->role.level-2] = sp;
1416 		parents->idx[sp->role.level-1] = pvec->page[n].idx;
1417 	}
1418 
1419 	return n;
1420 }
1421 
mmu_pages_clear_parents(struct mmu_page_path * parents)1422 static void mmu_pages_clear_parents(struct mmu_page_path *parents)
1423 {
1424 	struct kvm_mmu_page *sp;
1425 	unsigned int level = 0;
1426 
1427 	do {
1428 		unsigned int idx = parents->idx[level];
1429 
1430 		sp = parents->parent[level];
1431 		if (!sp)
1432 			return;
1433 
1434 		--sp->unsync_children;
1435 		WARN_ON((int)sp->unsync_children < 0);
1436 		__clear_bit(idx, sp->unsync_child_bitmap);
1437 		level++;
1438 	} while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children);
1439 }
1440 
kvm_mmu_pages_init(struct kvm_mmu_page * parent,struct mmu_page_path * parents,struct kvm_mmu_pages * pvec)1441 static void kvm_mmu_pages_init(struct kvm_mmu_page *parent,
1442 			       struct mmu_page_path *parents,
1443 			       struct kvm_mmu_pages *pvec)
1444 {
1445 	parents->parent[parent->role.level-1] = NULL;
1446 	pvec->nr = 0;
1447 }
1448 
mmu_sync_children(struct kvm_vcpu * vcpu,struct kvm_mmu_page * parent)1449 static void mmu_sync_children(struct kvm_vcpu *vcpu,
1450 			      struct kvm_mmu_page *parent)
1451 {
1452 	int i;
1453 	struct kvm_mmu_page *sp;
1454 	struct mmu_page_path parents;
1455 	struct kvm_mmu_pages pages;
1456 	LIST_HEAD(invalid_list);
1457 
1458 	kvm_mmu_pages_init(parent, &parents, &pages);
1459 	while (mmu_unsync_walk(parent, &pages)) {
1460 		int protected = 0;
1461 
1462 		for_each_sp(pages, sp, parents, i)
1463 			protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
1464 
1465 		if (protected)
1466 			kvm_flush_remote_tlbs(vcpu->kvm);
1467 
1468 		for_each_sp(pages, sp, parents, i) {
1469 			kvm_sync_page(vcpu, sp, &invalid_list);
1470 			mmu_pages_clear_parents(&parents);
1471 		}
1472 		kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1473 		cond_resched_lock(&vcpu->kvm->mmu_lock);
1474 		kvm_mmu_pages_init(parent, &parents, &pages);
1475 	}
1476 }
1477 
kvm_mmu_get_page(struct kvm_vcpu * vcpu,gfn_t gfn,gva_t gaddr,unsigned level,int direct,unsigned access,u64 * parent_pte)1478 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1479 					     gfn_t gfn,
1480 					     gva_t gaddr,
1481 					     unsigned level,
1482 					     int direct,
1483 					     unsigned access,
1484 					     u64 *parent_pte)
1485 {
1486 	union kvm_mmu_page_role role;
1487 	unsigned quadrant;
1488 	struct kvm_mmu_page *sp;
1489 	struct hlist_node *node;
1490 	bool need_sync = false;
1491 
1492 	role = vcpu->arch.mmu.base_role;
1493 	role.level = level;
1494 	role.direct = direct;
1495 	if (role.direct)
1496 		role.cr4_pae = 0;
1497 	role.access = access;
1498 	if (!vcpu->arch.mmu.direct_map
1499 	    && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
1500 		quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
1501 		quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
1502 		role.quadrant = quadrant;
1503 	}
1504 	for_each_gfn_sp(vcpu->kvm, sp, gfn, node) {
1505 		if (!need_sync && sp->unsync)
1506 			need_sync = true;
1507 
1508 		if (sp->role.word != role.word)
1509 			continue;
1510 
1511 		if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
1512 			break;
1513 
1514 		mmu_page_add_parent_pte(vcpu, sp, parent_pte);
1515 		if (sp->unsync_children) {
1516 			kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
1517 			kvm_mmu_mark_parents_unsync(sp);
1518 		} else if (sp->unsync)
1519 			kvm_mmu_mark_parents_unsync(sp);
1520 
1521 		trace_kvm_mmu_get_page(sp, false);
1522 		return sp;
1523 	}
1524 	++vcpu->kvm->stat.mmu_cache_miss;
1525 	sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct);
1526 	if (!sp)
1527 		return sp;
1528 	sp->gfn = gfn;
1529 	sp->role = role;
1530 	hlist_add_head(&sp->hash_link,
1531 		&vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
1532 	if (!direct) {
1533 		if (rmap_write_protect(vcpu->kvm, gfn))
1534 			kvm_flush_remote_tlbs(vcpu->kvm);
1535 		if (level > PT_PAGE_TABLE_LEVEL && need_sync)
1536 			kvm_sync_pages(vcpu, gfn);
1537 
1538 		account_shadowed(vcpu->kvm, gfn);
1539 	}
1540 	if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
1541 		vcpu->arch.mmu.prefetch_page(vcpu, sp);
1542 	else
1543 		nonpaging_prefetch_page(vcpu, sp);
1544 	trace_kvm_mmu_get_page(sp, true);
1545 	return sp;
1546 }
1547 
shadow_walk_init(struct kvm_shadow_walk_iterator * iterator,struct kvm_vcpu * vcpu,u64 addr)1548 static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
1549 			     struct kvm_vcpu *vcpu, u64 addr)
1550 {
1551 	iterator->addr = addr;
1552 	iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
1553 	iterator->level = vcpu->arch.mmu.shadow_root_level;
1554 
1555 	if (iterator->level == PT64_ROOT_LEVEL &&
1556 	    vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL &&
1557 	    !vcpu->arch.mmu.direct_map)
1558 		--iterator->level;
1559 
1560 	if (iterator->level == PT32E_ROOT_LEVEL) {
1561 		iterator->shadow_addr
1562 			= vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
1563 		iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
1564 		--iterator->level;
1565 		if (!iterator->shadow_addr)
1566 			iterator->level = 0;
1567 	}
1568 }
1569 
shadow_walk_okay(struct kvm_shadow_walk_iterator * iterator)1570 static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
1571 {
1572 	if (iterator->level < PT_PAGE_TABLE_LEVEL)
1573 		return false;
1574 
1575 	if (iterator->level == PT_PAGE_TABLE_LEVEL)
1576 		if (is_large_pte(*iterator->sptep))
1577 			return false;
1578 
1579 	iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
1580 	iterator->sptep	= ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
1581 	return true;
1582 }
1583 
shadow_walk_next(struct kvm_shadow_walk_iterator * iterator)1584 static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
1585 {
1586 	iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK;
1587 	--iterator->level;
1588 }
1589 
link_shadow_page(u64 * sptep,struct kvm_mmu_page * sp)1590 static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
1591 {
1592 	u64 spte;
1593 
1594 	spte = __pa(sp->spt)
1595 		| PT_PRESENT_MASK | PT_ACCESSED_MASK
1596 		| PT_WRITABLE_MASK | PT_USER_MASK;
1597 	__set_spte(sptep, spte);
1598 }
1599 
drop_large_spte(struct kvm_vcpu * vcpu,u64 * sptep)1600 static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1601 {
1602 	if (is_large_pte(*sptep)) {
1603 		drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
1604 		kvm_flush_remote_tlbs(vcpu->kvm);
1605 	}
1606 }
1607 
validate_direct_spte(struct kvm_vcpu * vcpu,u64 * sptep,unsigned direct_access)1608 static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1609 				   unsigned direct_access)
1610 {
1611 	if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
1612 		struct kvm_mmu_page *child;
1613 
1614 		/*
1615 		 * For the direct sp, if the guest pte's dirty bit
1616 		 * changed form clean to dirty, it will corrupt the
1617 		 * sp's access: allow writable in the read-only sp,
1618 		 * so we should update the spte at this point to get
1619 		 * a new sp with the correct access.
1620 		 */
1621 		child = page_header(*sptep & PT64_BASE_ADDR_MASK);
1622 		if (child->role.access == direct_access)
1623 			return;
1624 
1625 		mmu_page_remove_parent_pte(child, sptep);
1626 		__set_spte(sptep, shadow_trap_nonpresent_pte);
1627 		kvm_flush_remote_tlbs(vcpu->kvm);
1628 	}
1629 }
1630 
kvm_mmu_page_unlink_children(struct kvm * kvm,struct kvm_mmu_page * sp)1631 static void kvm_mmu_page_unlink_children(struct kvm *kvm,
1632 					 struct kvm_mmu_page *sp)
1633 {
1634 	unsigned i;
1635 	u64 *pt;
1636 	u64 ent;
1637 
1638 	pt = sp->spt;
1639 
1640 	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1641 		ent = pt[i];
1642 
1643 		if (is_shadow_present_pte(ent)) {
1644 			if (!is_last_spte(ent, sp->role.level)) {
1645 				ent &= PT64_BASE_ADDR_MASK;
1646 				mmu_page_remove_parent_pte(page_header(ent),
1647 							   &pt[i]);
1648 			} else {
1649 				if (is_large_pte(ent))
1650 					--kvm->stat.lpages;
1651 				drop_spte(kvm, &pt[i],
1652 					  shadow_trap_nonpresent_pte);
1653 			}
1654 		}
1655 		pt[i] = shadow_trap_nonpresent_pte;
1656 	}
1657 }
1658 
kvm_mmu_put_page(struct kvm_mmu_page * sp,u64 * parent_pte)1659 static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
1660 {
1661 	mmu_page_remove_parent_pte(sp, parent_pte);
1662 }
1663 
kvm_mmu_reset_last_pte_updated(struct kvm * kvm)1664 static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
1665 {
1666 	int i;
1667 	struct kvm_vcpu *vcpu;
1668 
1669 	kvm_for_each_vcpu(i, vcpu, kvm)
1670 		vcpu->arch.last_pte_updated = NULL;
1671 }
1672 
kvm_mmu_unlink_parents(struct kvm * kvm,struct kvm_mmu_page * sp)1673 static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
1674 {
1675 	u64 *parent_pte;
1676 
1677 	while (sp->multimapped || sp->parent_pte) {
1678 		if (!sp->multimapped)
1679 			parent_pte = sp->parent_pte;
1680 		else {
1681 			struct kvm_pte_chain *chain;
1682 
1683 			chain = container_of(sp->parent_ptes.first,
1684 					     struct kvm_pte_chain, link);
1685 			parent_pte = chain->parent_ptes[0];
1686 		}
1687 		BUG_ON(!parent_pte);
1688 		kvm_mmu_put_page(sp, parent_pte);
1689 		__set_spte(parent_pte, shadow_trap_nonpresent_pte);
1690 	}
1691 }
1692 
mmu_zap_unsync_children(struct kvm * kvm,struct kvm_mmu_page * parent,struct list_head * invalid_list)1693 static int mmu_zap_unsync_children(struct kvm *kvm,
1694 				   struct kvm_mmu_page *parent,
1695 				   struct list_head *invalid_list)
1696 {
1697 	int i, zapped = 0;
1698 	struct mmu_page_path parents;
1699 	struct kvm_mmu_pages pages;
1700 
1701 	if (parent->role.level == PT_PAGE_TABLE_LEVEL)
1702 		return 0;
1703 
1704 	kvm_mmu_pages_init(parent, &parents, &pages);
1705 	while (mmu_unsync_walk(parent, &pages)) {
1706 		struct kvm_mmu_page *sp;
1707 
1708 		for_each_sp(pages, sp, parents, i) {
1709 			kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
1710 			mmu_pages_clear_parents(&parents);
1711 			zapped++;
1712 		}
1713 		kvm_mmu_pages_init(parent, &parents, &pages);
1714 	}
1715 
1716 	return zapped;
1717 }
1718 
kvm_mmu_prepare_zap_page(struct kvm * kvm,struct kvm_mmu_page * sp,struct list_head * invalid_list)1719 static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1720 				    struct list_head *invalid_list)
1721 {
1722 	int ret;
1723 
1724 	trace_kvm_mmu_prepare_zap_page(sp);
1725 	++kvm->stat.mmu_shadow_zapped;
1726 	ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
1727 	kvm_mmu_page_unlink_children(kvm, sp);
1728 	kvm_mmu_unlink_parents(kvm, sp);
1729 	if (!sp->role.invalid && !sp->role.direct)
1730 		unaccount_shadowed(kvm, sp->gfn);
1731 	if (sp->unsync)
1732 		kvm_unlink_unsync_page(kvm, sp);
1733 	if (!sp->root_count) {
1734 		/* Count self */
1735 		ret++;
1736 		list_move(&sp->link, invalid_list);
1737 	} else {
1738 		list_move(&sp->link, &kvm->arch.active_mmu_pages);
1739 		kvm_reload_remote_mmus(kvm);
1740 	}
1741 
1742 	sp->role.invalid = 1;
1743 	kvm_mmu_reset_last_pte_updated(kvm);
1744 	return ret;
1745 }
1746 
kvm_mmu_commit_zap_page(struct kvm * kvm,struct list_head * invalid_list)1747 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1748 				    struct list_head *invalid_list)
1749 {
1750 	struct kvm_mmu_page *sp;
1751 
1752 	if (list_empty(invalid_list))
1753 		return;
1754 
1755 	kvm_flush_remote_tlbs(kvm);
1756 
1757 	do {
1758 		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
1759 		WARN_ON(!sp->role.invalid || sp->root_count);
1760 		kvm_mmu_free_page(kvm, sp);
1761 	} while (!list_empty(invalid_list));
1762 
1763 }
1764 
1765 /*
1766  * Changing the number of mmu pages allocated to the vm
1767  * Note: if goal_nr_mmu_pages is too small, you will get dead lock
1768  */
kvm_mmu_change_mmu_pages(struct kvm * kvm,unsigned int goal_nr_mmu_pages)1769 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
1770 {
1771 	LIST_HEAD(invalid_list);
1772 	/*
1773 	 * If we set the number of mmu pages to be smaller be than the
1774 	 * number of actived pages , we must to free some mmu pages before we
1775 	 * change the value
1776 	 */
1777 
1778 	if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
1779 		while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages &&
1780 			!list_empty(&kvm->arch.active_mmu_pages)) {
1781 			struct kvm_mmu_page *page;
1782 
1783 			page = container_of(kvm->arch.active_mmu_pages.prev,
1784 					    struct kvm_mmu_page, link);
1785 			kvm_mmu_prepare_zap_page(kvm, page, &invalid_list);
1786 			kvm_mmu_commit_zap_page(kvm, &invalid_list);
1787 		}
1788 		goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
1789 	}
1790 
1791 	kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
1792 }
1793 
kvm_mmu_unprotect_page(struct kvm * kvm,gfn_t gfn)1794 static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1795 {
1796 	struct kvm_mmu_page *sp;
1797 	struct hlist_node *node;
1798 	LIST_HEAD(invalid_list);
1799 	int r;
1800 
1801 	pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
1802 	r = 0;
1803 
1804 	for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1805 		pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
1806 			 sp->role.word);
1807 		r = 1;
1808 		kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
1809 	}
1810 	kvm_mmu_commit_zap_page(kvm, &invalid_list);
1811 	return r;
1812 }
1813 
mmu_unshadow(struct kvm * kvm,gfn_t gfn)1814 static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
1815 {
1816 	struct kvm_mmu_page *sp;
1817 	struct hlist_node *node;
1818 	LIST_HEAD(invalid_list);
1819 
1820 	for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1821 		pgprintk("%s: zap %llx %x\n",
1822 			 __func__, gfn, sp->role.word);
1823 		kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
1824 	}
1825 	kvm_mmu_commit_zap_page(kvm, &invalid_list);
1826 }
1827 
page_header_update_slot(struct kvm * kvm,void * pte,gfn_t gfn)1828 static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
1829 {
1830 	int slot = memslot_id(kvm, gfn);
1831 	struct kvm_mmu_page *sp = page_header(__pa(pte));
1832 
1833 	__set_bit(slot, sp->slot_bitmap);
1834 }
1835 
mmu_convert_notrap(struct kvm_mmu_page * sp)1836 static void mmu_convert_notrap(struct kvm_mmu_page *sp)
1837 {
1838 	int i;
1839 	u64 *pt = sp->spt;
1840 
1841 	if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte)
1842 		return;
1843 
1844 	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1845 		if (pt[i] == shadow_notrap_nonpresent_pte)
1846 			__set_spte(&pt[i], shadow_trap_nonpresent_pte);
1847 	}
1848 }
1849 
1850 /*
1851  * The function is based on mtrr_type_lookup() in
1852  * arch/x86/kernel/cpu/mtrr/generic.c
1853  */
get_mtrr_type(struct mtrr_state_type * mtrr_state,u64 start,u64 end)1854 static int get_mtrr_type(struct mtrr_state_type *mtrr_state,
1855 			 u64 start, u64 end)
1856 {
1857 	int i;
1858 	u64 base, mask;
1859 	u8 prev_match, curr_match;
1860 	int num_var_ranges = KVM_NR_VAR_MTRR;
1861 
1862 	if (!mtrr_state->enabled)
1863 		return 0xFF;
1864 
1865 	/* Make end inclusive end, instead of exclusive */
1866 	end--;
1867 
1868 	/* Look in fixed ranges. Just return the type as per start */
1869 	if (mtrr_state->have_fixed && (start < 0x100000)) {
1870 		int idx;
1871 
1872 		if (start < 0x80000) {
1873 			idx = 0;
1874 			idx += (start >> 16);
1875 			return mtrr_state->fixed_ranges[idx];
1876 		} else if (start < 0xC0000) {
1877 			idx = 1 * 8;
1878 			idx += ((start - 0x80000) >> 14);
1879 			return mtrr_state->fixed_ranges[idx];
1880 		} else if (start < 0x1000000) {
1881 			idx = 3 * 8;
1882 			idx += ((start - 0xC0000) >> 12);
1883 			return mtrr_state->fixed_ranges[idx];
1884 		}
1885 	}
1886 
1887 	/*
1888 	 * Look in variable ranges
1889 	 * Look of multiple ranges matching this address and pick type
1890 	 * as per MTRR precedence
1891 	 */
1892 	if (!(mtrr_state->enabled & 2))
1893 		return mtrr_state->def_type;
1894 
1895 	prev_match = 0xFF;
1896 	for (i = 0; i < num_var_ranges; ++i) {
1897 		unsigned short start_state, end_state;
1898 
1899 		if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11)))
1900 			continue;
1901 
1902 		base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) +
1903 		       (mtrr_state->var_ranges[i].base_lo & PAGE_MASK);
1904 		mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) +
1905 		       (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK);
1906 
1907 		start_state = ((start & mask) == (base & mask));
1908 		end_state = ((end & mask) == (base & mask));
1909 		if (start_state != end_state)
1910 			return 0xFE;
1911 
1912 		if ((start & mask) != (base & mask))
1913 			continue;
1914 
1915 		curr_match = mtrr_state->var_ranges[i].base_lo & 0xff;
1916 		if (prev_match == 0xFF) {
1917 			prev_match = curr_match;
1918 			continue;
1919 		}
1920 
1921 		if (prev_match == MTRR_TYPE_UNCACHABLE ||
1922 		    curr_match == MTRR_TYPE_UNCACHABLE)
1923 			return MTRR_TYPE_UNCACHABLE;
1924 
1925 		if ((prev_match == MTRR_TYPE_WRBACK &&
1926 		     curr_match == MTRR_TYPE_WRTHROUGH) ||
1927 		    (prev_match == MTRR_TYPE_WRTHROUGH &&
1928 		     curr_match == MTRR_TYPE_WRBACK)) {
1929 			prev_match = MTRR_TYPE_WRTHROUGH;
1930 			curr_match = MTRR_TYPE_WRTHROUGH;
1931 		}
1932 
1933 		if (prev_match != curr_match)
1934 			return MTRR_TYPE_UNCACHABLE;
1935 	}
1936 
1937 	if (prev_match != 0xFF)
1938 		return prev_match;
1939 
1940 	return mtrr_state->def_type;
1941 }
1942 
kvm_get_guest_memory_type(struct kvm_vcpu * vcpu,gfn_t gfn)1943 u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
1944 {
1945 	u8 mtrr;
1946 
1947 	mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT,
1948 			     (gfn << PAGE_SHIFT) + PAGE_SIZE);
1949 	if (mtrr == 0xfe || mtrr == 0xff)
1950 		mtrr = MTRR_TYPE_WRBACK;
1951 	return mtrr;
1952 }
1953 EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
1954 
__kvm_unsync_page(struct kvm_vcpu * vcpu,struct kvm_mmu_page * sp)1955 static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1956 {
1957 	trace_kvm_mmu_unsync_page(sp);
1958 	++vcpu->kvm->stat.mmu_unsync;
1959 	sp->unsync = 1;
1960 
1961 	kvm_mmu_mark_parents_unsync(sp);
1962 	mmu_convert_notrap(sp);
1963 }
1964 
kvm_unsync_pages(struct kvm_vcpu * vcpu,gfn_t gfn)1965 static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
1966 {
1967 	struct kvm_mmu_page *s;
1968 	struct hlist_node *node;
1969 
1970 	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1971 		if (s->unsync)
1972 			continue;
1973 		WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
1974 		__kvm_unsync_page(vcpu, s);
1975 	}
1976 }
1977 
mmu_need_write_protect(struct kvm_vcpu * vcpu,gfn_t gfn,bool can_unsync)1978 static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
1979 				  bool can_unsync)
1980 {
1981 	struct kvm_mmu_page *s;
1982 	struct hlist_node *node;
1983 	bool need_unsync = false;
1984 
1985 	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1986 		if (!can_unsync)
1987 			return 1;
1988 
1989 		if (s->role.level != PT_PAGE_TABLE_LEVEL)
1990 			return 1;
1991 
1992 		if (!need_unsync && !s->unsync) {
1993 			if (!oos_shadow)
1994 				return 1;
1995 			need_unsync = true;
1996 		}
1997 	}
1998 	if (need_unsync)
1999 		kvm_unsync_pages(vcpu, gfn);
2000 	return 0;
2001 }
2002 
set_spte(struct kvm_vcpu * vcpu,u64 * sptep,unsigned pte_access,int user_fault,int write_fault,int dirty,int level,gfn_t gfn,pfn_t pfn,bool speculative,bool can_unsync,bool host_writable)2003 static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2004 		    unsigned pte_access, int user_fault,
2005 		    int write_fault, int dirty, int level,
2006 		    gfn_t gfn, pfn_t pfn, bool speculative,
2007 		    bool can_unsync, bool host_writable)
2008 {
2009 	u64 spte, entry = *sptep;
2010 	int ret = 0;
2011 
2012 	/*
2013 	 * We don't set the accessed bit, since we sometimes want to see
2014 	 * whether the guest actually used the pte (in order to detect
2015 	 * demand paging).
2016 	 */
2017 	spte = PT_PRESENT_MASK;
2018 	if (!speculative)
2019 		spte |= shadow_accessed_mask;
2020 	if (!dirty)
2021 		pte_access &= ~ACC_WRITE_MASK;
2022 	if (pte_access & ACC_EXEC_MASK)
2023 		spte |= shadow_x_mask;
2024 	else
2025 		spte |= shadow_nx_mask;
2026 	if (pte_access & ACC_USER_MASK)
2027 		spte |= shadow_user_mask;
2028 	if (level > PT_PAGE_TABLE_LEVEL)
2029 		spte |= PT_PAGE_SIZE_MASK;
2030 	if (tdp_enabled)
2031 		spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
2032 			kvm_is_mmio_pfn(pfn));
2033 
2034 	if (host_writable)
2035 		spte |= SPTE_HOST_WRITEABLE;
2036 	else
2037 		pte_access &= ~ACC_WRITE_MASK;
2038 
2039 	spte |= (u64)pfn << PAGE_SHIFT;
2040 
2041 	if ((pte_access & ACC_WRITE_MASK)
2042 	    || (!vcpu->arch.mmu.direct_map && write_fault
2043 		&& !is_write_protection(vcpu) && !user_fault)) {
2044 
2045 		if (level > PT_PAGE_TABLE_LEVEL &&
2046 		    has_wrprotected_page(vcpu->kvm, gfn, level)) {
2047 			ret = 1;
2048 			drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
2049 			goto done;
2050 		}
2051 
2052 		spte |= PT_WRITABLE_MASK;
2053 
2054 		if (!vcpu->arch.mmu.direct_map
2055 		    && !(pte_access & ACC_WRITE_MASK))
2056 			spte &= ~PT_USER_MASK;
2057 
2058 		/*
2059 		 * Optimization: for pte sync, if spte was writable the hash
2060 		 * lookup is unnecessary (and expensive). Write protection
2061 		 * is responsibility of mmu_get_page / kvm_sync_page.
2062 		 * Same reasoning can be applied to dirty page accounting.
2063 		 */
2064 		if (!can_unsync && is_writable_pte(*sptep))
2065 			goto set_pte;
2066 
2067 		if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
2068 			pgprintk("%s: found shadow page for %llx, marking ro\n",
2069 				 __func__, gfn);
2070 			ret = 1;
2071 			pte_access &= ~ACC_WRITE_MASK;
2072 			if (is_writable_pte(spte))
2073 				spte &= ~PT_WRITABLE_MASK;
2074 		}
2075 	}
2076 
2077 	if (pte_access & ACC_WRITE_MASK)
2078 		mark_page_dirty(vcpu->kvm, gfn);
2079 
2080 set_pte:
2081 	update_spte(sptep, spte);
2082 	/*
2083 	 * If we overwrite a writable spte with a read-only one we
2084 	 * should flush remote TLBs. Otherwise rmap_write_protect
2085 	 * will find a read-only spte, even though the writable spte
2086 	 * might be cached on a CPU's TLB.
2087 	 */
2088 	if (is_writable_pte(entry) && !is_writable_pte(*sptep))
2089 		kvm_flush_remote_tlbs(vcpu->kvm);
2090 done:
2091 	return ret;
2092 }
2093 
mmu_set_spte(struct kvm_vcpu * vcpu,u64 * sptep,unsigned pt_access,unsigned pte_access,int user_fault,int write_fault,int dirty,int * ptwrite,int level,gfn_t gfn,pfn_t pfn,bool speculative,bool host_writable)2094 static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2095 			 unsigned pt_access, unsigned pte_access,
2096 			 int user_fault, int write_fault, int dirty,
2097 			 int *ptwrite, int level, gfn_t gfn,
2098 			 pfn_t pfn, bool speculative,
2099 			 bool host_writable)
2100 {
2101 	int was_rmapped = 0;
2102 	int rmap_count;
2103 
2104 	pgprintk("%s: spte %llx access %x write_fault %d"
2105 		 " user_fault %d gfn %llx\n",
2106 		 __func__, *sptep, pt_access,
2107 		 write_fault, user_fault, gfn);
2108 
2109 	if (is_rmap_spte(*sptep)) {
2110 		/*
2111 		 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
2112 		 * the parent of the now unreachable PTE.
2113 		 */
2114 		if (level > PT_PAGE_TABLE_LEVEL &&
2115 		    !is_large_pte(*sptep)) {
2116 			struct kvm_mmu_page *child;
2117 			u64 pte = *sptep;
2118 
2119 			child = page_header(pte & PT64_BASE_ADDR_MASK);
2120 			mmu_page_remove_parent_pte(child, sptep);
2121 			__set_spte(sptep, shadow_trap_nonpresent_pte);
2122 			kvm_flush_remote_tlbs(vcpu->kvm);
2123 		} else if (pfn != spte_to_pfn(*sptep)) {
2124 			pgprintk("hfn old %llx new %llx\n",
2125 				 spte_to_pfn(*sptep), pfn);
2126 			drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
2127 			kvm_flush_remote_tlbs(vcpu->kvm);
2128 		} else
2129 			was_rmapped = 1;
2130 	}
2131 
2132 	if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
2133 		      dirty, level, gfn, pfn, speculative, true,
2134 		      host_writable)) {
2135 		if (write_fault)
2136 			*ptwrite = 1;
2137 		kvm_mmu_flush_tlb(vcpu);
2138 	}
2139 
2140 	pgprintk("%s: setting spte %llx\n", __func__, *sptep);
2141 	pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
2142 		 is_large_pte(*sptep)? "2MB" : "4kB",
2143 		 *sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
2144 		 *sptep, sptep);
2145 	if (!was_rmapped && is_large_pte(*sptep))
2146 		++vcpu->kvm->stat.lpages;
2147 
2148 	page_header_update_slot(vcpu->kvm, sptep, gfn);
2149 	if (!was_rmapped) {
2150 		rmap_count = rmap_add(vcpu, sptep, gfn);
2151 		if (rmap_count > RMAP_RECYCLE_THRESHOLD)
2152 			rmap_recycle(vcpu, sptep, gfn);
2153 	}
2154 	kvm_release_pfn_clean(pfn);
2155 	if (speculative) {
2156 		vcpu->arch.last_pte_updated = sptep;
2157 		vcpu->arch.last_pte_gfn = gfn;
2158 	}
2159 }
2160 
nonpaging_new_cr3(struct kvm_vcpu * vcpu)2161 static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
2162 {
2163 }
2164 
pte_prefetch_gfn_to_pfn(struct kvm_vcpu * vcpu,gfn_t gfn,bool no_dirty_log)2165 static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
2166 				     bool no_dirty_log)
2167 {
2168 	struct kvm_memory_slot *slot;
2169 	unsigned long hva;
2170 
2171 	slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
2172 	if (!slot) {
2173 		get_page(bad_page);
2174 		return page_to_pfn(bad_page);
2175 	}
2176 
2177 	hva = gfn_to_hva_memslot(slot, gfn);
2178 
2179 	return hva_to_pfn_atomic(vcpu->kvm, hva);
2180 }
2181 
direct_pte_prefetch_many(struct kvm_vcpu * vcpu,struct kvm_mmu_page * sp,u64 * start,u64 * end)2182 static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2183 				    struct kvm_mmu_page *sp,
2184 				    u64 *start, u64 *end)
2185 {
2186 	struct page *pages[PTE_PREFETCH_NUM];
2187 	unsigned access = sp->role.access;
2188 	int i, ret;
2189 	gfn_t gfn;
2190 
2191 	gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
2192 	if (!gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK))
2193 		return -1;
2194 
2195 	ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start);
2196 	if (ret <= 0)
2197 		return -1;
2198 
2199 	for (i = 0; i < ret; i++, gfn++, start++)
2200 		mmu_set_spte(vcpu, start, ACC_ALL,
2201 			     access, 0, 0, 1, NULL,
2202 			     sp->role.level, gfn,
2203 			     page_to_pfn(pages[i]), true, true);
2204 
2205 	return 0;
2206 }
2207 
__direct_pte_prefetch(struct kvm_vcpu * vcpu,struct kvm_mmu_page * sp,u64 * sptep)2208 static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
2209 				  struct kvm_mmu_page *sp, u64 *sptep)
2210 {
2211 	u64 *spte, *start = NULL;
2212 	int i;
2213 
2214 	WARN_ON(!sp->role.direct);
2215 
2216 	i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
2217 	spte = sp->spt + i;
2218 
2219 	for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
2220 		if (*spte != shadow_trap_nonpresent_pte || spte == sptep) {
2221 			if (!start)
2222 				continue;
2223 			if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
2224 				break;
2225 			start = NULL;
2226 		} else if (!start)
2227 			start = spte;
2228 	}
2229 }
2230 
direct_pte_prefetch(struct kvm_vcpu * vcpu,u64 * sptep)2231 static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
2232 {
2233 	struct kvm_mmu_page *sp;
2234 
2235 	/*
2236 	 * Since it's no accessed bit on EPT, it's no way to
2237 	 * distinguish between actually accessed translations
2238 	 * and prefetched, so disable pte prefetch if EPT is
2239 	 * enabled.
2240 	 */
2241 	if (!shadow_accessed_mask)
2242 		return;
2243 
2244 	sp = page_header(__pa(sptep));
2245 	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
2246 		return;
2247 
2248 	__direct_pte_prefetch(vcpu, sp, sptep);
2249 }
2250 
__direct_map(struct kvm_vcpu * vcpu,gpa_t v,int write,int map_writable,int level,gfn_t gfn,pfn_t pfn,bool prefault)2251 static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2252 			int map_writable, int level, gfn_t gfn, pfn_t pfn,
2253 			bool prefault)
2254 {
2255 	struct kvm_shadow_walk_iterator iterator;
2256 	struct kvm_mmu_page *sp;
2257 	int pt_write = 0;
2258 	gfn_t pseudo_gfn;
2259 
2260 	for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
2261 		if (iterator.level == level) {
2262 			unsigned pte_access = ACC_ALL;
2263 
2264 			mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
2265 				     0, write, 1, &pt_write,
2266 				     level, gfn, pfn, prefault, map_writable);
2267 			direct_pte_prefetch(vcpu, iterator.sptep);
2268 			++vcpu->stat.pf_fixed;
2269 			break;
2270 		}
2271 
2272 		if (*iterator.sptep == shadow_trap_nonpresent_pte) {
2273 			u64 base_addr = iterator.addr;
2274 
2275 			base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
2276 			pseudo_gfn = base_addr >> PAGE_SHIFT;
2277 			sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
2278 					      iterator.level - 1,
2279 					      1, ACC_ALL, iterator.sptep);
2280 			if (!sp) {
2281 				pgprintk("nonpaging_map: ENOMEM\n");
2282 				kvm_release_pfn_clean(pfn);
2283 				return -ENOMEM;
2284 			}
2285 
2286 			__set_spte(iterator.sptep,
2287 				   __pa(sp->spt)
2288 				   | PT_PRESENT_MASK | PT_WRITABLE_MASK
2289 				   | shadow_user_mask | shadow_x_mask
2290 				   | shadow_accessed_mask);
2291 		}
2292 	}
2293 	return pt_write;
2294 }
2295 
kvm_send_hwpoison_signal(unsigned long address,struct task_struct * tsk)2296 static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
2297 {
2298 	siginfo_t info;
2299 
2300 	info.si_signo	= SIGBUS;
2301 	info.si_errno	= 0;
2302 	info.si_code	= BUS_MCEERR_AR;
2303 	info.si_addr	= (void __user *)address;
2304 	info.si_addr_lsb = PAGE_SHIFT;
2305 
2306 	send_sig_info(SIGBUS, &info, tsk);
2307 }
2308 
kvm_handle_bad_page(struct kvm * kvm,gfn_t gfn,pfn_t pfn)2309 static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
2310 {
2311 	kvm_release_pfn_clean(pfn);
2312 	if (is_hwpoison_pfn(pfn)) {
2313 		kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current);
2314 		return 0;
2315 	} else if (is_fault_pfn(pfn))
2316 		return -EFAULT;
2317 
2318 	return 1;
2319 }
2320 
transparent_hugepage_adjust(struct kvm_vcpu * vcpu,gfn_t * gfnp,pfn_t * pfnp,int * levelp)2321 static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
2322 					gfn_t *gfnp, pfn_t *pfnp, int *levelp)
2323 {
2324 	pfn_t pfn = *pfnp;
2325 	gfn_t gfn = *gfnp;
2326 	int level = *levelp;
2327 
2328 	/*
2329 	 * Check if it's a transparent hugepage. If this would be an
2330 	 * hugetlbfs page, level wouldn't be set to
2331 	 * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
2332 	 * here.
2333 	 */
2334 	if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
2335 	    level == PT_PAGE_TABLE_LEVEL &&
2336 	    PageTransCompound(pfn_to_page(pfn)) &&
2337 	    !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
2338 		unsigned long mask;
2339 		/*
2340 		 * mmu_notifier_retry was successful and we hold the
2341 		 * mmu_lock here, so the pmd can't become splitting
2342 		 * from under us, and in turn
2343 		 * __split_huge_page_refcount() can't run from under
2344 		 * us and we can safely transfer the refcount from
2345 		 * PG_tail to PG_head as we switch the pfn to tail to
2346 		 * head.
2347 		 */
2348 		*levelp = level = PT_DIRECTORY_LEVEL;
2349 		mask = KVM_PAGES_PER_HPAGE(level) - 1;
2350 		VM_BUG_ON((gfn & mask) != (pfn & mask));
2351 		if (pfn & mask) {
2352 			gfn &= ~mask;
2353 			*gfnp = gfn;
2354 			kvm_release_pfn_clean(pfn);
2355 			pfn &= ~mask;
2356 			if (!get_page_unless_zero(pfn_to_page(pfn)))
2357 				BUG();
2358 			*pfnp = pfn;
2359 		}
2360 	}
2361 }
2362 
2363 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
2364 			 gva_t gva, pfn_t *pfn, bool write, bool *writable);
2365 
nonpaging_map(struct kvm_vcpu * vcpu,gva_t v,int write,gfn_t gfn,bool prefault)2366 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
2367 			 bool prefault)
2368 {
2369 	int r;
2370 	int level;
2371 	int force_pt_level;
2372 	pfn_t pfn;
2373 	unsigned long mmu_seq;
2374 	bool map_writable;
2375 
2376 	force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
2377 	if (likely(!force_pt_level)) {
2378 		level = mapping_level(vcpu, gfn);
2379 		/*
2380 		 * This path builds a PAE pagetable - so we can map
2381 		 * 2mb pages at maximum. Therefore check if the level
2382 		 * is larger than that.
2383 		 */
2384 		if (level > PT_DIRECTORY_LEVEL)
2385 			level = PT_DIRECTORY_LEVEL;
2386 
2387 		gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2388 	} else
2389 		level = PT_PAGE_TABLE_LEVEL;
2390 
2391 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
2392 	smp_rmb();
2393 
2394 	if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
2395 		return 0;
2396 
2397 	/* mmio */
2398 	if (is_error_pfn(pfn))
2399 		return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
2400 
2401 	spin_lock(&vcpu->kvm->mmu_lock);
2402 	if (mmu_notifier_retry(vcpu, mmu_seq))
2403 		goto out_unlock;
2404 	kvm_mmu_free_some_pages(vcpu);
2405 	if (likely(!force_pt_level))
2406 		transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
2407 	r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
2408 			 prefault);
2409 	spin_unlock(&vcpu->kvm->mmu_lock);
2410 
2411 
2412 	return r;
2413 
2414 out_unlock:
2415 	spin_unlock(&vcpu->kvm->mmu_lock);
2416 	kvm_release_pfn_clean(pfn);
2417 	return 0;
2418 }
2419 
2420 
mmu_free_roots(struct kvm_vcpu * vcpu)2421 static void mmu_free_roots(struct kvm_vcpu *vcpu)
2422 {
2423 	int i;
2424 	struct kvm_mmu_page *sp;
2425 	LIST_HEAD(invalid_list);
2426 
2427 	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2428 		return;
2429 	spin_lock(&vcpu->kvm->mmu_lock);
2430 	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL &&
2431 	    (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL ||
2432 	     vcpu->arch.mmu.direct_map)) {
2433 		hpa_t root = vcpu->arch.mmu.root_hpa;
2434 
2435 		sp = page_header(root);
2436 		--sp->root_count;
2437 		if (!sp->root_count && sp->role.invalid) {
2438 			kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
2439 			kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2440 		}
2441 		vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2442 		spin_unlock(&vcpu->kvm->mmu_lock);
2443 		return;
2444 	}
2445 	for (i = 0; i < 4; ++i) {
2446 		hpa_t root = vcpu->arch.mmu.pae_root[i];
2447 
2448 		if (root) {
2449 			root &= PT64_BASE_ADDR_MASK;
2450 			sp = page_header(root);
2451 			--sp->root_count;
2452 			if (!sp->root_count && sp->role.invalid)
2453 				kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
2454 							 &invalid_list);
2455 		}
2456 		vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
2457 	}
2458 	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2459 	spin_unlock(&vcpu->kvm->mmu_lock);
2460 	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2461 }
2462 
mmu_check_root(struct kvm_vcpu * vcpu,gfn_t root_gfn)2463 static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
2464 {
2465 	int ret = 0;
2466 
2467 	if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
2468 		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2469 		ret = 1;
2470 	}
2471 
2472 	return ret;
2473 }
2474 
mmu_alloc_direct_roots(struct kvm_vcpu * vcpu)2475 static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
2476 {
2477 	struct kvm_mmu_page *sp;
2478 	unsigned i;
2479 
2480 	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
2481 		spin_lock(&vcpu->kvm->mmu_lock);
2482 		kvm_mmu_free_some_pages(vcpu);
2483 		sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL,
2484 				      1, ACC_ALL, NULL);
2485 		++sp->root_count;
2486 		spin_unlock(&vcpu->kvm->mmu_lock);
2487 		vcpu->arch.mmu.root_hpa = __pa(sp->spt);
2488 	} else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) {
2489 		for (i = 0; i < 4; ++i) {
2490 			hpa_t root = vcpu->arch.mmu.pae_root[i];
2491 
2492 			ASSERT(!VALID_PAGE(root));
2493 			spin_lock(&vcpu->kvm->mmu_lock);
2494 			kvm_mmu_free_some_pages(vcpu);
2495 			sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
2496 					      i << 30,
2497 					      PT32_ROOT_LEVEL, 1, ACC_ALL,
2498 					      NULL);
2499 			root = __pa(sp->spt);
2500 			++sp->root_count;
2501 			spin_unlock(&vcpu->kvm->mmu_lock);
2502 			vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
2503 		}
2504 		vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
2505 	} else
2506 		BUG();
2507 
2508 	return 0;
2509 }
2510 
mmu_alloc_shadow_roots(struct kvm_vcpu * vcpu)2511 static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
2512 {
2513 	struct kvm_mmu_page *sp;
2514 	u64 pdptr, pm_mask;
2515 	gfn_t root_gfn;
2516 	int i;
2517 
2518 	root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT;
2519 
2520 	if (mmu_check_root(vcpu, root_gfn))
2521 		return 1;
2522 
2523 	/*
2524 	 * Do we shadow a long mode page table? If so we need to
2525 	 * write-protect the guests page table root.
2526 	 */
2527 	if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
2528 		hpa_t root = vcpu->arch.mmu.root_hpa;
2529 
2530 		ASSERT(!VALID_PAGE(root));
2531 
2532 		spin_lock(&vcpu->kvm->mmu_lock);
2533 		kvm_mmu_free_some_pages(vcpu);
2534 		sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL,
2535 				      0, ACC_ALL, NULL);
2536 		root = __pa(sp->spt);
2537 		++sp->root_count;
2538 		spin_unlock(&vcpu->kvm->mmu_lock);
2539 		vcpu->arch.mmu.root_hpa = root;
2540 		return 0;
2541 	}
2542 
2543 	/*
2544 	 * We shadow a 32 bit page table. This may be a legacy 2-level
2545 	 * or a PAE 3-level page table. In either case we need to be aware that
2546 	 * the shadow page table may be a PAE or a long mode page table.
2547 	 */
2548 	pm_mask = PT_PRESENT_MASK;
2549 	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL)
2550 		pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
2551 
2552 	for (i = 0; i < 4; ++i) {
2553 		hpa_t root = vcpu->arch.mmu.pae_root[i];
2554 
2555 		ASSERT(!VALID_PAGE(root));
2556 		if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
2557 			pdptr = kvm_pdptr_read_mmu(vcpu, &vcpu->arch.mmu, i);
2558 			if (!is_present_gpte(pdptr)) {
2559 				vcpu->arch.mmu.pae_root[i] = 0;
2560 				continue;
2561 			}
2562 			root_gfn = pdptr >> PAGE_SHIFT;
2563 			if (mmu_check_root(vcpu, root_gfn))
2564 				return 1;
2565 		}
2566 		spin_lock(&vcpu->kvm->mmu_lock);
2567 		kvm_mmu_free_some_pages(vcpu);
2568 		sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
2569 				      PT32_ROOT_LEVEL, 0,
2570 				      ACC_ALL, NULL);
2571 		root = __pa(sp->spt);
2572 		++sp->root_count;
2573 		spin_unlock(&vcpu->kvm->mmu_lock);
2574 
2575 		vcpu->arch.mmu.pae_root[i] = root | pm_mask;
2576 	}
2577 	vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
2578 
2579 	/*
2580 	 * If we shadow a 32 bit page table with a long mode page
2581 	 * table we enter this path.
2582 	 */
2583 	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
2584 		if (vcpu->arch.mmu.lm_root == NULL) {
2585 			/*
2586 			 * The additional page necessary for this is only
2587 			 * allocated on demand.
2588 			 */
2589 
2590 			u64 *lm_root;
2591 
2592 			lm_root = (void*)get_zeroed_page(GFP_KERNEL);
2593 			if (lm_root == NULL)
2594 				return 1;
2595 
2596 			lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask;
2597 
2598 			vcpu->arch.mmu.lm_root = lm_root;
2599 		}
2600 
2601 		vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root);
2602 	}
2603 
2604 	return 0;
2605 }
2606 
mmu_alloc_roots(struct kvm_vcpu * vcpu)2607 static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
2608 {
2609 	if (vcpu->arch.mmu.direct_map)
2610 		return mmu_alloc_direct_roots(vcpu);
2611 	else
2612 		return mmu_alloc_shadow_roots(vcpu);
2613 }
2614 
mmu_sync_roots(struct kvm_vcpu * vcpu)2615 static void mmu_sync_roots(struct kvm_vcpu *vcpu)
2616 {
2617 	int i;
2618 	struct kvm_mmu_page *sp;
2619 
2620 	if (vcpu->arch.mmu.direct_map)
2621 		return;
2622 
2623 	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2624 		return;
2625 
2626 	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
2627 	if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
2628 		hpa_t root = vcpu->arch.mmu.root_hpa;
2629 		sp = page_header(root);
2630 		mmu_sync_children(vcpu, sp);
2631 		trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
2632 		return;
2633 	}
2634 	for (i = 0; i < 4; ++i) {
2635 		hpa_t root = vcpu->arch.mmu.pae_root[i];
2636 
2637 		if (root && VALID_PAGE(root)) {
2638 			root &= PT64_BASE_ADDR_MASK;
2639 			sp = page_header(root);
2640 			mmu_sync_children(vcpu, sp);
2641 		}
2642 	}
2643 	trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
2644 }
2645 
kvm_mmu_sync_roots(struct kvm_vcpu * vcpu)2646 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
2647 {
2648 	spin_lock(&vcpu->kvm->mmu_lock);
2649 	mmu_sync_roots(vcpu);
2650 	spin_unlock(&vcpu->kvm->mmu_lock);
2651 }
2652 
nonpaging_gva_to_gpa(struct kvm_vcpu * vcpu,gva_t vaddr,u32 access,struct x86_exception * exception)2653 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
2654 				  u32 access, struct x86_exception *exception)
2655 {
2656 	if (exception)
2657 		exception->error_code = 0;
2658 	return vaddr;
2659 }
2660 
nonpaging_gva_to_gpa_nested(struct kvm_vcpu * vcpu,gva_t vaddr,u32 access,struct x86_exception * exception)2661 static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
2662 					 u32 access,
2663 					 struct x86_exception *exception)
2664 {
2665 	if (exception)
2666 		exception->error_code = 0;
2667 	return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
2668 }
2669 
nonpaging_page_fault(struct kvm_vcpu * vcpu,gva_t gva,u32 error_code,bool prefault)2670 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
2671 				u32 error_code, bool prefault)
2672 {
2673 	gfn_t gfn;
2674 	int r;
2675 
2676 	pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
2677 	r = mmu_topup_memory_caches(vcpu);
2678 	if (r)
2679 		return r;
2680 
2681 	ASSERT(vcpu);
2682 	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
2683 
2684 	gfn = gva >> PAGE_SHIFT;
2685 
2686 	return nonpaging_map(vcpu, gva & PAGE_MASK,
2687 			     error_code & PFERR_WRITE_MASK, gfn, prefault);
2688 }
2689 
kvm_arch_setup_async_pf(struct kvm_vcpu * vcpu,gva_t gva,gfn_t gfn)2690 static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
2691 {
2692 	struct kvm_arch_async_pf arch;
2693 
2694 	arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
2695 	arch.gfn = gfn;
2696 	arch.direct_map = vcpu->arch.mmu.direct_map;
2697 	arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
2698 
2699 	return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
2700 }
2701 
can_do_async_pf(struct kvm_vcpu * vcpu)2702 static bool can_do_async_pf(struct kvm_vcpu *vcpu)
2703 {
2704 	if (unlikely(!irqchip_in_kernel(vcpu->kvm) ||
2705 		     kvm_event_needs_reinjection(vcpu)))
2706 		return false;
2707 
2708 	return kvm_x86_ops->interrupt_allowed(vcpu);
2709 }
2710 
try_async_pf(struct kvm_vcpu * vcpu,bool prefault,gfn_t gfn,gva_t gva,pfn_t * pfn,bool write,bool * writable)2711 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
2712 			 gva_t gva, pfn_t *pfn, bool write, bool *writable)
2713 {
2714 	bool async;
2715 
2716 	*pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable);
2717 
2718 	if (!async)
2719 		return false; /* *pfn has correct page already */
2720 
2721 	put_page(pfn_to_page(*pfn));
2722 
2723 	if (!prefault && can_do_async_pf(vcpu)) {
2724 		trace_kvm_try_async_get_page(gva, gfn);
2725 		if (kvm_find_async_pf_gfn(vcpu, gfn)) {
2726 			trace_kvm_async_pf_doublefault(gva, gfn);
2727 			kvm_make_request(KVM_REQ_APF_HALT, vcpu);
2728 			return true;
2729 		} else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
2730 			return true;
2731 	}
2732 
2733 	*pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable);
2734 
2735 	return false;
2736 }
2737 
tdp_page_fault(struct kvm_vcpu * vcpu,gva_t gpa,u32 error_code,bool prefault)2738 static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
2739 			  bool prefault)
2740 {
2741 	pfn_t pfn;
2742 	int r;
2743 	int level;
2744 	int force_pt_level;
2745 	gfn_t gfn = gpa >> PAGE_SHIFT;
2746 	unsigned long mmu_seq;
2747 	int write = error_code & PFERR_WRITE_MASK;
2748 	bool map_writable;
2749 
2750 	ASSERT(vcpu);
2751 	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
2752 
2753 	r = mmu_topup_memory_caches(vcpu);
2754 	if (r)
2755 		return r;
2756 
2757 	force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
2758 	if (likely(!force_pt_level)) {
2759 		level = mapping_level(vcpu, gfn);
2760 		gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2761 	} else
2762 		level = PT_PAGE_TABLE_LEVEL;
2763 
2764 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
2765 	smp_rmb();
2766 
2767 	if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
2768 		return 0;
2769 
2770 	/* mmio */
2771 	if (is_error_pfn(pfn))
2772 		return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
2773 	spin_lock(&vcpu->kvm->mmu_lock);
2774 	if (mmu_notifier_retry(vcpu, mmu_seq))
2775 		goto out_unlock;
2776 	kvm_mmu_free_some_pages(vcpu);
2777 	if (likely(!force_pt_level))
2778 		transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
2779 	r = __direct_map(vcpu, gpa, write, map_writable,
2780 			 level, gfn, pfn, prefault);
2781 	spin_unlock(&vcpu->kvm->mmu_lock);
2782 
2783 	return r;
2784 
2785 out_unlock:
2786 	spin_unlock(&vcpu->kvm->mmu_lock);
2787 	kvm_release_pfn_clean(pfn);
2788 	return 0;
2789 }
2790 
nonpaging_free(struct kvm_vcpu * vcpu)2791 static void nonpaging_free(struct kvm_vcpu *vcpu)
2792 {
2793 	mmu_free_roots(vcpu);
2794 }
2795 
nonpaging_init_context(struct kvm_vcpu * vcpu,struct kvm_mmu * context)2796 static int nonpaging_init_context(struct kvm_vcpu *vcpu,
2797 				  struct kvm_mmu *context)
2798 {
2799 	context->new_cr3 = nonpaging_new_cr3;
2800 	context->page_fault = nonpaging_page_fault;
2801 	context->gva_to_gpa = nonpaging_gva_to_gpa;
2802 	context->free = nonpaging_free;
2803 	context->prefetch_page = nonpaging_prefetch_page;
2804 	context->sync_page = nonpaging_sync_page;
2805 	context->invlpg = nonpaging_invlpg;
2806 	context->update_pte = nonpaging_update_pte;
2807 	context->root_level = 0;
2808 	context->shadow_root_level = PT32E_ROOT_LEVEL;
2809 	context->root_hpa = INVALID_PAGE;
2810 	context->direct_map = true;
2811 	context->nx = false;
2812 	return 0;
2813 }
2814 
kvm_mmu_flush_tlb(struct kvm_vcpu * vcpu)2815 void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
2816 {
2817 	++vcpu->stat.tlb_flush;
2818 	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2819 }
2820 
paging_new_cr3(struct kvm_vcpu * vcpu)2821 static void paging_new_cr3(struct kvm_vcpu *vcpu)
2822 {
2823 	pgprintk("%s: cr3 %lx\n", __func__, kvm_read_cr3(vcpu));
2824 	mmu_free_roots(vcpu);
2825 }
2826 
get_cr3(struct kvm_vcpu * vcpu)2827 static unsigned long get_cr3(struct kvm_vcpu *vcpu)
2828 {
2829 	return kvm_read_cr3(vcpu);
2830 }
2831 
inject_page_fault(struct kvm_vcpu * vcpu,struct x86_exception * fault)2832 static void inject_page_fault(struct kvm_vcpu *vcpu,
2833 			      struct x86_exception *fault)
2834 {
2835 	vcpu->arch.mmu.inject_page_fault(vcpu, fault);
2836 }
2837 
paging_free(struct kvm_vcpu * vcpu)2838 static void paging_free(struct kvm_vcpu *vcpu)
2839 {
2840 	nonpaging_free(vcpu);
2841 }
2842 
is_rsvd_bits_set(struct kvm_mmu * mmu,u64 gpte,int level)2843 static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
2844 {
2845 	int bit7;
2846 
2847 	bit7 = (gpte >> 7) & 1;
2848 	return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
2849 }
2850 
2851 #define PTTYPE 64
2852 #include "paging_tmpl.h"
2853 #undef PTTYPE
2854 
2855 #define PTTYPE 32
2856 #include "paging_tmpl.h"
2857 #undef PTTYPE
2858 
reset_rsvds_bits_mask(struct kvm_vcpu * vcpu,struct kvm_mmu * context,int level)2859 static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
2860 				  struct kvm_mmu *context,
2861 				  int level)
2862 {
2863 	int maxphyaddr = cpuid_maxphyaddr(vcpu);
2864 	u64 exb_bit_rsvd = 0;
2865 
2866 	if (!context->nx)
2867 		exb_bit_rsvd = rsvd_bits(63, 63);
2868 	switch (level) {
2869 	case PT32_ROOT_LEVEL:
2870 		/* no rsvd bits for 2 level 4K page table entries */
2871 		context->rsvd_bits_mask[0][1] = 0;
2872 		context->rsvd_bits_mask[0][0] = 0;
2873 		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2874 
2875 		if (!is_pse(vcpu)) {
2876 			context->rsvd_bits_mask[1][1] = 0;
2877 			break;
2878 		}
2879 
2880 		if (is_cpuid_PSE36())
2881 			/* 36bits PSE 4MB page */
2882 			context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
2883 		else
2884 			/* 32 bits PSE 4MB page */
2885 			context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
2886 		break;
2887 	case PT32E_ROOT_LEVEL:
2888 		context->rsvd_bits_mask[0][2] =
2889 			rsvd_bits(maxphyaddr, 63) |
2890 			rsvd_bits(7, 8) | rsvd_bits(1, 2);	/* PDPTE */
2891 		context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
2892 			rsvd_bits(maxphyaddr, 62);	/* PDE */
2893 		context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
2894 			rsvd_bits(maxphyaddr, 62); 	/* PTE */
2895 		context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2896 			rsvd_bits(maxphyaddr, 62) |
2897 			rsvd_bits(13, 20);		/* large page */
2898 		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2899 		break;
2900 	case PT64_ROOT_LEVEL:
2901 		context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
2902 			rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
2903 		context->rsvd_bits_mask[0][2] = exb_bit_rsvd |
2904 			rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
2905 		context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
2906 			rsvd_bits(maxphyaddr, 51);
2907 		context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
2908 			rsvd_bits(maxphyaddr, 51);
2909 		context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
2910 		context->rsvd_bits_mask[1][2] = exb_bit_rsvd |
2911 			rsvd_bits(maxphyaddr, 51) |
2912 			rsvd_bits(13, 29);
2913 		context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2914 			rsvd_bits(maxphyaddr, 51) |
2915 			rsvd_bits(13, 20);		/* large page */
2916 		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2917 		break;
2918 	}
2919 }
2920 
paging64_init_context_common(struct kvm_vcpu * vcpu,struct kvm_mmu * context,int level)2921 static int paging64_init_context_common(struct kvm_vcpu *vcpu,
2922 					struct kvm_mmu *context,
2923 					int level)
2924 {
2925 	context->nx = is_nx(vcpu);
2926 
2927 	reset_rsvds_bits_mask(vcpu, context, level);
2928 
2929 	ASSERT(is_pae(vcpu));
2930 	context->new_cr3 = paging_new_cr3;
2931 	context->page_fault = paging64_page_fault;
2932 	context->gva_to_gpa = paging64_gva_to_gpa;
2933 	context->prefetch_page = paging64_prefetch_page;
2934 	context->sync_page = paging64_sync_page;
2935 	context->invlpg = paging64_invlpg;
2936 	context->update_pte = paging64_update_pte;
2937 	context->free = paging_free;
2938 	context->root_level = level;
2939 	context->shadow_root_level = level;
2940 	context->root_hpa = INVALID_PAGE;
2941 	context->direct_map = false;
2942 	return 0;
2943 }
2944 
paging64_init_context(struct kvm_vcpu * vcpu,struct kvm_mmu * context)2945 static int paging64_init_context(struct kvm_vcpu *vcpu,
2946 				 struct kvm_mmu *context)
2947 {
2948 	return paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL);
2949 }
2950 
paging32_init_context(struct kvm_vcpu * vcpu,struct kvm_mmu * context)2951 static int paging32_init_context(struct kvm_vcpu *vcpu,
2952 				 struct kvm_mmu *context)
2953 {
2954 	context->nx = false;
2955 
2956 	reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
2957 
2958 	context->new_cr3 = paging_new_cr3;
2959 	context->page_fault = paging32_page_fault;
2960 	context->gva_to_gpa = paging32_gva_to_gpa;
2961 	context->free = paging_free;
2962 	context->prefetch_page = paging32_prefetch_page;
2963 	context->sync_page = paging32_sync_page;
2964 	context->invlpg = paging32_invlpg;
2965 	context->update_pte = paging32_update_pte;
2966 	context->root_level = PT32_ROOT_LEVEL;
2967 	context->shadow_root_level = PT32E_ROOT_LEVEL;
2968 	context->root_hpa = INVALID_PAGE;
2969 	context->direct_map = false;
2970 	return 0;
2971 }
2972 
paging32E_init_context(struct kvm_vcpu * vcpu,struct kvm_mmu * context)2973 static int paging32E_init_context(struct kvm_vcpu *vcpu,
2974 				  struct kvm_mmu *context)
2975 {
2976 	return paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
2977 }
2978 
init_kvm_tdp_mmu(struct kvm_vcpu * vcpu)2979 static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2980 {
2981 	struct kvm_mmu *context = vcpu->arch.walk_mmu;
2982 
2983 	context->base_role.word = 0;
2984 	context->new_cr3 = nonpaging_new_cr3;
2985 	context->page_fault = tdp_page_fault;
2986 	context->free = nonpaging_free;
2987 	context->prefetch_page = nonpaging_prefetch_page;
2988 	context->sync_page = nonpaging_sync_page;
2989 	context->invlpg = nonpaging_invlpg;
2990 	context->update_pte = nonpaging_update_pte;
2991 	context->shadow_root_level = kvm_x86_ops->get_tdp_level();
2992 	context->root_hpa = INVALID_PAGE;
2993 	context->direct_map = true;
2994 	context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
2995 	context->get_cr3 = get_cr3;
2996 	context->inject_page_fault = kvm_inject_page_fault;
2997 	context->nx = is_nx(vcpu);
2998 
2999 	if (!is_paging(vcpu)) {
3000 		context->nx = false;
3001 		context->gva_to_gpa = nonpaging_gva_to_gpa;
3002 		context->root_level = 0;
3003 	} else if (is_long_mode(vcpu)) {
3004 		context->nx = is_nx(vcpu);
3005 		reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL);
3006 		context->gva_to_gpa = paging64_gva_to_gpa;
3007 		context->root_level = PT64_ROOT_LEVEL;
3008 	} else if (is_pae(vcpu)) {
3009 		context->nx = is_nx(vcpu);
3010 		reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL);
3011 		context->gva_to_gpa = paging64_gva_to_gpa;
3012 		context->root_level = PT32E_ROOT_LEVEL;
3013 	} else {
3014 		context->nx = false;
3015 		reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
3016 		context->gva_to_gpa = paging32_gva_to_gpa;
3017 		context->root_level = PT32_ROOT_LEVEL;
3018 	}
3019 
3020 	return 0;
3021 }
3022 
kvm_init_shadow_mmu(struct kvm_vcpu * vcpu,struct kvm_mmu * context)3023 int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
3024 {
3025 	int r;
3026 	ASSERT(vcpu);
3027 	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
3028 
3029 	if (!is_paging(vcpu))
3030 		r = nonpaging_init_context(vcpu, context);
3031 	else if (is_long_mode(vcpu))
3032 		r = paging64_init_context(vcpu, context);
3033 	else if (is_pae(vcpu))
3034 		r = paging32E_init_context(vcpu, context);
3035 	else
3036 		r = paging32_init_context(vcpu, context);
3037 
3038 	vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
3039 	vcpu->arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu);
3040 
3041 	return r;
3042 }
3043 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
3044 
init_kvm_softmmu(struct kvm_vcpu * vcpu)3045 static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
3046 {
3047 	int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
3048 
3049 	vcpu->arch.walk_mmu->set_cr3           = kvm_x86_ops->set_cr3;
3050 	vcpu->arch.walk_mmu->get_cr3           = get_cr3;
3051 	vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
3052 
3053 	return r;
3054 }
3055 
init_kvm_nested_mmu(struct kvm_vcpu * vcpu)3056 static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
3057 {
3058 	struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
3059 
3060 	g_context->get_cr3           = get_cr3;
3061 	g_context->inject_page_fault = kvm_inject_page_fault;
3062 
3063 	/*
3064 	 * Note that arch.mmu.gva_to_gpa translates l2_gva to l1_gpa. The
3065 	 * translation of l2_gpa to l1_gpa addresses is done using the
3066 	 * arch.nested_mmu.gva_to_gpa function. Basically the gva_to_gpa
3067 	 * functions between mmu and nested_mmu are swapped.
3068 	 */
3069 	if (!is_paging(vcpu)) {
3070 		g_context->nx = false;
3071 		g_context->root_level = 0;
3072 		g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
3073 	} else if (is_long_mode(vcpu)) {
3074 		g_context->nx = is_nx(vcpu);
3075 		reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL);
3076 		g_context->root_level = PT64_ROOT_LEVEL;
3077 		g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
3078 	} else if (is_pae(vcpu)) {
3079 		g_context->nx = is_nx(vcpu);
3080 		reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL);
3081 		g_context->root_level = PT32E_ROOT_LEVEL;
3082 		g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
3083 	} else {
3084 		g_context->nx = false;
3085 		reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL);
3086 		g_context->root_level = PT32_ROOT_LEVEL;
3087 		g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
3088 	}
3089 
3090 	return 0;
3091 }
3092 
init_kvm_mmu(struct kvm_vcpu * vcpu)3093 static int init_kvm_mmu(struct kvm_vcpu *vcpu)
3094 {
3095 	if (mmu_is_nested(vcpu))
3096 		return init_kvm_nested_mmu(vcpu);
3097 	else if (tdp_enabled)
3098 		return init_kvm_tdp_mmu(vcpu);
3099 	else
3100 		return init_kvm_softmmu(vcpu);
3101 }
3102 
destroy_kvm_mmu(struct kvm_vcpu * vcpu)3103 static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
3104 {
3105 	ASSERT(vcpu);
3106 	if (VALID_PAGE(vcpu->arch.mmu.root_hpa))
3107 		/* mmu.free() should set root_hpa = INVALID_PAGE */
3108 		vcpu->arch.mmu.free(vcpu);
3109 }
3110 
kvm_mmu_reset_context(struct kvm_vcpu * vcpu)3111 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
3112 {
3113 	destroy_kvm_mmu(vcpu);
3114 	return init_kvm_mmu(vcpu);
3115 }
3116 EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
3117 
kvm_mmu_load(struct kvm_vcpu * vcpu)3118 int kvm_mmu_load(struct kvm_vcpu *vcpu)
3119 {
3120 	int r;
3121 
3122 	r = mmu_topup_memory_caches(vcpu);
3123 	if (r)
3124 		goto out;
3125 	r = mmu_alloc_roots(vcpu);
3126 	spin_lock(&vcpu->kvm->mmu_lock);
3127 	mmu_sync_roots(vcpu);
3128 	spin_unlock(&vcpu->kvm->mmu_lock);
3129 	if (r)
3130 		goto out;
3131 	/* set_cr3() should ensure TLB has been flushed */
3132 	vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
3133 out:
3134 	return r;
3135 }
3136 EXPORT_SYMBOL_GPL(kvm_mmu_load);
3137 
kvm_mmu_unload(struct kvm_vcpu * vcpu)3138 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
3139 {
3140 	mmu_free_roots(vcpu);
3141 }
3142 EXPORT_SYMBOL_GPL(kvm_mmu_unload);
3143 
mmu_pte_write_zap_pte(struct kvm_vcpu * vcpu,struct kvm_mmu_page * sp,u64 * spte)3144 static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
3145 				  struct kvm_mmu_page *sp,
3146 				  u64 *spte)
3147 {
3148 	u64 pte;
3149 	struct kvm_mmu_page *child;
3150 
3151 	pte = *spte;
3152 	if (is_shadow_present_pte(pte)) {
3153 		if (is_last_spte(pte, sp->role.level))
3154 			drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte);
3155 		else {
3156 			child = page_header(pte & PT64_BASE_ADDR_MASK);
3157 			mmu_page_remove_parent_pte(child, spte);
3158 		}
3159 	}
3160 	__set_spte(spte, shadow_trap_nonpresent_pte);
3161 	if (is_large_pte(pte))
3162 		--vcpu->kvm->stat.lpages;
3163 }
3164 
mmu_pte_write_new_pte(struct kvm_vcpu * vcpu,struct kvm_mmu_page * sp,u64 * spte,const void * new,unsigned long mmu_seq)3165 static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
3166 				  struct kvm_mmu_page *sp,
3167 				  u64 *spte,
3168 				  const void *new, unsigned long mmu_seq)
3169 {
3170 	if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
3171 		++vcpu->kvm->stat.mmu_pde_zapped;
3172 		return;
3173         }
3174 
3175 	++vcpu->kvm->stat.mmu_pte_updated;
3176 	vcpu->arch.mmu.update_pte(vcpu, sp, spte, new, mmu_seq);
3177 }
3178 
need_remote_flush(u64 old,u64 new)3179 static bool need_remote_flush(u64 old, u64 new)
3180 {
3181 	if (!is_shadow_present_pte(old))
3182 		return false;
3183 	if (!is_shadow_present_pte(new))
3184 		return true;
3185 	if ((old ^ new) & PT64_BASE_ADDR_MASK)
3186 		return true;
3187 	old ^= PT64_NX_MASK;
3188 	new ^= PT64_NX_MASK;
3189 	return (old & ~new & PT64_PERM_MASK) != 0;
3190 }
3191 
mmu_pte_write_flush_tlb(struct kvm_vcpu * vcpu,bool zap_page,bool remote_flush,bool local_flush)3192 static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
3193 				    bool remote_flush, bool local_flush)
3194 {
3195 	if (zap_page)
3196 		return;
3197 
3198 	if (remote_flush)
3199 		kvm_flush_remote_tlbs(vcpu->kvm);
3200 	else if (local_flush)
3201 		kvm_mmu_flush_tlb(vcpu);
3202 }
3203 
last_updated_pte_accessed(struct kvm_vcpu * vcpu)3204 static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
3205 {
3206 	u64 *spte = vcpu->arch.last_pte_updated;
3207 
3208 	return !!(spte && (*spte & shadow_accessed_mask));
3209 }
3210 
kvm_mmu_access_page(struct kvm_vcpu * vcpu,gfn_t gfn)3211 static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
3212 {
3213 	u64 *spte = vcpu->arch.last_pte_updated;
3214 
3215 	if (spte
3216 	    && vcpu->arch.last_pte_gfn == gfn
3217 	    && shadow_accessed_mask
3218 	    && !(*spte & shadow_accessed_mask)
3219 	    && is_shadow_present_pte(*spte))
3220 		set_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
3221 }
3222 
kvm_mmu_pte_write(struct kvm_vcpu * vcpu,gpa_t gpa,const u8 * new,int bytes,bool guest_initiated)3223 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3224 		       const u8 *new, int bytes,
3225 		       bool guest_initiated)
3226 {
3227 	gfn_t gfn = gpa >> PAGE_SHIFT;
3228 	union kvm_mmu_page_role mask = { .word = 0 };
3229 	struct kvm_mmu_page *sp;
3230 	struct hlist_node *node;
3231 	LIST_HEAD(invalid_list);
3232 	unsigned long mmu_seq;
3233 	u64 entry, gentry, *spte;
3234 	unsigned pte_size, page_offset, misaligned, quadrant, offset;
3235 	int level, npte, invlpg_counter, r, flooded = 0;
3236 	bool remote_flush, local_flush, zap_page;
3237 
3238 	zap_page = remote_flush = local_flush = false;
3239 	offset = offset_in_page(gpa);
3240 
3241 	pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
3242 
3243 	invlpg_counter = atomic_read(&vcpu->kvm->arch.invlpg_counter);
3244 
3245 	/*
3246 	 * Assume that the pte write on a page table of the same type
3247 	 * as the current vcpu paging mode since we update the sptes only
3248 	 * when they have the same mode.
3249 	 */
3250 	if ((is_pae(vcpu) && bytes == 4) || !new) {
3251 		/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
3252 		if (is_pae(vcpu)) {
3253 			gpa &= ~(gpa_t)7;
3254 			bytes = 8;
3255 		}
3256 		r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8));
3257 		if (r)
3258 			gentry = 0;
3259 		new = (const u8 *)&gentry;
3260 	}
3261 
3262 	switch (bytes) {
3263 	case 4:
3264 		gentry = *(const u32 *)new;
3265 		break;
3266 	case 8:
3267 		gentry = *(const u64 *)new;
3268 		break;
3269 	default:
3270 		gentry = 0;
3271 		break;
3272 	}
3273 
3274 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
3275 	smp_rmb();
3276 
3277 	spin_lock(&vcpu->kvm->mmu_lock);
3278 	if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
3279 		gentry = 0;
3280 	kvm_mmu_free_some_pages(vcpu);
3281 	++vcpu->kvm->stat.mmu_pte_write;
3282 	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
3283 	if (guest_initiated) {
3284 		kvm_mmu_access_page(vcpu, gfn);
3285 		if (gfn == vcpu->arch.last_pt_write_gfn
3286 		    && !last_updated_pte_accessed(vcpu)) {
3287 			++vcpu->arch.last_pt_write_count;
3288 			if (vcpu->arch.last_pt_write_count >= 3)
3289 				flooded = 1;
3290 		} else {
3291 			vcpu->arch.last_pt_write_gfn = gfn;
3292 			vcpu->arch.last_pt_write_count = 1;
3293 			vcpu->arch.last_pte_updated = NULL;
3294 		}
3295 	}
3296 
3297 	mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
3298 	for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
3299 		pte_size = sp->role.cr4_pae ? 8 : 4;
3300 		misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
3301 		misaligned |= bytes < 4;
3302 		if (misaligned || flooded) {
3303 			/*
3304 			 * Misaligned accesses are too much trouble to fix
3305 			 * up; also, they usually indicate a page is not used
3306 			 * as a page table.
3307 			 *
3308 			 * If we're seeing too many writes to a page,
3309 			 * it may no longer be a page table, or we may be
3310 			 * forking, in which case it is better to unmap the
3311 			 * page.
3312 			 */
3313 			pgprintk("misaligned: gpa %llx bytes %d role %x\n",
3314 				 gpa, bytes, sp->role.word);
3315 			zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
3316 						     &invalid_list);
3317 			++vcpu->kvm->stat.mmu_flooded;
3318 			continue;
3319 		}
3320 		page_offset = offset;
3321 		level = sp->role.level;
3322 		npte = 1;
3323 		if (!sp->role.cr4_pae) {
3324 			page_offset <<= 1;	/* 32->64 */
3325 			/*
3326 			 * A 32-bit pde maps 4MB while the shadow pdes map
3327 			 * only 2MB.  So we need to double the offset again
3328 			 * and zap two pdes instead of one.
3329 			 */
3330 			if (level == PT32_ROOT_LEVEL) {
3331 				page_offset &= ~7; /* kill rounding error */
3332 				page_offset <<= 1;
3333 				npte = 2;
3334 			}
3335 			quadrant = page_offset >> PAGE_SHIFT;
3336 			page_offset &= ~PAGE_MASK;
3337 			if (quadrant != sp->role.quadrant)
3338 				continue;
3339 		}
3340 		local_flush = true;
3341 		spte = &sp->spt[page_offset / sizeof(*spte)];
3342 		while (npte--) {
3343 			entry = *spte;
3344 			mmu_pte_write_zap_pte(vcpu, sp, spte);
3345 			if (gentry &&
3346 			      !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
3347 			      & mask.word))
3348 				mmu_pte_write_new_pte(vcpu, sp, spte, &gentry,
3349 						      mmu_seq);
3350 			if (!remote_flush && need_remote_flush(entry, *spte))
3351 				remote_flush = true;
3352 			++spte;
3353 		}
3354 	}
3355 	mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
3356 	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
3357 	trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
3358 	spin_unlock(&vcpu->kvm->mmu_lock);
3359 }
3360 
kvm_mmu_unprotect_page_virt(struct kvm_vcpu * vcpu,gva_t gva)3361 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
3362 {
3363 	gpa_t gpa;
3364 	int r;
3365 
3366 	if (vcpu->arch.mmu.direct_map)
3367 		return 0;
3368 
3369 	gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
3370 
3371 	spin_lock(&vcpu->kvm->mmu_lock);
3372 	r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
3373 	spin_unlock(&vcpu->kvm->mmu_lock);
3374 	return r;
3375 }
3376 EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
3377 
__kvm_mmu_free_some_pages(struct kvm_vcpu * vcpu)3378 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
3379 {
3380 	LIST_HEAD(invalid_list);
3381 
3382 	while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES &&
3383 	       !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
3384 		struct kvm_mmu_page *sp;
3385 
3386 		sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
3387 				  struct kvm_mmu_page, link);
3388 		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
3389 		kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
3390 		++vcpu->kvm->stat.mmu_recycled;
3391 	}
3392 }
3393 
kvm_mmu_page_fault(struct kvm_vcpu * vcpu,gva_t cr2,u32 error_code,void * insn,int insn_len)3394 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
3395 		       void *insn, int insn_len)
3396 {
3397 	int r;
3398 	enum emulation_result er;
3399 
3400 	r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
3401 	if (r < 0)
3402 		goto out;
3403 
3404 	if (!r) {
3405 		r = 1;
3406 		goto out;
3407 	}
3408 
3409 	r = mmu_topup_memory_caches(vcpu);
3410 	if (r)
3411 		goto out;
3412 
3413 	er = x86_emulate_instruction(vcpu, cr2, 0, insn, insn_len);
3414 
3415 	switch (er) {
3416 	case EMULATE_DONE:
3417 		return 1;
3418 	case EMULATE_DO_MMIO:
3419 		++vcpu->stat.mmio_exits;
3420 		/* fall through */
3421 	case EMULATE_FAIL:
3422 		return 0;
3423 	default:
3424 		BUG();
3425 	}
3426 out:
3427 	return r;
3428 }
3429 EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
3430 
kvm_mmu_invlpg(struct kvm_vcpu * vcpu,gva_t gva)3431 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
3432 {
3433 	vcpu->arch.mmu.invlpg(vcpu, gva);
3434 	kvm_mmu_flush_tlb(vcpu);
3435 	++vcpu->stat.invlpg;
3436 }
3437 EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
3438 
kvm_enable_tdp(void)3439 void kvm_enable_tdp(void)
3440 {
3441 	tdp_enabled = true;
3442 }
3443 EXPORT_SYMBOL_GPL(kvm_enable_tdp);
3444 
kvm_disable_tdp(void)3445 void kvm_disable_tdp(void)
3446 {
3447 	tdp_enabled = false;
3448 }
3449 EXPORT_SYMBOL_GPL(kvm_disable_tdp);
3450 
free_mmu_pages(struct kvm_vcpu * vcpu)3451 static void free_mmu_pages(struct kvm_vcpu *vcpu)
3452 {
3453 	free_page((unsigned long)vcpu->arch.mmu.pae_root);
3454 	if (vcpu->arch.mmu.lm_root != NULL)
3455 		free_page((unsigned long)vcpu->arch.mmu.lm_root);
3456 }
3457 
alloc_mmu_pages(struct kvm_vcpu * vcpu)3458 static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
3459 {
3460 	struct page *page;
3461 	int i;
3462 
3463 	ASSERT(vcpu);
3464 
3465 	/*
3466 	 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
3467 	 * Therefore we need to allocate shadow page tables in the first
3468 	 * 4GB of memory, which happens to fit the DMA32 zone.
3469 	 */
3470 	page = alloc_page(GFP_KERNEL | __GFP_DMA32);
3471 	if (!page)
3472 		return -ENOMEM;
3473 
3474 	vcpu->arch.mmu.pae_root = page_address(page);
3475 	for (i = 0; i < 4; ++i)
3476 		vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
3477 
3478 	return 0;
3479 }
3480 
kvm_mmu_create(struct kvm_vcpu * vcpu)3481 int kvm_mmu_create(struct kvm_vcpu *vcpu)
3482 {
3483 	ASSERT(vcpu);
3484 	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
3485 
3486 	return alloc_mmu_pages(vcpu);
3487 }
3488 
kvm_mmu_setup(struct kvm_vcpu * vcpu)3489 int kvm_mmu_setup(struct kvm_vcpu *vcpu)
3490 {
3491 	ASSERT(vcpu);
3492 	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
3493 
3494 	return init_kvm_mmu(vcpu);
3495 }
3496 
kvm_mmu_slot_remove_write_access(struct kvm * kvm,int slot)3497 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
3498 {
3499 	struct kvm_mmu_page *sp;
3500 
3501 	list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
3502 		int i;
3503 		u64 *pt;
3504 
3505 		if (!test_bit(slot, sp->slot_bitmap))
3506 			continue;
3507 
3508 		pt = sp->spt;
3509 		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
3510 			if (!is_shadow_present_pte(pt[i]) ||
3511 			      !is_last_spte(pt[i], sp->role.level))
3512 				continue;
3513 
3514 			if (is_large_pte(pt[i])) {
3515 				drop_spte(kvm, &pt[i],
3516 					  shadow_trap_nonpresent_pte);
3517 				--kvm->stat.lpages;
3518 				continue;
3519 			}
3520 
3521 			/* avoid RMW */
3522 			if (is_writable_pte(pt[i]))
3523 				update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK);
3524 		}
3525 	}
3526 	kvm_flush_remote_tlbs(kvm);
3527 }
3528 
kvm_mmu_zap_all(struct kvm * kvm)3529 void kvm_mmu_zap_all(struct kvm *kvm)
3530 {
3531 	struct kvm_mmu_page *sp, *node;
3532 	LIST_HEAD(invalid_list);
3533 
3534 	spin_lock(&kvm->mmu_lock);
3535 restart:
3536 	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
3537 		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
3538 			goto restart;
3539 
3540 	kvm_mmu_commit_zap_page(kvm, &invalid_list);
3541 	spin_unlock(&kvm->mmu_lock);
3542 }
3543 
kvm_mmu_remove_some_alloc_mmu_pages(struct kvm * kvm,struct list_head * invalid_list)3544 static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
3545 					       struct list_head *invalid_list)
3546 {
3547 	struct kvm_mmu_page *page;
3548 
3549 	page = container_of(kvm->arch.active_mmu_pages.prev,
3550 			    struct kvm_mmu_page, link);
3551 	return kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
3552 }
3553 
mmu_shrink(struct shrinker * shrink,int nr_to_scan,gfp_t gfp_mask)3554 static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
3555 {
3556 	struct kvm *kvm;
3557 	struct kvm *kvm_freed = NULL;
3558 
3559 	if (nr_to_scan == 0)
3560 		goto out;
3561 
3562 	raw_spin_lock(&kvm_lock);
3563 
3564 	list_for_each_entry(kvm, &vm_list, vm_list) {
3565 		int idx, freed_pages;
3566 		LIST_HEAD(invalid_list);
3567 
3568 		idx = srcu_read_lock(&kvm->srcu);
3569 		spin_lock(&kvm->mmu_lock);
3570 		if (!kvm_freed && nr_to_scan > 0 &&
3571 		    kvm->arch.n_used_mmu_pages > 0) {
3572 			freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
3573 							  &invalid_list);
3574 			kvm_freed = kvm;
3575 		}
3576 		nr_to_scan--;
3577 
3578 		kvm_mmu_commit_zap_page(kvm, &invalid_list);
3579 		spin_unlock(&kvm->mmu_lock);
3580 		srcu_read_unlock(&kvm->srcu, idx);
3581 	}
3582 	if (kvm_freed)
3583 		list_move_tail(&kvm_freed->vm_list, &vm_list);
3584 
3585 	raw_spin_unlock(&kvm_lock);
3586 
3587 out:
3588 	return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
3589 }
3590 
3591 static struct shrinker mmu_shrinker = {
3592 	.shrink = mmu_shrink,
3593 	.seeks = DEFAULT_SEEKS * 10,
3594 };
3595 
mmu_destroy_caches(void)3596 static void mmu_destroy_caches(void)
3597 {
3598 	if (pte_chain_cache)
3599 		kmem_cache_destroy(pte_chain_cache);
3600 	if (rmap_desc_cache)
3601 		kmem_cache_destroy(rmap_desc_cache);
3602 	if (mmu_page_header_cache)
3603 		kmem_cache_destroy(mmu_page_header_cache);
3604 }
3605 
kvm_mmu_module_init(void)3606 int kvm_mmu_module_init(void)
3607 {
3608 	pte_chain_cache = kmem_cache_create("kvm_pte_chain",
3609 					    sizeof(struct kvm_pte_chain),
3610 					    0, 0, NULL);
3611 	if (!pte_chain_cache)
3612 		goto nomem;
3613 	rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
3614 					    sizeof(struct kvm_rmap_desc),
3615 					    0, 0, NULL);
3616 	if (!rmap_desc_cache)
3617 		goto nomem;
3618 
3619 	mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
3620 						  sizeof(struct kvm_mmu_page),
3621 						  0, 0, NULL);
3622 	if (!mmu_page_header_cache)
3623 		goto nomem;
3624 
3625 	if (percpu_counter_init(&kvm_total_used_mmu_pages, 0))
3626 		goto nomem;
3627 
3628 	register_shrinker(&mmu_shrinker);
3629 
3630 	return 0;
3631 
3632 nomem:
3633 	mmu_destroy_caches();
3634 	return -ENOMEM;
3635 }
3636 
3637 /*
3638  * Caculate mmu pages needed for kvm.
3639  */
kvm_mmu_calculate_mmu_pages(struct kvm * kvm)3640 unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
3641 {
3642 	int i;
3643 	unsigned int nr_mmu_pages;
3644 	unsigned int  nr_pages = 0;
3645 	struct kvm_memslots *slots;
3646 
3647 	slots = kvm_memslots(kvm);
3648 
3649 	for (i = 0; i < slots->nmemslots; i++)
3650 		nr_pages += slots->memslots[i].npages;
3651 
3652 	nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
3653 	nr_mmu_pages = max(nr_mmu_pages,
3654 			(unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
3655 
3656 	return nr_mmu_pages;
3657 }
3658 
pv_mmu_peek_buffer(struct kvm_pv_mmu_op_buffer * buffer,unsigned len)3659 static void *pv_mmu_peek_buffer(struct kvm_pv_mmu_op_buffer *buffer,
3660 				unsigned len)
3661 {
3662 	if (len > buffer->len)
3663 		return NULL;
3664 	return buffer->ptr;
3665 }
3666 
pv_mmu_read_buffer(struct kvm_pv_mmu_op_buffer * buffer,unsigned len)3667 static void *pv_mmu_read_buffer(struct kvm_pv_mmu_op_buffer *buffer,
3668 				unsigned len)
3669 {
3670 	void *ret;
3671 
3672 	ret = pv_mmu_peek_buffer(buffer, len);
3673 	if (!ret)
3674 		return ret;
3675 	buffer->ptr += len;
3676 	buffer->len -= len;
3677 	buffer->processed += len;
3678 	return ret;
3679 }
3680 
kvm_pv_mmu_write(struct kvm_vcpu * vcpu,gpa_t addr,gpa_t value)3681 static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
3682 			     gpa_t addr, gpa_t value)
3683 {
3684 	int bytes = 8;
3685 	int r;
3686 
3687 	if (!is_long_mode(vcpu) && !is_pae(vcpu))
3688 		bytes = 4;
3689 
3690 	r = mmu_topup_memory_caches(vcpu);
3691 	if (r)
3692 		return r;
3693 
3694 	if (!emulator_write_phys(vcpu, addr, &value, bytes))
3695 		return -EFAULT;
3696 
3697 	return 1;
3698 }
3699 
kvm_pv_mmu_flush_tlb(struct kvm_vcpu * vcpu)3700 static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
3701 {
3702 	(void)kvm_set_cr3(vcpu, kvm_read_cr3(vcpu));
3703 	return 1;
3704 }
3705 
kvm_pv_mmu_release_pt(struct kvm_vcpu * vcpu,gpa_t addr)3706 static int kvm_pv_mmu_release_pt(struct kvm_vcpu *vcpu, gpa_t addr)
3707 {
3708 	spin_lock(&vcpu->kvm->mmu_lock);
3709 	mmu_unshadow(vcpu->kvm, addr >> PAGE_SHIFT);
3710 	spin_unlock(&vcpu->kvm->mmu_lock);
3711 	return 1;
3712 }
3713 
kvm_pv_mmu_op_one(struct kvm_vcpu * vcpu,struct kvm_pv_mmu_op_buffer * buffer)3714 static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu,
3715 			     struct kvm_pv_mmu_op_buffer *buffer)
3716 {
3717 	struct kvm_mmu_op_header *header;
3718 
3719 	header = pv_mmu_peek_buffer(buffer, sizeof *header);
3720 	if (!header)
3721 		return 0;
3722 	switch (header->op) {
3723 	case KVM_MMU_OP_WRITE_PTE: {
3724 		struct kvm_mmu_op_write_pte *wpte;
3725 
3726 		wpte = pv_mmu_read_buffer(buffer, sizeof *wpte);
3727 		if (!wpte)
3728 			return 0;
3729 		return kvm_pv_mmu_write(vcpu, wpte->pte_phys,
3730 					wpte->pte_val);
3731 	}
3732 	case KVM_MMU_OP_FLUSH_TLB: {
3733 		struct kvm_mmu_op_flush_tlb *ftlb;
3734 
3735 		ftlb = pv_mmu_read_buffer(buffer, sizeof *ftlb);
3736 		if (!ftlb)
3737 			return 0;
3738 		return kvm_pv_mmu_flush_tlb(vcpu);
3739 	}
3740 	case KVM_MMU_OP_RELEASE_PT: {
3741 		struct kvm_mmu_op_release_pt *rpt;
3742 
3743 		rpt = pv_mmu_read_buffer(buffer, sizeof *rpt);
3744 		if (!rpt)
3745 			return 0;
3746 		return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys);
3747 	}
3748 	default: return 0;
3749 	}
3750 }
3751 
kvm_pv_mmu_op(struct kvm_vcpu * vcpu,unsigned long bytes,gpa_t addr,unsigned long * ret)3752 int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
3753 		  gpa_t addr, unsigned long *ret)
3754 {
3755 	int r;
3756 	struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer;
3757 
3758 	buffer->ptr = buffer->buf;
3759 	buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf);
3760 	buffer->processed = 0;
3761 
3762 	r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len);
3763 	if (r)
3764 		goto out;
3765 
3766 	while (buffer->len) {
3767 		r = kvm_pv_mmu_op_one(vcpu, buffer);
3768 		if (r < 0)
3769 			goto out;
3770 		if (r == 0)
3771 			break;
3772 	}
3773 
3774 	r = 1;
3775 out:
3776 	*ret = buffer->processed;
3777 	return r;
3778 }
3779 
kvm_mmu_get_spte_hierarchy(struct kvm_vcpu * vcpu,u64 addr,u64 sptes[4])3780 int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
3781 {
3782 	struct kvm_shadow_walk_iterator iterator;
3783 	int nr_sptes = 0;
3784 
3785 	spin_lock(&vcpu->kvm->mmu_lock);
3786 	for_each_shadow_entry(vcpu, addr, iterator) {
3787 		sptes[iterator.level-1] = *iterator.sptep;
3788 		nr_sptes++;
3789 		if (!is_shadow_present_pte(*iterator.sptep))
3790 			break;
3791 	}
3792 	spin_unlock(&vcpu->kvm->mmu_lock);
3793 
3794 	return nr_sptes;
3795 }
3796 EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
3797 
kvm_mmu_destroy(struct kvm_vcpu * vcpu)3798 void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
3799 {
3800 	ASSERT(vcpu);
3801 
3802 	destroy_kvm_mmu(vcpu);
3803 	free_mmu_pages(vcpu);
3804 	mmu_free_memory_caches(vcpu);
3805 }
3806 
3807 #ifdef CONFIG_KVM_MMU_AUDIT
3808 #include "mmu_audit.c"
3809 #else
mmu_audit_disable(void)3810 static void mmu_audit_disable(void) { }
3811 #endif
3812 
kvm_mmu_module_exit(void)3813 void kvm_mmu_module_exit(void)
3814 {
3815 	mmu_destroy_caches();
3816 	percpu_counter_destroy(&kvm_total_used_mmu_pages);
3817 	unregister_shrinker(&mmu_shrinker);
3818 	mmu_audit_disable();
3819 }
3820