1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "mmu.h"
4 #include "mmu_internal.h"
5 #include "mmutrace.h"
6 #include "tdp_iter.h"
7 #include "tdp_mmu.h"
8 #include "spte.h"
9
10 #include <asm/cmpxchg.h>
11 #include <trace/events/kvm.h>
12
13 static bool __read_mostly tdp_mmu_enabled = true;
14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
15
16 /* Initializes the TDP MMU for the VM, if enabled. */
kvm_mmu_init_tdp_mmu(struct kvm * kvm)17 int kvm_mmu_init_tdp_mmu(struct kvm *kvm)
18 {
19 struct workqueue_struct *wq;
20
21 if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
22 return 0;
23
24 wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0);
25 if (!wq)
26 return -ENOMEM;
27
28 /* This should not be changed for the lifetime of the VM. */
29 kvm->arch.tdp_mmu_enabled = true;
30 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
31 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
32 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
33 kvm->arch.tdp_mmu_zap_wq = wq;
34 return 1;
35 }
36
37 /* Arbitrarily returns true so that this may be used in if statements. */
kvm_lockdep_assert_mmu_lock_held(struct kvm * kvm,bool shared)38 static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
39 bool shared)
40 {
41 if (shared)
42 lockdep_assert_held_read(&kvm->mmu_lock);
43 else
44 lockdep_assert_held_write(&kvm->mmu_lock);
45
46 return true;
47 }
48
kvm_mmu_uninit_tdp_mmu(struct kvm * kvm)49 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
50 {
51 if (!kvm->arch.tdp_mmu_enabled)
52 return;
53
54 /* Also waits for any queued work items. */
55 destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
56
57 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
58 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
59
60 /*
61 * Ensure that all the outstanding RCU callbacks to free shadow pages
62 * can run before the VM is torn down. Work items on tdp_mmu_zap_wq
63 * can call kvm_tdp_mmu_put_root and create new callbacks.
64 */
65 rcu_barrier();
66 }
67
tdp_mmu_free_sp(struct kvm_mmu_page * sp)68 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
69 {
70 free_page((unsigned long)sp->spt);
71 kmem_cache_free(mmu_page_header_cache, sp);
72 }
73
74 /*
75 * This is called through call_rcu in order to free TDP page table memory
76 * safely with respect to other kernel threads that may be operating on
77 * the memory.
78 * By only accessing TDP MMU page table memory in an RCU read critical
79 * section, and freeing it after a grace period, lockless access to that
80 * memory won't use it after it is freed.
81 */
tdp_mmu_free_sp_rcu_callback(struct rcu_head * head)82 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
83 {
84 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
85 rcu_head);
86
87 tdp_mmu_free_sp(sp);
88 }
89
90 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
91 bool shared);
92
tdp_mmu_zap_root_work(struct work_struct * work)93 static void tdp_mmu_zap_root_work(struct work_struct *work)
94 {
95 struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page,
96 tdp_mmu_async_work);
97 struct kvm *kvm = root->tdp_mmu_async_data;
98
99 read_lock(&kvm->mmu_lock);
100
101 /*
102 * A TLB flush is not necessary as KVM performs a local TLB flush when
103 * allocating a new root (see kvm_mmu_load()), and when migrating vCPU
104 * to a different pCPU. Note, the local TLB flush on reuse also
105 * invalidates any paging-structure-cache entries, i.e. TLB entries for
106 * intermediate paging structures, that may be zapped, as such entries
107 * are associated with the ASID on both VMX and SVM.
108 */
109 tdp_mmu_zap_root(kvm, root, true);
110
111 /*
112 * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for
113 * avoiding an infinite loop. By design, the root is reachable while
114 * it's being asynchronously zapped, thus a different task can put its
115 * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an
116 * asynchronously zapped root is unavoidable.
117 */
118 kvm_tdp_mmu_put_root(kvm, root, true);
119
120 read_unlock(&kvm->mmu_lock);
121 }
122
tdp_mmu_schedule_zap_root(struct kvm * kvm,struct kvm_mmu_page * root)123 static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root)
124 {
125 root->tdp_mmu_async_data = kvm;
126 INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work);
127 queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work);
128 }
129
kvm_tdp_root_mark_invalid(struct kvm_mmu_page * page)130 static inline bool kvm_tdp_root_mark_invalid(struct kvm_mmu_page *page)
131 {
132 union kvm_mmu_page_role role = page->role;
133 role.invalid = true;
134
135 /* No need to use cmpxchg, only the invalid bit can change. */
136 role.word = xchg(&page->role.word, role.word);
137 return role.invalid;
138 }
139
kvm_tdp_mmu_put_root(struct kvm * kvm,struct kvm_mmu_page * root,bool shared)140 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
141 bool shared)
142 {
143 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
144
145 if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
146 return;
147
148 WARN_ON(!root->tdp_mmu_page);
149
150 /*
151 * The root now has refcount=0. It is valid, but readers already
152 * cannot acquire a reference to it because kvm_tdp_mmu_get_root()
153 * rejects it. This remains true for the rest of the execution
154 * of this function, because readers visit valid roots only
155 * (except for tdp_mmu_zap_root_work(), which however
156 * does not acquire any reference itself).
157 *
158 * Even though there are flows that need to visit all roots for
159 * correctness, they all take mmu_lock for write, so they cannot yet
160 * run concurrently. The same is true after kvm_tdp_root_mark_invalid,
161 * since the root still has refcount=0.
162 *
163 * However, tdp_mmu_zap_root can yield, and writers do not expect to
164 * see refcount=0 (see for example kvm_tdp_mmu_invalidate_all_roots()).
165 * So the root temporarily gets an extra reference, going to refcount=1
166 * while staying invalid. Readers still cannot acquire any reference;
167 * but writers are now allowed to run if tdp_mmu_zap_root yields and
168 * they might take an extra reference if they themselves yield.
169 * Therefore, when the reference is given back by the worker,
170 * there is no guarantee that the refcount is still 1. If not, whoever
171 * puts the last reference will free the page, but they will not have to
172 * zap the root because a root cannot go from invalid to valid.
173 */
174 if (!kvm_tdp_root_mark_invalid(root)) {
175 refcount_set(&root->tdp_mmu_root_count, 1);
176
177 /*
178 * Zapping the root in a worker is not just "nice to have";
179 * it is required because kvm_tdp_mmu_invalidate_all_roots()
180 * skips already-invalid roots. If kvm_tdp_mmu_put_root() did
181 * not add the root to the workqueue, kvm_tdp_mmu_zap_all_fast()
182 * might return with some roots not zapped yet.
183 */
184 tdp_mmu_schedule_zap_root(kvm, root);
185 return;
186 }
187
188 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
189 list_del_rcu(&root->link);
190 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
191 call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
192 }
193
194 /*
195 * Returns the next root after @prev_root (or the first root if @prev_root is
196 * NULL). A reference to the returned root is acquired, and the reference to
197 * @prev_root is released (the caller obviously must hold a reference to
198 * @prev_root if it's non-NULL).
199 *
200 * If @only_valid is true, invalid roots are skipped.
201 *
202 * Returns NULL if the end of tdp_mmu_roots was reached.
203 */
tdp_mmu_next_root(struct kvm * kvm,struct kvm_mmu_page * prev_root,bool shared,bool only_valid)204 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
205 struct kvm_mmu_page *prev_root,
206 bool shared, bool only_valid)
207 {
208 struct kvm_mmu_page *next_root;
209
210 rcu_read_lock();
211
212 if (prev_root)
213 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
214 &prev_root->link,
215 typeof(*prev_root), link);
216 else
217 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
218 typeof(*next_root), link);
219
220 while (next_root) {
221 if ((!only_valid || !next_root->role.invalid) &&
222 kvm_tdp_mmu_get_root(next_root))
223 break;
224
225 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
226 &next_root->link, typeof(*next_root), link);
227 }
228
229 rcu_read_unlock();
230
231 if (prev_root)
232 kvm_tdp_mmu_put_root(kvm, prev_root, shared);
233
234 return next_root;
235 }
236
237 /*
238 * Note: this iterator gets and puts references to the roots it iterates over.
239 * This makes it safe to release the MMU lock and yield within the loop, but
240 * if exiting the loop early, the caller must drop the reference to the most
241 * recent root. (Unless keeping a live reference is desirable.)
242 *
243 * If shared is set, this function is operating under the MMU lock in read
244 * mode. In the unlikely event that this thread must free a root, the lock
245 * will be temporarily dropped and reacquired in write mode.
246 */
247 #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
248 for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \
249 _root; \
250 _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \
251 if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) && \
252 kvm_mmu_page_as_id(_root) != _as_id) { \
253 } else
254
255 #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
256 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
257
258 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \
259 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false)
260
261 /*
262 * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write,
263 * the implication being that any flow that holds mmu_lock for read is
264 * inherently yield-friendly and should use the yield-safe variant above.
265 * Holding mmu_lock for write obviates the need for RCU protection as the list
266 * is guaranteed to be stable.
267 */
268 #define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
269 list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \
270 if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \
271 kvm_mmu_page_as_id(_root) != _as_id) { \
272 } else
273
tdp_mmu_alloc_sp(struct kvm_vcpu * vcpu)274 static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
275 {
276 struct kvm_mmu_page *sp;
277
278 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
279 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
280
281 return sp;
282 }
283
tdp_mmu_init_sp(struct kvm_mmu_page * sp,tdp_ptep_t sptep,gfn_t gfn,union kvm_mmu_page_role role)284 static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
285 gfn_t gfn, union kvm_mmu_page_role role)
286 {
287 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
288
289 sp->role = role;
290 sp->gfn = gfn;
291 sp->ptep = sptep;
292 sp->tdp_mmu_page = true;
293
294 trace_kvm_mmu_get_page(sp, true);
295 }
296
tdp_mmu_init_child_sp(struct kvm_mmu_page * child_sp,struct tdp_iter * iter)297 static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
298 struct tdp_iter *iter)
299 {
300 struct kvm_mmu_page *parent_sp;
301 union kvm_mmu_page_role role;
302
303 parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
304
305 role = parent_sp->role;
306 role.level--;
307
308 tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
309 }
310
kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu * vcpu)311 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
312 {
313 union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
314 struct kvm *kvm = vcpu->kvm;
315 struct kvm_mmu_page *root;
316
317 lockdep_assert_held_write(&kvm->mmu_lock);
318
319 /*
320 * Check for an existing root before allocating a new one. Note, the
321 * role check prevents consuming an invalid root.
322 */
323 for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
324 if (root->role.word == role.word &&
325 kvm_tdp_mmu_get_root(root))
326 goto out;
327 }
328
329 root = tdp_mmu_alloc_sp(vcpu);
330 tdp_mmu_init_sp(root, NULL, 0, role);
331
332 refcount_set(&root->tdp_mmu_root_count, 1);
333
334 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
335 list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
336 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
337
338 out:
339 return __pa(root->spt);
340 }
341
342 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
343 u64 old_spte, u64 new_spte, int level,
344 bool shared);
345
handle_changed_spte_acc_track(u64 old_spte,u64 new_spte,int level)346 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
347 {
348 if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
349 return;
350
351 if (is_accessed_spte(old_spte) &&
352 (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
353 spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
354 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
355 }
356
handle_changed_spte_dirty_log(struct kvm * kvm,int as_id,gfn_t gfn,u64 old_spte,u64 new_spte,int level)357 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
358 u64 old_spte, u64 new_spte, int level)
359 {
360 bool pfn_changed;
361 struct kvm_memory_slot *slot;
362
363 if (level > PG_LEVEL_4K)
364 return;
365
366 pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
367
368 if ((!is_writable_pte(old_spte) || pfn_changed) &&
369 is_writable_pte(new_spte)) {
370 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
371 mark_page_dirty_in_slot(kvm, slot, gfn);
372 }
373 }
374
375 /**
376 * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
377 *
378 * @kvm: kvm instance
379 * @sp: the page to be removed
380 * @shared: This operation may not be running under the exclusive use of
381 * the MMU lock and the operation must synchronize with other
382 * threads that might be adding or removing pages.
383 */
tdp_mmu_unlink_sp(struct kvm * kvm,struct kvm_mmu_page * sp,bool shared)384 static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
385 bool shared)
386 {
387 if (shared)
388 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
389 else
390 lockdep_assert_held_write(&kvm->mmu_lock);
391
392 list_del(&sp->link);
393 if (sp->lpage_disallowed)
394 unaccount_huge_nx_page(kvm, sp);
395
396 if (shared)
397 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
398 }
399
400 /**
401 * handle_removed_pt() - handle a page table removed from the TDP structure
402 *
403 * @kvm: kvm instance
404 * @pt: the page removed from the paging structure
405 * @shared: This operation may not be running under the exclusive use
406 * of the MMU lock and the operation must synchronize with other
407 * threads that might be modifying SPTEs.
408 *
409 * Given a page table that has been removed from the TDP paging structure,
410 * iterates through the page table to clear SPTEs and free child page tables.
411 *
412 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
413 * protection. Since this thread removed it from the paging structure,
414 * this thread will be responsible for ensuring the page is freed. Hence the
415 * early rcu_dereferences in the function.
416 */
handle_removed_pt(struct kvm * kvm,tdp_ptep_t pt,bool shared)417 static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
418 {
419 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
420 int level = sp->role.level;
421 gfn_t base_gfn = sp->gfn;
422 int i;
423
424 trace_kvm_mmu_prepare_zap_page(sp);
425
426 tdp_mmu_unlink_sp(kvm, sp, shared);
427
428 for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
429 tdp_ptep_t sptep = pt + i;
430 gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
431 u64 old_spte;
432
433 if (shared) {
434 /*
435 * Set the SPTE to a nonpresent value that other
436 * threads will not overwrite. If the SPTE was
437 * already marked as removed then another thread
438 * handling a page fault could overwrite it, so
439 * set the SPTE until it is set from some other
440 * value to the removed SPTE value.
441 */
442 for (;;) {
443 old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE);
444 if (!is_removed_spte(old_spte))
445 break;
446 cpu_relax();
447 }
448 } else {
449 /*
450 * If the SPTE is not MMU-present, there is no backing
451 * page associated with the SPTE and so no side effects
452 * that need to be recorded, and exclusive ownership of
453 * mmu_lock ensures the SPTE can't be made present.
454 * Note, zapping MMIO SPTEs is also unnecessary as they
455 * are guarded by the memslots generation, not by being
456 * unreachable.
457 */
458 old_spte = kvm_tdp_mmu_read_spte(sptep);
459 if (!is_shadow_present_pte(old_spte))
460 continue;
461
462 /*
463 * Use the common helper instead of a raw WRITE_ONCE as
464 * the SPTE needs to be updated atomically if it can be
465 * modified by a different vCPU outside of mmu_lock.
466 * Even though the parent SPTE is !PRESENT, the TLB
467 * hasn't yet been flushed, and both Intel and AMD
468 * document that A/D assists can use upper-level PxE
469 * entries that are cached in the TLB, i.e. the CPU can
470 * still access the page and mark it dirty.
471 *
472 * No retry is needed in the atomic update path as the
473 * sole concern is dropping a Dirty bit, i.e. no other
474 * task can zap/remove the SPTE as mmu_lock is held for
475 * write. Marking the SPTE as a removed SPTE is not
476 * strictly necessary for the same reason, but using
477 * the remove SPTE value keeps the shared/exclusive
478 * paths consistent and allows the handle_changed_spte()
479 * call below to hardcode the new value to REMOVED_SPTE.
480 *
481 * Note, even though dropping a Dirty bit is the only
482 * scenario where a non-atomic update could result in a
483 * functional bug, simply checking the Dirty bit isn't
484 * sufficient as a fast page fault could read the upper
485 * level SPTE before it is zapped, and then make this
486 * target SPTE writable, resume the guest, and set the
487 * Dirty bit between reading the SPTE above and writing
488 * it here.
489 */
490 old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
491 REMOVED_SPTE, level);
492 }
493 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
494 old_spte, REMOVED_SPTE, level, shared);
495 }
496
497 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
498 }
499
500 /**
501 * __handle_changed_spte - handle bookkeeping associated with an SPTE change
502 * @kvm: kvm instance
503 * @as_id: the address space of the paging structure the SPTE was a part of
504 * @gfn: the base GFN that was mapped by the SPTE
505 * @old_spte: The value of the SPTE before the change
506 * @new_spte: The value of the SPTE after the change
507 * @level: the level of the PT the SPTE is part of in the paging structure
508 * @shared: This operation may not be running under the exclusive use of
509 * the MMU lock and the operation must synchronize with other
510 * threads that might be modifying SPTEs.
511 *
512 * Handle bookkeeping that might result from the modification of a SPTE.
513 * This function must be called for all TDP SPTE modifications.
514 */
__handle_changed_spte(struct kvm * kvm,int as_id,gfn_t gfn,u64 old_spte,u64 new_spte,int level,bool shared)515 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
516 u64 old_spte, u64 new_spte, int level,
517 bool shared)
518 {
519 bool was_present = is_shadow_present_pte(old_spte);
520 bool is_present = is_shadow_present_pte(new_spte);
521 bool was_leaf = was_present && is_last_spte(old_spte, level);
522 bool is_leaf = is_present && is_last_spte(new_spte, level);
523 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
524
525 WARN_ON(level > PT64_ROOT_MAX_LEVEL);
526 WARN_ON(level < PG_LEVEL_4K);
527 WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
528
529 /*
530 * If this warning were to trigger it would indicate that there was a
531 * missing MMU notifier or a race with some notifier handler.
532 * A present, leaf SPTE should never be directly replaced with another
533 * present leaf SPTE pointing to a different PFN. A notifier handler
534 * should be zapping the SPTE before the main MM's page table is
535 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
536 * thread before replacement.
537 */
538 if (was_leaf && is_leaf && pfn_changed) {
539 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
540 "SPTE with another present leaf SPTE mapping a\n"
541 "different PFN!\n"
542 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
543 as_id, gfn, old_spte, new_spte, level);
544
545 /*
546 * Crash the host to prevent error propagation and guest data
547 * corruption.
548 */
549 BUG();
550 }
551
552 if (old_spte == new_spte)
553 return;
554
555 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
556
557 if (is_leaf)
558 check_spte_writable_invariants(new_spte);
559
560 /*
561 * The only times a SPTE should be changed from a non-present to
562 * non-present state is when an MMIO entry is installed/modified/
563 * removed. In that case, there is nothing to do here.
564 */
565 if (!was_present && !is_present) {
566 /*
567 * If this change does not involve a MMIO SPTE or removed SPTE,
568 * it is unexpected. Log the change, though it should not
569 * impact the guest since both the former and current SPTEs
570 * are nonpresent.
571 */
572 if (WARN_ON(!is_mmio_spte(old_spte) &&
573 !is_mmio_spte(new_spte) &&
574 !is_removed_spte(new_spte)))
575 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
576 "should not be replaced with another,\n"
577 "different nonpresent SPTE, unless one or both\n"
578 "are MMIO SPTEs, or the new SPTE is\n"
579 "a temporary removed SPTE.\n"
580 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
581 as_id, gfn, old_spte, new_spte, level);
582 return;
583 }
584
585 if (is_leaf != was_leaf)
586 kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
587
588 if (was_leaf && is_dirty_spte(old_spte) &&
589 (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
590 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
591
592 /*
593 * Recursively handle child PTs if the change removed a subtree from
594 * the paging structure. Note the WARN on the PFN changing without the
595 * SPTE being converted to a hugepage (leaf) or being zapped. Shadow
596 * pages are kernel allocations and should never be migrated.
597 */
598 if (was_present && !was_leaf &&
599 (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
600 handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
601 }
602
handle_changed_spte(struct kvm * kvm,int as_id,gfn_t gfn,u64 old_spte,u64 new_spte,int level,bool shared)603 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
604 u64 old_spte, u64 new_spte, int level,
605 bool shared)
606 {
607 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
608 shared);
609 handle_changed_spte_acc_track(old_spte, new_spte, level);
610 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
611 new_spte, level);
612 }
613
614 /*
615 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
616 * and handle the associated bookkeeping. Do not mark the page dirty
617 * in KVM's dirty bitmaps.
618 *
619 * If setting the SPTE fails because it has changed, iter->old_spte will be
620 * refreshed to the current value of the spte.
621 *
622 * @kvm: kvm instance
623 * @iter: a tdp_iter instance currently on the SPTE that should be set
624 * @new_spte: The value the SPTE should be set to
625 * Return:
626 * * 0 - If the SPTE was set.
627 * * -EBUSY - If the SPTE cannot be set. In this case this function will have
628 * no side-effects other than setting iter->old_spte to the last
629 * known value of the spte.
630 */
tdp_mmu_set_spte_atomic(struct kvm * kvm,struct tdp_iter * iter,u64 new_spte)631 static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
632 struct tdp_iter *iter,
633 u64 new_spte)
634 {
635 u64 *sptep = rcu_dereference(iter->sptep);
636 u64 old_spte;
637
638 /*
639 * The caller is responsible for ensuring the old SPTE is not a REMOVED
640 * SPTE. KVM should never attempt to zap or manipulate a REMOVED SPTE,
641 * and pre-checking before inserting a new SPTE is advantageous as it
642 * avoids unnecessary work.
643 */
644 WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte));
645
646 lockdep_assert_held_read(&kvm->mmu_lock);
647
648 /*
649 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
650 * does not hold the mmu_lock.
651 */
652 old_spte = cmpxchg64(sptep, iter->old_spte, new_spte);
653 if (old_spte != iter->old_spte) {
654 /*
655 * The page table entry was modified by a different logical
656 * CPU. Refresh iter->old_spte with the current value so the
657 * caller operates on fresh data, e.g. if it retries
658 * tdp_mmu_set_spte_atomic().
659 */
660 iter->old_spte = old_spte;
661 return -EBUSY;
662 }
663
664 __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
665 new_spte, iter->level, true);
666 handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
667
668 return 0;
669 }
670
tdp_mmu_zap_spte_atomic(struct kvm * kvm,struct tdp_iter * iter)671 static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
672 struct tdp_iter *iter)
673 {
674 int ret;
675
676 /*
677 * Freeze the SPTE by setting it to a special,
678 * non-present value. This will stop other threads from
679 * immediately installing a present entry in its place
680 * before the TLBs are flushed.
681 */
682 ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
683 if (ret)
684 return ret;
685
686 kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
687 KVM_PAGES_PER_HPAGE(iter->level));
688
689 /*
690 * No other thread can overwrite the removed SPTE as they must either
691 * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not
692 * overwrite the special removed SPTE value. No bookkeeping is needed
693 * here since the SPTE is going from non-present to non-present. Use
694 * the raw write helper to avoid an unnecessary check on volatile bits.
695 */
696 __kvm_tdp_mmu_write_spte(iter->sptep, 0);
697
698 return 0;
699 }
700
701
702 /*
703 * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
704 * @kvm: KVM instance
705 * @as_id: Address space ID, i.e. regular vs. SMM
706 * @sptep: Pointer to the SPTE
707 * @old_spte: The current value of the SPTE
708 * @new_spte: The new value that will be set for the SPTE
709 * @gfn: The base GFN that was (or will be) mapped by the SPTE
710 * @level: The level _containing_ the SPTE (its parent PT's level)
711 * @record_acc_track: Notify the MM subsystem of changes to the accessed state
712 * of the page. Should be set unless handling an MMU
713 * notifier for access tracking. Leaving record_acc_track
714 * unset in that case prevents page accesses from being
715 * double counted.
716 * @record_dirty_log: Record the page as dirty in the dirty bitmap if
717 * appropriate for the change being made. Should be set
718 * unless performing certain dirty logging operations.
719 * Leaving record_dirty_log unset in that case prevents page
720 * writes from being double counted.
721 *
722 * Returns the old SPTE value, which _may_ be different than @old_spte if the
723 * SPTE had voldatile bits.
724 */
__tdp_mmu_set_spte(struct kvm * kvm,int as_id,tdp_ptep_t sptep,u64 old_spte,u64 new_spte,gfn_t gfn,int level,bool record_acc_track,bool record_dirty_log)725 static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
726 u64 old_spte, u64 new_spte, gfn_t gfn, int level,
727 bool record_acc_track, bool record_dirty_log)
728 {
729 lockdep_assert_held_write(&kvm->mmu_lock);
730
731 /*
732 * No thread should be using this function to set SPTEs to or from the
733 * temporary removed SPTE value.
734 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
735 * should be used. If operating under the MMU lock in write mode, the
736 * use of the removed SPTE should not be necessary.
737 */
738 WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte));
739
740 old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
741
742 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
743
744 if (record_acc_track)
745 handle_changed_spte_acc_track(old_spte, new_spte, level);
746 if (record_dirty_log)
747 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
748 new_spte, level);
749 return old_spte;
750 }
751
_tdp_mmu_set_spte(struct kvm * kvm,struct tdp_iter * iter,u64 new_spte,bool record_acc_track,bool record_dirty_log)752 static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
753 u64 new_spte, bool record_acc_track,
754 bool record_dirty_log)
755 {
756 WARN_ON_ONCE(iter->yielded);
757
758 iter->old_spte = __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
759 iter->old_spte, new_spte,
760 iter->gfn, iter->level,
761 record_acc_track, record_dirty_log);
762 }
763
tdp_mmu_set_spte(struct kvm * kvm,struct tdp_iter * iter,u64 new_spte)764 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
765 u64 new_spte)
766 {
767 _tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
768 }
769
tdp_mmu_set_spte_no_acc_track(struct kvm * kvm,struct tdp_iter * iter,u64 new_spte)770 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
771 struct tdp_iter *iter,
772 u64 new_spte)
773 {
774 _tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
775 }
776
tdp_mmu_set_spte_no_dirty_log(struct kvm * kvm,struct tdp_iter * iter,u64 new_spte)777 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
778 struct tdp_iter *iter,
779 u64 new_spte)
780 {
781 _tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
782 }
783
784 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
785 for_each_tdp_pte(_iter, _root, _start, _end)
786
787 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \
788 tdp_root_for_each_pte(_iter, _root, _start, _end) \
789 if (!is_shadow_present_pte(_iter.old_spte) || \
790 !is_last_spte(_iter.old_spte, _iter.level)) \
791 continue; \
792 else
793
794 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \
795 for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end)
796
797 /*
798 * Yield if the MMU lock is contended or this thread needs to return control
799 * to the scheduler.
800 *
801 * If this function should yield and flush is set, it will perform a remote
802 * TLB flush before yielding.
803 *
804 * If this function yields, iter->yielded is set and the caller must skip to
805 * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
806 * over the paging structures to allow the iterator to continue its traversal
807 * from the paging structure root.
808 *
809 * Returns true if this function yielded.
810 */
tdp_mmu_iter_cond_resched(struct kvm * kvm,struct tdp_iter * iter,bool flush,bool shared)811 static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
812 struct tdp_iter *iter,
813 bool flush, bool shared)
814 {
815 WARN_ON(iter->yielded);
816
817 /* Ensure forward progress has been made before yielding. */
818 if (iter->next_last_level_gfn == iter->yielded_gfn)
819 return false;
820
821 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
822 if (flush)
823 kvm_flush_remote_tlbs(kvm);
824
825 rcu_read_unlock();
826
827 if (shared)
828 cond_resched_rwlock_read(&kvm->mmu_lock);
829 else
830 cond_resched_rwlock_write(&kvm->mmu_lock);
831
832 rcu_read_lock();
833
834 WARN_ON(iter->gfn > iter->next_last_level_gfn);
835
836 iter->yielded = true;
837 }
838
839 return iter->yielded;
840 }
841
tdp_mmu_max_gfn_exclusive(void)842 static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
843 {
844 /*
845 * Bound TDP MMU walks at host.MAXPHYADDR. KVM disallows memslots with
846 * a gpa range that would exceed the max gfn, and KVM does not create
847 * MMIO SPTEs for "impossible" gfns, instead sending such accesses down
848 * the slow emulation path every time.
849 */
850 return kvm_mmu_max_gfn() + 1;
851 }
852
__tdp_mmu_zap_root(struct kvm * kvm,struct kvm_mmu_page * root,bool shared,int zap_level)853 static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
854 bool shared, int zap_level)
855 {
856 struct tdp_iter iter;
857
858 gfn_t end = tdp_mmu_max_gfn_exclusive();
859 gfn_t start = 0;
860
861 for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
862 retry:
863 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
864 continue;
865
866 if (!is_shadow_present_pte(iter.old_spte))
867 continue;
868
869 if (iter.level > zap_level)
870 continue;
871
872 if (!shared)
873 tdp_mmu_set_spte(kvm, &iter, 0);
874 else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
875 goto retry;
876 }
877 }
878
tdp_mmu_zap_root(struct kvm * kvm,struct kvm_mmu_page * root,bool shared)879 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
880 bool shared)
881 {
882
883 /*
884 * The root must have an elevated refcount so that it's reachable via
885 * mmu_notifier callbacks, which allows this path to yield and drop
886 * mmu_lock. When handling an unmap/release mmu_notifier command, KVM
887 * must drop all references to relevant pages prior to completing the
888 * callback. Dropping mmu_lock with an unreachable root would result
889 * in zapping SPTEs after a relevant mmu_notifier callback completes
890 * and lead to use-after-free as zapping a SPTE triggers "writeback" of
891 * dirty accessed bits to the SPTE's associated struct page.
892 */
893 WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
894
895 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
896
897 rcu_read_lock();
898
899 /*
900 * To avoid RCU stalls due to recursively removing huge swaths of SPs,
901 * split the zap into two passes. On the first pass, zap at the 1gb
902 * level, and then zap top-level SPs on the second pass. "1gb" is not
903 * arbitrary, as KVM must be able to zap a 1gb shadow page without
904 * inducing a stall to allow in-place replacement with a 1gb hugepage.
905 *
906 * Because zapping a SP recurses on its children, stepping down to
907 * PG_LEVEL_4K in the iterator itself is unnecessary.
908 */
909 __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
910 __tdp_mmu_zap_root(kvm, root, shared, root->role.level);
911
912 rcu_read_unlock();
913 }
914
kvm_tdp_mmu_zap_sp(struct kvm * kvm,struct kvm_mmu_page * sp)915 bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
916 {
917 u64 old_spte;
918
919 /*
920 * This helper intentionally doesn't allow zapping a root shadow page,
921 * which doesn't have a parent page table and thus no associated entry.
922 */
923 if (WARN_ON_ONCE(!sp->ptep))
924 return false;
925
926 old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
927 if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
928 return false;
929
930 __tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
931 sp->gfn, sp->role.level + 1, true, true);
932
933 return true;
934 }
935
936 /*
937 * Zap leafs SPTEs for the range of gfns, [start, end). Returns true if SPTEs
938 * have been cleared and a TLB flush is needed before releasing the MMU lock.
939 *
940 * If can_yield is true, will release the MMU lock and reschedule if the
941 * scheduler needs the CPU or there is contention on the MMU lock. If this
942 * function cannot yield, it will not release the MMU lock or reschedule and
943 * the caller must ensure it does not supply too large a GFN range, or the
944 * operation can cause a soft lockup.
945 */
tdp_mmu_zap_leafs(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t start,gfn_t end,bool can_yield,bool flush)946 static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
947 gfn_t start, gfn_t end, bool can_yield, bool flush)
948 {
949 struct tdp_iter iter;
950
951 end = min(end, tdp_mmu_max_gfn_exclusive());
952
953 lockdep_assert_held_write(&kvm->mmu_lock);
954
955 rcu_read_lock();
956
957 for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
958 if (can_yield &&
959 tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
960 flush = false;
961 continue;
962 }
963
964 if (!is_shadow_present_pte(iter.old_spte) ||
965 !is_last_spte(iter.old_spte, iter.level))
966 continue;
967
968 tdp_mmu_set_spte(kvm, &iter, 0);
969 flush = true;
970 }
971
972 rcu_read_unlock();
973
974 /*
975 * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
976 * to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
977 */
978 return flush;
979 }
980
981 /*
982 * Tears down the mappings for the range of gfns, [start, end), and frees the
983 * non-root pages mapping GFNs strictly within that range. Returns true if
984 * SPTEs have been cleared and a TLB flush is needed before releasing the
985 * MMU lock.
986 */
kvm_tdp_mmu_zap_leafs(struct kvm * kvm,int as_id,gfn_t start,gfn_t end,bool can_yield,bool flush)987 bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
988 bool can_yield, bool flush)
989 {
990 struct kvm_mmu_page *root;
991
992 for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
993 flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush);
994
995 return flush;
996 }
997
kvm_tdp_mmu_zap_all(struct kvm * kvm)998 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
999 {
1000 struct kvm_mmu_page *root;
1001 int i;
1002
1003 /*
1004 * Zap all roots, including invalid roots, as all SPTEs must be dropped
1005 * before returning to the caller. Zap directly even if the root is
1006 * also being zapped by a worker. Walking zapped top-level SPTEs isn't
1007 * all that expensive and mmu_lock is already held, which means the
1008 * worker has yielded, i.e. flushing the work instead of zapping here
1009 * isn't guaranteed to be any faster.
1010 *
1011 * A TLB flush is unnecessary, KVM zaps everything if and only the VM
1012 * is being destroyed or the userspace VMM has exited. In both cases,
1013 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
1014 */
1015 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1016 for_each_tdp_mmu_root_yield_safe(kvm, root, i)
1017 tdp_mmu_zap_root(kvm, root, false);
1018 }
1019 }
1020
1021 /*
1022 * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
1023 * zap" completes.
1024 */
kvm_tdp_mmu_zap_invalidated_roots(struct kvm * kvm)1025 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
1026 {
1027 flush_workqueue(kvm->arch.tdp_mmu_zap_wq);
1028 }
1029
1030 /*
1031 * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
1032 * is about to be zapped, e.g. in response to a memslots update. The actual
1033 * zapping is performed asynchronously, so a reference is taken on all roots.
1034 * Using a separate workqueue makes it easy to ensure that the destruction is
1035 * performed before the "fast zap" completes, without keeping a separate list
1036 * of invalidated roots; the list is effectively the list of work items in
1037 * the workqueue.
1038 *
1039 * Get a reference even if the root is already invalid, the asynchronous worker
1040 * assumes it was gifted a reference to the root it processes. Because mmu_lock
1041 * is held for write, it should be impossible to observe a root with zero refcount,
1042 * i.e. the list of roots cannot be stale.
1043 *
1044 * This has essentially the same effect for the TDP MMU
1045 * as updating mmu_valid_gen does for the shadow MMU.
1046 */
kvm_tdp_mmu_invalidate_all_roots(struct kvm * kvm)1047 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
1048 {
1049 struct kvm_mmu_page *root;
1050
1051 lockdep_assert_held_write(&kvm->mmu_lock);
1052 list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
1053 if (!root->role.invalid &&
1054 !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) {
1055 root->role.invalid = true;
1056 tdp_mmu_schedule_zap_root(kvm, root);
1057 }
1058 }
1059 }
1060
1061 /*
1062 * Installs a last-level SPTE to handle a TDP page fault.
1063 * (NPT/EPT violation/misconfiguration)
1064 */
tdp_mmu_map_handle_target_level(struct kvm_vcpu * vcpu,struct kvm_page_fault * fault,struct tdp_iter * iter)1065 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
1066 struct kvm_page_fault *fault,
1067 struct tdp_iter *iter)
1068 {
1069 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
1070 u64 new_spte;
1071 int ret = RET_PF_FIXED;
1072 bool wrprot = false;
1073
1074 WARN_ON(sp->role.level != fault->goal_level);
1075 if (unlikely(!fault->slot))
1076 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
1077 else
1078 wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
1079 fault->pfn, iter->old_spte, fault->prefetch, true,
1080 fault->map_writable, &new_spte);
1081
1082 if (new_spte == iter->old_spte)
1083 ret = RET_PF_SPURIOUS;
1084 else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
1085 return RET_PF_RETRY;
1086 else if (is_shadow_present_pte(iter->old_spte) &&
1087 !is_last_spte(iter->old_spte, iter->level))
1088 kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
1089 KVM_PAGES_PER_HPAGE(iter->level + 1));
1090
1091 /*
1092 * If the page fault was caused by a write but the page is write
1093 * protected, emulation is needed. If the emulation was skipped,
1094 * the vCPU would have the same fault again.
1095 */
1096 if (wrprot) {
1097 if (fault->write)
1098 ret = RET_PF_EMULATE;
1099 }
1100
1101 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
1102 if (unlikely(is_mmio_spte(new_spte))) {
1103 vcpu->stat.pf_mmio_spte_created++;
1104 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
1105 new_spte);
1106 ret = RET_PF_EMULATE;
1107 } else {
1108 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
1109 rcu_dereference(iter->sptep));
1110 }
1111
1112 return ret;
1113 }
1114
1115 /*
1116 * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
1117 * provided page table.
1118 *
1119 * @kvm: kvm instance
1120 * @iter: a tdp_iter instance currently on the SPTE that should be set
1121 * @sp: The new TDP page table to install.
1122 * @account_nx: True if this page table is being installed to split a
1123 * non-executable huge page.
1124 * @shared: This operation is running under the MMU lock in read mode.
1125 *
1126 * Returns: 0 if the new page table was installed. Non-0 if the page table
1127 * could not be installed (e.g. the atomic compare-exchange failed).
1128 */
tdp_mmu_link_sp(struct kvm * kvm,struct tdp_iter * iter,struct kvm_mmu_page * sp,bool account_nx,bool shared)1129 static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
1130 struct kvm_mmu_page *sp, bool account_nx,
1131 bool shared)
1132 {
1133 u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled());
1134 int ret = 0;
1135
1136 if (shared) {
1137 ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
1138 if (ret)
1139 return ret;
1140 } else {
1141 tdp_mmu_set_spte(kvm, iter, spte);
1142 }
1143
1144 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
1145 list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
1146 if (account_nx)
1147 account_huge_nx_page(kvm, sp);
1148 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
1149
1150 return 0;
1151 }
1152
1153 /*
1154 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
1155 * page tables and SPTEs to translate the faulting guest physical address.
1156 */
kvm_tdp_mmu_map(struct kvm_vcpu * vcpu,struct kvm_page_fault * fault)1157 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
1158 {
1159 struct kvm_mmu *mmu = vcpu->arch.mmu;
1160 struct tdp_iter iter;
1161 struct kvm_mmu_page *sp;
1162 int ret;
1163
1164 kvm_mmu_hugepage_adjust(vcpu, fault);
1165
1166 trace_kvm_mmu_spte_requested(fault);
1167
1168 rcu_read_lock();
1169
1170 tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
1171 if (fault->nx_huge_page_workaround_enabled)
1172 disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
1173
1174 if (iter.level == fault->goal_level)
1175 break;
1176
1177 /*
1178 * If there is an SPTE mapping a large page at a higher level
1179 * than the target, that SPTE must be cleared and replaced
1180 * with a non-leaf SPTE.
1181 */
1182 if (is_shadow_present_pte(iter.old_spte) &&
1183 is_large_pte(iter.old_spte)) {
1184 if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
1185 break;
1186
1187 /*
1188 * The iter must explicitly re-read the spte here
1189 * because the new value informs the !present
1190 * path below.
1191 */
1192 iter.old_spte = kvm_tdp_mmu_read_spte(iter.sptep);
1193 }
1194
1195 if (!is_shadow_present_pte(iter.old_spte)) {
1196 bool account_nx = fault->huge_page_disallowed &&
1197 fault->req_level >= iter.level;
1198
1199 /*
1200 * If SPTE has been frozen by another thread, just
1201 * give up and retry, avoiding unnecessary page table
1202 * allocation and free.
1203 */
1204 if (is_removed_spte(iter.old_spte))
1205 break;
1206
1207 sp = tdp_mmu_alloc_sp(vcpu);
1208 tdp_mmu_init_child_sp(sp, &iter);
1209
1210 if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) {
1211 tdp_mmu_free_sp(sp);
1212 break;
1213 }
1214 }
1215 }
1216
1217 /*
1218 * Force the guest to retry the access if the upper level SPTEs aren't
1219 * in place, or if the target leaf SPTE is frozen by another CPU.
1220 */
1221 if (iter.level != fault->goal_level || is_removed_spte(iter.old_spte)) {
1222 rcu_read_unlock();
1223 return RET_PF_RETRY;
1224 }
1225
1226 ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
1227 rcu_read_unlock();
1228
1229 return ret;
1230 }
1231
kvm_tdp_mmu_unmap_gfn_range(struct kvm * kvm,struct kvm_gfn_range * range,bool flush)1232 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1233 bool flush)
1234 {
1235 return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start,
1236 range->end, range->may_block, flush);
1237 }
1238
1239 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1240 struct kvm_gfn_range *range);
1241
kvm_tdp_mmu_handle_gfn(struct kvm * kvm,struct kvm_gfn_range * range,tdp_handler_t handler)1242 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1243 struct kvm_gfn_range *range,
1244 tdp_handler_t handler)
1245 {
1246 struct kvm_mmu_page *root;
1247 struct tdp_iter iter;
1248 bool ret = false;
1249
1250 /*
1251 * Don't support rescheduling, none of the MMU notifiers that funnel
1252 * into this helper allow blocking; it'd be dead, wasteful code.
1253 */
1254 for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1255 rcu_read_lock();
1256
1257 tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1258 ret |= handler(kvm, &iter, range);
1259
1260 rcu_read_unlock();
1261 }
1262
1263 return ret;
1264 }
1265
1266 /*
1267 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1268 * if any of the GFNs in the range have been accessed.
1269 */
age_gfn_range(struct kvm * kvm,struct tdp_iter * iter,struct kvm_gfn_range * range)1270 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1271 struct kvm_gfn_range *range)
1272 {
1273 u64 new_spte = 0;
1274
1275 /* If we have a non-accessed entry we don't need to change the pte. */
1276 if (!is_accessed_spte(iter->old_spte))
1277 return false;
1278
1279 new_spte = iter->old_spte;
1280
1281 if (spte_ad_enabled(new_spte)) {
1282 new_spte &= ~shadow_accessed_mask;
1283 } else {
1284 /*
1285 * Capture the dirty status of the page, so that it doesn't get
1286 * lost when the SPTE is marked for access tracking.
1287 */
1288 if (is_writable_pte(new_spte))
1289 kvm_set_pfn_dirty(spte_to_pfn(new_spte));
1290
1291 new_spte = mark_spte_for_access_track(new_spte);
1292 }
1293
1294 tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
1295
1296 return true;
1297 }
1298
kvm_tdp_mmu_age_gfn_range(struct kvm * kvm,struct kvm_gfn_range * range)1299 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1300 {
1301 return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
1302 }
1303
test_age_gfn(struct kvm * kvm,struct tdp_iter * iter,struct kvm_gfn_range * range)1304 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1305 struct kvm_gfn_range *range)
1306 {
1307 return is_accessed_spte(iter->old_spte);
1308 }
1309
kvm_tdp_mmu_test_age_gfn(struct kvm * kvm,struct kvm_gfn_range * range)1310 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1311 {
1312 return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
1313 }
1314
set_spte_gfn(struct kvm * kvm,struct tdp_iter * iter,struct kvm_gfn_range * range)1315 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1316 struct kvm_gfn_range *range)
1317 {
1318 u64 new_spte;
1319
1320 /* Huge pages aren't expected to be modified without first being zapped. */
1321 WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
1322
1323 if (iter->level != PG_LEVEL_4K ||
1324 !is_shadow_present_pte(iter->old_spte))
1325 return false;
1326
1327 /*
1328 * Note, when changing a read-only SPTE, it's not strictly necessary to
1329 * zero the SPTE before setting the new PFN, but doing so preserves the
1330 * invariant that the PFN of a present * leaf SPTE can never change.
1331 * See __handle_changed_spte().
1332 */
1333 tdp_mmu_set_spte(kvm, iter, 0);
1334
1335 if (!pte_write(range->pte)) {
1336 new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
1337 pte_pfn(range->pte));
1338
1339 tdp_mmu_set_spte(kvm, iter, new_spte);
1340 }
1341
1342 return true;
1343 }
1344
1345 /*
1346 * Handle the changed_pte MMU notifier for the TDP MMU.
1347 * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1348 * notifier.
1349 * Returns non-zero if a flush is needed before releasing the MMU lock.
1350 */
kvm_tdp_mmu_set_spte_gfn(struct kvm * kvm,struct kvm_gfn_range * range)1351 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1352 {
1353 /*
1354 * No need to handle the remote TLB flush under RCU protection, the
1355 * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
1356 * shadow page. See the WARN on pfn_changed in __handle_changed_spte().
1357 */
1358 return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1359 }
1360
1361 /*
1362 * Remove write access from all SPTEs at or above min_level that map GFNs
1363 * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1364 * be flushed.
1365 */
wrprot_gfn_range(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t start,gfn_t end,int min_level)1366 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1367 gfn_t start, gfn_t end, int min_level)
1368 {
1369 struct tdp_iter iter;
1370 u64 new_spte;
1371 bool spte_set = false;
1372
1373 rcu_read_lock();
1374
1375 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1376
1377 for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
1378 retry:
1379 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1380 continue;
1381
1382 if (!is_shadow_present_pte(iter.old_spte) ||
1383 !is_last_spte(iter.old_spte, iter.level) ||
1384 !(iter.old_spte & PT_WRITABLE_MASK))
1385 continue;
1386
1387 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1388
1389 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1390 goto retry;
1391
1392 spte_set = true;
1393 }
1394
1395 rcu_read_unlock();
1396 return spte_set;
1397 }
1398
1399 /*
1400 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1401 * only affect leaf SPTEs down to min_level.
1402 * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1403 */
kvm_tdp_mmu_wrprot_slot(struct kvm * kvm,const struct kvm_memory_slot * slot,int min_level)1404 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1405 const struct kvm_memory_slot *slot, int min_level)
1406 {
1407 struct kvm_mmu_page *root;
1408 bool spte_set = false;
1409
1410 lockdep_assert_held_read(&kvm->mmu_lock);
1411
1412 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1413 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1414 slot->base_gfn + slot->npages, min_level);
1415
1416 return spte_set;
1417 }
1418
__tdp_mmu_alloc_sp_for_split(gfp_t gfp)1419 static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
1420 {
1421 struct kvm_mmu_page *sp;
1422
1423 gfp |= __GFP_ZERO;
1424
1425 sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
1426 if (!sp)
1427 return NULL;
1428
1429 sp->spt = (void *)__get_free_page(gfp);
1430 if (!sp->spt) {
1431 kmem_cache_free(mmu_page_header_cache, sp);
1432 return NULL;
1433 }
1434
1435 return sp;
1436 }
1437
tdp_mmu_alloc_sp_for_split(struct kvm * kvm,struct tdp_iter * iter,bool shared)1438 static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
1439 struct tdp_iter *iter,
1440 bool shared)
1441 {
1442 struct kvm_mmu_page *sp;
1443
1444 /*
1445 * Since we are allocating while under the MMU lock we have to be
1446 * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
1447 * reclaim and to avoid making any filesystem callbacks (which can end
1448 * up invoking KVM MMU notifiers, resulting in a deadlock).
1449 *
1450 * If this allocation fails we drop the lock and retry with reclaim
1451 * allowed.
1452 */
1453 sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
1454 if (sp)
1455 return sp;
1456
1457 rcu_read_unlock();
1458
1459 if (shared)
1460 read_unlock(&kvm->mmu_lock);
1461 else
1462 write_unlock(&kvm->mmu_lock);
1463
1464 iter->yielded = true;
1465 sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
1466
1467 if (shared)
1468 read_lock(&kvm->mmu_lock);
1469 else
1470 write_lock(&kvm->mmu_lock);
1471
1472 rcu_read_lock();
1473
1474 return sp;
1475 }
1476
tdp_mmu_split_huge_page(struct kvm * kvm,struct tdp_iter * iter,struct kvm_mmu_page * sp,bool shared)1477 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1478 struct kvm_mmu_page *sp, bool shared)
1479 {
1480 const u64 huge_spte = iter->old_spte;
1481 const int level = iter->level;
1482 int ret, i;
1483
1484 tdp_mmu_init_child_sp(sp, iter);
1485
1486 /*
1487 * No need for atomics when writing to sp->spt since the page table has
1488 * not been linked in yet and thus is not reachable from any other CPU.
1489 */
1490 for (i = 0; i < PT64_ENT_PER_PAGE; i++)
1491 sp->spt[i] = make_huge_page_split_spte(huge_spte, level, i);
1492
1493 /*
1494 * Replace the huge spte with a pointer to the populated lower level
1495 * page table. Since we are making this change without a TLB flush vCPUs
1496 * will see a mix of the split mappings and the original huge mapping,
1497 * depending on what's currently in their TLB. This is fine from a
1498 * correctness standpoint since the translation will be the same either
1499 * way.
1500 */
1501 ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared);
1502 if (ret)
1503 goto out;
1504
1505 /*
1506 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
1507 * are overwriting from the page stats. But we have to manually update
1508 * the page stats with the new present child pages.
1509 */
1510 kvm_update_page_stats(kvm, level - 1, PT64_ENT_PER_PAGE);
1511
1512 out:
1513 trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
1514 return ret;
1515 }
1516
tdp_mmu_split_huge_pages_root(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t start,gfn_t end,int target_level,bool shared)1517 static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
1518 struct kvm_mmu_page *root,
1519 gfn_t start, gfn_t end,
1520 int target_level, bool shared)
1521 {
1522 struct kvm_mmu_page *sp = NULL;
1523 struct tdp_iter iter;
1524 int ret = 0;
1525
1526 rcu_read_lock();
1527
1528 /*
1529 * Traverse the page table splitting all huge pages above the target
1530 * level into one lower level. For example, if we encounter a 1GB page
1531 * we split it into 512 2MB pages.
1532 *
1533 * Since the TDP iterator uses a pre-order traversal, we are guaranteed
1534 * to visit an SPTE before ever visiting its children, which means we
1535 * will correctly recursively split huge pages that are more than one
1536 * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
1537 * and then splitting each of those to 512 4KB pages).
1538 */
1539 for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
1540 retry:
1541 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
1542 continue;
1543
1544 if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
1545 continue;
1546
1547 if (!sp) {
1548 sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
1549 if (!sp) {
1550 ret = -ENOMEM;
1551 trace_kvm_mmu_split_huge_page(iter.gfn,
1552 iter.old_spte,
1553 iter.level, ret);
1554 break;
1555 }
1556
1557 if (iter.yielded)
1558 continue;
1559 }
1560
1561 if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
1562 goto retry;
1563
1564 sp = NULL;
1565 }
1566
1567 rcu_read_unlock();
1568
1569 /*
1570 * It's possible to exit the loop having never used the last sp if, for
1571 * example, a vCPU doing HugePage NX splitting wins the race and
1572 * installs its own sp in place of the last sp we tried to split.
1573 */
1574 if (sp)
1575 tdp_mmu_free_sp(sp);
1576
1577 return ret;
1578 }
1579
1580
1581 /*
1582 * Try to split all huge pages mapped by the TDP MMU down to the target level.
1583 */
kvm_tdp_mmu_try_split_huge_pages(struct kvm * kvm,const struct kvm_memory_slot * slot,gfn_t start,gfn_t end,int target_level,bool shared)1584 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
1585 const struct kvm_memory_slot *slot,
1586 gfn_t start, gfn_t end,
1587 int target_level, bool shared)
1588 {
1589 struct kvm_mmu_page *root;
1590 int r = 0;
1591
1592 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
1593
1594 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
1595 r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
1596 if (r) {
1597 kvm_tdp_mmu_put_root(kvm, root, shared);
1598 break;
1599 }
1600 }
1601 }
1602
1603 /*
1604 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1605 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1606 * If AD bits are not enabled, this will require clearing the writable bit on
1607 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1608 * be flushed.
1609 */
clear_dirty_gfn_range(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t start,gfn_t end)1610 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1611 gfn_t start, gfn_t end)
1612 {
1613 struct tdp_iter iter;
1614 u64 new_spte;
1615 bool spte_set = false;
1616
1617 rcu_read_lock();
1618
1619 tdp_root_for_each_leaf_pte(iter, root, start, end) {
1620 retry:
1621 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1622 continue;
1623
1624 if (!is_shadow_present_pte(iter.old_spte))
1625 continue;
1626
1627 if (spte_ad_need_write_protect(iter.old_spte)) {
1628 if (is_writable_pte(iter.old_spte))
1629 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1630 else
1631 continue;
1632 } else {
1633 if (iter.old_spte & shadow_dirty_mask)
1634 new_spte = iter.old_spte & ~shadow_dirty_mask;
1635 else
1636 continue;
1637 }
1638
1639 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1640 goto retry;
1641
1642 spte_set = true;
1643 }
1644
1645 rcu_read_unlock();
1646 return spte_set;
1647 }
1648
1649 /*
1650 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1651 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1652 * If AD bits are not enabled, this will require clearing the writable bit on
1653 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1654 * be flushed.
1655 */
kvm_tdp_mmu_clear_dirty_slot(struct kvm * kvm,const struct kvm_memory_slot * slot)1656 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1657 const struct kvm_memory_slot *slot)
1658 {
1659 struct kvm_mmu_page *root;
1660 bool spte_set = false;
1661
1662 lockdep_assert_held_read(&kvm->mmu_lock);
1663
1664 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1665 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1666 slot->base_gfn + slot->npages);
1667
1668 return spte_set;
1669 }
1670
1671 /*
1672 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1673 * set in mask, starting at gfn. The given memslot is expected to contain all
1674 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1675 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1676 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1677 */
clear_dirty_pt_masked(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t gfn,unsigned long mask,bool wrprot)1678 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1679 gfn_t gfn, unsigned long mask, bool wrprot)
1680 {
1681 struct tdp_iter iter;
1682 u64 new_spte;
1683
1684 rcu_read_lock();
1685
1686 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1687 gfn + BITS_PER_LONG) {
1688 if (!mask)
1689 break;
1690
1691 if (iter.level > PG_LEVEL_4K ||
1692 !(mask & (1UL << (iter.gfn - gfn))))
1693 continue;
1694
1695 mask &= ~(1UL << (iter.gfn - gfn));
1696
1697 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1698 if (is_writable_pte(iter.old_spte))
1699 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1700 else
1701 continue;
1702 } else {
1703 if (iter.old_spte & shadow_dirty_mask)
1704 new_spte = iter.old_spte & ~shadow_dirty_mask;
1705 else
1706 continue;
1707 }
1708
1709 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1710 }
1711
1712 rcu_read_unlock();
1713 }
1714
1715 /*
1716 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1717 * set in mask, starting at gfn. The given memslot is expected to contain all
1718 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1719 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1720 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1721 */
kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm * kvm,struct kvm_memory_slot * slot,gfn_t gfn,unsigned long mask,bool wrprot)1722 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1723 struct kvm_memory_slot *slot,
1724 gfn_t gfn, unsigned long mask,
1725 bool wrprot)
1726 {
1727 struct kvm_mmu_page *root;
1728
1729 lockdep_assert_held_write(&kvm->mmu_lock);
1730 for_each_tdp_mmu_root(kvm, root, slot->as_id)
1731 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1732 }
1733
1734 /*
1735 * Clear leaf entries which could be replaced by large mappings, for
1736 * GFNs within the slot.
1737 */
zap_collapsible_spte_range(struct kvm * kvm,struct kvm_mmu_page * root,const struct kvm_memory_slot * slot)1738 static void zap_collapsible_spte_range(struct kvm *kvm,
1739 struct kvm_mmu_page *root,
1740 const struct kvm_memory_slot *slot)
1741 {
1742 gfn_t start = slot->base_gfn;
1743 gfn_t end = start + slot->npages;
1744 struct tdp_iter iter;
1745 int max_mapping_level;
1746 kvm_pfn_t pfn;
1747
1748 rcu_read_lock();
1749
1750 tdp_root_for_each_pte(iter, root, start, end) {
1751 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1752 continue;
1753
1754 if (!is_shadow_present_pte(iter.old_spte) ||
1755 !is_last_spte(iter.old_spte, iter.level))
1756 continue;
1757
1758 /*
1759 * This is a leaf SPTE. Check if the PFN it maps can
1760 * be mapped at a higher level.
1761 */
1762 pfn = spte_to_pfn(iter.old_spte);
1763
1764 if (kvm_is_reserved_pfn(pfn))
1765 continue;
1766
1767 max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
1768 iter.gfn, pfn, PG_LEVEL_NUM);
1769
1770 WARN_ON(max_mapping_level < iter.level);
1771
1772 /*
1773 * If this page is already mapped at the highest
1774 * viable level, there's nothing more to do.
1775 */
1776 if (max_mapping_level == iter.level)
1777 continue;
1778
1779 /*
1780 * The page can be remapped at a higher level, so step
1781 * up to zap the parent SPTE.
1782 */
1783 while (max_mapping_level > iter.level)
1784 tdp_iter_step_up(&iter);
1785
1786 /* Note, a successful atomic zap also does a remote TLB flush. */
1787 tdp_mmu_zap_spte_atomic(kvm, &iter);
1788
1789 /*
1790 * If the atomic zap fails, the iter will recurse back into
1791 * the same subtree to retry.
1792 */
1793 }
1794
1795 rcu_read_unlock();
1796 }
1797
1798 /*
1799 * Clear non-leaf entries (and free associated page tables) which could
1800 * be replaced by large mappings, for GFNs within the slot.
1801 */
kvm_tdp_mmu_zap_collapsible_sptes(struct kvm * kvm,const struct kvm_memory_slot * slot)1802 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1803 const struct kvm_memory_slot *slot)
1804 {
1805 struct kvm_mmu_page *root;
1806
1807 lockdep_assert_held_read(&kvm->mmu_lock);
1808
1809 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1810 zap_collapsible_spte_range(kvm, root, slot);
1811 }
1812
1813 /*
1814 * Removes write access on the last level SPTE mapping this GFN and unsets the
1815 * MMU-writable bit to ensure future writes continue to be intercepted.
1816 * Returns true if an SPTE was set and a TLB flush is needed.
1817 */
write_protect_gfn(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t gfn,int min_level)1818 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1819 gfn_t gfn, int min_level)
1820 {
1821 struct tdp_iter iter;
1822 u64 new_spte;
1823 bool spte_set = false;
1824
1825 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1826
1827 rcu_read_lock();
1828
1829 for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
1830 if (!is_shadow_present_pte(iter.old_spte) ||
1831 !is_last_spte(iter.old_spte, iter.level))
1832 continue;
1833
1834 new_spte = iter.old_spte &
1835 ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1836
1837 if (new_spte == iter.old_spte)
1838 break;
1839
1840 tdp_mmu_set_spte(kvm, &iter, new_spte);
1841 spte_set = true;
1842 }
1843
1844 rcu_read_unlock();
1845
1846 return spte_set;
1847 }
1848
1849 /*
1850 * Removes write access on the last level SPTE mapping this GFN and unsets the
1851 * MMU-writable bit to ensure future writes continue to be intercepted.
1852 * Returns true if an SPTE was set and a TLB flush is needed.
1853 */
kvm_tdp_mmu_write_protect_gfn(struct kvm * kvm,struct kvm_memory_slot * slot,gfn_t gfn,int min_level)1854 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1855 struct kvm_memory_slot *slot, gfn_t gfn,
1856 int min_level)
1857 {
1858 struct kvm_mmu_page *root;
1859 bool spte_set = false;
1860
1861 lockdep_assert_held_write(&kvm->mmu_lock);
1862 for_each_tdp_mmu_root(kvm, root, slot->as_id)
1863 spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1864
1865 return spte_set;
1866 }
1867
1868 /*
1869 * Return the level of the lowest level SPTE added to sptes.
1870 * That SPTE may be non-present.
1871 *
1872 * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1873 */
kvm_tdp_mmu_get_walk(struct kvm_vcpu * vcpu,u64 addr,u64 * sptes,int * root_level)1874 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1875 int *root_level)
1876 {
1877 struct tdp_iter iter;
1878 struct kvm_mmu *mmu = vcpu->arch.mmu;
1879 gfn_t gfn = addr >> PAGE_SHIFT;
1880 int leaf = -1;
1881
1882 *root_level = vcpu->arch.mmu->root_role.level;
1883
1884 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1885 leaf = iter.level;
1886 sptes[leaf] = iter.old_spte;
1887 }
1888
1889 return leaf;
1890 }
1891
1892 /*
1893 * Returns the last level spte pointer of the shadow page walk for the given
1894 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1895 * walk could be performed, returns NULL and *spte does not contain valid data.
1896 *
1897 * Contract:
1898 * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1899 * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1900 *
1901 * WARNING: This function is only intended to be called during fast_page_fault.
1902 */
kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu * vcpu,u64 addr,u64 * spte)1903 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
1904 u64 *spte)
1905 {
1906 struct tdp_iter iter;
1907 struct kvm_mmu *mmu = vcpu->arch.mmu;
1908 gfn_t gfn = addr >> PAGE_SHIFT;
1909 tdp_ptep_t sptep = NULL;
1910
1911 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1912 *spte = iter.old_spte;
1913 sptep = iter.sptep;
1914 }
1915
1916 /*
1917 * Perform the rcu_dereference to get the raw spte pointer value since
1918 * we are passing it up to fast_page_fault, which is shared with the
1919 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
1920 * annotation.
1921 *
1922 * This is safe since fast_page_fault obeys the contracts of this
1923 * function as well as all TDP MMU contracts around modifying SPTEs
1924 * outside of mmu_lock.
1925 */
1926 return rcu_dereference(sptep);
1927 }
1928