1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "mmu.h"
4 #include "mmu_internal.h"
5 #include "mmutrace.h"
6 #include "tdp_iter.h"
7 #include "tdp_mmu.h"
8 #include "spte.h"
9
10 #include <asm/cmpxchg.h>
11 #include <trace/events/kvm.h>
12
13 static bool __read_mostly tdp_mmu_enabled = true;
14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
15
16 /* Initializes the TDP MMU for the VM, if enabled. */
kvm_mmu_init_tdp_mmu(struct kvm * kvm)17 bool kvm_mmu_init_tdp_mmu(struct kvm *kvm)
18 {
19 if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
20 return false;
21
22 /* This should not be changed for the lifetime of the VM. */
23 kvm->arch.tdp_mmu_enabled = true;
24
25 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
26 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
27 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
28
29 return true;
30 }
31
kvm_lockdep_assert_mmu_lock_held(struct kvm * kvm,bool shared)32 static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
33 bool shared)
34 {
35 if (shared)
36 lockdep_assert_held_read(&kvm->mmu_lock);
37 else
38 lockdep_assert_held_write(&kvm->mmu_lock);
39 }
40
kvm_mmu_uninit_tdp_mmu(struct kvm * kvm)41 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
42 {
43 if (!kvm->arch.tdp_mmu_enabled)
44 return;
45
46 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
47 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
48
49 /*
50 * Ensure that all the outstanding RCU callbacks to free shadow pages
51 * can run before the VM is torn down.
52 */
53 rcu_barrier();
54 }
55
56 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
57 gfn_t start, gfn_t end, bool can_yield, bool flush,
58 bool shared);
59
tdp_mmu_free_sp(struct kvm_mmu_page * sp)60 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
61 {
62 free_page((unsigned long)sp->spt);
63 kmem_cache_free(mmu_page_header_cache, sp);
64 }
65
66 /*
67 * This is called through call_rcu in order to free TDP page table memory
68 * safely with respect to other kernel threads that may be operating on
69 * the memory.
70 * By only accessing TDP MMU page table memory in an RCU read critical
71 * section, and freeing it after a grace period, lockless access to that
72 * memory won't use it after it is freed.
73 */
tdp_mmu_free_sp_rcu_callback(struct rcu_head * head)74 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
75 {
76 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
77 rcu_head);
78
79 tdp_mmu_free_sp(sp);
80 }
81
kvm_tdp_mmu_put_root(struct kvm * kvm,struct kvm_mmu_page * root,bool shared)82 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
83 bool shared)
84 {
85 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
86
87 if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
88 return;
89
90 WARN_ON(!root->tdp_mmu_page);
91
92 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
93 list_del_rcu(&root->link);
94 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
95
96 zap_gfn_range(kvm, root, 0, -1ull, false, false, shared);
97
98 call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
99 }
100
101 /*
102 * Finds the next valid root after root (or the first valid root if root
103 * is NULL), takes a reference on it, and returns that next root. If root
104 * is not NULL, this thread should have already taken a reference on it, and
105 * that reference will be dropped. If no valid root is found, this
106 * function will return NULL.
107 */
tdp_mmu_next_root(struct kvm * kvm,struct kvm_mmu_page * prev_root,bool shared)108 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
109 struct kvm_mmu_page *prev_root,
110 bool shared)
111 {
112 struct kvm_mmu_page *next_root;
113
114 rcu_read_lock();
115
116 if (prev_root)
117 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
118 &prev_root->link,
119 typeof(*prev_root), link);
120 else
121 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
122 typeof(*next_root), link);
123
124 while (next_root && !kvm_tdp_mmu_get_root(kvm, next_root))
125 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
126 &next_root->link, typeof(*next_root), link);
127
128 rcu_read_unlock();
129
130 if (prev_root)
131 kvm_tdp_mmu_put_root(kvm, prev_root, shared);
132
133 return next_root;
134 }
135
136 /*
137 * Note: this iterator gets and puts references to the roots it iterates over.
138 * This makes it safe to release the MMU lock and yield within the loop, but
139 * if exiting the loop early, the caller must drop the reference to the most
140 * recent root. (Unless keeping a live reference is desirable.)
141 *
142 * If shared is set, this function is operating under the MMU lock in read
143 * mode. In the unlikely event that this thread must free a root, the lock
144 * will be temporarily dropped and reacquired in write mode.
145 */
146 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
147 for (_root = tdp_mmu_next_root(_kvm, NULL, _shared); \
148 _root; \
149 _root = tdp_mmu_next_root(_kvm, _root, _shared)) \
150 if (kvm_mmu_page_as_id(_root) != _as_id) { \
151 } else
152
153 #define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
154 list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link, \
155 lockdep_is_held_type(&kvm->mmu_lock, 0) || \
156 lockdep_is_held(&kvm->arch.tdp_mmu_pages_lock)) \
157 if (kvm_mmu_page_as_id(_root) != _as_id) { \
158 } else
159
page_role_for_level(struct kvm_vcpu * vcpu,int level)160 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
161 int level)
162 {
163 union kvm_mmu_page_role role;
164
165 role = vcpu->arch.mmu->mmu_role.base;
166 role.level = level;
167 role.direct = true;
168 role.gpte_is_8_bytes = true;
169 role.access = ACC_ALL;
170 role.ad_disabled = !shadow_accessed_mask;
171
172 return role;
173 }
174
alloc_tdp_mmu_page(struct kvm_vcpu * vcpu,gfn_t gfn,int level)175 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
176 int level)
177 {
178 struct kvm_mmu_page *sp;
179
180 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
181 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
182 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
183
184 sp->role.word = page_role_for_level(vcpu, level).word;
185 sp->gfn = gfn;
186 sp->tdp_mmu_page = true;
187
188 trace_kvm_mmu_get_page(sp, true);
189
190 return sp;
191 }
192
kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu * vcpu)193 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
194 {
195 union kvm_mmu_page_role role;
196 struct kvm *kvm = vcpu->kvm;
197 struct kvm_mmu_page *root;
198
199 lockdep_assert_held_write(&kvm->mmu_lock);
200
201 role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
202
203 /* Check for an existing root before allocating a new one. */
204 for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
205 if (root->role.word == role.word &&
206 kvm_tdp_mmu_get_root(kvm, root))
207 goto out;
208 }
209
210 root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
211 refcount_set(&root->tdp_mmu_root_count, 1);
212
213 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
214 list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
215 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
216
217 out:
218 return __pa(root->spt);
219 }
220
221 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
222 u64 old_spte, u64 new_spte, int level,
223 bool shared);
224
handle_changed_spte_acc_track(u64 old_spte,u64 new_spte,int level)225 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
226 {
227 if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
228 return;
229
230 if (is_accessed_spte(old_spte) &&
231 (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
232 spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
233 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
234 }
235
handle_changed_spte_dirty_log(struct kvm * kvm,int as_id,gfn_t gfn,u64 old_spte,u64 new_spte,int level)236 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
237 u64 old_spte, u64 new_spte, int level)
238 {
239 bool pfn_changed;
240 struct kvm_memory_slot *slot;
241
242 if (level > PG_LEVEL_4K)
243 return;
244
245 pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
246
247 if ((!is_writable_pte(old_spte) || pfn_changed) &&
248 is_writable_pte(new_spte)) {
249 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
250 mark_page_dirty_in_slot(kvm, slot, gfn);
251 }
252 }
253
254 /**
255 * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
256 *
257 * @kvm: kvm instance
258 * @sp: the new page
259 * @account_nx: This page replaces a NX large page and should be marked for
260 * eventual reclaim.
261 */
tdp_mmu_link_page(struct kvm * kvm,struct kvm_mmu_page * sp,bool account_nx)262 static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
263 bool account_nx)
264 {
265 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
266 list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
267 if (account_nx)
268 account_huge_nx_page(kvm, sp);
269 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
270 }
271
272 /**
273 * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
274 *
275 * @kvm: kvm instance
276 * @sp: the page to be removed
277 * @shared: This operation may not be running under the exclusive use of
278 * the MMU lock and the operation must synchronize with other
279 * threads that might be adding or removing pages.
280 */
tdp_mmu_unlink_page(struct kvm * kvm,struct kvm_mmu_page * sp,bool shared)281 static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
282 bool shared)
283 {
284 if (shared)
285 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
286 else
287 lockdep_assert_held_write(&kvm->mmu_lock);
288
289 list_del(&sp->link);
290 if (sp->lpage_disallowed)
291 unaccount_huge_nx_page(kvm, sp);
292
293 if (shared)
294 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
295 }
296
297 /**
298 * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
299 *
300 * @kvm: kvm instance
301 * @pt: the page removed from the paging structure
302 * @shared: This operation may not be running under the exclusive use
303 * of the MMU lock and the operation must synchronize with other
304 * threads that might be modifying SPTEs.
305 *
306 * Given a page table that has been removed from the TDP paging structure,
307 * iterates through the page table to clear SPTEs and free child page tables.
308 *
309 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
310 * protection. Since this thread removed it from the paging structure,
311 * this thread will be responsible for ensuring the page is freed. Hence the
312 * early rcu_dereferences in the function.
313 */
handle_removed_tdp_mmu_page(struct kvm * kvm,tdp_ptep_t pt,bool shared)314 static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
315 bool shared)
316 {
317 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
318 int level = sp->role.level;
319 gfn_t base_gfn = sp->gfn;
320 int i;
321
322 trace_kvm_mmu_prepare_zap_page(sp);
323
324 tdp_mmu_unlink_page(kvm, sp, shared);
325
326 for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
327 u64 *sptep = rcu_dereference(pt) + i;
328 gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
329 u64 old_child_spte;
330
331 if (shared) {
332 /*
333 * Set the SPTE to a nonpresent value that other
334 * threads will not overwrite. If the SPTE was
335 * already marked as removed then another thread
336 * handling a page fault could overwrite it, so
337 * set the SPTE until it is set from some other
338 * value to the removed SPTE value.
339 */
340 for (;;) {
341 old_child_spte = xchg(sptep, REMOVED_SPTE);
342 if (!is_removed_spte(old_child_spte))
343 break;
344 cpu_relax();
345 }
346 } else {
347 /*
348 * If the SPTE is not MMU-present, there is no backing
349 * page associated with the SPTE and so no side effects
350 * that need to be recorded, and exclusive ownership of
351 * mmu_lock ensures the SPTE can't be made present.
352 * Note, zapping MMIO SPTEs is also unnecessary as they
353 * are guarded by the memslots generation, not by being
354 * unreachable.
355 */
356 old_child_spte = READ_ONCE(*sptep);
357 if (!is_shadow_present_pte(old_child_spte))
358 continue;
359
360 /*
361 * Marking the SPTE as a removed SPTE is not
362 * strictly necessary here as the MMU lock will
363 * stop other threads from concurrently modifying
364 * this SPTE. Using the removed SPTE value keeps
365 * the two branches consistent and simplifies
366 * the function.
367 */
368 WRITE_ONCE(*sptep, REMOVED_SPTE);
369 }
370 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
371 old_child_spte, REMOVED_SPTE, level,
372 shared);
373 }
374
375 kvm_flush_remote_tlbs_with_address(kvm, base_gfn,
376 KVM_PAGES_PER_HPAGE(level + 1));
377
378 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
379 }
380
381 /**
382 * __handle_changed_spte - handle bookkeeping associated with an SPTE change
383 * @kvm: kvm instance
384 * @as_id: the address space of the paging structure the SPTE was a part of
385 * @gfn: the base GFN that was mapped by the SPTE
386 * @old_spte: The value of the SPTE before the change
387 * @new_spte: The value of the SPTE after the change
388 * @level: the level of the PT the SPTE is part of in the paging structure
389 * @shared: This operation may not be running under the exclusive use of
390 * the MMU lock and the operation must synchronize with other
391 * threads that might be modifying SPTEs.
392 *
393 * Handle bookkeeping that might result from the modification of a SPTE.
394 * This function must be called for all TDP SPTE modifications.
395 */
__handle_changed_spte(struct kvm * kvm,int as_id,gfn_t gfn,u64 old_spte,u64 new_spte,int level,bool shared)396 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
397 u64 old_spte, u64 new_spte, int level,
398 bool shared)
399 {
400 bool was_present = is_shadow_present_pte(old_spte);
401 bool is_present = is_shadow_present_pte(new_spte);
402 bool was_leaf = was_present && is_last_spte(old_spte, level);
403 bool is_leaf = is_present && is_last_spte(new_spte, level);
404 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
405
406 WARN_ON(level > PT64_ROOT_MAX_LEVEL);
407 WARN_ON(level < PG_LEVEL_4K);
408 WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
409
410 /*
411 * If this warning were to trigger it would indicate that there was a
412 * missing MMU notifier or a race with some notifier handler.
413 * A present, leaf SPTE should never be directly replaced with another
414 * present leaf SPTE pointing to a different PFN. A notifier handler
415 * should be zapping the SPTE before the main MM's page table is
416 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
417 * thread before replacement.
418 */
419 if (was_leaf && is_leaf && pfn_changed) {
420 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
421 "SPTE with another present leaf SPTE mapping a\n"
422 "different PFN!\n"
423 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
424 as_id, gfn, old_spte, new_spte, level);
425
426 /*
427 * Crash the host to prevent error propagation and guest data
428 * corruption.
429 */
430 BUG();
431 }
432
433 if (old_spte == new_spte)
434 return;
435
436 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
437
438 /*
439 * The only times a SPTE should be changed from a non-present to
440 * non-present state is when an MMIO entry is installed/modified/
441 * removed. In that case, there is nothing to do here.
442 */
443 if (!was_present && !is_present) {
444 /*
445 * If this change does not involve a MMIO SPTE or removed SPTE,
446 * it is unexpected. Log the change, though it should not
447 * impact the guest since both the former and current SPTEs
448 * are nonpresent.
449 */
450 if (WARN_ON(!is_mmio_spte(old_spte) &&
451 !is_mmio_spte(new_spte) &&
452 !is_removed_spte(new_spte)))
453 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
454 "should not be replaced with another,\n"
455 "different nonpresent SPTE, unless one or both\n"
456 "are MMIO SPTEs, or the new SPTE is\n"
457 "a temporary removed SPTE.\n"
458 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
459 as_id, gfn, old_spte, new_spte, level);
460 return;
461 }
462
463 if (is_leaf != was_leaf)
464 kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
465
466 if (was_leaf && is_dirty_spte(old_spte) &&
467 (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
468 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
469
470 /*
471 * Recursively handle child PTs if the change removed a subtree from
472 * the paging structure.
473 */
474 if (was_present && !was_leaf && (pfn_changed || !is_present))
475 handle_removed_tdp_mmu_page(kvm,
476 spte_to_child_pt(old_spte, level), shared);
477 }
478
handle_changed_spte(struct kvm * kvm,int as_id,gfn_t gfn,u64 old_spte,u64 new_spte,int level,bool shared)479 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
480 u64 old_spte, u64 new_spte, int level,
481 bool shared)
482 {
483 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
484 shared);
485 handle_changed_spte_acc_track(old_spte, new_spte, level);
486 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
487 new_spte, level);
488 }
489
490 /*
491 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
492 * and handle the associated bookkeeping. Do not mark the page dirty
493 * in KVM's dirty bitmaps.
494 *
495 * @kvm: kvm instance
496 * @iter: a tdp_iter instance currently on the SPTE that should be set
497 * @new_spte: The value the SPTE should be set to
498 * Returns: true if the SPTE was set, false if it was not. If false is returned,
499 * this function will have no side-effects.
500 */
tdp_mmu_set_spte_atomic(struct kvm * kvm,struct tdp_iter * iter,u64 new_spte)501 static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
502 struct tdp_iter *iter,
503 u64 new_spte)
504 {
505 WARN_ON_ONCE(iter->yielded);
506
507 lockdep_assert_held_read(&kvm->mmu_lock);
508
509 /*
510 * Do not change removed SPTEs. Only the thread that froze the SPTE
511 * may modify it.
512 */
513 if (is_removed_spte(iter->old_spte))
514 return false;
515
516 /*
517 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
518 * does not hold the mmu_lock.
519 */
520 if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
521 new_spte) != iter->old_spte)
522 return false;
523
524 __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
525 new_spte, iter->level, true);
526 handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
527
528 return true;
529 }
530
tdp_mmu_zap_spte_atomic(struct kvm * kvm,struct tdp_iter * iter)531 static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
532 struct tdp_iter *iter)
533 {
534 /*
535 * Freeze the SPTE by setting it to a special,
536 * non-present value. This will stop other threads from
537 * immediately installing a present entry in its place
538 * before the TLBs are flushed.
539 */
540 if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
541 return false;
542
543 kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
544 KVM_PAGES_PER_HPAGE(iter->level));
545
546 /*
547 * No other thread can overwrite the removed SPTE as they
548 * must either wait on the MMU lock or use
549 * tdp_mmu_set_spte_atomic which will not overwrite the
550 * special removed SPTE value. No bookkeeping is needed
551 * here since the SPTE is going from non-present
552 * to non-present.
553 */
554 WRITE_ONCE(*rcu_dereference(iter->sptep), 0);
555
556 return true;
557 }
558
559
560 /*
561 * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
562 * @kvm: kvm instance
563 * @iter: a tdp_iter instance currently on the SPTE that should be set
564 * @new_spte: The value the SPTE should be set to
565 * @record_acc_track: Notify the MM subsystem of changes to the accessed state
566 * of the page. Should be set unless handling an MMU
567 * notifier for access tracking. Leaving record_acc_track
568 * unset in that case prevents page accesses from being
569 * double counted.
570 * @record_dirty_log: Record the page as dirty in the dirty bitmap if
571 * appropriate for the change being made. Should be set
572 * unless performing certain dirty logging operations.
573 * Leaving record_dirty_log unset in that case prevents page
574 * writes from being double counted.
575 */
__tdp_mmu_set_spte(struct kvm * kvm,struct tdp_iter * iter,u64 new_spte,bool record_acc_track,bool record_dirty_log)576 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
577 u64 new_spte, bool record_acc_track,
578 bool record_dirty_log)
579 {
580 WARN_ON_ONCE(iter->yielded);
581
582 lockdep_assert_held_write(&kvm->mmu_lock);
583
584 /*
585 * No thread should be using this function to set SPTEs to the
586 * temporary removed SPTE value.
587 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
588 * should be used. If operating under the MMU lock in write mode, the
589 * use of the removed SPTE should not be necessary.
590 */
591 WARN_ON(is_removed_spte(iter->old_spte));
592
593 WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
594
595 __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
596 new_spte, iter->level, false);
597 if (record_acc_track)
598 handle_changed_spte_acc_track(iter->old_spte, new_spte,
599 iter->level);
600 if (record_dirty_log)
601 handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
602 iter->old_spte, new_spte,
603 iter->level);
604 }
605
tdp_mmu_set_spte(struct kvm * kvm,struct tdp_iter * iter,u64 new_spte)606 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
607 u64 new_spte)
608 {
609 __tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
610 }
611
tdp_mmu_set_spte_no_acc_track(struct kvm * kvm,struct tdp_iter * iter,u64 new_spte)612 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
613 struct tdp_iter *iter,
614 u64 new_spte)
615 {
616 __tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
617 }
618
tdp_mmu_set_spte_no_dirty_log(struct kvm * kvm,struct tdp_iter * iter,u64 new_spte)619 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
620 struct tdp_iter *iter,
621 u64 new_spte)
622 {
623 __tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
624 }
625
626 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
627 for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
628
629 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \
630 tdp_root_for_each_pte(_iter, _root, _start, _end) \
631 if (!is_shadow_present_pte(_iter.old_spte) || \
632 !is_last_spte(_iter.old_spte, _iter.level)) \
633 continue; \
634 else
635
636 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \
637 for_each_tdp_pte(_iter, __va(_mmu->root_hpa), \
638 _mmu->shadow_root_level, _start, _end)
639
640 /*
641 * Yield if the MMU lock is contended or this thread needs to return control
642 * to the scheduler.
643 *
644 * If this function should yield and flush is set, it will perform a remote
645 * TLB flush before yielding.
646 *
647 * If this function yields, iter->yielded is set and the caller must skip to
648 * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
649 * over the paging structures to allow the iterator to continue its traversal
650 * from the paging structure root.
651 *
652 * Returns true if this function yielded.
653 */
tdp_mmu_iter_cond_resched(struct kvm * kvm,struct tdp_iter * iter,bool flush,bool shared)654 static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
655 struct tdp_iter *iter,
656 bool flush, bool shared)
657 {
658 WARN_ON(iter->yielded);
659
660 /* Ensure forward progress has been made before yielding. */
661 if (iter->next_last_level_gfn == iter->yielded_gfn)
662 return false;
663
664 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
665 rcu_read_unlock();
666
667 if (flush)
668 kvm_flush_remote_tlbs(kvm);
669
670 if (shared)
671 cond_resched_rwlock_read(&kvm->mmu_lock);
672 else
673 cond_resched_rwlock_write(&kvm->mmu_lock);
674
675 rcu_read_lock();
676
677 WARN_ON(iter->gfn > iter->next_last_level_gfn);
678
679 iter->yielded = true;
680 }
681
682 return iter->yielded;
683 }
684
685 /*
686 * Tears down the mappings for the range of gfns, [start, end), and frees the
687 * non-root pages mapping GFNs strictly within that range. Returns true if
688 * SPTEs have been cleared and a TLB flush is needed before releasing the
689 * MMU lock.
690 *
691 * If can_yield is true, will release the MMU lock and reschedule if the
692 * scheduler needs the CPU or there is contention on the MMU lock. If this
693 * function cannot yield, it will not release the MMU lock or reschedule and
694 * the caller must ensure it does not supply too large a GFN range, or the
695 * operation can cause a soft lockup.
696 *
697 * If shared is true, this thread holds the MMU lock in read mode and must
698 * account for the possibility that other threads are modifying the paging
699 * structures concurrently. If shared is false, this thread should hold the
700 * MMU lock in write mode.
701 */
zap_gfn_range(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t start,gfn_t end,bool can_yield,bool flush,bool shared)702 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
703 gfn_t start, gfn_t end, bool can_yield, bool flush,
704 bool shared)
705 {
706 gfn_t max_gfn_host = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
707 bool zap_all = (start == 0 && end >= max_gfn_host);
708 struct tdp_iter iter;
709
710 /*
711 * No need to try to step down in the iterator when zapping all SPTEs,
712 * zapping the top-level non-leaf SPTEs will recurse on their children.
713 */
714 int min_level = zap_all ? root->role.level : PG_LEVEL_4K;
715
716 /*
717 * Bound the walk at host.MAXPHYADDR, guest accesses beyond that will
718 * hit a #PF(RSVD) and never get to an EPT Violation/Misconfig / #NPF,
719 * and so KVM will never install a SPTE for such addresses.
720 */
721 end = min(end, max_gfn_host);
722
723 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
724
725 rcu_read_lock();
726
727 for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
728 min_level, start, end) {
729 retry:
730 if (can_yield &&
731 tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) {
732 flush = false;
733 continue;
734 }
735
736 if (!is_shadow_present_pte(iter.old_spte))
737 continue;
738
739 /*
740 * If this is a non-last-level SPTE that covers a larger range
741 * than should be zapped, continue, and zap the mappings at a
742 * lower level, except when zapping all SPTEs.
743 */
744 if (!zap_all &&
745 (iter.gfn < start ||
746 iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
747 !is_last_spte(iter.old_spte, iter.level))
748 continue;
749
750 if (!shared) {
751 tdp_mmu_set_spte(kvm, &iter, 0);
752 flush = true;
753 } else if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
754 /*
755 * The iter must explicitly re-read the SPTE because
756 * the atomic cmpxchg failed.
757 */
758 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
759 goto retry;
760 }
761 }
762
763 rcu_read_unlock();
764 return flush;
765 }
766
767 /*
768 * Tears down the mappings for the range of gfns, [start, end), and frees the
769 * non-root pages mapping GFNs strictly within that range. Returns true if
770 * SPTEs have been cleared and a TLB flush is needed before releasing the
771 * MMU lock.
772 */
__kvm_tdp_mmu_zap_gfn_range(struct kvm * kvm,int as_id,gfn_t start,gfn_t end,bool can_yield,bool flush)773 bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
774 gfn_t end, bool can_yield, bool flush)
775 {
776 struct kvm_mmu_page *root;
777
778 for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, false)
779 flush = zap_gfn_range(kvm, root, start, end, can_yield, flush,
780 false);
781
782 return flush;
783 }
784
kvm_tdp_mmu_zap_all(struct kvm * kvm)785 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
786 {
787 bool flush = false;
788 int i;
789
790 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
791 flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull, flush);
792
793 if (flush)
794 kvm_flush_remote_tlbs(kvm);
795 }
796
next_invalidated_root(struct kvm * kvm,struct kvm_mmu_page * prev_root)797 static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
798 struct kvm_mmu_page *prev_root)
799 {
800 struct kvm_mmu_page *next_root;
801
802 if (prev_root)
803 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
804 &prev_root->link,
805 typeof(*prev_root), link);
806 else
807 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
808 typeof(*next_root), link);
809
810 while (next_root && !(next_root->role.invalid &&
811 refcount_read(&next_root->tdp_mmu_root_count)))
812 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
813 &next_root->link,
814 typeof(*next_root), link);
815
816 return next_root;
817 }
818
819 /*
820 * Since kvm_tdp_mmu_zap_all_fast has acquired a reference to each
821 * invalidated root, they will not be freed until this function drops the
822 * reference. Before dropping that reference, tear down the paging
823 * structure so that whichever thread does drop the last reference
824 * only has to do a trivial amount of work. Since the roots are invalid,
825 * no new SPTEs should be created under them.
826 */
kvm_tdp_mmu_zap_invalidated_roots(struct kvm * kvm)827 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
828 {
829 struct kvm_mmu_page *next_root;
830 struct kvm_mmu_page *root;
831 bool flush = false;
832
833 lockdep_assert_held_read(&kvm->mmu_lock);
834
835 rcu_read_lock();
836
837 root = next_invalidated_root(kvm, NULL);
838
839 while (root) {
840 next_root = next_invalidated_root(kvm, root);
841
842 rcu_read_unlock();
843
844 flush = zap_gfn_range(kvm, root, 0, -1ull, true, flush, true);
845
846 /*
847 * Put the reference acquired in
848 * kvm_tdp_mmu_invalidate_roots
849 */
850 kvm_tdp_mmu_put_root(kvm, root, true);
851
852 root = next_root;
853
854 rcu_read_lock();
855 }
856
857 rcu_read_unlock();
858
859 if (flush)
860 kvm_flush_remote_tlbs(kvm);
861 }
862
863 /*
864 * Mark each TDP MMU root as invalid so that other threads
865 * will drop their references and allow the root count to
866 * go to 0.
867 *
868 * Also take a reference on all roots so that this thread
869 * can do the bulk of the work required to free the roots
870 * once they are invalidated. Without this reference, a
871 * vCPU thread might drop the last reference to a root and
872 * get stuck with tearing down the entire paging structure.
873 *
874 * Roots which have a zero refcount should be skipped as
875 * they're already being torn down.
876 * Already invalid roots should be referenced again so that
877 * they aren't freed before kvm_tdp_mmu_zap_all_fast is
878 * done with them.
879 *
880 * This has essentially the same effect for the TDP MMU
881 * as updating mmu_valid_gen does for the shadow MMU.
882 */
kvm_tdp_mmu_invalidate_all_roots(struct kvm * kvm)883 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
884 {
885 struct kvm_mmu_page *root;
886
887 lockdep_assert_held_write(&kvm->mmu_lock);
888 list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link)
889 if (refcount_inc_not_zero(&root->tdp_mmu_root_count))
890 root->role.invalid = true;
891 }
892
893 /*
894 * Installs a last-level SPTE to handle a TDP page fault.
895 * (NPT/EPT violation/misconfiguration)
896 */
tdp_mmu_map_handle_target_level(struct kvm_vcpu * vcpu,struct kvm_page_fault * fault,struct tdp_iter * iter)897 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
898 struct kvm_page_fault *fault,
899 struct tdp_iter *iter)
900 {
901 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
902 u64 new_spte;
903 int ret = RET_PF_FIXED;
904 bool wrprot = false;
905
906 WARN_ON(sp->role.level != fault->goal_level);
907 if (unlikely(!fault->slot))
908 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
909 else
910 wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
911 fault->pfn, iter->old_spte, fault->prefetch, true,
912 fault->map_writable, &new_spte);
913
914 if (new_spte == iter->old_spte)
915 ret = RET_PF_SPURIOUS;
916 else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
917 return RET_PF_RETRY;
918
919 /*
920 * If the page fault was caused by a write but the page is write
921 * protected, emulation is needed. If the emulation was skipped,
922 * the vCPU would have the same fault again.
923 */
924 if (wrprot) {
925 if (fault->write)
926 ret = RET_PF_EMULATE;
927 }
928
929 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
930 if (unlikely(is_mmio_spte(new_spte))) {
931 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
932 new_spte);
933 ret = RET_PF_EMULATE;
934 } else {
935 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
936 rcu_dereference(iter->sptep));
937 }
938
939 /*
940 * Increase pf_fixed in both RET_PF_EMULATE and RET_PF_FIXED to be
941 * consistent with legacy MMU behavior.
942 */
943 if (ret != RET_PF_SPURIOUS)
944 vcpu->stat.pf_fixed++;
945
946 return ret;
947 }
948
949 /*
950 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
951 * page tables and SPTEs to translate the faulting guest physical address.
952 */
kvm_tdp_mmu_map(struct kvm_vcpu * vcpu,struct kvm_page_fault * fault)953 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
954 {
955 struct kvm_mmu *mmu = vcpu->arch.mmu;
956 struct tdp_iter iter;
957 struct kvm_mmu_page *sp;
958 u64 *child_pt;
959 u64 new_spte;
960 int ret;
961
962 kvm_mmu_hugepage_adjust(vcpu, fault);
963
964 trace_kvm_mmu_spte_requested(fault);
965
966 rcu_read_lock();
967
968 tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
969 if (fault->nx_huge_page_workaround_enabled)
970 disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
971
972 if (iter.level == fault->goal_level)
973 break;
974
975 /*
976 * If there is an SPTE mapping a large page at a higher level
977 * than the target, that SPTE must be cleared and replaced
978 * with a non-leaf SPTE.
979 */
980 if (is_shadow_present_pte(iter.old_spte) &&
981 is_large_pte(iter.old_spte)) {
982 if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
983 break;
984
985 /*
986 * The iter must explicitly re-read the spte here
987 * because the new value informs the !present
988 * path below.
989 */
990 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
991 }
992
993 if (!is_shadow_present_pte(iter.old_spte)) {
994 /*
995 * If SPTE has been frozen by another thread, just
996 * give up and retry, avoiding unnecessary page table
997 * allocation and free.
998 */
999 if (is_removed_spte(iter.old_spte))
1000 break;
1001
1002 sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level - 1);
1003 child_pt = sp->spt;
1004
1005 new_spte = make_nonleaf_spte(child_pt,
1006 !shadow_accessed_mask);
1007
1008 if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, new_spte)) {
1009 tdp_mmu_link_page(vcpu->kvm, sp,
1010 fault->huge_page_disallowed &&
1011 fault->req_level >= iter.level);
1012
1013 trace_kvm_mmu_get_page(sp, true);
1014 } else {
1015 tdp_mmu_free_sp(sp);
1016 break;
1017 }
1018 }
1019 }
1020
1021 if (iter.level != fault->goal_level) {
1022 rcu_read_unlock();
1023 return RET_PF_RETRY;
1024 }
1025
1026 ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
1027 rcu_read_unlock();
1028
1029 return ret;
1030 }
1031
kvm_tdp_mmu_unmap_gfn_range(struct kvm * kvm,struct kvm_gfn_range * range,bool flush)1032 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1033 bool flush)
1034 {
1035 struct kvm_mmu_page *root;
1036
1037 for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false)
1038 flush = zap_gfn_range(kvm, root, range->start, range->end,
1039 range->may_block, flush, false);
1040
1041 return flush;
1042 }
1043
1044 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1045 struct kvm_gfn_range *range);
1046
kvm_tdp_mmu_handle_gfn(struct kvm * kvm,struct kvm_gfn_range * range,tdp_handler_t handler)1047 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1048 struct kvm_gfn_range *range,
1049 tdp_handler_t handler)
1050 {
1051 struct kvm_mmu_page *root;
1052 struct tdp_iter iter;
1053 bool ret = false;
1054
1055 rcu_read_lock();
1056
1057 /*
1058 * Don't support rescheduling, none of the MMU notifiers that funnel
1059 * into this helper allow blocking; it'd be dead, wasteful code.
1060 */
1061 for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1062 tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1063 ret |= handler(kvm, &iter, range);
1064 }
1065
1066 rcu_read_unlock();
1067
1068 return ret;
1069 }
1070
1071 /*
1072 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1073 * if any of the GFNs in the range have been accessed.
1074 */
age_gfn_range(struct kvm * kvm,struct tdp_iter * iter,struct kvm_gfn_range * range)1075 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1076 struct kvm_gfn_range *range)
1077 {
1078 u64 new_spte = 0;
1079
1080 /* If we have a non-accessed entry we don't need to change the pte. */
1081 if (!is_accessed_spte(iter->old_spte))
1082 return false;
1083
1084 new_spte = iter->old_spte;
1085
1086 if (spte_ad_enabled(new_spte)) {
1087 new_spte &= ~shadow_accessed_mask;
1088 } else {
1089 /*
1090 * Capture the dirty status of the page, so that it doesn't get
1091 * lost when the SPTE is marked for access tracking.
1092 */
1093 if (is_writable_pte(new_spte))
1094 kvm_set_pfn_dirty(spte_to_pfn(new_spte));
1095
1096 new_spte = mark_spte_for_access_track(new_spte);
1097 }
1098
1099 tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
1100
1101 return true;
1102 }
1103
kvm_tdp_mmu_age_gfn_range(struct kvm * kvm,struct kvm_gfn_range * range)1104 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1105 {
1106 return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
1107 }
1108
test_age_gfn(struct kvm * kvm,struct tdp_iter * iter,struct kvm_gfn_range * range)1109 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1110 struct kvm_gfn_range *range)
1111 {
1112 return is_accessed_spte(iter->old_spte);
1113 }
1114
kvm_tdp_mmu_test_age_gfn(struct kvm * kvm,struct kvm_gfn_range * range)1115 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1116 {
1117 return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
1118 }
1119
set_spte_gfn(struct kvm * kvm,struct tdp_iter * iter,struct kvm_gfn_range * range)1120 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1121 struct kvm_gfn_range *range)
1122 {
1123 u64 new_spte;
1124
1125 /* Huge pages aren't expected to be modified without first being zapped. */
1126 WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
1127
1128 if (iter->level != PG_LEVEL_4K ||
1129 !is_shadow_present_pte(iter->old_spte))
1130 return false;
1131
1132 /*
1133 * Note, when changing a read-only SPTE, it's not strictly necessary to
1134 * zero the SPTE before setting the new PFN, but doing so preserves the
1135 * invariant that the PFN of a present * leaf SPTE can never change.
1136 * See __handle_changed_spte().
1137 */
1138 tdp_mmu_set_spte(kvm, iter, 0);
1139
1140 if (!pte_write(range->pte)) {
1141 new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
1142 pte_pfn(range->pte));
1143
1144 tdp_mmu_set_spte(kvm, iter, new_spte);
1145 }
1146
1147 return true;
1148 }
1149
1150 /*
1151 * Handle the changed_pte MMU notifier for the TDP MMU.
1152 * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1153 * notifier.
1154 * Returns non-zero if a flush is needed before releasing the MMU lock.
1155 */
kvm_tdp_mmu_set_spte_gfn(struct kvm * kvm,struct kvm_gfn_range * range)1156 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1157 {
1158 bool flush = kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1159
1160 /* FIXME: return 'flush' instead of flushing here. */
1161 if (flush)
1162 kvm_flush_remote_tlbs_with_address(kvm, range->start, 1);
1163
1164 return false;
1165 }
1166
1167 /*
1168 * Remove write access from all SPTEs at or above min_level that map GFNs
1169 * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1170 * be flushed.
1171 */
wrprot_gfn_range(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t start,gfn_t end,int min_level)1172 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1173 gfn_t start, gfn_t end, int min_level)
1174 {
1175 struct tdp_iter iter;
1176 u64 new_spte;
1177 bool spte_set = false;
1178
1179 rcu_read_lock();
1180
1181 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1182
1183 for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1184 min_level, start, end) {
1185 retry:
1186 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1187 continue;
1188
1189 if (!is_shadow_present_pte(iter.old_spte) ||
1190 !is_last_spte(iter.old_spte, iter.level) ||
1191 !(iter.old_spte & PT_WRITABLE_MASK))
1192 continue;
1193
1194 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1195
1196 if (!tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) {
1197 /*
1198 * The iter must explicitly re-read the SPTE because
1199 * the atomic cmpxchg failed.
1200 */
1201 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1202 goto retry;
1203 }
1204 spte_set = true;
1205 }
1206
1207 rcu_read_unlock();
1208 return spte_set;
1209 }
1210
1211 /*
1212 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1213 * only affect leaf SPTEs down to min_level.
1214 * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1215 */
kvm_tdp_mmu_wrprot_slot(struct kvm * kvm,const struct kvm_memory_slot * slot,int min_level)1216 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1217 const struct kvm_memory_slot *slot, int min_level)
1218 {
1219 struct kvm_mmu_page *root;
1220 bool spte_set = false;
1221
1222 lockdep_assert_held_read(&kvm->mmu_lock);
1223
1224 for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1225 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1226 slot->base_gfn + slot->npages, min_level);
1227
1228 return spte_set;
1229 }
1230
1231 /*
1232 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1233 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1234 * If AD bits are not enabled, this will require clearing the writable bit on
1235 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1236 * be flushed.
1237 */
clear_dirty_gfn_range(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t start,gfn_t end)1238 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1239 gfn_t start, gfn_t end)
1240 {
1241 struct tdp_iter iter;
1242 u64 new_spte;
1243 bool spte_set = false;
1244
1245 rcu_read_lock();
1246
1247 tdp_root_for_each_leaf_pte(iter, root, start, end) {
1248 retry:
1249 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1250 continue;
1251
1252 if (spte_ad_need_write_protect(iter.old_spte)) {
1253 if (is_writable_pte(iter.old_spte))
1254 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1255 else
1256 continue;
1257 } else {
1258 if (iter.old_spte & shadow_dirty_mask)
1259 new_spte = iter.old_spte & ~shadow_dirty_mask;
1260 else
1261 continue;
1262 }
1263
1264 if (!tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) {
1265 /*
1266 * The iter must explicitly re-read the SPTE because
1267 * the atomic cmpxchg failed.
1268 */
1269 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1270 goto retry;
1271 }
1272 spte_set = true;
1273 }
1274
1275 rcu_read_unlock();
1276 return spte_set;
1277 }
1278
1279 /*
1280 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1281 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1282 * If AD bits are not enabled, this will require clearing the writable bit on
1283 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1284 * be flushed.
1285 */
kvm_tdp_mmu_clear_dirty_slot(struct kvm * kvm,const struct kvm_memory_slot * slot)1286 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1287 const struct kvm_memory_slot *slot)
1288 {
1289 struct kvm_mmu_page *root;
1290 bool spte_set = false;
1291
1292 lockdep_assert_held_read(&kvm->mmu_lock);
1293
1294 for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1295 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1296 slot->base_gfn + slot->npages);
1297
1298 return spte_set;
1299 }
1300
1301 /*
1302 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1303 * set in mask, starting at gfn. The given memslot is expected to contain all
1304 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1305 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1306 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1307 */
clear_dirty_pt_masked(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t gfn,unsigned long mask,bool wrprot)1308 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1309 gfn_t gfn, unsigned long mask, bool wrprot)
1310 {
1311 struct tdp_iter iter;
1312 u64 new_spte;
1313
1314 rcu_read_lock();
1315
1316 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1317 gfn + BITS_PER_LONG) {
1318 if (!mask)
1319 break;
1320
1321 if (iter.level > PG_LEVEL_4K ||
1322 !(mask & (1UL << (iter.gfn - gfn))))
1323 continue;
1324
1325 mask &= ~(1UL << (iter.gfn - gfn));
1326
1327 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1328 if (is_writable_pte(iter.old_spte))
1329 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1330 else
1331 continue;
1332 } else {
1333 if (iter.old_spte & shadow_dirty_mask)
1334 new_spte = iter.old_spte & ~shadow_dirty_mask;
1335 else
1336 continue;
1337 }
1338
1339 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1340 }
1341
1342 rcu_read_unlock();
1343 }
1344
1345 /*
1346 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1347 * set in mask, starting at gfn. The given memslot is expected to contain all
1348 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1349 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1350 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1351 */
kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm * kvm,struct kvm_memory_slot * slot,gfn_t gfn,unsigned long mask,bool wrprot)1352 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1353 struct kvm_memory_slot *slot,
1354 gfn_t gfn, unsigned long mask,
1355 bool wrprot)
1356 {
1357 struct kvm_mmu_page *root;
1358
1359 lockdep_assert_held_write(&kvm->mmu_lock);
1360 for_each_tdp_mmu_root(kvm, root, slot->as_id)
1361 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1362 }
1363
1364 /*
1365 * Clear leaf entries which could be replaced by large mappings, for
1366 * GFNs within the slot.
1367 */
zap_collapsible_spte_range(struct kvm * kvm,struct kvm_mmu_page * root,const struct kvm_memory_slot * slot)1368 static void zap_collapsible_spte_range(struct kvm *kvm,
1369 struct kvm_mmu_page *root,
1370 const struct kvm_memory_slot *slot)
1371 {
1372 gfn_t start = slot->base_gfn;
1373 gfn_t end = start + slot->npages;
1374 struct tdp_iter iter;
1375 kvm_pfn_t pfn;
1376
1377 rcu_read_lock();
1378
1379 tdp_root_for_each_pte(iter, root, start, end) {
1380 retry:
1381 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1382 continue;
1383
1384 if (!is_shadow_present_pte(iter.old_spte) ||
1385 !is_last_spte(iter.old_spte, iter.level))
1386 continue;
1387
1388 pfn = spte_to_pfn(iter.old_spte);
1389 if (kvm_is_reserved_pfn(pfn) ||
1390 iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1391 pfn, PG_LEVEL_NUM))
1392 continue;
1393
1394 /* Note, a successful atomic zap also does a remote TLB flush. */
1395 if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
1396 /*
1397 * The iter must explicitly re-read the SPTE because
1398 * the atomic cmpxchg failed.
1399 */
1400 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1401 goto retry;
1402 }
1403 }
1404
1405 rcu_read_unlock();
1406 }
1407
1408 /*
1409 * Clear non-leaf entries (and free associated page tables) which could
1410 * be replaced by large mappings, for GFNs within the slot.
1411 */
kvm_tdp_mmu_zap_collapsible_sptes(struct kvm * kvm,const struct kvm_memory_slot * slot)1412 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1413 const struct kvm_memory_slot *slot)
1414 {
1415 struct kvm_mmu_page *root;
1416
1417 lockdep_assert_held_read(&kvm->mmu_lock);
1418
1419 for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1420 zap_collapsible_spte_range(kvm, root, slot);
1421 }
1422
1423 /*
1424 * Removes write access on the last level SPTE mapping this GFN and unsets the
1425 * MMU-writable bit to ensure future writes continue to be intercepted.
1426 * Returns true if an SPTE was set and a TLB flush is needed.
1427 */
write_protect_gfn(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t gfn,int min_level)1428 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1429 gfn_t gfn, int min_level)
1430 {
1431 struct tdp_iter iter;
1432 u64 new_spte;
1433 bool spte_set = false;
1434
1435 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1436
1437 rcu_read_lock();
1438
1439 for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1440 min_level, gfn, gfn + 1) {
1441 if (!is_shadow_present_pte(iter.old_spte) ||
1442 !is_last_spte(iter.old_spte, iter.level))
1443 continue;
1444
1445 if (!is_writable_pte(iter.old_spte))
1446 break;
1447
1448 new_spte = iter.old_spte &
1449 ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1450
1451 tdp_mmu_set_spte(kvm, &iter, new_spte);
1452 spte_set = true;
1453 }
1454
1455 rcu_read_unlock();
1456
1457 return spte_set;
1458 }
1459
1460 /*
1461 * Removes write access on the last level SPTE mapping this GFN and unsets the
1462 * MMU-writable bit to ensure future writes continue to be intercepted.
1463 * Returns true if an SPTE was set and a TLB flush is needed.
1464 */
kvm_tdp_mmu_write_protect_gfn(struct kvm * kvm,struct kvm_memory_slot * slot,gfn_t gfn,int min_level)1465 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1466 struct kvm_memory_slot *slot, gfn_t gfn,
1467 int min_level)
1468 {
1469 struct kvm_mmu_page *root;
1470 bool spte_set = false;
1471
1472 lockdep_assert_held_write(&kvm->mmu_lock);
1473 for_each_tdp_mmu_root(kvm, root, slot->as_id)
1474 spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1475
1476 return spte_set;
1477 }
1478
1479 /*
1480 * Return the level of the lowest level SPTE added to sptes.
1481 * That SPTE may be non-present.
1482 *
1483 * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1484 */
kvm_tdp_mmu_get_walk(struct kvm_vcpu * vcpu,u64 addr,u64 * sptes,int * root_level)1485 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1486 int *root_level)
1487 {
1488 struct tdp_iter iter;
1489 struct kvm_mmu *mmu = vcpu->arch.mmu;
1490 gfn_t gfn = addr >> PAGE_SHIFT;
1491 int leaf = -1;
1492
1493 *root_level = vcpu->arch.mmu->shadow_root_level;
1494
1495 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1496 leaf = iter.level;
1497 sptes[leaf] = iter.old_spte;
1498 }
1499
1500 return leaf;
1501 }
1502
1503 /*
1504 * Returns the last level spte pointer of the shadow page walk for the given
1505 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1506 * walk could be performed, returns NULL and *spte does not contain valid data.
1507 *
1508 * Contract:
1509 * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1510 * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1511 *
1512 * WARNING: This function is only intended to be called during fast_page_fault.
1513 */
kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu * vcpu,u64 addr,u64 * spte)1514 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
1515 u64 *spte)
1516 {
1517 struct tdp_iter iter;
1518 struct kvm_mmu *mmu = vcpu->arch.mmu;
1519 gfn_t gfn = addr >> PAGE_SHIFT;
1520 tdp_ptep_t sptep = NULL;
1521
1522 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1523 *spte = iter.old_spte;
1524 sptep = iter.sptep;
1525 }
1526
1527 /*
1528 * Perform the rcu_dereference to get the raw spte pointer value since
1529 * we are passing it up to fast_page_fault, which is shared with the
1530 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
1531 * annotation.
1532 *
1533 * This is safe since fast_page_fault obeys the contracts of this
1534 * function as well as all TDP MMU contracts around modifying SPTEs
1535 * outside of mmu_lock.
1536 */
1537 return rcu_dereference(sptep);
1538 }
1539