[PATCH 10/24] KVM: x86/mmu: split XS/XU bits for EPT

From: Paolo Bonzini

Date: Thu Mar 26 2026 - 14:23:33 EST


When EPT is in use, replace ACC_USER_MASK with ACC_USER_EXEC_MASK,
so that supervisor and user-mode execution can be controlled
independently (ACC_USER_MASK would not allow a setting similar to
XU=0 XS=1 W=1 R=1).

Replace shadow_x_mask with shadow_xs_mask/shadow_xu_mask, to allow
setting XS and XU bits separately in EPT entries.

Note that ACC_USER_EXEC_MASK is already set through ACC_ALL in
the kvm_mmu_page roles, but it does not propagate to the XU bit
because shadow_xs_mask == shadow_xu_mask. On the other hand,
access tracking for eptad=0 does take it into account when
saving/restoring page permissions.

Signed-off-by: Paolo Bonzini <pbonzini@xxxxxxxxxx>
---
arch/x86/kvm/mmu/mmu.c | 2 +-
arch/x86/kvm/mmu/mmutrace.h | 6 ++---
arch/x86/kvm/mmu/spte.c | 49 +++++++++++++++++++++++--------------
arch/x86/kvm/mmu/spte.h | 8 +++---
4 files changed, 40 insertions(+), 25 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index dd5419a1f891..a6ee467ad838 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5472,7 +5472,7 @@ static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
static inline bool boot_cpu_is_amd(void)
{
WARN_ON_ONCE(!tdp_enabled);
- return shadow_x_mask == 0;
+ return shadow_xs_mask == 0;
}

/*
diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
index dcfdfedfc4e9..3429c1413f42 100644
--- a/arch/x86/kvm/mmu/mmutrace.h
+++ b/arch/x86/kvm/mmu/mmutrace.h
@@ -357,8 +357,8 @@ TRACE_EVENT(
__entry->sptep = virt_to_phys(sptep);
__entry->level = level;
__entry->r = shadow_present_mask || (__entry->spte & PT_PRESENT_MASK);
- __entry->x = is_executable_pte(__entry->spte);
- __entry->u = shadow_user_mask ? !!(__entry->spte & shadow_user_mask) : -1;
+ __entry->x = (__entry->spte & (shadow_xs_mask | shadow_nx_mask)) == shadow_xs_mask;
+ __entry->u = !!(__entry->spte & (shadow_xu_mask | shadow_user_mask));
),

TP_printk("gfn %llx spte %llx (%s%s%s%s) level %d at %llx",
@@ -366,7 +366,7 @@ TRACE_EVENT(
__entry->r ? "r" : "-",
__entry->spte & PT_WRITABLE_MASK ? "w" : "-",
__entry->x ? "x" : "-",
- __entry->u == -1 ? "" : (__entry->u ? "u" : "-"),
+ __entry->u ? "u" : "-",
__entry->level, __entry->sptep
)
);
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 7b5f118ae211..fc7eb73476f6 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -29,8 +29,9 @@ bool __read_mostly kvm_ad_enabled;
u64 __read_mostly shadow_host_writable_mask;
u64 __read_mostly shadow_mmu_writable_mask;
u64 __read_mostly shadow_nx_mask;
-u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
u64 __read_mostly shadow_user_mask;
+u64 __read_mostly shadow_xs_mask; /* mutual exclusive with nx_mask and user_mask */
+u64 __read_mostly shadow_xu_mask; /* mutual exclusive with nx_mask and user_mask */
u64 __read_mostly shadow_accessed_mask;
u64 __read_mostly shadow_dirty_mask;
u64 __read_mostly shadow_mmio_value;
@@ -216,22 +217,30 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
* when CR0.PG is toggled, but leveraging that to ignore the mitigation
* would tie make_spte() further to vCPU/MMU state, and add complexity
* just to optimize a mode that is anything but performance critical.
+ *
+ * Use ACC_USER_EXEC_MASK here assuming only Intel processors (EPT)
+ * are affected by the NX huge page erratum.
*/
- if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) &&
+ if (level > PG_LEVEL_4K &&
+ (pte_access & (ACC_EXEC_MASK | ACC_USER_EXEC_MASK)) &&
is_nx_huge_page_enabled(vcpu->kvm)) {
- pte_access &= ~ACC_EXEC_MASK;
+ pte_access &= ~(ACC_EXEC_MASK | ACC_USER_EXEC_MASK);
}

if (pte_access & ACC_READ_MASK)
spte |= PT_PRESENT_MASK; /* or VMX_EPT_READABLE_MASK */

- if (pte_access & ACC_EXEC_MASK)
- spte |= shadow_x_mask;
- else
- spte |= shadow_nx_mask;
-
- if (pte_access & ACC_USER_MASK)
- spte |= shadow_user_mask;
+ if (shadow_nx_mask) {
+ if (!(pte_access & ACC_EXEC_MASK))
+ spte |= shadow_nx_mask;
+ if (pte_access & ACC_USER_MASK)
+ spte |= shadow_user_mask;
+ } else {
+ if (pte_access & ACC_EXEC_MASK)
+ spte |= shadow_xs_mask;
+ if (pte_access & ACC_USER_EXEC_MASK)
+ spte |= shadow_xu_mask;
+ }

if (level > PG_LEVEL_4K)
spte |= PT_PAGE_SIZE_MASK;
@@ -318,11 +327,13 @@ static u64 make_spte_executable(u64 spte, u8 access)
{
u64 set, clear;

- if (access & ACC_EXEC_MASK)
- set = shadow_x_mask;
+ if (shadow_nx_mask)
+ set = (access & ACC_EXEC_MASK) ? 0 : shadow_nx_mask;
else
- set = shadow_nx_mask;
- clear = set ^ (shadow_nx_mask | shadow_x_mask);
+ set =
+ (access & ACC_EXEC_MASK ? shadow_xs_mask : 0) |
+ (access & ACC_USER_EXEC_MASK ? shadow_xu_mask : 0);
+ clear = set ^ (shadow_nx_mask | shadow_xs_mask | shadow_xu_mask);
return modify_spte_protections(spte, set, clear);
}

@@ -389,7 +400,7 @@ u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)

spte |= __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK |
PT_PRESENT_MASK /* or VMX_EPT_READABLE_MASK */ |
- shadow_user_mask | shadow_x_mask | shadow_me_value;
+ shadow_user_mask | shadow_xs_mask | shadow_xu_mask | shadow_me_value;

if (ad_disabled)
spte |= SPTE_TDP_AD_DISABLED;
@@ -497,10 +508,11 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits)
shadow_accessed_mask = VMX_EPT_ACCESS_BIT;
shadow_dirty_mask = VMX_EPT_DIRTY_BIT;
shadow_nx_mask = 0ull;
- shadow_x_mask = VMX_EPT_EXECUTABLE_MASK;
+ shadow_xs_mask = VMX_EPT_EXECUTABLE_MASK;
+ shadow_xu_mask = VMX_EPT_EXECUTABLE_MASK;
shadow_present_mask = VMX_EPT_SUPPRESS_VE_BIT;

- shadow_acc_track_mask = VMX_EPT_RWX_MASK;
+ shadow_acc_track_mask = VMX_EPT_RWX_MASK | VMX_EPT_USER_EXECUTABLE_MASK;
shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE;
shadow_mmu_writable_mask = EPT_SPTE_MMU_WRITABLE;

@@ -548,7 +560,8 @@ void kvm_mmu_reset_all_pte_masks(void)
shadow_accessed_mask = PT_ACCESSED_MASK;
shadow_dirty_mask = PT_DIRTY_MASK;
shadow_nx_mask = PT64_NX_MASK;
- shadow_x_mask = 0;
+ shadow_xs_mask = 0;
+ shadow_xu_mask = 0;
shadow_present_mask = PT_PRESENT_MASK;

shadow_acc_track_mask = 0;
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 121bfb2217e8..204f16aaf4e5 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -54,7 +54,8 @@ static_assert(SPTE_TDP_AD_ENABLED == 0);

#define ACC_READ_MASK PT_PRESENT_MASK
#define ACC_WRITE_MASK PT_WRITABLE_MASK
-#define ACC_USER_MASK PT_USER_MASK
+#define ACC_USER_MASK PT_USER_MASK /* non EPT */
+#define ACC_USER_EXEC_MASK ACC_USER_MASK /* EPT only */
#define ACC_EXEC_MASK 8
#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK | ACC_READ_MASK)

@@ -184,8 +185,9 @@ extern bool __read_mostly kvm_ad_enabled;
extern u64 __read_mostly shadow_host_writable_mask;
extern u64 __read_mostly shadow_mmu_writable_mask;
extern u64 __read_mostly shadow_nx_mask;
-extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
extern u64 __read_mostly shadow_user_mask;
+extern u64 __read_mostly shadow_xs_mask; /* mutual exclusive with nx_mask and user_mask */
+extern u64 __read_mostly shadow_xu_mask; /* mutual exclusive with nx_mask and user_mask */
extern u64 __read_mostly shadow_accessed_mask;
extern u64 __read_mostly shadow_dirty_mask;
extern u64 __read_mostly shadow_mmio_value;
@@ -363,7 +365,7 @@ static inline bool is_last_spte(u64 pte, int level)

static inline bool is_executable_pte(u64 spte)
{
- return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
+ return (spte & (shadow_xs_mask | shadow_xu_mask | shadow_nx_mask)) != shadow_nx_mask;
}

static inline kvm_pfn_t spte_to_pfn(u64 pte)
--
2.53.0