Re: [PATCH 13/28] KVM: x86/mmu: split XS/XU bits for EPT

From: mlevitsk

Date: Tue Jun 02 2026 - 10:40:19 EST


On Tue, 2026-05-05 at 21:52 +0200, Paolo Bonzini wrote:
> When EPT is in use, replace ACC_USER_MASK with ACC_USER_EXEC_MASK,
> so that supervisor and user-mode execution can be controlled
> independently (ACC_USER_MASK would not allow a setting similar to
> XU=0 XS=1 W=1 R=1).
>
> Replace shadow_x_mask with shadow_xs_mask/shadow_xu_mask, to allow setting
> XS and XU bits separately in EPT entries.
>
> In fact, ACC_USER_EXEC_MASK is already set through ACC_ALL in the
> kvm_mmu_page roles and propagates to the XU bit of sPTEs even if
> MBEC is not (yet) enabled in the execution controls.  This is fine,
> because the XU bit is ignored by the processor, and even once KVM
> supports MBEC this mode will remain for processors that lack the
> feature.
>
> Tested-by: David Riley <d.riley@xxxxxxxxxxx>
> Signed-off-by: Paolo Bonzini <pbonzini@xxxxxxxxxx>
> ---
>  arch/x86/kvm/mmu.h          |  3 +-
>  arch/x86/kvm/mmu/mmu.c      |  2 +-
>  arch/x86/kvm/mmu/mmutrace.h |  6 ++--
>  arch/x86/kvm/mmu/spte.c     | 62 ++++++++++++++++++++++++++-----------
>  arch/x86/kvm/mmu/spte.h     | 16 +++++++---
>  5 files changed, 62 insertions(+), 27 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> index 63be5c5efed9..d8c13e43c2d7 100644
> --- a/arch/x86/kvm/mmu.h
> +++ b/arch/x86/kvm/mmu.h
> @@ -39,7 +39,8 @@ extern bool __read_mostly enable_mmio_caching;
>  
>  #define ACC_READ_MASK    PT_PRESENT_MASK
>  #define ACC_WRITE_MASK   PT_WRITABLE_MASK
> -#define ACC_USER_MASK    PT_USER_MASK
> +#define ACC_USER_MASK    PT_USER_MASK   /* non EPT */
> +#define ACC_USER_EXEC_MASK ACC_USER_MASK /* EPT only */
>  #define ACC_EXEC_MASK    8
>  #define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK | ACC_READ_MASK)
>  
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 3dbac7ad044f..16eaf413b299 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -5491,7 +5491,7 @@ static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
>  static inline bool boot_cpu_is_amd(void)
>  {
>   WARN_ON_ONCE(!tdp_enabled);
> - return shadow_x_mask == 0;
> + return shadow_xs_mask == 0;
>  }
>  
>  /*
> diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
> index dcfdfedfc4e9..3429c1413f42 100644
> --- a/arch/x86/kvm/mmu/mmutrace.h
> +++ b/arch/x86/kvm/mmu/mmutrace.h
> @@ -357,8 +357,8 @@ TRACE_EVENT(
>   __entry->sptep = virt_to_phys(sptep);
>   __entry->level = level;
>   __entry->r = shadow_present_mask || (__entry->spte & PT_PRESENT_MASK);
> - __entry->x = is_executable_pte(__entry->spte);
> - __entry->u = shadow_user_mask ? !!(__entry->spte & shadow_user_mask) : -1;
> + __entry->x = (__entry->spte & (shadow_xs_mask | shadow_nx_mask)) == shadow_xs_mask;
> + __entry->u = !!(__entry->spte & (shadow_xu_mask | shadow_user_mask));
>   ),
>  
>   TP_printk("gfn %llx spte %llx (%s%s%s%s) level %d at %llx",
> @@ -366,7 +366,7 @@ TRACE_EVENT(
>     __entry->r ? "r" : "-",
>     __entry->spte & PT_WRITABLE_MASK ? "w" : "-",
>     __entry->x ? "x" : "-",
> -   __entry->u == -1 ? "" : (__entry->u ? "u" : "-"),
> +   __entry->u ? "u" : "-",
>     __entry->level, __entry->sptep
>   )
>  );
> diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
> index 1b7fb508098b..f41573b0ccfa 100644
> --- a/arch/x86/kvm/mmu/spte.c
> +++ b/arch/x86/kvm/mmu/spte.c
> @@ -29,8 +29,9 @@ bool __read_mostly kvm_ad_enabled;
>  u64 __read_mostly shadow_host_writable_mask;
>  u64 __read_mostly shadow_mmu_writable_mask;
>  u64 __read_mostly shadow_nx_mask;
> -u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
>  u64 __read_mostly shadow_user_mask;
> +u64 __read_mostly shadow_xs_mask; /* mutual exclusive with nx_mask and user_mask */
> +u64 __read_mostly shadow_xu_mask; /* mutual exclusive with nx_mask and user_mask */
>  u64 __read_mostly shadow_accessed_mask;
>  u64 __read_mostly shadow_dirty_mask;
>  u64 __read_mostly shadow_mmio_value;
> @@ -217,21 +218,26 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
>   * would tie make_spte() further to vCPU/MMU state, and add complexity
>   * just to optimize a mode that is anything but performance critical.
>   */
> - if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) &&
> -     is_nx_huge_page_enabled(vcpu->kvm)) {
> + if (level > PG_LEVEL_4K && is_nx_huge_page_enabled(vcpu->kvm)) {
>   pte_access &= ~ACC_EXEC_MASK;
> + if (shadow_xu_mask)
> + pte_access &= ~ACC_USER_EXEC_MASK;
>   }
>  
>   if (pte_access & ACC_READ_MASK)
>   spte |= PT_PRESENT_MASK; /* or VMX_EPT_READABLE_MASK */
>  
> - if (pte_access & ACC_EXEC_MASK)
> - spte |= shadow_x_mask;
> - else
> - spte |= shadow_nx_mask;
> -
> - if (pte_access & ACC_USER_MASK)
> - spte |= shadow_user_mask;
> + if (shadow_nx_mask) {
> + if (!(pte_access & ACC_EXEC_MASK))
> + spte |= shadow_nx_mask;
> + if (pte_access & ACC_USER_MASK)
> + spte |= shadow_user_mask;
> + } else {
> + if (pte_access & ACC_EXEC_MASK)
> + spte |= shadow_xs_mask;
> + if (pte_access & ACC_USER_EXEC_MASK)
> + spte |= shadow_xu_mask;
> + }

Hi!

Looks correct but at some point a question starts to ask itself: 

Do you think that the time come to give up and add an is_ept/is_npt flags, and use that instead of checking for various properties that happen to be specific to EPT/x86/NPT paging, 
such as non-zero shadow_nx_mask?

The intention behind this is very good, but in practice IMHO, the code still assumes
hard a EPT or NPT/x86 and there is roughly 0% chance that it will work if some
vendor ever ships a CPU with TDP paging that uses weird half-EPT, half-NPT entries.

I do understand that this is not black and white because x86 paging and NPT also do
differ, but only slightly.

I think we should have:

is_ept() for cases which are explicitly EPT-only (e.g emulation of A/D)
is_npt() for cases which are explicitly NPT (e.g NPT specific checks on U bit with and without GMET)
is_x86() for cases which are common for NPT or EPT (or use !is_ept())

What do you think?

Sorry for ramblings, this is just an idea for a possible refactoring, no need to do it now :)
BTW, actually we even already have boot_cpu_is_amd, which checks 'shadow_xs_mask == 0'...

>  
>   if (level > PG_LEVEL_4K)
>   spte |= PT_PAGE_SIZE_MASK;
> @@ -318,11 +324,13 @@ static u64 change_spte_executable(u64 spte, u8 access)
>  {
>   u64 set, clear;
>  
> - if (access & ACC_EXEC_MASK)
> - set = shadow_x_mask;
> + if (shadow_nx_mask)
> + set = (access & ACC_EXEC_MASK) ? 0 : shadow_nx_mask;
>   else
> - set = shadow_nx_mask;
> - clear = set ^ (shadow_nx_mask | shadow_x_mask);
> + set =
> + (access & ACC_EXEC_MASK ? shadow_xs_mask : 0) |
> + (access & ACC_USER_EXEC_MASK ? shadow_xu_mask : 0);
> + clear = set ^ (shadow_nx_mask | shadow_xs_mask | shadow_xu_mask);
>   return modify_spte_protections(spte, set, clear);
>  }
>  
> @@ -389,7 +397,7 @@ u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
>  
>   spte |= __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK |
>   PT_PRESENT_MASK /* or VMX_EPT_READABLE_MASK */ |
> - shadow_user_mask | shadow_x_mask | shadow_me_value;
> + shadow_user_mask | shadow_xs_mask | shadow_xu_mask | shadow_me_value;
>  
>   if (ad_disabled)
>   spte |= SPTE_TDP_AD_DISABLED;
> @@ -497,10 +505,27 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits)
>   shadow_accessed_mask = VMX_EPT_ACCESS_BIT;
>   shadow_dirty_mask = VMX_EPT_DIRTY_BIT;
>   shadow_nx_mask = 0ull;
> - shadow_x_mask = VMX_EPT_EXECUTABLE_MASK;
> + shadow_xs_mask = VMX_EPT_EXECUTABLE_MASK;
> +
> + /*
> + * The MMU always maps ACC_EXEC_MASK and ACC_USER_EXEC_MASK to the
> + * XS and XU bits of shadow EPT entries, regardless of whether MBEC
> + * is available on the host or enabled in the VMCS.
> + *
> + * For the non-nested case, pages are mapped with ACC_EXEC_MASK
> + * and ACC_USER_EXEC_MASK set in tandem, so XS == XU and the
> + * host's MBEC setting does not matter.  On hardware without MBEC
> + * the XU bit is reserved-as-ignored, and setting it does no harm.
> + *
> + * For nested EPT MBEC is not supported, but bit 10 of the gPTE has
> + * no effect because (a) is_present_gpte() does not treat it as a
> + * present bit, and (b) permission_fault() uses an mmu->permissions[]
> + * array that effectively ignores ACC_USER_EXEC_MASK.
> + */




> + shadow_xu_mask = VMX_EPT_USER_EXECUTABLE_MASK;
>   shadow_present_mask = VMX_EPT_SUPPRESS_VE_BIT;
>  
> - shadow_acc_track_mask = VMX_EPT_RWX_MASK;
> + shadow_acc_track_mask = VMX_EPT_RWX_MASK | VMX_EPT_USER_EXECUTABLE_MASK;
>   shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE;
>   shadow_mmu_writable_mask  = EPT_SPTE_MMU_WRITABLE;
>  
> @@ -548,7 +573,8 @@ void kvm_mmu_reset_all_pte_masks(void)
>   shadow_accessed_mask = PT_ACCESSED_MASK;
>   shadow_dirty_mask = PT_DIRTY_MASK;
>   shadow_nx_mask = PT64_NX_MASK;
> - shadow_x_mask = 0;
> + shadow_xs_mask = 0;
> + shadow_xu_mask = 0;
>   shadow_present_mask = PT_PRESENT_MASK;
>  
>   shadow_acc_track_mask = 0;
> diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
> index 8a4c09c5cdbf..f5261d993eac 100644
> --- a/arch/x86/kvm/mmu/spte.h
> +++ b/arch/x86/kvm/mmu/spte.h
> @@ -24,7 +24,7 @@
>   * - bits 55 (EPT only): MMU-writable
>   * - bits 56-59: unused
>   * - bits 60-61: type of A/D tracking
> - * - bits 62: unused
> + * - bits 62 (EPT only): saved XU bit for disabled AD
>   */
>  
>  /*
> @@ -65,7 +65,8 @@ static_assert(SPTE_TDP_AD_ENABLED == 0);
>   * must not overlap the A/D type mask.
>   */
>  #define SHADOW_ACC_TRACK_SAVED_BITS_MASK (VMX_EPT_READABLE_MASK | \
> -   VMX_EPT_EXECUTABLE_MASK)
> +   VMX_EPT_EXECUTABLE_MASK | \
> +   VMX_EPT_USER_EXECUTABLE_MASK)
>  #define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT 52
>  #define SHADOW_ACC_TRACK_SAVED_MASK (SHADOW_ACC_TRACK_SAVED_BITS_MASK << \
>   SHADOW_ACC_TRACK_SAVED_BITS_SHIFT)
> @@ -178,8 +179,9 @@ extern bool __read_mostly kvm_ad_enabled;
>  extern u64 __read_mostly shadow_host_writable_mask;
>  extern u64 __read_mostly shadow_mmu_writable_mask;
>  extern u64 __read_mostly shadow_nx_mask;
> -extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
>  extern u64 __read_mostly shadow_user_mask;
> +extern u64 __read_mostly shadow_xs_mask; /* mutual exclusive with nx_mask and user_mask */
> +extern u64 __read_mostly shadow_xu_mask; /* mutual exclusive with nx_mask and user_mask */
>  extern u64 __read_mostly shadow_accessed_mask;
>  extern u64 __read_mostly shadow_dirty_mask;
>  extern u64 __read_mostly shadow_mmio_value;
> @@ -357,7 +359,13 @@ static inline bool is_last_spte(u64 pte, int level)
>  
>  static inline bool is_executable_pte(u64 spte)
>  {
> - return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
> + /*
> + * For now, return true if either the XS or XU bit is set
> + * This function is only used for fast_page_fault,
> + * which never processes shadow EPT, and regular page
> + * tables always have XS==XU.
> + */
> + return (spte & (shadow_xs_mask | shadow_xu_mask | shadow_nx_mask)) != shadow_nx_mask;
>  }
>  
>  static inline kvm_pfn_t spte_to_pfn(u64 pte)



Looks all correct to me.

Reviewed-by: Maxim Levitsky <mlevitsk@xxxxxxxxxx>

Best regards,
Maxim Levitsky