[PATCH v3 2/3] RISC-V: KVM: Allow splitting huge pages to arbitrary level

From: wang.yechao255

Date: Mon Mar 16 2026 - 02:20:33 EST


From: Wang Yechao <wang.yechao255@xxxxxxxxxx>

This patch introduces the function kvm_riscv_gstage_split_huge().
It splits the huge page covering a given guest physical address down
to a specified target level (e.g., from 1G to 2M or 4K). The caller
provides a memory cache for allocating any intermediate page tables
and may request a TLB flush after the split.

This functionality will be used by subsequent patches to split huge
pages before handling the write-protection fault, or for other operations
that require page-level granularity.

Signed-off-by: Wang Yechao <wang.yechao255@xxxxxxxxxx>
---
arch/riscv/include/asm/kvm_gstage.h | 4 ++
arch/riscv/kvm/gstage.c | 69 +++++++++++++++++++++++++++++
2 files changed, 73 insertions(+)

diff --git a/arch/riscv/include/asm/kvm_gstage.h b/arch/riscv/include/asm/kvm_gstage.h
index 595e2183173e..373748c6745e 100644
--- a/arch/riscv/include/asm/kvm_gstage.h
+++ b/arch/riscv/include/asm/kvm_gstage.h
@@ -53,6 +53,10 @@ int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage,
bool page_rdonly, bool page_exec,
struct kvm_gstage_mapping *out_map);

+int kvm_riscv_gstage_split_huge(struct kvm_gstage *gstage,
+ struct kvm_mmu_memory_cache *pcache,
+ gpa_t addr, u32 target_level, bool flush);
+
enum kvm_riscv_gstage_op {
GSTAGE_OP_NOP = 0, /* Nothing */
GSTAGE_OP_CLEAR, /* Clear/Unmap */
diff --git a/arch/riscv/kvm/gstage.c b/arch/riscv/kvm/gstage.c
index d2001d508046..5356abb18932 100644
--- a/arch/riscv/kvm/gstage.c
+++ b/arch/riscv/kvm/gstage.c
@@ -209,6 +209,75 @@ int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage,
return kvm_riscv_gstage_set_pte(gstage, pcache, out_map);
}

+static inline unsigned long make_child_pte(unsigned long huge_pte, int index,
+ unsigned long child_page_size)
+{
+ unsigned long child_pte = huge_pte;
+ unsigned long child_pfn_offset;
+
+ /*
+ * The child_pte already has the base address of the huge page being
+ * split. So we just have to OR in the offset to the page at the next
+ * lower level for the given index.
+ */
+ child_pfn_offset = index * (child_page_size / PAGE_SIZE);
+ child_pte |= pte_val(pfn_pte(child_pfn_offset, __pgprot(0)));
+
+ return child_pte;
+}
+
+int kvm_riscv_gstage_split_huge(struct kvm_gstage *gstage,
+ struct kvm_mmu_memory_cache *pcache,
+ gpa_t addr, u32 target_level, bool flush)
+{
+ u32 current_level = kvm_riscv_gstage_pgd_levels - 1;
+ pte_t *next_ptep = (pte_t *)gstage->pgd;
+ pte_t *ptep;
+ unsigned long huge_pte, child_pte;
+ unsigned long child_page_size;
+ int i, ret;
+
+ while(current_level > target_level) {
+ ptep = (pte_t *)&next_ptep[gstage_pte_index(addr, current_level)];
+
+ if (!pte_val(ptep_get(ptep)))
+ break;
+
+ if (!gstage_pte_leaf(ptep)) {
+ next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
+ current_level--;
+ continue;
+ }
+
+ huge_pte = pte_val(ptep_get(ptep));
+
+ ret = gstage_level_to_page_size(current_level - 1, &child_page_size);
+ if (ret)
+ return ret;
+
+ if (!pcache)
+ return -ENOMEM;
+ next_ptep = kvm_mmu_memory_cache_alloc(pcache);
+ if (!next_ptep)
+ return -ENOMEM;
+
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ child_pte = make_child_pte(huge_pte, i, child_page_size);
+ set_pte((pte_t *)&next_ptep[i], __pte(child_pte));
+ }
+
+ set_pte(ptep, pfn_pte(PFN_DOWN(__pa(next_ptep)),
+ __pgprot(_PAGE_TABLE)));
+
+ if (flush)
+ gstage_tlb_flush(gstage, current_level, addr);
+
+ current_level--;
+ }
+
+ return 0;
+}
+
void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr,
pte_t *ptep, u32 ptep_level, enum kvm_riscv_gstage_op op)
{
--
2.27.0