Re: [RFC, PATCH 02/12] x86/virt/tdx: Allocate reference counters for PAMT memory
From: Huang, Kai
Date: Mon May 05 2025 - 07:05:34 EST
> +static atomic_t *pamt_refcounts;
> +
> static enum tdx_module_status_t tdx_module_status;
> static DEFINE_MUTEX(tdx_module_lock);
>
> @@ -1035,9 +1038,108 @@ static int config_global_keyid(void)
> return ret;
> }
>
> +atomic_t *tdx_get_pamt_refcount(unsigned long hpa)
> +{
> + return &pamt_refcounts[hpa / PMD_SIZE];
> +}
> +EXPORT_SYMBOL_GPL(tdx_get_pamt_refcount);
It's not quite clear why this function needs to be exported in this patch. IMO
it's better to move the export to the patch which actually needs it.
Looking at patch 5, tdx_pamt_get()/put() use it, and they are in KVM code. But
I think we should just put them here in this file. tdx_alloc_page() and
tdx_free_page() should be in this file too.
And instead of exporting tdx_get_pamt_refcount(), the TDX core code here can
export tdx_alloc_page() and tdx_free_page(), providing two high level helpers to
allow the TDX users (e.g., KVM) to allocate/free TDX private pages. How PAMT
pages are allocated is then hidden in the core TDX code.
> +
> +static int pamt_refcount_populate(pte_t *pte, unsigned long addr, void *data)
> +{
> + unsigned long vaddr;
> + pte_t entry;
> +
> + if (!pte_none(ptep_get(pte)))
> + return 0;
> +
> + vaddr = __get_free_page(GFP_KERNEL | __GFP_ZERO);
> + if (!vaddr)
> + return -ENOMEM;
> +
> + entry = pfn_pte(PFN_DOWN(__pa(vaddr)), PAGE_KERNEL);
> +
> + spin_lock(&init_mm.page_table_lock);
> + if (pte_none(ptep_get(pte)))
> + set_pte_at(&init_mm, addr, pte, entry);
> + else
> + free_page(vaddr);
> + spin_unlock(&init_mm.page_table_lock);
> +
> + return 0;
> +}
> +
> +static int pamt_refcount_depopulate(pte_t *pte, unsigned long addr,
> + void *data)
> +{
> + unsigned long vaddr;
> +
> + vaddr = (unsigned long)__va(PFN_PHYS(pte_pfn(ptep_get(pte))));
> +
> + spin_lock(&init_mm.page_table_lock);
> + if (!pte_none(ptep_get(pte))) {
> + pte_clear(&init_mm, addr, pte);
> + free_page(vaddr);
> + }
> + spin_unlock(&init_mm.page_table_lock);
> +
> + return 0;
> +}
> +
> +static int alloc_tdmr_pamt_refcount(struct tdmr_info *tdmr)
> +{
> + unsigned long start, end;
> +
> + start = (unsigned long)tdx_get_pamt_refcount(tdmr->base);
> + end = (unsigned long)tdx_get_pamt_refcount(tdmr->base + tdmr->size);
> + start = round_down(start, PAGE_SIZE);
> + end = round_up(end, PAGE_SIZE);
> +
> + return apply_to_page_range(&init_mm, start, end - start,
> + pamt_refcount_populate, NULL);
> +}
IIUC, populating refcount based on TDMR will slightly waste memory. The reason
is IIUC we don't need to populate the refcount for a 2M range if the range is
completely marked as reserved in TDMR, because it's not possible for the kernel
to use such range for TDX.
Populating based on the list of TDX memory blocks should be better. In
practice, the difference should be unnoticeable, but conceptually, using TDX
memory blocks is better.
> +
> +static int init_pamt_metadata(void)
> +{
> + size_t size = max_pfn / PTRS_PER_PTE * sizeof(*pamt_refcounts);
> + struct vm_struct *area;
> +
> + if (!tdx_supports_dynamic_pamt(&tdx_sysinfo))
> + return 0;
> +
> + /*
> + * Reserve vmalloc range for PAMT reference counters. It covers all
> + * physical address space up to max_pfn. It is going to be populated
> + * from init_tdmr() only for present memory that available for TDX use.
> + */
> + area = get_vm_area(size, VM_IOREMAP);
> + if (!area)
> + return -ENOMEM;
> +
> + pamt_refcounts = area->addr;
> + return 0;
> +}
> +
> +static void free_pamt_metadata(void)
> +{
> + size_t size = max_pfn / PTRS_PER_PTE * sizeof(*pamt_refcounts);
> +
> + size = round_up(size, PAGE_SIZE);
> + apply_to_existing_page_range(&init_mm,
> + (unsigned long)pamt_refcounts,
> + size, pamt_refcount_depopulate,
> + NULL);
> + vfree(pamt_refcounts);
> + pamt_refcounts = NULL;
> +}
> +
> static int init_tdmr(struct tdmr_info *tdmr)
> {
> u64 next;
> + int ret;
> +
> + ret = alloc_tdmr_pamt_refcount(tdmr);
> + if (ret)
> + return ret;
>
> /*
> * Initializing a TDMR can be time consuming. To avoid long
> @@ -1048,7 +1150,6 @@ static int init_tdmr(struct tdmr_info *tdmr)
> struct tdx_module_args args = {
> .rcx = tdmr->base,
> };
> - int ret;
>
> ret = seamcall_prerr_ret(TDH_SYS_TDMR_INIT, &args);
> if (ret)
> @@ -1134,10 +1235,15 @@ static int init_tdx_module(void)
> if (ret)
> goto err_reset_pamts;
>
> + /* Reserve vmalloc range for PAMT reference counters */
> + ret = init_pamt_metadata();
> + if (ret)
> + goto err_reset_pamts;
> +
> /* Initialize TDMRs to complete the TDX module initialization */
> ret = init_tdmrs(&tdx_tdmr_list);
> if (ret)
> - goto err_reset_pamts;
> + goto err_free_pamt_metadata;
>
> pr_info("%lu KB allocated for PAMT\n", tdmrs_count_pamt_kb(&tdx_tdmr_list));
>
> @@ -1149,6 +1255,9 @@ static int init_tdx_module(void)
> put_online_mems();
> return ret;
>
> +err_free_pamt_metadata:
> + free_pamt_metadata();
> +
> err_reset_pamts:
> /*
> * Part of PAMTs may already have been initialized by the