[PATCH] convert from stack to forward loop
From: Nico Pache
Date: Tue Jun 02 2026 - 12:26:18 EST
Signed-off-by: Nico Pache <npache@xxxxxxxxxx>
---
mm/khugepaged.c | 96 ++++++++-----------------------------------------
1 file changed, 15 insertions(+), 81 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 498eba009751..6de935e76ceb 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -100,28 +100,6 @@ static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
static struct kmem_cache *mm_slot_cache __ro_after_init;
#define KHUGEPAGED_MIN_MTHP_ORDER 2
-/*
- * mthp_collapse() does an iterative DFS over a binary tree, from
- * HPAGE_PMD_ORDER down to KHUGEPAGED_MIN_MTHP_ORDER. The max stack
- * size needed for a DFS on a binary tree is height + 1, where
- * height = HPAGE_PMD_ORDER - KHUGEPAGED_MIN_MTHP_ORDER.
- *
- * ilog2 is used in place of HPAGE_PMD_ORDER because some architectures
- * (e.g. ppc64le) do not define HPAGE_PMD_ORDER until after build time.
- */
-#define MTHP_STACK_SIZE (ilog2(MAX_PTRS_PER_PTE) - KHUGEPAGED_MIN_MTHP_ORDER + 1)
-
-/*
- * Defines a range of PTE entries in a PTE page table which are being
- * considered for mTHP collapse.
- *
- * @offset: the offset of the first PTE entry in a PMD range.
- * @order: the order of the PTE entries being considered for collapse.
- */
-struct mthp_range {
- u16 offset;
- u8 order;
-};
struct collapse_control {
bool is_khugepaged;
@@ -137,7 +115,6 @@ struct collapse_control {
/* Each bit represents a single occupied (!none/zero) page. */
DECLARE_BITMAP(mthp_present_ptes, MAX_PTRS_PER_PTE);
- struct mthp_range mthp_bitmap_stack[MTHP_STACK_SIZE];
};
/**
@@ -1458,50 +1435,14 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long s
return result;
}
-static void collapse_mthp_stack_push(struct collapse_control *cc, int *stack_size,
- u16 offset, u8 order)
-{
- const int size = *stack_size;
- struct mthp_range *stack = &cc->mthp_bitmap_stack[size];
-
- VM_WARN_ON_ONCE(size >= MTHP_STACK_SIZE);
- stack->order = order;
- stack->offset = offset;
- (*stack_size)++;
-}
-
-static struct mthp_range collapse_mthp_stack_pop(struct collapse_control *cc,
- int *stack_size)
-{
- const int size = *stack_size;
-
- VM_WARN_ON_ONCE(size <= 0);
- (*stack_size)--;
- return cc->mthp_bitmap_stack[size - 1];
-}
-
/*
* mthp_collapse() consumes the bitmap that is generated during
* collapse_scan_pmd() to determine what regions and mTHP orders fit best.
*
* Each bit in cc->mthp_present_ptes represents a single occupied (!none/zero)
- * page. A stack structure cc->mthp_bitmap_stack is used to check different
- * regions of the bitmap for collapse eligibility. The stack maintains a pair
- * of variables (offset, order), indicating the number of PTEs from the start
- * of the PMD, and the order of the potential collapse candidate respectively.
- * We start at the PMD order and check if it is eligible for collapse; if not,
- * we add two entries to the stack at a lower order to represent the left and
- * right halves of the PTE page table we are examining.
- *
- * offset mid_offset
- * | |
- * | |
- * v v
- * --------------------------------------
- * | cc->mthp_present_ptes |
- * --------------------------------------
- * <-------><------->
- * order-1 order-1
+ * page. We start at the PMD order and check if it is eligible for collapse;
+ * if not, we check the left and right halves of the PTE page table we are
+ * examining at a lower order.
*
* For each of these, we determine how many PTE entries are occupied in the
* range of PTE entries we propose to collapse, then we compare this to a
@@ -1517,26 +1458,20 @@ static enum scan_result mthp_collapse(struct mm_struct *mm,
{
unsigned int nr_occupied_ptes, nr_ptes, max_ptes_none;
enum scan_result last_result = SCAN_FAIL;
- int collapsed = 0, stack_size = 0;
+ int collapsed = 0;
bool alloc_failed = false;
unsigned long collapse_address;
- struct mthp_range range;
- u16 offset;
- u8 order;
+ unsigned int offset = 0;
+ unsigned int order = HPAGE_PMD_ORDER;
- collapse_mthp_stack_push(cc, &stack_size, 0, HPAGE_PMD_ORDER);
- while (stack_size) {
- range = collapse_mthp_stack_pop(cc, &stack_size);
- order = range.order;
- offset = range.offset;
+ while (offset < HPAGE_PMD_NR) {
nr_ptes = 1UL << order;
if (!test_bit(order, &enabled_orders))
goto next_order;
max_ptes_none = collapse_max_ptes_none(cc, NULL, order);
-
nr_occupied_ptes = bitmap_weight_from(cc->mthp_present_ptes, offset,
offset + nr_ptes);
@@ -1553,7 +1488,7 @@ static enum scan_result mthp_collapse(struct mm_struct *mm,
collapsed += nr_ptes;
fallthrough;
case SCAN_PTE_MAPPED_HUGEPAGE:
- continue;
+ goto next_offset;
/* Cases where lower orders might still succeed */
case SCAN_ALLOC_HUGE_PAGE_FAIL:
alloc_failed = true;
@@ -1581,15 +1516,14 @@ static enum scan_result mthp_collapse(struct mm_struct *mm,
}
next_order:
- if ((BIT(order) - 1) & enabled_orders) {
- const u8 next_order = order - 1;
- const u16 mid_offset = offset + (nr_ptes / 2);
-
- collapse_mthp_stack_push(cc, &stack_size, mid_offset,
- next_order);
- collapse_mthp_stack_push(cc, &stack_size, offset,
- next_order);
+ if (order > KHUGEPAGED_MIN_MTHP_ORDER &&
+ (BIT(order) - 1) & enabled_orders) {
+ order = order - 1;
+ continue;
}
+next_offset:
+ offset += nr_ptes;
+ order = min_t(int, __ffs(offset), HPAGE_PMD_ORDER);
}
done:
if (collapsed)
--
2.54.0
>
> [...]
>
>>>> + bitmap_zero(cc->mthp_bitmap, MAX_PTRS_PER_PTE);
>>>> memset(cc->node_load, 0, sizeof(cc->node_load));
>>>> nodes_clear(cc->alloc_nmask);
>>>> +
>>>> + enabled_orders = collapse_allowable_orders(vma, vma->vm_flags, tva_flags);
>>>> +
>>>> + /*
>>>> + * If PMD is the only enabled order, enforce max_ptes_none, otherwise
>>>> + * scan all pages to populate the bitmap for mTHP collapse.
>>>> + */
>>>
>>> You should note here, that we re-verify in mthp_collapse().
>>>
>>> But the question is, whether we should relocate the check completely into
>>> mthp_collapse(), instead of conditionally duplicating it.
>>>
>>> What speaks against always populating the bitmap and making the decision in
>>> mthp_collapse()?
>>>
>>> Sure, we might scan a page table a bit longer, but the code gets clearer ... and
>>> I am not sure if scanning some more page table entries is really that critical here.
>>
>> Someone asked me to preserve the legacy behavior (PMD only). Although
>> rather trivial, if you set max_ptes_none=0 for example, we'd still
>> have to do 511 iterations for no reason if PMD collapse is the only
>> enabled order rather than bailing immediately.
>>
>> I'm ok with dropping it, but I think its the correct approach (despite
>> the extra complexity). @Usama Arif brought up this point here
>> https://lore.kernel.org/all/f8f7bb71-ca31-46ee-a62d-7ddfd83e0ead@xxxxxxxxx/
>
> We talk about regressions, but I am not sure if we care about scanning speed
> within a page table that much?
>
> After all, we locked it and already read some entries.
>
> Having the same check at two places to optimize for PMD order might right now
> feel like a good optimization, but likely an irrelevant one in a near future?
>
> Anyhow, won't push back, as long as we document why we are special casing things
> here.
>