Re: [PATCH v3 26/30] maple_tree: Use maple copy node for mas_wr_split()
From: D, Suneeth
Date: Fri May 08 2026 - 04:42:47 EST
Hi Liam Howlett,
On 1/31/2026 2:29 AM, Liam R. Howlett wrote:
Instead of using the maple big node, use the maple copy node for reduced
stack usage and aligning with mas_wr_rebalance() and
mas_wr_spanning_store().
Splitting a node is similar to rebalancing, but a new evaluation of when
to ascend is needed. The only other difference is that the data is
pushed and never rebalanced at each level.
The testing must also align with the changes to this commit to ensure
the test suite continues to pass.
We run will-it-scale micro-benchmark as part of our weekly CI for Kernel
Performance Regression testing between a stable vs rc kernel. We
observed will-it-scale-thread-brk1 variant was regressing with
~9% on an AMD's Turin machine between the kernels v7.0 and
v7.1-rc1. Bisecting further landed me onto this commit
280b792cac62ddadca2935766ca870b438c86323 (maple_tree: Use maple copy node for mas_wr_split()) as the first bad
commit. The following were the machine's configuration and test
parameters used:-
Model name: AMD EPYC 64-Core Processor [Turin]
Thread(s) per core: 2
Core(s) per socket: 64
Socket(s): 2
Total online memory: 258G
Test params:
------------
nr_task: [1 8 64 128 192 256]
mode: thread
test: brk1
kpi: per_thread_ops
cpufreq_governor: performance
The following are the stats after bisection:-
(the KPI used here is per_thread_ops)
v7.0 (baseline) %diff per_process_ops kernel_rc_ver
--------------- ----- --------------- -------------
353091 -9 321987 v7.1-rc1
353091 -7 328897 v7.0-rc5-280b792cac62(culprit)
353091 -1 347884 v7.0-rc5-11e7f22f5e85(culpritm1)
jFYI a very high level call trace from running will-it-scale-thread-brk1 which ends up in mas_wr_split goes like this,
do_brk_flags() {
may_expand_vm();
vma_merge_new_range() {
vma_expand() {
commit_merge() {
vma_iter_store_overwrite(){
mas_store_prealloc(){
mas_wr_store_entry(){
mas_wr_split(); <--- Function of interest from this patch
} /* mas_wr_store_entry */
} /* mas_store_prealloc */
} /* vma_iter_store_overwrite */
} /* commit_merge */
} /* vma_expand */
} /* do_brk_flags */
Recreation steps:
-----------------
1) git clone https://github.com/antonblanchard/will-it-scale.git
2) git clone https://github.com/intel/lkp-tests.git
3) cd will-it-scale && git apply
lkp-tests/programs/will-it-scale/pkg/will-it-scale.patch
4) make
5) python3 ./runtest.py brk1 25 thread 0 0 1 8 64 128 192 256
NOTE: [5] is specific to machine's architecture. starting from 1 is the
array of no.of tasks that you'd wish to run the testcase which here is
no.cores per CCX, per NUMA node/ per Socket, nr_threads.
Would be happy to help with further testing and providing additional
data if required.
Thanks & Regards,
Suneeth D
Signed-off-by: Liam R. Howlett <Liam.Howlett@xxxxxxxxxx>
---
lib/maple_tree.c | 99 ++++++++++++++++++++++++++++++--
lib/test_maple_tree.c | 55 ++++++++++++++----
tools/testing/radix-tree/maple.c | 11 ++++
3 files changed, 149 insertions(+), 16 deletions(-)
diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index f04989f8a115e..5813ad17ea6fe 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -4542,19 +4542,106 @@ static inline void mas_wr_append(struct ma_wr_state *wr_mas,
trace_ma_write(TP_FCT, mas, new_end, wr_mas->entry);
}
+/*
+ * split_ascend() - See if a split operation has to keep walking up the tree
+ * @cp: The maple_copy node
+ * @wr_mas: The maple write state
+ * @sib: the maple state of the sibling
+ *
+ * Return: true if another split operation on the next level is needed, false
+ * otherwise
+ */
+static inline bool split_ascend(struct maple_copy *cp,
+ struct ma_wr_state *wr_mas, struct ma_state *sib,
+ struct ma_state *parent)
+{
+ struct ma_state *mas;
+ unsigned long min, max;
+
+ mas = wr_mas->mas;
+ min = mas->min; /* push right, or normal split */
+ max = mas->max;
+ wr_mas->offset_end = parent->offset;
+ if (sib->end) {
+ if (sib->max < mas->min) {
+ min = sib->min; /* push left */
+ parent->offset--;
+ } else {
+ max = sib->max; /* push right */
+ wr_mas->offset_end++;
+ }
+ }
+
+ cp_dst_to_slots(cp, min, max, mas);
+ if (cp_is_new_root(cp, mas))
+ return false;
+
+ if (cp_converged(cp, mas, sib))
+ return false;
+
+ cp->height++;
+ copy_tree_location(parent, mas);
+ wr_mas_setup(wr_mas, mas);
+ return true;
+}
+
+/*
+ * split_data() - Calculate the @cp data, populate @sib if the data can be
+ * pushed into a sibling.
+ * @cp: The maple copy node
+ * @wr_mas: The left write maple state
+ * @sib: The maple state of the sibling.
+ *
+ * Note: @cp->data is a size and not indexed by 0. @sib->end may be set to 0 to
+ * indicate it will not be used.
+ *
+ */
+static inline void split_data(struct maple_copy *cp,
+ struct ma_wr_state *wr_mas, struct ma_state *sib,
+ struct ma_state *parent)
+{
+ cp_data_calc(cp, wr_mas, wr_mas);
+ if (cp->data <= mt_slots[wr_mas->type]) {
+ sib->end = 0;
+ return;
+ }
+
+ push_data_sib(cp, wr_mas->mas, sib, parent);
+ if (sib->end)
+ cp->data += sib->end + 1;
+}
+
/*
* mas_wr_split() - Expand one node into two
* @wr_mas: The write maple state
*/
-static noinline_for_kasan void mas_wr_split(struct ma_wr_state *wr_mas)
+static void mas_wr_split(struct ma_wr_state *wr_mas)
{
- struct maple_big_node b_node;
+ struct maple_enode *old_enode;
+ struct ma_state parent;
+ struct ma_state *mas;
+ struct maple_copy cp;
+ struct ma_state sib;
+ mas = wr_mas->mas;
trace_ma_write(TP_FCT, wr_mas->mas, 0, wr_mas->entry);
- memset(&b_node, 0, sizeof(struct maple_big_node));
- mas_store_b_node(wr_mas, &b_node, wr_mas->offset_end);
- WARN_ON_ONCE(wr_mas->mas->store_type != wr_split_store);
- return mas_split(wr_mas->mas, &b_node);
+ parent = *mas;
+ cp_leaf_init(&cp, mas, wr_mas, wr_mas);
+ do {
+ if (!mte_is_root(parent.node)) {
+ mas_ascend(&parent);
+ parent.end = mas_data_end(&parent);
+ }
+ split_data(&cp, wr_mas, &sib, &parent);
+ multi_src_setup(&cp, wr_mas, wr_mas, &sib);
+ dst_setup(&cp, mas, wr_mas->type);
+ cp_data_write(&cp, mas);
+ } while (split_ascend(&cp, wr_mas, &sib, &parent));
+
+ old_enode = mas->node;
+ mas->node = mt_slot_locked(mas->tree, cp.slot, 0);
+ mas_wmb_replace(mas, old_enode, cp.height);
+ mtree_range_walk(mas);
}
/*
diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c
index a182e48b5f5e6..434d8a2fdd99c 100644
--- a/lib/test_maple_tree.c
+++ b/lib/test_maple_tree.c
@@ -1024,6 +1024,7 @@ static noinline void __init check_ranges(struct maple_tree *mt)
mt_set_non_kernel(10);
check_store_range(mt, r[10], r[11], xa_mk_value(r[10]), 0);
MT_BUG_ON(mt, !mt_height(mt));
+ mt_validate(mt);
mtree_destroy(mt);
/* Create tree of 1-200 */
@@ -1031,11 +1032,13 @@ static noinline void __init check_ranges(struct maple_tree *mt)
/* Store 45-168 */
check_store_range(mt, r[10], r[11], xa_mk_value(r[10]), 0);
MT_BUG_ON(mt, !mt_height(mt));
+ mt_validate(mt);
mtree_destroy(mt);
check_seq(mt, 30, false);
check_store_range(mt, 6, 18, xa_mk_value(6), 0);
MT_BUG_ON(mt, !mt_height(mt));
+ mt_validate(mt);
mtree_destroy(mt);
/* Overwrite across multiple levels. */
@@ -1061,6 +1064,7 @@ static noinline void __init check_ranges(struct maple_tree *mt)
check_load(mt, r[13] + 1, xa_mk_value(r[13] + 1));
check_load(mt, 135, NULL);
check_load(mt, 140, NULL);
+ mt_validate(mt);
mt_set_non_kernel(0);
MT_BUG_ON(mt, !mt_height(mt));
mtree_destroy(mt);
@@ -1285,14 +1289,20 @@ static noinline void __init check_ranges(struct maple_tree *mt)
MT_BUG_ON(mt, mt_height(mt) >= 4);
}
/* Cause a 3 child split all the way up the tree. */
- for (i = 5; i < 215; i += 10)
+ for (i = 5; i < 215; i += 10) {
check_store_range(mt, 11450 + i, 11450 + i + 1, NULL, 0);
- for (i = 5; i < 65; i += 10)
+ mt_validate(mt);
+ }
+ for (i = 5; i < 65; i += 10) {
check_store_range(mt, 11770 + i, 11770 + i + 1, NULL, 0);
+ mt_validate(mt);
+ }
MT_BUG_ON(mt, mt_height(mt) >= 4);
- for (i = 5; i < 45; i += 10)
+ for (i = 5; i < 45; i += 10) {
check_store_range(mt, 11700 + i, 11700 + i + 1, NULL, 0);
+ mt_validate(mt);
+ }
if (!MAPLE_32BIT)
MT_BUG_ON(mt, mt_height(mt) < 4);
mtree_destroy(mt);
@@ -1304,17 +1314,42 @@ static noinline void __init check_ranges(struct maple_tree *mt)
val2 = (i+1)*10;
check_store_range(mt, val, val2, xa_mk_value(val), 0);
MT_BUG_ON(mt, mt_height(mt) >= 4);
+ mt_validate(mt);
+ }
+ /* Fill parents and leaves before split. */
+ val = 7660;
+ for (i = 5; i < 490; i += 5) {
+ val += 5;
+ check_store_range(mt, val, val + 1, NULL, 0);
+ mt_validate(mt);
+ MT_BUG_ON(mt, mt_height(mt) >= 4);
}
+
+ val = 9460;
/* Fill parents and leaves before split. */
- for (i = 5; i < 455; i += 10)
- check_store_range(mt, 7800 + i, 7800 + i + 1, NULL, 0);
+ for (i = 1; i < 10; i++) {
+ val++;
+ check_store_range(mt, val, val + 1, xa_mk_value(val), 0);
+ mt_validate(mt);
+ }
- for (i = 1; i < 16; i++)
- check_store_range(mt, 8185 + i, 8185 + i + 1,
- xa_mk_value(8185+i), 0);
- MT_BUG_ON(mt, mt_height(mt) >= 4);
+ val = 8000;
+ for (i = 1; i < 14; i++) {
+ val++;
+ check_store_range(mt, val, val + 1, xa_mk_value(val), 0);
+ mt_validate(mt);
+ }
+
+
+ check_store_range(mt, 8051, 8051, xa_mk_value(8081), 0);
+ check_store_range(mt, 8052, 8052, xa_mk_value(8082), 0);
+ check_store_range(mt, 8083, 8083, xa_mk_value(8083), 0);
+ check_store_range(mt, 8084, 8084, xa_mk_value(8084), 0);
+ check_store_range(mt, 8085, 8085, xa_mk_value(8085), 0);
/* triple split across multiple levels. */
- check_store_range(mt, 8184, 8184, xa_mk_value(8184), 0);
+ check_store_range(mt, 8099, 8100, xa_mk_value(1), 0);
+
+ mt_validate(mt);
if (!MAPLE_32BIT)
MT_BUG_ON(mt, mt_height(mt) != 4);
}
diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c
index 5ea45d67556a8..feedd5ab7058f 100644
--- a/tools/testing/radix-tree/maple.c
+++ b/tools/testing/radix-tree/maple.c
@@ -35406,7 +35406,18 @@ static noinline void __init check_spanning_write(struct maple_tree *mt)
mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE);
for (i = 0; i <= max; i++)
mtree_test_store_range(mt, i * 10, i * 10 + 5, &i);
+
mtree_lock(mt);
+ if (MAPLE_32BIT) {
+ i = 47811;
+ do {
+ mas_set(&mas, i);
+ mas_store_gfp(&mas, check_spanning_write, GFP_KERNEL);
+ i++;
+ mas_ascend(&mas);
+ } while (mas_data_end(&mas) < mt_slot_count(mas.node) - 1);
+ }
+
mas_set(&mas, 47606);
mas_store_gfp(&mas, check_spanning_write, GFP_KERNEL);
mas_set(&mas, 47607);