Re: [PATCH 10/17] sched_ext: Add cmask, a base-windowed bitmap over cid space
From: Changwoo Min
Date: Wed Apr 29 2026 - 08:47:40 EST
On 4/29/26 5:35 AM, Tejun Heo wrote:
+/*
+ * x86 BPF JIT rejects BPF_OR | BPF_FETCH and BPF_AND | BPF_FETCH on arena
+ * pointers (see bpf_jit_supports_insn() in arch/x86/net/bpf_jit_comp.c). Only
+ * BPF_CMPXCHG / BPF_XCHG / BPF_ADD with FETCH are allowed. Implement
+ * test_and_{set,clear} and the atomic set/clear via a cmpxchg loop.
+ *
+ * CMASK_CAS_TRIES is far above what any non-pathological contention needs.
+ * Exhausting it means the bit update was lost, which corrupts the caller's view
+ * of the bitmap, so raise scx_bpf_error() to abort the scheduler.
+ */
+#define CMASK_CAS_TRIES 1024
+
+static __always_inline void cmask_set(struct scx_cmask __arena *m, u32 cid)
+{
+ u64 __arena *w;
+ u64 bit, old, new;
+ u32 i;
+
+ if (!__cmask_contains(m, cid))
+ return;
+ w = __cmask_word(m, cid);
+ bit = BIT_U64(cid & 63);
+ bpf_for(i, 0, CMASK_CAS_TRIES) {
+ old = *w;
+ if (old & bit)
+ return;
+ new = old | bit;
+ if (__sync_val_compare_and_swap(w, old, new) == old)
+ return;
+ }
+ scx_bpf_error("cmask_set CAS exhausted at cid %u", cid);
+}
+
+static __always_inline void cmask_clear(struct scx_cmask __arena *m, u32 cid)
+{
+ u64 __arena *w;
+ u64 bit, old, new;
+ u32 i;
+
+ if (!__cmask_contains(m, cid))
+ return;
+ w = __cmask_word(m, cid);
+ bit = BIT_U64(cid & 63);
+ bpf_for(i, 0, CMASK_CAS_TRIES) {
+ old = *w;
+ if (!(old & bit))
+ return;
+ new = old & ~bit;
+ if (__sync_val_compare_and_swap(w, old, new) == old)
+ return;
+ }
+ scx_bpf_error("cmask_clear CAS exhausted at cid %u", cid);
+}
+
+static __always_inline bool cmask_test_and_set(struct scx_cmask __arena *m, u32 cid)
+{
+ u64 __arena *w;
+ u64 bit, old, new;
+ u32 i;
+
+ if (!__cmask_contains(m, cid))
+ return false;
+ w = __cmask_word(m, cid);
+ bit = BIT_U64(cid & 63);
+ bpf_for(i, 0, CMASK_CAS_TRIES) {
+ old = *w;
+ if (old & bit)
+ return true;
+ new = old | bit;
+ if (__sync_val_compare_and_swap(w, old, new) == old)
+ return false;
+ }
+ scx_bpf_error("cmask_test_and_set CAS exhausted at cid %u", cid);
+ return false;
+}
+
+static __always_inline bool cmask_test_and_clear(struct scx_cmask __arena *m, u32 cid)
+{
+ u64 __arena *w;
+ u64 bit, old, new;
+ u32 i;
+
+ if (!__cmask_contains(m, cid))
+ return false;
+ w = __cmask_word(m, cid);
+ bit = BIT_U64(cid & 63);
+ bpf_for(i, 0, CMASK_CAS_TRIES) {
+ old = *w;
+ if (!(old & bit))
+ return false;
+ new = old & ~bit;
+ if (__sync_val_compare_and_swap(w, old, new) == old)
+ return true;
+ }
+ scx_bpf_error("cmask_test_and_clear CAS exhausted at cid %u", cid);
+ return false;
+}
Exiting a BPF scheduler when CAS retries fail seems too brutal, while it
is extremely rare to happen. What about adding a kfunc for the slow path
that runs only when the CAS retry fails? For example,
scx_bpf_cmask_test_and_clear( ) does the same thing as
cmask_test_and_clear(), but it never fails. cmask_test_and_clear() calls
scx_bpf_cmask_test_and_clear( ) only when the CAS retry fails. In this
way, we can use the fast path in the BPF implementation while ensuring
the operation succeeds.
+/**
+ * cmask_next_set - find the first set bit at or after @cid
+ * @m: cmask to search
+ * @cid: starting cid (clamped to @m->base if below)
+ *
+ * Returns the smallest set cid in [@cid, @m->base + @m->nr_bits), or
+ * @m->base + @m->nr_bits if none (the out-of-range sentinel matches the
+ * termination condition used by cmask_for_each()).
+ */
+static __always_inline u32 cmask_next_set(const struct scx_cmask __arena *m, u32 cid)
+{
+ u32 end = m->base + m->nr_bits;
+ u32 base = m->base / 64;
+ u32 last_wi = (end - 1) / 64 - base;
+ u32 start_wi, start_bit, i;
+
+ if (cid < m->base)
+ cid = m->base;
+ if (cid >= end)
+ return end;
+
+ start_wi = cid / 64 - base;
+ start_bit = cid & 63;
+
+ bpf_for(i, 0, CMASK_MAX_WORDS) {
+ u32 wi = start_wi + i;
+ u64 word;
+ u32 found;
+
+ if (wi > last_wi)
+ break;
+
+ word = m->bits[wi];
+ if (i == 0)
+ word &= GENMASK_U64(63, start_bit);
+ if (!word)
+ continue;
+
+ found = (base + wi) * 64 + __builtin_ctzll(word);
Some compiler versions (e.g., clang-18 or older) don’t support
__builtin_ctzll(). To handle this gracefully, there is already a
wrapper, ctzll(), in common.bpf.h. So, I suggest using ctzll()
for compatibility.
Reviewed-by: Changwoo Min <changwoo@xxxxxxxxxx>