Re: [PATCH bpf] bpf,tcp: avoid infinite recursion in BPF_SOCK_OPS_HDR_OPT_LEN_CB

From: mkf

Date: Tue Apr 14 2026 - 11:39:55 EST

On Tue, 2026-04-14 at 18:57 +0800, Jiayuan Chen wrote:
> A BPF_PROG_TYPE_SOCK_OPS program can set BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG
> to inject custom TCP header options. When the kernel builds a TCP packet,
> it calls tcp_established_options() to calculate the header size, which
> invokes bpf_skops_hdr_opt_len() to trigger the BPF_SOCK_OPS_HDR_OPT_LEN_CB
> callback.
>
> If the BPF program calls bpf_setsockopt(TCP_NODELAY) inside this callback,
> __tcp_sock_set_nodelay() will call tcp_push_pending_frames(), which calls
> tcp_current_mss(), which calls tcp_established_options() again,
> re-triggering the same BPF callback. This creates an infinite recursion
> that exhausts the kernel stack and causes a panic.
>
> BPF_SOCK_OPS_HDR_OPT_LEN_CB
> -> bpf_setsockopt(TCP_NODELAY)
> -> tcp_push_pending_frames()
> -> tcp_current_mss()
> -> tcp_established_options()
> -> bpf_skops_hdr_opt_len()
>                            /* infinite recursion */
> -> BPF_SOCK_OPS_HDR_OPT_LEN_CB
>
> A similar reentrancy issue exists for TCP congestion control, which is
> guarded by tp->bpf_chg_cc_inprogress. Adopt the same approach: introduce
> tp->bpf_hdr_opt_len_cb_inprogress, set it before invoking the callback in
> bpf_skops_hdr_opt_len(), and check it in sol_tcp_sockopt() to reject
> bpf_setsockopt(TCP_NODELAY) calls that would trigger
> tcp_push_pending_frames() and cause the recursion.
>
> Reported-by: Quan Sun <2022090917019@xxxxxxxxxxxxxxxx>
> Reported-by: Yinhao Hu <dddddd@xxxxxxxxxxx>
> Reported-by: Kaiyan Mei <M202472210@xxxxxxxxxxx>
> Reported-by: Dongliang Mu <dzm91@xxxxxxxxxxx>
> Closes: https://lore.kernel.org/bpf/d1d523c9-6901-4454-a183-94462b8f3e4e@xxxxxxxxxxxxxxxx/
> Fixes: 0813a841566f ("bpf: tcp: Allow bpf prog to write and parse TCP header option")
> Signed-off-by: Jiayuan Chen <jiayuan.chen@xxxxxxxxx>
> ---
> Documentation/networking/net_cachelines/tcp_sock.rst | 1 +
> include/linux/tcp.h                                  | 11 ++++++++++-
> net/core/filter.c                                    | 4 ++++
> net/ipv4/tcp_minisocks.c                             | 1 +
> net/ipv4/tcp_output.c                                | 3 +++
> 5 files changed, 19 insertions(+), 1 deletion(-)
>
> diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst
> b/Documentation/networking/net_cachelines/tcp_sock.rst
> index 563daea10d6c..07d3226d90cc 100644
> --- a/Documentation/networking/net_cachelines/tcp_sock.rst
> +++ b/Documentation/networking/net_cachelines/tcp_sock.rst
> @@ -152,6 +152,7 @@ unsigned_int                  keepalive_intvl
> int                           linger2
> u8                            bpf_sock_ops_cb_flags
> u8:1                          bpf_chg_cc_inprogress
> +u8:1                          bpf_hdr_opt_len_cb_inprogress
> u16                           timeout_rehash
> u32                           rcv_ooopack
> u32                           rcv_rtt_last_tsecr
> diff --git a/include/linux/tcp.h b/include/linux/tcp.h
> index f72eef31fa23..2bfb73cf922e 100644
> --- a/include/linux/tcp.h
> +++ b/include/linux/tcp.h
> @@ -475,12 +475,21 @@ struct tcp_sock {
> u8 bpf_sock_ops_cb_flags; /* Control calling BPF programs
> * values defined in uapi/linux/tcp.h
> */
> - u8 bpf_chg_cc_inprogress:1; /* In the middle of
> + u8 bpf_chg_cc_inprogress:1, /* In the middle of
> * bpf_setsockopt(TCP_CONGESTION),
> * it is to avoid the bpf_tcp_cc->init()
> * to recur itself by calling
> * bpf_setsockopt(TCP_CONGESTION, "itself").
> */
> + bpf_hdr_opt_len_cb_inprogress:1; /* It is set before invoking the
> + * callback so that a nested
> + * bpf_setsockopt(TCP_NODELAY) or
> + * bpf_setsockopt(TCP_CORK) cannot
> + * trigger tcp_push_pending_frames(),
> + * which would call tcp_current_mss()
> + * -> bpf_skops_hdr_opt_len(), causing
> + * infinite recursion.
> + */
> #define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) (TP->bpf_sock_ops_cb_flags & ARG)
> #else
> #define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 78b548158fb0..518699429a7a 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -5483,6 +5483,10 @@ static int sol_tcp_sockopt(struct sock *sk, int optname,
> if (sk->sk_protocol != IPPROTO_TCP)
> return -EINVAL;
>
> + if ((optname == TCP_NODELAY || optname == TCP_CORK) &&
> +     tcp_sk(sk)->bpf_hdr_opt_len_cb_inprogress)
> + return -EBUSY;
> +
TCP_CORK is not support in sol_tcp_sockopt(), return -EINVAL by default. and put the check here
could also prevent us from calling getsockopt(TCP_NODELAY) below.

> switch (optname) {
> case TCP_NODELAY:
> case TCP_MAXSEG:
> diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
> index dafb63b923d0..fb06c464ac16 100644
> --- a/net/ipv4/tcp_minisocks.c
> +++ b/net/ipv4/tcp_minisocks.c
> @@ -663,6 +663,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
> RCU_INIT_POINTER(newtp->fastopen_rsk, NULL);
>
> newtp->bpf_chg_cc_inprogress = 0;
> + newtp->bpf_hdr_opt_len_cb_inprogress = 0;
> tcp_bpf_clone(sk, newsk);
>
> __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index 326b58ff1118..c9654e690e1a 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -475,6 +475,7 @@ static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
> unsigned int *remaining)
> {
> struct bpf_sock_ops_kern sock_ops;
> + struct tcp_sock *tp = tcp_sk(sk);
> int err;
>
> if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
> @@ -519,7 +520,9 @@ static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
> if (skb)
> bpf_skops_init_skb(&sock_ops, skb, 0);
>
> + tp->bpf_hdr_opt_len_cb_inprogress = 1;
we check the BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG before calling BPF_CGROUP_RUN_PROG_SOCK_OPS_SK,
could this flag use for the same purpose? so we don't need to add an extra field.

if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)) ||
!*remaining)
return;
> err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);
> + tp->bpf_hdr_opt_len_cb_inprogress = 0;
>
> if (err || sock_ops.remaining_opt_len == *remaining)
> return;

--
Thanks,
KaFai