Re: [PATCH bpf] bpf,tcp: avoid infinite recursion in BPF_SOCK_OPS_HDR_OPT_LEN_CB

From: mkf

Date: Tue Apr 14 2026 - 11:39:55 EST


On Tue, 2026-04-14 at 18:57 +0800, Jiayuan Chen wrote:
> A BPF_PROG_TYPE_SOCK_OPS program can set BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG
> to inject custom TCP header options. When the kernel builds a TCP packet,
> it calls tcp_established_options() to calculate the header size, which
> invokes bpf_skops_hdr_opt_len() to trigger the BPF_SOCK_OPS_HDR_OPT_LEN_CB
> callback.
>
> If the BPF program calls bpf_setsockopt(TCP_NODELAY) inside this callback,
> __tcp_sock_set_nodelay() will call tcp_push_pending_frames(), which calls
> tcp_current_mss(), which calls tcp_established_options() again,
> re-triggering the same BPF callback. This creates an infinite recursion
> that exhausts the kernel stack and causes a panic.
>
> BPF_SOCK_OPS_HDR_OPT_LEN_CB
>   -> bpf_setsockopt(TCP_NODELAY)
> -> tcp_push_pending_frames()
>   -> tcp_current_mss()
> -> tcp_established_options()
>   -> bpf_skops_hdr_opt_len()
>                            /* infinite recursion */
> -> BPF_SOCK_OPS_HDR_OPT_LEN_CB
>
> A similar reentrancy issue exists for TCP congestion control, which is
> guarded by tp->bpf_chg_cc_inprogress. Adopt the same approach: introduce
> tp->bpf_hdr_opt_len_cb_inprogress, set it before invoking the callback in
> bpf_skops_hdr_opt_len(), and check it in sol_tcp_sockopt() to reject
> bpf_setsockopt(TCP_NODELAY) calls that would trigger
> tcp_push_pending_frames() and cause the recursion.
>
> Reported-by: Quan Sun <2022090917019@xxxxxxxxxxxxxxxx>
> Reported-by: Yinhao Hu <dddddd@xxxxxxxxxxx>
> Reported-by: Kaiyan Mei <M202472210@xxxxxxxxxxx>
> Reported-by: Dongliang Mu <dzm91@xxxxxxxxxxx>
> Closes: https://lore.kernel.org/bpf/d1d523c9-6901-4454-a183-94462b8f3e4e@xxxxxxxxxxxxxxxx/
> Fixes: 0813a841566f ("bpf: tcp: Allow bpf prog to write and parse TCP header option")
> Signed-off-by: Jiayuan Chen <jiayuan.chen@xxxxxxxxx>
> ---
>  Documentation/networking/net_cachelines/tcp_sock.rst |  1 +
>  include/linux/tcp.h                                  | 11 ++++++++++-
>  net/core/filter.c                                    |  4 ++++
>  net/ipv4/tcp_minisocks.c                             |  1 +
>  net/ipv4/tcp_output.c                                |  3 +++
>  5 files changed, 19 insertions(+), 1 deletion(-)
>
> diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst
> b/Documentation/networking/net_cachelines/tcp_sock.rst
> index 563daea10d6c..07d3226d90cc 100644
> --- a/Documentation/networking/net_cachelines/tcp_sock.rst
> +++ b/Documentation/networking/net_cachelines/tcp_sock.rst
> @@ -152,6 +152,7 @@ unsigned_int                  keepalive_intvl
>  int                           linger2
>  u8                            bpf_sock_ops_cb_flags
>  u8:1                          bpf_chg_cc_inprogress
> +u8:1                          bpf_hdr_opt_len_cb_inprogress
>  u16                           timeout_rehash
>  u32                           rcv_ooopack
>  u32                           rcv_rtt_last_tsecr
> diff --git a/include/linux/tcp.h b/include/linux/tcp.h
> index f72eef31fa23..2bfb73cf922e 100644
> --- a/include/linux/tcp.h
> +++ b/include/linux/tcp.h
> @@ -475,12 +475,21 @@ struct tcp_sock {
>   u8 bpf_sock_ops_cb_flags;  /* Control calling BPF programs
>   * values defined in uapi/linux/tcp.h
>   */
> - u8 bpf_chg_cc_inprogress:1; /* In the middle of
> + u8 bpf_chg_cc_inprogress:1, /* In the middle of
>     * bpf_setsockopt(TCP_CONGESTION),
>     * it is to avoid the bpf_tcp_cc->init()
>     * to recur itself by calling
>     * bpf_setsockopt(TCP_CONGESTION, "itself").
>     */
> + bpf_hdr_opt_len_cb_inprogress:1; /* It is set before invoking the
> +   * callback so that a nested
> +   * bpf_setsockopt(TCP_NODELAY) or
> +   * bpf_setsockopt(TCP_CORK) cannot
> +   * trigger tcp_push_pending_frames(),
> +   * which would call tcp_current_mss()
> +   * -> bpf_skops_hdr_opt_len(), causing
> +   * infinite recursion.
> +   */
>  #define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) (TP->bpf_sock_ops_cb_flags & ARG)
>  #else
>  #define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 78b548158fb0..518699429a7a 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -5483,6 +5483,10 @@ static int sol_tcp_sockopt(struct sock *sk, int optname,
>   if (sk->sk_protocol != IPPROTO_TCP)
>   return -EINVAL;
>  
> + if ((optname == TCP_NODELAY || optname == TCP_CORK) &&
> +     tcp_sk(sk)->bpf_hdr_opt_len_cb_inprogress)
> + return -EBUSY;
> +
TCP_CORK is not support in sol_tcp_sockopt(), return -EINVAL by default. and put the check here
could also prevent us from calling getsockopt(TCP_NODELAY) below.

>   switch (optname) {
>   case TCP_NODELAY:
>   case TCP_MAXSEG:
> diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
> index dafb63b923d0..fb06c464ac16 100644
> --- a/net/ipv4/tcp_minisocks.c
> +++ b/net/ipv4/tcp_minisocks.c
> @@ -663,6 +663,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
>   RCU_INIT_POINTER(newtp->fastopen_rsk, NULL);
>  
>   newtp->bpf_chg_cc_inprogress = 0;
> + newtp->bpf_hdr_opt_len_cb_inprogress = 0;
>   tcp_bpf_clone(sk, newsk);
>  
>   __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index 326b58ff1118..c9654e690e1a 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -475,6 +475,7 @@ static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
>     unsigned int *remaining)
>  {
>   struct bpf_sock_ops_kern sock_ops;
> + struct tcp_sock *tp = tcp_sk(sk);
>   int err;
>  
>   if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
> @@ -519,7 +520,9 @@ static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
>   if (skb)
>   bpf_skops_init_skb(&sock_ops, skb, 0);
>  
> + tp->bpf_hdr_opt_len_cb_inprogress = 1;
we check the BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG before calling BPF_CGROUP_RUN_PROG_SOCK_OPS_SK,
could this flag use for the same purpose? so we don't need to add an extra field.

if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)) ||
!*remaining)
return;
>   err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);
> + tp->bpf_hdr_opt_len_cb_inprogress = 0;
>  
>   if (err || sock_ops.remaining_opt_len == *remaining)
>   return;

--
Thanks,
KaFai