[PATCH net-next 1/6] net: devmem: support TX through netkit leased queues
From: Bobby Eshleman
Date: Mon Mar 16 2026 - 18:31:54 EST
From: Bobby Eshleman <bobbyeshleman@xxxxxxxx>
When a netkit virtual device leases queues from a physical NIC, devmem
TX bindings created on the netkit device should use the physical NIC
for DMA operations rather than the virtual device, which has no DMA
capability.
In bind_tx_doit, walk the device's leased rx queues to discover the
underlying physical device that supports netmem_tx. Use this device
for DMA device lookup and pass it as the real_tx_dev in the binding.
When real_tx_dev is set, it is also used for NUMA-local allocations.
Extend validate_xmit_unreadable_skb() to support the netkit case, where
the skb is validated twice: once on the netkit guest device and again on
the physical NIC after BPF redirect or ip forwarding. Both invocations
must pass for the skb to be transmitted.
Signed-off-by: Bobby Eshleman <bobbyeshleman@xxxxxxxx>
---
net/core/dev.c | 26 +++++++++++++++++++-------
net/core/devmem.c | 16 ++++++++++------
net/core/devmem.h | 6 ++++--
net/core/netdev-genl.c | 38 +++++++++++++++++++++++++++++++++-----
4 files changed, 66 insertions(+), 20 deletions(-)
diff --git a/net/core/dev.c b/net/core/dev.c
index ca4b26dfb1bd..105bd27be024 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3981,24 +3981,36 @@ static struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb,
static struct sk_buff *validate_xmit_unreadable_skb(struct sk_buff *skb,
struct net_device *dev)
{
+ struct net_devmem_dmabuf_binding *binding;
struct skb_shared_info *shinfo;
+ struct net_device *real_tx_dev;
struct net_iov *niov;
if (likely(skb_frags_readable(skb)))
goto out;
- if (!dev->netmem_tx)
- goto out_free;
-
shinfo = skb_shinfo(skb);
+ if (shinfo->nr_frags == 0)
+ goto out;
- if (shinfo->nr_frags > 0) {
- niov = netmem_to_net_iov(skb_frag_netmem(&shinfo->frags[0]));
- if (net_is_devmem_iov(niov) &&
- READ_ONCE(net_devmem_iov_binding(niov)->dev) != dev)
+ niov = netmem_to_net_iov(skb_frag_netmem(&shinfo->frags[0]));
+ if (!net_is_devmem_iov(niov))
+ goto out;
+
+ binding = net_devmem_iov_binding(niov);
+ real_tx_dev = READ_ONCE(binding->real_tx_dev);
+
+ if (real_tx_dev) {
+ if (!real_tx_dev->netmem_tx)
+ goto out_free;
+ if (READ_ONCE(binding->dev) != dev && real_tx_dev != dev)
goto out_free;
+ goto out;
}
+ if (READ_ONCE(binding->dev) != dev || !dev->netmem_tx)
+ goto out_free;
+
out:
return skb;
diff --git a/net/core/devmem.c b/net/core/devmem.c
index 7ede81509968..a4148cba5b5f 100644
--- a/net/core/devmem.c
+++ b/net/core/devmem.c
@@ -181,12 +181,13 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
}
struct net_devmem_dmabuf_binding *
-net_devmem_bind_dmabuf(struct net_device *dev,
+net_devmem_bind_dmabuf(struct net_device *dev, struct net_device *real_tx_dev,
struct device *dma_dev,
enum dma_data_direction direction,
unsigned int dmabuf_fd, struct netdev_nl_sock *priv,
struct netlink_ext_ack *extack)
{
+ struct net_device *node_dev = real_tx_dev ?: dev;
struct net_devmem_dmabuf_binding *binding;
static u32 id_alloc_next;
struct scatterlist *sg;
@@ -205,13 +206,14 @@ net_devmem_bind_dmabuf(struct net_device *dev,
return ERR_CAST(dmabuf);
binding = kzalloc_node(sizeof(*binding), GFP_KERNEL,
- dev_to_node(&dev->dev));
+ dev_to_node(&node_dev->dev));
if (!binding) {
err = -ENOMEM;
goto err_put_dmabuf;
}
binding->dev = dev;
+ binding->real_tx_dev = real_tx_dev;
xa_init_flags(&binding->bound_rxqs, XA_FLAGS_ALLOC);
err = percpu_ref_init(&binding->ref,
@@ -254,7 +256,7 @@ net_devmem_bind_dmabuf(struct net_device *dev,
* allocate MTU sized chunks here. Leave that for future work...
*/
binding->chunk_pool = gen_pool_create(PAGE_SHIFT,
- dev_to_node(&dev->dev));
+ dev_to_node(&node_dev->dev));
if (!binding->chunk_pool) {
err = -ENOMEM;
goto err_tx_vec;
@@ -268,7 +270,7 @@ net_devmem_bind_dmabuf(struct net_device *dev,
struct net_iov *niov;
owner = kzalloc_node(sizeof(*owner), GFP_KERNEL,
- dev_to_node(&dev->dev));
+ dev_to_node(&node_dev->dev));
if (!owner) {
err = -ENOMEM;
goto err_free_chunks;
@@ -280,7 +282,8 @@ net_devmem_bind_dmabuf(struct net_device *dev,
owner->binding = binding;
err = gen_pool_add_owner(binding->chunk_pool, dma_addr,
- dma_addr, len, dev_to_node(&dev->dev),
+ dma_addr, len,
+ dev_to_node(&node_dev->dev),
owner);
if (err) {
kfree(owner);
@@ -397,7 +400,8 @@ struct net_devmem_dmabuf_binding *net_devmem_get_binding(struct sock *sk,
*/
dst_dev = dst_dev_rcu(dst);
if (unlikely(!dst_dev) ||
- unlikely(dst_dev != READ_ONCE(binding->dev))) {
+ unlikely(dst_dev != READ_ONCE(binding->dev) &&
+ dst_dev != READ_ONCE(binding->real_tx_dev))) {
err = -ENODEV;
goto out_unlock;
}
diff --git a/net/core/devmem.h b/net/core/devmem.h
index 1c5c18581fcb..ffcf97a33633 100644
--- a/net/core/devmem.h
+++ b/net/core/devmem.h
@@ -20,6 +20,8 @@ struct net_devmem_dmabuf_binding {
struct dma_buf_attachment *attachment;
struct sg_table *sgt;
struct net_device *dev;
+ /* Phys dev behind a virtual dev (e.g. netkit) with a queue lease. */
+ struct net_device *real_tx_dev;
struct gen_pool *chunk_pool;
/* Protect dev */
struct mutex lock;
@@ -84,7 +86,7 @@ struct dmabuf_genpool_chunk_owner {
void __net_devmem_dmabuf_binding_free(struct work_struct *wq);
struct net_devmem_dmabuf_binding *
-net_devmem_bind_dmabuf(struct net_device *dev,
+net_devmem_bind_dmabuf(struct net_device *dev, struct net_device *real_tx_dev,
struct device *dma_dev,
enum dma_data_direction direction,
unsigned int dmabuf_fd, struct netdev_nl_sock *priv,
@@ -165,7 +167,7 @@ static inline void net_devmem_put_net_iov(struct net_iov *niov)
}
static inline struct net_devmem_dmabuf_binding *
-net_devmem_bind_dmabuf(struct net_device *dev,
+net_devmem_bind_dmabuf(struct net_device *dev, struct net_device *real_tx_dev,
struct device *dma_dev,
enum dma_data_direction direction,
unsigned int dmabuf_fd,
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index 7d073894ca74..2b34924dc30f 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -1037,7 +1037,7 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info)
goto err_rxq_bitmap;
}
- binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_FROM_DEVICE,
+ binding = net_devmem_bind_dmabuf(netdev, NULL, dma_dev, DMA_FROM_DEVICE,
dmabuf_fd, priv, info->extack);
if (IS_ERR(binding)) {
err = PTR_ERR(binding);
@@ -1082,6 +1082,8 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info)
int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info)
{
struct net_devmem_dmabuf_binding *binding;
+ struct net_device *real_tx_dev = NULL;
+ struct netdev_rx_queue *lease_rxq;
struct netdev_nl_sock *priv;
struct net_device *netdev;
struct device *dma_dev;
@@ -1089,6 +1091,7 @@ int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info)
struct sk_buff *rsp;
int err = 0;
void *hdr;
+ int i;
if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) ||
GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_FD))
@@ -1124,16 +1127,41 @@ int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info)
goto err_unlock_netdev;
}
- if (!netdev->netmem_tx) {
+ for (i = 0; i < netdev->real_num_rx_queues; i++) {
+ lease_rxq = READ_ONCE(__netif_get_rx_queue(netdev, i)->lease);
+
+ if (!lease_rxq)
+ continue;
+
+ real_tx_dev = lease_rxq->dev;
+ break;
+ }
+
+ if (real_tx_dev) {
+ if (!netif_device_present(real_tx_dev)) {
+ err = -ENODEV;
+ goto err_unlock_netdev;
+ }
+
+ if (!real_tx_dev->netmem_tx) {
+ err = -EOPNOTSUPP;
+ NL_SET_ERR_MSG(info->extack,
+ "Driver for queue lease device does not support netmem TX");
+ goto err_unlock_netdev;
+ }
+ }
+
+ if (!real_tx_dev && !netdev->netmem_tx) {
err = -EOPNOTSUPP;
NL_SET_ERR_MSG(info->extack,
"Driver does not support netmem TX");
goto err_unlock_netdev;
}
- dma_dev = netdev_queue_get_dma_dev(netdev, 0);
- binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_TO_DEVICE,
- dmabuf_fd, priv, info->extack);
+ dma_dev = netdev_queue_get_dma_dev(real_tx_dev ?: netdev, 0);
+ binding = net_devmem_bind_dmabuf(netdev, real_tx_dev, dma_dev,
+ DMA_TO_DEVICE, dmabuf_fd, priv,
+ info->extack);
if (IS_ERR(binding)) {
err = PTR_ERR(binding);
goto err_unlock_netdev;
--
2.52.0