Re: [REGRESSION][BISECTED] Performance Regression in IOMMU/VT-d Since Kernel 6.10
From: Baolu Lu
Date: Wed Jul 02 2025 - 05:01:05 EST
On 7/2/2025 1:14 PM, Baolu Lu wrote:
On 7/2/25 01:11, Ioanna Alifieraki wrote:
#regzbot introduced: 129dab6e1286
Hello everyone,
We've identified a performance regression that starts with linux
kernel 6.10 and persists through 6.16(tested at commit e540341508ce).
Bisection pointed to commit:
129dab6e1286 ("iommu/vt-d: Use cache_tag_flush_range_np() in
iotlb_sync_map").
The issue occurs when running fio against two NVMe devices located
under the same PCIe bridge (dual-port NVMe configuration). Performance
drops compared to configurations where the devices are on different
bridges.
Observed Performance:
- Before the commit: ~6150 MiB/s, regardless of NVMe device placement.
- After the commit:
-- Same PCIe bridge: ~4985 MiB/s
-- Different PCIe bridges: ~6150 MiB/s
Currently we can only reproduce the issue on a Z3 metal instance on
gcp. I suspect the issue can be reproducible if you have a dual port
nvme on any machine.
At [1] there's a more detailed description of the issue and details
on the reproducer.
This test was running on bare metal hardware instead of any
virtualization guest, right? If that's the case,
cache_tag_flush_range_np() is almost a no-op.
Can you please show me the capability register of the IOMMU by:
#cat /sys/bus/pci/devices/[pci_dev_name]/iommu/intel-iommu/cap
Also, can you please try whether the below changes make any difference?
I've also attached a patch file to this email so you can apply the
change more easily.
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 7aa3932251b2..f60201ee4be0 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -1796,6 +1796,18 @@ static int domain_setup_first_level(struct
intel_iommu *iommu,
(pgd_t *)pgd, flags, old);
}
+static bool domain_need_iotlb_sync_map(struct dmar_domain *domain,
+ struct intel_iommu *iommu)
+{
+ if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
+ return true;
+
+ if (rwbf_quirk || cap_rwbf(iommu->cap))
+ return true;
+
+ return false;
+}
+
static int dmar_domain_attach_device(struct dmar_domain *domain,
struct device *dev)
{
@@ -1833,6 +1845,8 @@ static int dmar_domain_attach_device(struct
dmar_domain *domain,
if (ret)
goto out_block_translation;
+ domain->iotlb_sync_map |= domain_need_iotlb_sync_map(domain, iommu);
+
return 0;
out_block_translation:
@@ -3945,7 +3959,10 @@ static bool risky_device(struct pci_dev *pdev)
static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
unsigned long iova, size_t size)
{
- cache_tag_flush_range_np(to_dmar_domain(domain), iova, iova + size - 1);
+ struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+
+ if (dmar_domain->iotlb_sync_map)
+ cache_tag_flush_range_np(dmar_domain, iova, iova + size - 1);
return 0;
}
diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
index 3ddbcc603de2..7ab2c34a5ecc 100644
--- a/drivers/iommu/intel/iommu.h
+++ b/drivers/iommu/intel/iommu.h
@@ -614,6 +614,9 @@ struct dmar_domain {
u8 has_mappings:1; /* Has mappings configured through
* iommu_map() interface.
*/
+ u8 iotlb_sync_map:1; /* Need to flush IOTLB cache or write
+ * buffer when creating mappings.
+ */
spinlock_t lock; /* Protect device tracking lists */
struct list_head devices; /* all devices' list */
--
2.43.0
Thanks,
baoluFrom ddc4210a3365147df978bd0bf45d824b9c869877 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@xxxxxxxxxxxxxxx>
Date: Wed, 2 Jul 2025 16:51:48 +0800
Subject: [PATCH 1/1] iommu/vt-d: Avoid unnecessary cache_tag_flush_range_np()
For test purpose only!
Signed-off-by: Lu Baolu <baolu.lu@xxxxxxxxxxxxxxx>
---
drivers/iommu/intel/iommu.c | 19 ++++++++++++++++++-
drivers/iommu/intel/iommu.h | 3 +++
2 files changed, 21 insertions(+), 1 deletion(-)
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 7aa3932251b2..f60201ee4be0 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -1796,6 +1796,18 @@ static int domain_setup_first_level(struct intel_iommu *iommu,
(pgd_t *)pgd, flags, old);
}
+static bool domain_need_iotlb_sync_map(struct dmar_domain *domain,
+ struct intel_iommu *iommu)
+{
+ if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
+ return true;
+
+ if (rwbf_quirk || cap_rwbf(iommu->cap))
+ return true;
+
+ return false;
+}
+
static int dmar_domain_attach_device(struct dmar_domain *domain,
struct device *dev)
{
@@ -1833,6 +1845,8 @@ static int dmar_domain_attach_device(struct dmar_domain *domain,
if (ret)
goto out_block_translation;
+ domain->iotlb_sync_map |= domain_need_iotlb_sync_map(domain, iommu);
+
return 0;
out_block_translation:
@@ -3945,7 +3959,10 @@ static bool risky_device(struct pci_dev *pdev)
static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
unsigned long iova, size_t size)
{
- cache_tag_flush_range_np(to_dmar_domain(domain), iova, iova + size - 1);
+ struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+
+ if (dmar_domain->iotlb_sync_map)
+ cache_tag_flush_range_np(dmar_domain, iova, iova + size - 1);
return 0;
}
diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
index 3ddbcc603de2..7ab2c34a5ecc 100644
--- a/drivers/iommu/intel/iommu.h
+++ b/drivers/iommu/intel/iommu.h
@@ -614,6 +614,9 @@ struct dmar_domain {
u8 has_mappings:1; /* Has mappings configured through
* iommu_map() interface.
*/
+ u8 iotlb_sync_map:1; /* Need to flush IOTLB cache or write
+ * buffer when creating mappings.
+ */
spinlock_t lock; /* Protect device tracking lists */
struct list_head devices; /* all devices' list */
--
2.43.0