Re: [PATCH v18 1/4] s390/pci: Store PCI error information for passthrough devices

From: Farhan Ali

Date: Wed Jun 03 2026 - 19:37:29 EST

On 6/3/2026 3:20 PM, Alex Williamson wrote:

On Wed, 3 Jun 2026 11:24:12 -0700
Farhan Ali <alifm@xxxxxxxxxxxxx> wrote:

@@ -81,6 +52,47 @@ static bool is_driver_supported(struct pci_driver *driver)
return true;
}
+static int zpci_store_pci_error(struct pci_dev *pdev,
+ struct zpci_ccdf_err *ccdf)
+{
+ struct zpci_dev *zdev = to_zpci(pdev);
+ int i;
+
+ guard(mutex)(&zdev->pending_errs_lock);
+ if (!zdev->pending_errs.mediated_recovery)
+ return -EINVAL;
+
+ if (zdev->pending_errs.count >= ZPCI_ERR_PENDING_MAX) {
+ pr_err("%s: Maximum number (%d) of pending error events queued\n",
+ pci_name(pdev), ZPCI_ERR_PENDING_MAX);

Is this really an err condition or just a warn? Nothing is
fundamentally broken here, the queue is just full and we're losing
errors. Maybe this should be a warn?

Can this create a DoS if a device continues to generate errors and
nobody is consuming them? Userspace could ignore the error. This
should probably be _ratelimited.

Architecturally once a device is an error state, then it doesn't generate any further error events. It normally will have only one event pending, we used a value of 4 for ZPCI_ERR_PENDING_MAX just to be cautious. Though it could be improved to be ratelimited.

pr_err + pci_name suggests this should be a pci_ or dev_ call and since
the pci variant doesn't exist, use dev_warn_ratelimited().

+ return -ENOMEM;
+ }
+
+ i = zdev->pending_errs.tail % ZPCI_ERR_PENDING_MAX;
+ memcpy(&zdev->pending_errs.err[i], ccdf, sizeof(struct zpci_ccdf_err));
+ zdev->pending_errs.tail++;
+ zdev->pending_errs.count++;
+ return 0;
+}
+
+void zpci_start_mediated_recovery(struct zpci_dev *zdev)
+{
+ guard(mutex)(&zdev->pending_errs_lock);
+ zdev->pending_errs.mediated_recovery = true;
+}
+EXPORT_SYMBOL_GPL(zpci_start_mediated_recovery);
+
+void zpci_stop_mediated_recovery(struct zpci_dev *zdev)
+{
+ guard(mutex)(&zdev->pending_errs_lock);
+ zdev->pending_errs.mediated_recovery = false;
+ if (zdev->pending_errs.count)
+ pr_info("Unhandled PCI error events count=%d for PCI function 0x%x\n",
+ zdev->pending_errs.count, zdev->fid);

It seems like there's always a race that an error could occur as the
user is closing the device. Is this really worth logging at anything
more than a dbg level, pci_dbg() in this case?

I wanted to avoid pci_* log messages as it would require getting a handle of struct pci_dev and there were some concerns indicated by sashiko as mentioned in the thread

https://lore.kernel.org/all/c6363f08f898c7f4bb3e291ab2d3f7d8e3280459.camel@xxxxxxxxxxxxx/

+ memset(&zdev->pending_errs, 0, sizeof(struct zpci_ccdf_pending));
+}
+EXPORT_SYMBOL_GPL(zpci_stop_mediated_recovery);
+
static pci_ers_result_t zpci_event_notify_error_detected(struct pci_dev *pdev,
struct pci_driver *driver)
{
@@ -175,12 +187,15 @@ static pci_ers_result_t zpci_event_do_reset(struct pci_dev *pdev,
* and the platform determines which functions are affected for
* multi-function devices.
*/
-static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
+static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev,
+ struct zpci_ccdf_err *ccdf)
{
pci_ers_result_t ers_res = PCI_ERS_RESULT_DISCONNECT;
struct zpci_dev *zdev = to_zpci(pdev);
+ bool mediated_recovery = false;
char *status_str = "success";
struct pci_driver *driver;
+ int rc;
/*
* Ensure that the PCI function is not removed concurrently, no driver
@@ -194,13 +209,6 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
}
pdev->error_state = pci_channel_io_frozen;
- if (is_passed_through(pdev)) {
- pr_info("%s: Cannot be recovered in the host because it is a pass-through device\n",
- pci_name(pdev));
- status_str = "failed (pass-through)";
- goto out_unlock;
- }
-
driver = to_pci_driver(pdev->dev.driver);
if (!is_driver_supported(driver)) {
if (!driver) {
@@ -216,12 +224,24 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
goto out_unlock;
}
+ rc = zpci_store_pci_error(pdev, ccdf);
+ if (!rc || rc == -ENOMEM)
+ mediated_recovery = true;

This is a convoluted way to get the state of
zdev->pending_errs.mediated_recovery, which becomes invalid out of
pending_errs_lock anyway.

We had some discussion around this on a previous version, to simplify the error recovery to be driven by host or userspace recovery we decided with this approach https://lore.kernel.org/all/6dd6de60d72a70ff85ecab2a9d14f45a617f05e7.camel@xxxxxxxxxxxxx/.

+
ers_res = zpci_event_notify_error_detected(pdev, driver);
if (ers_result_indicates_abort(ers_res)) {
status_str = "failed (abort on detection)";
goto out_unlock;
}
+ if (mediated_recovery) {
+ pr_info("%s: Leaving recovery of pass-through device to user-space\n",
+ pci_name(pdev));
+ ers_res = PCI_ERS_RESULT_RECOVERED;
+ status_str = "in progress";
+ goto out_unlock;
+ }

Since zdev->pending_errs.mediated_recovery is only valid while holding
pending_errs_lock, this is really no better than the
is_passed_through() test.

+
if (ers_res != PCI_ERS_RESULT_NEED_RESET) {
ers_res = zpci_event_do_error_state_clear(pdev, driver);
if (ers_result_indicates_abort(ers_res)) {
@@ -266,25 +286,19 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
* @pdev: PCI function for which to report
* @es: PCI channel failure state to report
*/
-static void zpci_event_io_failure(struct pci_dev *pdev, pci_channel_state_t es)
+static void zpci_event_io_failure(struct pci_dev *pdev, pci_channel_state_t es,
+ struct zpci_ccdf_err *ccdf)
{
struct pci_driver *driver;
pci_dev_lock(pdev);
pdev->error_state = es;
- /**
- * While vfio-pci's error_detected callback notifies user-space QEMU
- * reacts to this by freezing the guest. In an s390 environment PCI
- * errors are rarely fatal so this is overkill. Instead in the future
- * we will inject the error event and let the guest recover the device
- * itself.
- */
- if (is_passed_through(pdev))
- goto out;
+
+ zpci_store_pci_error(pdev, ccdf);
driver = to_pci_driver(pdev->dev.driver);
if (driver && driver->err_handler && driver->err_handler->error_detected)
driver->err_handler->error_detected(pdev, pdev->error_state);

How do you intend to stage this versus QEMU changes? This seems like a
big regression if we're suddenly triggering the eventfd that causes
QEMU to halt. Do you need userspace to opt-in to mediated recovery
rather than automatically enabling it on open? Thanks,

Alex

AFAIU userspace registering an eventfd to receive notification for error events is an opt-in? And yes for QEMU the current behavior halts the guest, but even today on an error device becomes unusable and requires manual intervention. I am not sure if we need to add another opt-in mechanism for QEMU.

Thanks

Farhan