Re: [PATCH v18 1/4] s390/pci: Store PCI error information for passthrough devices

From: Alex Williamson

Date: Wed Jun 03 2026 - 18:24:28 EST

On Wed, 3 Jun 2026 11:24:12 -0700
Farhan Ali <alifm@xxxxxxxxxxxxx> wrote:
> @@ -81,6 +52,47 @@ static bool is_driver_supported(struct pci_driver *driver)
> return true;
> }
>
> +static int zpci_store_pci_error(struct pci_dev *pdev,
> + struct zpci_ccdf_err *ccdf)
> +{
> + struct zpci_dev *zdev = to_zpci(pdev);
> + int i;
> +
> + guard(mutex)(&zdev->pending_errs_lock);
> + if (!zdev->pending_errs.mediated_recovery)
> + return -EINVAL;
> +
> + if (zdev->pending_errs.count >= ZPCI_ERR_PENDING_MAX) {
> + pr_err("%s: Maximum number (%d) of pending error events queued\n",
> + pci_name(pdev), ZPCI_ERR_PENDING_MAX);

Is this really an err condition or just a warn? Nothing is
fundamentally broken here, the queue is just full and we're losing
errors. Maybe this should be a warn?

Can this create a DoS if a device continues to generate errors and
nobody is consuming them? Userspace could ignore the error. This
should probably be _ratelimited.

pr_err + pci_name suggests this should be a pci_ or dev_ call and since
the pci variant doesn't exist, use dev_warn_ratelimited().

> + return -ENOMEM;
> + }
> +
> + i = zdev->pending_errs.tail % ZPCI_ERR_PENDING_MAX;
> + memcpy(&zdev->pending_errs.err[i], ccdf, sizeof(struct zpci_ccdf_err));
> + zdev->pending_errs.tail++;
> + zdev->pending_errs.count++;
> + return 0;
> +}
> +
> +void zpci_start_mediated_recovery(struct zpci_dev *zdev)
> +{
> + guard(mutex)(&zdev->pending_errs_lock);
> + zdev->pending_errs.mediated_recovery = true;
> +}
> +EXPORT_SYMBOL_GPL(zpci_start_mediated_recovery);
> +
> +void zpci_stop_mediated_recovery(struct zpci_dev *zdev)
> +{
> + guard(mutex)(&zdev->pending_errs_lock);
> + zdev->pending_errs.mediated_recovery = false;
> + if (zdev->pending_errs.count)
> + pr_info("Unhandled PCI error events count=%d for PCI function 0x%x\n",
> + zdev->pending_errs.count, zdev->fid);

It seems like there's always a race that an error could occur as the
user is closing the device. Is this really worth logging at anything
more than a dbg level, pci_dbg() in this case?

> + memset(&zdev->pending_errs, 0, sizeof(struct zpci_ccdf_pending));
> +}
> +EXPORT_SYMBOL_GPL(zpci_stop_mediated_recovery);
> +
> static pci_ers_result_t zpci_event_notify_error_detected(struct pci_dev *pdev,
> struct pci_driver *driver)
> {
> @@ -175,12 +187,15 @@ static pci_ers_result_t zpci_event_do_reset(struct pci_dev *pdev,
> * and the platform determines which functions are affected for
> * multi-function devices.
> */
> -static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
> +static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev,
> + struct zpci_ccdf_err *ccdf)
> {
> pci_ers_result_t ers_res = PCI_ERS_RESULT_DISCONNECT;
> struct zpci_dev *zdev = to_zpci(pdev);
> + bool mediated_recovery = false;
> char *status_str = "success";
> struct pci_driver *driver;
> + int rc;
>
> /*
> * Ensure that the PCI function is not removed concurrently, no driver
> @@ -194,13 +209,6 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
> }
> pdev->error_state = pci_channel_io_frozen;
>
> - if (is_passed_through(pdev)) {
> - pr_info("%s: Cannot be recovered in the host because it is a pass-through device\n",
> - pci_name(pdev));
> - status_str = "failed (pass-through)";
> - goto out_unlock;
> - }
> -
> driver = to_pci_driver(pdev->dev.driver);
> if (!is_driver_supported(driver)) {
> if (!driver) {
> @@ -216,12 +224,24 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
> goto out_unlock;
> }
>
> + rc = zpci_store_pci_error(pdev, ccdf);
> + if (!rc || rc == -ENOMEM)
> + mediated_recovery = true;

This is a convoluted way to get the state of
zdev->pending_errs.mediated_recovery, which becomes invalid out of
pending_errs_lock anyway.

> +
> ers_res = zpci_event_notify_error_detected(pdev, driver);
> if (ers_result_indicates_abort(ers_res)) {
> status_str = "failed (abort on detection)";
> goto out_unlock;
> }
>
> + if (mediated_recovery) {
> + pr_info("%s: Leaving recovery of pass-through device to user-space\n",
> + pci_name(pdev));
> + ers_res = PCI_ERS_RESULT_RECOVERED;
> + status_str = "in progress";
> + goto out_unlock;
> + }

Since zdev->pending_errs.mediated_recovery is only valid while holding
pending_errs_lock, this is really no better than the
is_passed_through() test.

> +
> if (ers_res != PCI_ERS_RESULT_NEED_RESET) {
> ers_res = zpci_event_do_error_state_clear(pdev, driver);
> if (ers_result_indicates_abort(ers_res)) {
> @@ -266,25 +286,19 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
> * @pdev: PCI function for which to report
> * @es: PCI channel failure state to report
> */
> -static void zpci_event_io_failure(struct pci_dev *pdev, pci_channel_state_t es)
> +static void zpci_event_io_failure(struct pci_dev *pdev, pci_channel_state_t es,
> + struct zpci_ccdf_err *ccdf)
> {
> struct pci_driver *driver;
>
> pci_dev_lock(pdev);
> pdev->error_state = es;
> - /**
> - * While vfio-pci's error_detected callback notifies user-space QEMU
> - * reacts to this by freezing the guest. In an s390 environment PCI
> - * errors are rarely fatal so this is overkill. Instead in the future
> - * we will inject the error event and let the guest recover the device
> - * itself.
> - */
> - if (is_passed_through(pdev))
> - goto out;
> +
> + zpci_store_pci_error(pdev, ccdf);
> driver = to_pci_driver(pdev->dev.driver);
> if (driver && driver->err_handler && driver->err_handler->error_detected)
> driver->err_handler->error_detected(pdev, pdev->error_state);

How do you intend to stage this versus QEMU changes? This seems like a
big regression if we're suddenly triggering the eventfd that causes
QEMU to halt. Do you need userspace to opt-in to mediated recovery
rather than automatically enabling it on open? Thanks,

Alex