Re: [PATCH v14 4/7] s390/pci: Store PCI error information for passthrough devices
From: Niklas Schnelle
Date: Wed Apr 29 2026 - 07:42:29 EST
On Tue, 2026-04-21 at 09:30 -0700, Farhan Ali wrote:
> For a passthrough device we need co-operation from user space to recover
> the device. This would require to bubble up any error information to user
> space. Let's store this error information for passthrough devices, so it
> can be retrieved later.
>
> We can now have userspace drivers (vfio-pci based) on s390x. The userspace
> drivers will not have any KVM fd and so no kzdev associated with them. So
> we need to update the logic for detecting passthrough devices to not depend
> on struct kvm_zdev.
>
> Reviewed-by: Matthew Rosato <mjrosato@xxxxxxxxxxxxx>
> Signed-off-by: Farhan Ali <alifm@xxxxxxxxxxxxx>
> ---
> arch/s390/include/asm/pci.h | 30 ++++++++
> arch/s390/pci/pci.c | 1 +
> arch/s390/pci/pci_event.c | 117 +++++++++++++++++--------------
> drivers/vfio/pci/vfio_pci_zdev.c | 9 ++-
> 4 files changed, 105 insertions(+), 52 deletions(-)
>
--- snip ---
> +
> +void zpci_start_mediated_recovery(struct zpci_dev *zdev)
> +{
> + guard(mutex)(&zdev->pending_errs_lock);
> + zdev->pending_errs.mediated_recovery = true;
> +}
> +EXPORT_SYMBOL_GPL(zpci_start_mediated_recovery);
> +
> +void zpci_stop_mediated_recovery(struct zpci_dev *zdev)
> +{
> + struct pci_dev *pdev = NULL;
> +
> + guard(mutex)(&zdev->pending_errs_lock);
> + zdev->pending_errs.mediated_recovery = false;
> + pdev = pci_get_slot(zdev->zbus->bus, zdev->devfn);
> + if (zdev->pending_errs.count)
> + pr_info("%s: Unhandled PCI error events count=%d",
> + pci_name(pdev), zdev->pending_errs.count);
Sashiko notes a possible ABBA locking issue here between
pending_errs_lock and the pci_bus_sem inside pci_get_slot(). I wonder
if that would also be visible with lockdep? Also Sashiko notes that
zdev->zbus->bus could be NULL I don't think this is possible at the
current callsites via vfio-pci though. Similarly I don't think
pci_get_slot() can really be NULL at the current call sites. This makes
me wonder however, would it maybe be cleaner to pass a struct pci_dev
here so you don't need the pci_get_slot() at all? Both
vfio_pci_zdev_open_device() and vfio_pci_zdev_close_device() have that
readily available via vdev->pdev.
> + memset(&zdev->pending_errs, 0, sizeof(struct zpci_ccdf_pending));
> + pci_dev_put(pdev);
> +}
> +EXPORT_SYMBOL_GPL(zpci_stop_mediated_recovery);
> +
> static pci_ers_result_t zpci_event_notify_error_detected(struct pci_dev *pdev,
> struct pci_driver *driver)
> {
> @@ -175,7 +190,8 @@ static pci_ers_result_t zpci_event_do_reset(struct pci_dev *pdev,
> * and the platform determines which functions are affected for
> * multi-function devices.
> */
> -static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
> +static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev,
> + struct zpci_ccdf_err *ccdf)
> {
> pci_ers_result_t ers_res = PCI_ERS_RESULT_DISCONNECT;
> struct zpci_dev *zdev = to_zpci(pdev);
> @@ -194,13 +210,6 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
> }
> pdev->error_state = pci_channel_io_frozen;
>
> - if (is_passed_through(pdev)) {
> - pr_info("%s: Cannot be recovered in the host because it is a pass-through device\n",
> - pci_name(pdev));
> - status_str = "failed (pass-through)";
> - goto out_unlock;
> - }
> -
> driver = to_pci_driver(pdev->dev.driver);
> if (!is_driver_supported(driver)) {
> if (!driver) {
> @@ -216,12 +225,24 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
> goto out_unlock;
> }
>
> + zpci_store_pci_error(pdev, ccdf);
> +
> ers_res = zpci_event_notify_error_detected(pdev, driver);
> if (ers_result_indicates_abort(ers_res)) {
> status_str = "failed (abort on detection)";
> goto out_unlock;
> }
>
> + scoped_guard(mutex, &zdev->pending_errs_lock) {
> + if (zdev->pending_errs.mediated_recovery) {
> + pr_info("%s: Leaving recovery of pass-through device to user-space\n",
> + pci_name(pdev));
> + ers_res = PCI_ERS_RESULT_RECOVERED;
> + status_str = "in progress";
> + goto out_unlock;
> + }
> + }
> +
Sashiko notes that mixing goto unlock and lock guards within one
function is discouraged. Here it's not that hard to read since it is
just a scoped guard but I think we should still not mix it.
However if we also convert the device_lock() to a guard lock here the
goto would still make sense to go to the zpci_report_status() and that
is still a bit odd with guarded locks. So I think an alternative simple
fix, that makes this overall cleaner too, is to put the whole
scoped_guard() block above into a helper function then you can do the
goto out_unlock on that helper returning PCI_ERS_RESULT_RECVOERED in
line with e.g. zpci_event_notify_error_detected(). That way you don't
need to touch existing locks and you get to keep the guard locking.
> if (ers_res != PCI_ERS_RESULT_NEED_RESET) {
> ers_res = zpci_event_do_error_state_clear(pdev, driver);
> if (ers_result_indicates_abort(ers_res)) {
> @@ -266,25 +287,19 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
> * @pdev: PCI function for which to report
> * @es: PCI channel failure state to report
> */
> -static void zpci_event_io_failure(struct pci_dev *pdev, pci_channel_state_t es)
> +static void zpci_event_io_failure(struct pci_dev *pdev, pci_channel_state_t es,
> + struct zpci_ccdf_err *ccdf)
> {
> struct pci_driver *driver;
>
> pci_dev_lock(pdev);
> pdev->error_state = es;
> - /**
> - * While vfio-pci's error_detected callback notifies user-space QEMU
> - * reacts to this by freezing the guest. In an s390 environment PCI
> - * errors are rarely fatal so this is overkill. Instead in the future
> - * we will inject the error event and let the guest recover the device
> - * itself.
> - */
> - if (is_passed_through(pdev))
> - goto out;
> +
> + zpci_store_pci_error(pdev, ccdf);
> driver = to_pci_driver(pdev->dev.driver);
> if (driver && driver->err_handler && driver->err_handler->error_detected)
> driver->err_handler->error_detected(pdev, pdev->error_state);
> -out:
> +
> pci_dev_unlock(pdev);
> }
>
> @@ -330,12 +345,12 @@ static void __zpci_event_error(struct zpci_ccdf_err *ccdf)
> break;
> case 0x0040: /* Service Action or Error Recovery Failed */
> case 0x003b:
> - zpci_event_io_failure(pdev, pci_channel_io_perm_failure);
> + zpci_event_io_failure(pdev, pci_channel_io_perm_failure, ccdf);
> break;
> default: /* PCI function left in the error state attempt to recover */
> - ers_res = zpci_event_attempt_error_recovery(pdev);
> + ers_res = zpci_event_attempt_error_recovery(pdev, ccdf);
> if (ers_res != PCI_ERS_RESULT_RECOVERED)
> - zpci_event_io_failure(pdev, pci_channel_io_perm_failure);
> + zpci_event_io_failure(pdev, pci_channel_io_perm_failure, ccdf);
> break;
> }
> pci_dev_put(pdev);
> diff --git a/drivers/vfio/pci/vfio_pci_zdev.c b/drivers/vfio/pci/vfio_pci_zdev.c
> index 0990fdb146b7..0658095ac5b1 100644
> --- a/drivers/vfio/pci/vfio_pci_zdev.c
> +++ b/drivers/vfio/pci/vfio_pci_zdev.c
> @@ -148,6 +148,8 @@ int vfio_pci_zdev_open_device(struct vfio_pci_core_device *vdev)
> if (!zdev)
> return -ENODEV;
>
> + zpci_start_mediated_recovery(zdev);
> +
> if (!vdev->vdev.kvm)
> return 0;
Sashiko notes that mediated recovery stays true iif kvm_register()
fails later in this function. I think Sashiko is right there.
>
> @@ -161,7 +163,12 @@ void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev)
> {
> struct zpci_dev *zdev = to_zpci(vdev->pdev);
>
> - if (!zdev || !vdev->vdev.kvm)
> + if (!zdev)
> + return;
> +
> + zpci_stop_mediated_recovery(zdev);
> +
> + if (!vdev->vdev.kvm)
> return;
>
> if (zpci_kvm_hook.kvm_unregister)