From ebfa7213e32faafd5532d6f5b3cb873018b671ae Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Thu, 10 Oct 2024 06:19:31 +0000 Subject: [PATCH] smmuv3: Add support for page fault handling Handle page fault from host and send response back. Signed-off-by: Shameer Kolothum --- backends/iommufd.c | 20 +++- hw/arm/smmu-common.c | 39 ++++++-- hw/arm/smmuv3.c | 188 ++++++++++++++++++++++++++++++++++- hw/vfio/iommufd.c | 2 +- include/hw/arm/smmu-common.h | 24 ++++- include/sysemu/iommufd.h | 2 +- 6 files changed, 263 insertions(+), 12 deletions(-) diff --git a/backends/iommufd.c b/backends/iommufd.c index ee6f5bcf65..e9ce82297b 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -228,7 +228,7 @@ bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id, uint32_t pt_id, uint32_t flags, uint32_t data_type, uint32_t data_len, void *data_ptr, uint32_t *out_hwpt, - Error **errp) + uint32_t *out_fault_fd, Error **errp) { int ret, fd = be->fd; struct iommu_hwpt_alloc alloc_hwpt = { @@ -241,6 +241,24 @@ bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id, .data_uptr = (uintptr_t)data_ptr, }; + if (flags & IOMMU_HWPT_FAULT_ID_VALID) { + + struct iommu_fault_alloc cmd = { + .size = sizeof(cmd), + }; + + ret = ioctl(fd, IOMMU_FAULT_QUEUE_ALLOC, &cmd); + if (ret) { + ret = -errno; + error_report("IOMMU_FAULT_ALLOC failed: %m"); + } else { + alloc_hwpt.fault_id = cmd.out_fault_id; + if (out_fault_fd) { + *out_fault_fd = cmd.out_fault_fd; + } + } + } + ret = ioctl(fd, IOMMU_HWPT_ALLOC, &alloc_hwpt); trace_iommufd_backend_alloc_hwpt(fd, dev_id, pt_id, flags, data_type, data_len, (uintptr_t)data_ptr, diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c index d0bc620606..c382fa16e5 100644 --- a/hw/arm/smmu-common.c +++ b/hw/arm/smmu-common.c @@ -670,7 +670,7 @@ static bool smmu_dev_attach_viommu(SMMUDevice *sdev, if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid, idev->ioas_id, IOMMU_HWPT_ALLOC_NEST_PARENT, IOMMU_HWPT_DATA_NONE, 0, NULL, - &s2_hwpt_id, errp)) { + &s2_hwpt_id, NULL, errp)) { error_setg(errp, "failed to allocate an S2 hwpt"); return false; } @@ -695,7 +695,7 @@ static bool smmu_dev_attach_viommu(SMMUDevice *sdev, viommu->core->viommu_id, 0, IOMMU_HWPT_DATA_ARM_SMMUV3, sizeof(abort_data), &abort_data, - &viommu->abort_hwpt_id, errp)) { + &viommu->abort_hwpt_id, NULL, errp)) { error_setg(errp, "failed to allocate an abort pagetable"); goto free_viommu_core; } @@ -704,7 +704,7 @@ static bool smmu_dev_attach_viommu(SMMUDevice *sdev, viommu->core->viommu_id, 0, IOMMU_HWPT_DATA_ARM_SMMUV3, sizeof(bypass_data), &bypass_data, - &viommu->bypass_hwpt_id, errp)) { + &viommu->bypass_hwpt_id, NULL, errp)) { error_setg(errp, "failed to allocate a bypass pagetable"); goto free_abort_hwpt; } @@ -882,6 +882,25 @@ void smmu_dev_uninstall_nested_ste(SMMUDevice *sdev, bool abort) hwpt_id = sdev->viommu->bypass_hwpt_id; } + /* ToDo: May be better to move the below to smmuv3. */ + if (s1_hwpt->out_fault_fd) { + struct io_uring *ring = &s1_hwpt->fault_ring; + struct io_uring_sqe *sqe; + struct __kernel_timespec ts = {.tv_sec = 0, .tv_nsec = 1}; + + s1_hwpt->exiting = true; + /* Send out a timeout sqe for the read handler to exit */ + sqe = io_uring_get_sqe(ring); + io_uring_prep_timeout(sqe, &ts, 0, 0); + io_uring_submit(ring); + + qemu_cond_signal(&s1_hwpt->fault_cond); + qemu_thread_join(&s1_hwpt->read_fault_thread); + qemu_thread_join(&s1_hwpt->write_fault_thread); + qemu_mutex_destroy(&s1_hwpt->fault_mutex); + io_uring_queue_exit(&s1_hwpt->fault_ring); + } + if (!host_iommu_device_iommufd_attach_hwpt(idev, hwpt_id, NULL)) { return; } @@ -892,11 +911,13 @@ void smmu_dev_uninstall_nested_ste(SMMUDevice *sdev, bool abort) } int smmu_dev_install_nested_ste(SMMUDevice *sdev, uint32_t data_type, - uint32_t data_len, void *data) + uint32_t data_len, void *data, + bool req_fault_fd) { SMMUViommu *viommu = sdev->viommu; SMMUS1Hwpt *s1_hwpt = sdev->s1_hwpt; HostIOMMUDeviceIOMMUFD *idev = sdev->idev; + uint32_t flags = 0; if (!idev || !viommu) { return -ENOENT; @@ -912,12 +933,18 @@ int smmu_dev_install_nested_ste(SMMUDevice *sdev, uint32_t data_type, } s1_hwpt->smmu = sdev->smmu; + s1_hwpt->sdev = sdev; s1_hwpt->viommu = viommu; s1_hwpt->iommufd = idev->iommufd; + if (req_fault_fd) { + flags |= IOMMU_HWPT_FAULT_ID_VALID; + } + if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid, - viommu->core->viommu_id, 0, data_type, - data_len, data, &s1_hwpt->hwpt_id, NULL)) { + viommu->core->viommu_id, flags, data_type, + data_len, data, &s1_hwpt->hwpt_id, + &s1_hwpt->out_fault_fd, NULL)) { goto free; } diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c index 8d8dcccd48..30c0ae4c3b 100644 --- a/hw/arm/smmuv3.c +++ b/hw/arm/smmuv3.c @@ -34,6 +34,9 @@ #include "hw/arm/smmuv3.h" #include "smmuv3-internal.h" #include "smmu-internal.h" +#ifdef CONFIG_LINUX_IO_URING +#include +#endif #define PTW_RECORD_FAULT(cfg) (((cfg)->stage == 1) ? (cfg)->record_faults : \ (cfg)->s2cfg.record_faults) @@ -1258,6 +1261,165 @@ static void smmuv3_range_inval(SMMUState *s, Cmd *cmd) } } +static void smmuv3_report_iommu_fault(SMMUS1Hwpt *hwpt, + struct iommu_hwpt_pgfault *fault) +{ + PendFaultEntry *pend; + SMMUDevice *sdev = hwpt->sdev; + SMMUv3State *s3 = sdev->smmu; + uint32_t sid = smmu_get_sid(sdev); + SMMUEventInfo info = {0}; + + info.sid = sid; + info.type = SMMU_EVT_F_TRANSLATION; + info.u.f_translation.addr = fault->addr; + info.u.f_translation.stall = true; + info.u.f_translation.ssid = fault->pasid; + info.u.f_translation.stag = fault->grpid; + + if (fault->flags | IOMMU_PGFAULT_FLAGS_PASID_VALID) { + info.u.f_translation.ssv = true; + } + if (fault->perm & IOMMU_PGFAULT_PERM_READ) { + info.u.f_translation.rnw = true; + } + if (fault->perm & IOMMU_PGFAULT_PERM_PRIV) { + info.u.f_translation.pnu = true; + } + if (fault->perm & IOMMU_PGFAULT_PERM_EXEC) { + info.u.f_translation.ind = true; + } + + pend = g_new0(PendFaultEntry, 1); + memcpy(&pend->fault, fault, sizeof(*fault)); + qemu_mutex_lock(&hwpt->fault_mutex); + QTAILQ_INSERT_TAIL(&hwpt->pendfault, pend, entry); + qemu_mutex_unlock(&hwpt->fault_mutex); + smmuv3_record_event(s3, &info); + return; +} + +static void smmuv3_notify_stall_resume(SMMUState *bs, uint32_t sid, + uint32_t stag, uint32_t code) +{ + SMMUDevice *sdev = smmu_find_sdev(bs, sid); + PageRespEntry *msg; + PendFaultEntry *pend, *tmp; + SMMUS1Hwpt *hwpt; + bool found = false; + + if (!sdev) { + return; + } + + hwpt = sdev->s1_hwpt; + msg = g_new0(PageRespEntry, 1); + + /* Kernel expects addr and pasid info for page response */ + qemu_mutex_lock(&hwpt->fault_mutex); + QTAILQ_FOREACH_SAFE(pend, &hwpt->pendfault, entry, tmp) { + if (pend->fault.grpid == stag) { + QTAILQ_REMOVE(&hwpt->pendfault, pend, entry); + msg->resp.cookie = pend->fault.cookie; + msg->resp.code = code; + QTAILQ_INSERT_TAIL(&hwpt->pageresp, msg, entry); + qemu_cond_signal(&hwpt->fault_cond); + + g_free(pend); + found = true; + break; + } + } + + qemu_mutex_unlock(&hwpt->fault_mutex); + if (!found) { + warn_report("No matching fault for resume(stag 0x%x), drop!", stag); + return; + } +} + +static void *write_fault_handler(void *opaque) +{ + SMMUS1Hwpt *hwpt = opaque; + PageRespEntry *msg, *tmp; + struct iommu_hwpt_page_response *resp; + int ret; + + resp = g_new0(struct iommu_hwpt_page_response, 1); + while (!hwpt->exiting) { + /* Check we have any pending responses */ + qemu_mutex_lock(&hwpt->fault_mutex); + qemu_cond_wait(&hwpt->fault_cond, &hwpt->fault_mutex); + QTAILQ_FOREACH_SAFE(msg, &hwpt->pageresp, entry, tmp) { + QTAILQ_REMOVE(&hwpt->pageresp, msg, entry); + memcpy(resp, &msg->resp, sizeof(*resp)); + g_free(msg); + + ret = write(hwpt->out_fault_fd, resp, sizeof(*resp)); + if (ret != sizeof(*resp)) { + warn_report("Write resp[cookie 0x%x] fail %d", + resp->cookie, ret); + } + } + qemu_mutex_unlock(&hwpt->fault_mutex); + } + g_free(resp); + return NULL; +} + +static void *read_fault_handler(void *opaque) +{ + SMMUS1Hwpt *hwpt = opaque; + struct io_uring_sqe *sqe; + struct io_uring_cqe *cqe; + struct iommu_hwpt_pgfault *fault; + struct io_uring *ring = &hwpt->fault_ring; + void *data; + int ret; + + fault = g_new0(struct iommu_hwpt_pgfault, 1); + while (!hwpt->exiting) { + sqe = io_uring_get_sqe(ring); + io_uring_prep_read(sqe, hwpt->out_fault_fd, fault, + sizeof(*fault), 0); + io_uring_sqe_set_data(sqe, fault); + io_uring_submit(ring); + + ret = io_uring_wait_cqe(ring, &cqe); + if (ret == 0) { + if (cqe->res == sizeof(*fault)) { + data = io_uring_cqe_get_data(cqe); + smmuv3_report_iommu_fault(hwpt, data); + } + } else { + warn_report("Read fault[hwpt_id 0x%x] failed %d", + hwpt->hwpt_id, ret); + } + io_uring_cqe_seen(ring, cqe); + } + g_free(fault); + return NULL; +} + +static void create_fault_handlers(SMMUS1Hwpt *hwpt) +{ + if (!hwpt->out_fault_fd) { + warn_report("No fault fd for hwpt id: %d", hwpt->hwpt_id); + return; + } + + io_uring_queue_init(1024, &hwpt->fault_ring, 0); + qemu_mutex_init(&hwpt->fault_mutex); + qemu_cond_init(&hwpt->fault_cond); + QTAILQ_INIT(&hwpt->pageresp); + QTAILQ_INIT(&hwpt->pendfault); + qemu_thread_create(&hwpt->read_fault_thread, "io fault read", + read_fault_handler, + hwpt, QEMU_THREAD_JOINABLE); + qemu_thread_create(&hwpt->write_fault_thread, "io fault write", + write_fault_handler, + hwpt, QEMU_THREAD_JOINABLE); +} static void smmuv3_install_nested_ste(SMMUDevice *sdev, int sid) { #ifdef __linux__ @@ -1266,6 +1428,7 @@ static void smmuv3_install_nested_ste(SMMUDevice *sdev, int sid) struct iommu_hwpt_arm_smmuv3 nested_data = {}; SMMUv3State *s = sdev->smmu; SMMUState *bs = &s->smmu_state; + bool req_fault_fd = false; uint32_t config; STE ste; int ret; @@ -1309,13 +1472,22 @@ static void smmuv3_install_nested_ste(SMMUDevice *sdev, int sid) /* S1DSS | S1CIR | S1COR | S1CSH | S1STALLD | EATS */ nested_data.ste[1] &= 0x380000ffULL; + if (STE_S1CDMAX(&ste)) { + req_fault_fd = true; + } + ret = smmu_dev_install_nested_ste(sdev, IOMMU_HWPT_DATA_ARM_SMMUV3, - sizeof(nested_data), &nested_data); + sizeof(nested_data), &nested_data, + req_fault_fd); if (ret) { error_report("Unable to install nested STE=%16LX:%16LX, ret=%d", nested_data.ste[1], nested_data.ste[0], ret); } + if (req_fault_fd) { + create_fault_handlers(sdev->s1_hwpt); + } + trace_smmuv3_install_nested_ste(sid, nested_data.ste[1], nested_data.ste[0]); #endif } @@ -1631,10 +1803,22 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) case SMMU_CMD_TLBI_EL2_VA: case SMMU_CMD_TLBI_EL2_VAA: case SMMU_CMD_PRI_RESP: - case SMMU_CMD_RESUME: case SMMU_CMD_STALL_TERM: trace_smmuv3_unhandled_cmd(type); break; + case SMMU_CMD_RESUME: + { + uint32_t sid = CMD_SID(&cmd); + uint16_t stag = CMD_RESUME_STAG(&cmd); + uint8_t action = CMD_RESUME_AC(&cmd); + uint32_t code = IOMMUFD_PAGE_RESP_INVALID; + + if (action) { + code = IOMMUFD_PAGE_RESP_SUCCESS; + } + smmuv3_notify_stall_resume(bs, sid, stag, code); + break; + } default: cmd_error = SMMU_CERROR_ILL; break; diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 528023b95b..c0eb87c78c 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -344,7 +344,7 @@ static int iommufd_cdev_autodomains_get(VFIODevice *vbasedev, if (!iommufd_backend_alloc_hwpt(iommufd, vbasedev->devid, container->ioas_id, flags, IOMMU_HWPT_DATA_NONE, 0, NULL, - &hwpt_id, errp)) { + &hwpt_id, NULL, errp)) { return -EINVAL; } diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h index e30539a8d4..087a11efc7 100644 --- a/include/hw/arm/smmu-common.h +++ b/include/hw/arm/smmu-common.h @@ -138,13 +138,34 @@ typedef struct SMMUVdev { uint32_t sid; }SMMUVdev; +typedef struct PendFaultEntry { + struct iommu_hwpt_pgfault fault; + QTAILQ_ENTRY(PendFaultEntry) entry; +} PendFaultEntry; + +typedef struct PageRespEntry { + struct iommu_hwpt_page_response resp; + QTAILQ_ENTRY(PageRespEntry) entry; +} PageRespEntry; + typedef struct SMMUS1Hwpt { + void *sdev; void *smmu; IOMMUFDBackend *iommufd; SMMUViommu *viommu; uint32_t hwpt_id; + uint32_t out_fault_fd; QLIST_HEAD(, SMMUDevice) device_list; QLIST_ENTRY(SMMUViommu) next; + /* fault handling */ + struct io_uring fault_ring; + QemuThread read_fault_thread; + QemuThread write_fault_thread; + QemuMutex fault_mutex; + QemuCond fault_cond; + QTAILQ_HEAD(, PageRespEntry) pageresp; + QTAILQ_HEAD(, PendFaultEntry) pendfault; + bool exiting; } SMMUS1Hwpt; typedef struct SMMUDevice { @@ -258,7 +279,8 @@ int smmu_dev_get_info(SMMUDevice *sdev, uint32_t *data_type, uint32_t data_len, void *data); void smmu_dev_uninstall_nested_ste(SMMUDevice *sdev, bool abort); int smmu_dev_install_nested_ste(SMMUDevice *sdev, uint32_t data_type, - uint32_t data_len, void *data); + uint32_t data_len, void *data, + bool req_fault_fd); int smmu_hwpt_invalidate_cache(SMMUS1Hwpt *s1_hwpt, uint32_t type, uint32_t len, uint32_t *num, void *reqs); int smmu_viommu_invalidate_cache(IOMMUFDViommu *viommu, uint32_t type, diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h index 0f2c826036..b279184974 100644 --- a/include/sysemu/iommufd.h +++ b/include/sysemu/iommufd.h @@ -62,7 +62,7 @@ bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id, uint32_t pt_id, uint32_t flags, uint32_t data_type, uint32_t data_len, void *data_ptr, uint32_t *out_hwpt, - Error **errp); + uint32_t *out_fault_fd, Error **errp); bool iommufd_backend_set_dirty_tracking(IOMMUFDBackend *be, uint32_t hwpt_id, bool start, Error **errp); bool iommufd_backend_get_dirty_bitmap(IOMMUFDBackend *be, uint32_t hwpt_id, -- 2.41.0.windows.1