!343 Add migration support for VFIO devices

From: @imxcc Reviewed-by: @kevinzhu1 Signed-off-by: @kevinzhu1
2021-07-29 06:30:45 +00:00 · 2021-07-29 06:30:45 +00:00 · 3308e532d1
commit 3308e532d1
parent 0ca7fd6709 4daf6a89f5
23 changed files with 5423 additions and 1 deletions
--- a/hw-net-fix-vmxnet3-live-migration.patch
+++ b/hw-net-fix-vmxnet3-live-migration.patch
@ -0,0 +1,136 @@
 From b8b9f58ee5d3cff0a1e7cca770fe632043efb728 Mon Sep 17 00:00:00 2001
 From: Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
 Date: Fri, 5 Jul 2019 04:07:11 +0300
 Subject: [PATCH] hw/net: fix vmxnet3 live migration
 At some point vmxnet3 live migration stopped working and git-bisect
 didn't help finding a working version.
 The issue is the PCI configuration space is not being migrated
 successfully and MSIX remains masked at destination.
 Remove the migration differentiation between PCI and PCIe since
 the logic resides now inside VMSTATE_PCI_DEVICE.
 Remove also the VMXNET3_COMPAT_FLAG_DISABLE_PCIE based differentiation
 since at 'realize' time is decided if the device is PCI or PCIe,
 then the above macro is enough.
 Use the opportunity to move to the standard VMSTATE_MSIX
 instead of the deprecated SaveVMHandlers.
 Signed-off-by: Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
 Message-Id: <20190705010711.23277-1-marcel.apfelbaum@gmail.com>
 Tested-by: Sukrit Bhatnagar <skrtbhtngr@gmail.com>
 Reviewed-by: Dmitry Fleytman <dmitry.fleytman@gmail.com>
 Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
 ---
 hw/net/vmxnet3.c | 52 ++----------------------------------------------
 1 file changed, 2 insertions(+), 50 deletions(-)
 diff --git a/hw/net/vmxnet3.c b/hw/net/vmxnet3.c
 index ecc4f5bcf0..bf8e6ca4c9 100644
 --- a/hw/net/vmxnet3.c
 +++ b/hw/net/vmxnet3.c
@@ -2153,21 +2153,6 @@ vmxnet3_cleanup_msi(VMXNET3State *s)
     msi_uninit(d);
 }
 -static void
 -vmxnet3_msix_save(QEMUFile *f, void *opaque)
 -{
 -    PCIDevice *d = PCI_DEVICE(opaque);
 -    msix_save(d, f);
 -}
 -
 -static int
 -vmxnet3_msix_load(QEMUFile *f, void *opaque, int version_id)
 -{
 -    PCIDevice *d = PCI_DEVICE(opaque);
 -    msix_load(d, f);
 -    return 0;
 -}
 -
 static const MemoryRegionOps b0_ops = {
     .read = vmxnet3_io_bar0_read,
     .write = vmxnet3_io_bar0_write,
@@ -2188,11 +2173,6 @@ static const MemoryRegionOps b1_ops = {
     },
 };
 -static SaveVMHandlers savevm_vmxnet3_msix = {
 -    .save_state = vmxnet3_msix_save,
 -    .load_state = vmxnet3_msix_load,
 -};
 -
 static uint64_t vmxnet3_device_serial_num(VMXNET3State *s)
 {
     uint64_t dsn_payload;
@@ -2215,7 +2195,6 @@ static uint64_t vmxnet3_device_serial_num(VMXNET3State *s)
 static void vmxnet3_pci_realize(PCIDevice *pci_dev, Error **errp)
 {
 -    DeviceState *dev = DEVICE(pci_dev);
     VMXNET3State *s = VMXNET3(pci_dev);
     int ret;
@@ -2261,8 +2240,6 @@ static void vmxnet3_pci_realize(PCIDevice *pci_dev, Error **errp)
         pcie_dev_ser_num_init(pci_dev, VMXNET3_DSN_OFFSET,
                               vmxnet3_device_serial_num(s));
     }
 -
 -    register_savevm_live(dev, "vmxnet3-msix", -1, 1, &savevm_vmxnet3_msix, s);
 }
 static void vmxnet3_instance_init(Object *obj)
@@ -2452,29 +2429,6 @@ static const VMStateDescription vmstate_vmxnet3_int_state = {
     }
 };
 -static bool vmxnet3_vmstate_need_pcie_device(void *opaque)
 -{
 -    VMXNET3State *s = VMXNET3(opaque);
 -
 -    return !(s->compat_flags & VMXNET3_COMPAT_FLAG_DISABLE_PCIE);
 -}
 -
 -static bool vmxnet3_vmstate_test_pci_device(void *opaque, int version_id)
 -{
 -    return !vmxnet3_vmstate_need_pcie_device(opaque);
 -}
 -
 -static const VMStateDescription vmstate_vmxnet3_pcie_device = {
 -    .name = "vmxnet3/pcie",
 -    .version_id = 1,
 -    .minimum_version_id = 1,
 -    .needed = vmxnet3_vmstate_need_pcie_device,
 -    .fields = (VMStateField[]) {
 -        VMSTATE_PCI_DEVICE(parent_obj, VMXNET3State),
 -        VMSTATE_END_OF_LIST()
 -    }
 -};
 -
 static const VMStateDescription vmstate_vmxnet3 = {
     .name = "vmxnet3",
     .version_id = 1,
@@ -2482,9 +2436,8 @@ static const VMStateDescription vmstate_vmxnet3 = {
     .pre_save = vmxnet3_pre_save,
     .post_load = vmxnet3_post_load,
     .fields = (VMStateField[]) {
 -            VMSTATE_STRUCT_TEST(parent_obj, VMXNET3State,
 -                                vmxnet3_vmstate_test_pci_device, 0,
 -                                vmstate_pci_device, PCIDevice),
 +            VMSTATE_PCI_DEVICE(parent_obj, VMXNET3State),
 +            VMSTATE_MSIX(parent_obj, VMXNET3State),
             VMSTATE_BOOL(rx_packets_compound, VMXNET3State),
             VMSTATE_BOOL(rx_vlan_stripping, VMXNET3State),
             VMSTATE_BOOL(lro_supported, VMXNET3State),
@@ -2520,7 +2473,6 @@ static const VMStateDescription vmstate_vmxnet3 = {
     },
     .subsections = (const VMStateDescription*[]) {
         &vmxstate_vmxnet3_mcast_list,
 -        &vmstate_vmxnet3_pcie_device,
         NULL
     }
 };
 -- 
 2.27.0
--- a/include-Make-headers-more-self-contained.patch
+++ b/include-Make-headers-more-self-contained.patch
--- a/linux-headers-Update-against-Add-migration-support-f.patch
+++ b/linux-headers-Update-against-Add-migration-support-f.patch
@ -0,0 +1,517 @@
 From 7ab9ce4016ec48e0af8010f742ee39fc84342d00 Mon Sep 17 00:00:00 2001
 From: Jinhao Gao <gaojinhao@huawei.com>
 Date: Fri, 23 Jul 2021 14:55:12 +0800
 Subject: [PATCH] linux headers: Update against "Add migration support for VFIO
 devices"
 Update linux-headers/linux/vfio.h against Linux 5.9-rc7 for the
 VFIO migration support series.
 Signed-off-by: Jinhao Gao <gaojinhao@huawei.com>
 Signed-off-by: Shenming Lu <lushenming@huawei.com>
 ---
 linux-headers/linux/vfio.h | 420 +++++++++++++++++++++++++++++++++++--
 1 file changed, 405 insertions(+), 15 deletions(-)
 diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
 index 24f505199f..a90672494d 100644
 --- a/linux-headers/linux/vfio.h
 +++ b/linux-headers/linux/vfio.h
@@ -295,15 +295,39 @@ struct vfio_region_info_cap_type {
 	__u32 subtype;	/* type specific */
 };
 +/*
 + * List of region types, global per bus driver.
 + * If you introduce a new type, please add it here.
 + */
 +
 +/* PCI region type containing a PCI vendor part */
 #define VFIO_REGION_TYPE_PCI_VENDOR_TYPE	(1 << 31)
 #define VFIO_REGION_TYPE_PCI_VENDOR_MASK	(0xffff)
 +#define VFIO_REGION_TYPE_GFX                    (1)
 +#define VFIO_REGION_TYPE_CCW			(2)
 +#define VFIO_REGION_TYPE_MIGRATION              (3)
 +
 +/* sub-types for VFIO_REGION_TYPE_PCI_* */
 -/* 8086 Vendor sub-types */
 +/* 8086 vendor PCI sub-types */
 #define VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION	(1)
 #define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG	(2)
 #define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG	(3)
 -#define VFIO_REGION_TYPE_GFX                    (1)
 +/* 10de vendor PCI sub-types */
 +/*
 + * NVIDIA GPU NVlink2 RAM is coherent RAM mapped onto the host address space.
 + */
 +#define VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM	(1)
 +
 +/* 1014 vendor PCI sub-types */
 +/*
 + * IBM NPU NVlink2 ATSD (Address Translation Shootdown) register of NPU
 + * to do TLB invalidation on a GPU.
 + */
 +#define VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD	(1)
 +
 +/* sub-types for VFIO_REGION_TYPE_GFX */
 #define VFIO_REGION_SUBTYPE_GFX_EDID            (1)
 /**
@@ -353,24 +377,237 @@ struct vfio_region_gfx_edid {
 #define VFIO_DEVICE_GFX_LINK_STATE_DOWN  2
 };
 -#define VFIO_REGION_TYPE_CCW			(2)
 -/* ccw sub-types */
 +/* sub-types for VFIO_REGION_TYPE_CCW */
 #define VFIO_REGION_SUBTYPE_CCW_ASYNC_CMD	(1)
 +#define VFIO_REGION_SUBTYPE_CCW_SCHIB		(2)
 +#define VFIO_REGION_SUBTYPE_CCW_CRW		(3)
 -/*
 - * 10de vendor sub-type
 - *
 - * NVIDIA GPU NVlink2 RAM is coherent RAM mapped onto the host address space.
 - */
 -#define VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM	(1)
 +/* sub-types for VFIO_REGION_TYPE_MIGRATION */
 +#define VFIO_REGION_SUBTYPE_MIGRATION           (1)
 /*
 - * 1014 vendor sub-type
 + * The structure vfio_device_migration_info is placed at the 0th offset of
 + * the VFIO_REGION_SUBTYPE_MIGRATION region to get and set VFIO device related
 + * migration information. Field accesses from this structure are only supported
 + * at their native width and alignment. Otherwise, the result is undefined and
 + * vendor drivers should return an error.
  *
 - * IBM NPU NVlink2 ATSD (Address Translation Shootdown) register of NPU
 - * to do TLB invalidation on a GPU.
 + * device_state: (read/write)
 + *      - The user application writes to this field to inform the vendor driver
 + *        about the device state to be transitioned to.
 + *      - The vendor driver should take the necessary actions to change the
 + *        device state. After successful transition to a given state, the
 + *        vendor driver should return success on write(device_state, state)
 + *        system call. If the device state transition fails, the vendor driver
 + *        should return an appropriate -errno for the fault condition.
 + *      - On the user application side, if the device state transition fails,
 + *	  that is, if write(device_state, state) returns an error, read
 + *	  device_state again to determine the current state of the device from
 + *	  the vendor driver.
 + *      - The vendor driver should return previous state of the device unless
 + *        the vendor driver has encountered an internal error, in which case
 + *        the vendor driver may report the device_state VFIO_DEVICE_STATE_ERROR.
 + *      - The user application must use the device reset ioctl to recover the
 + *        device from VFIO_DEVICE_STATE_ERROR state. If the device is
 + *        indicated to be in a valid device state by reading device_state, the
 + *        user application may attempt to transition the device to any valid
 + *        state reachable from the current state or terminate itself.
 + *
 + *      device_state consists of 3 bits:
 + *      - If bit 0 is set, it indicates the _RUNNING state. If bit 0 is clear,
 + *        it indicates the _STOP state. When the device state is changed to
 + *        _STOP, driver should stop the device before write() returns.
 + *      - If bit 1 is set, it indicates the _SAVING state, which means that the
 + *        driver should start gathering device state information that will be
 + *        provided to the VFIO user application to save the device's state.
 + *      - If bit 2 is set, it indicates the _RESUMING state, which means that
 + *        the driver should prepare to resume the device. Data provided through
 + *        the migration region should be used to resume the device.
 + *      Bits 3 - 31 are reserved for future use. To preserve them, the user
 + *      application should perform a read-modify-write operation on this
 + *      field when modifying the specified bits.
 + *
 + *  +------- _RESUMING
 + *  |+------ _SAVING
 + *  ||+----- _RUNNING
 + *  |||
 + *  000b => Device Stopped, not saving or resuming
 + *  001b => Device running, which is the default state
 + *  010b => Stop the device & save the device state, stop-and-copy state
 + *  011b => Device running and save the device state, pre-copy state
 + *  100b => Device stopped and the device state is resuming
 + *  101b => Invalid state
 + *  110b => Error state
 + *  111b => Invalid state
 + *
 + * State transitions:
 + *
 + *              _RESUMING  _RUNNING    Pre-copy    Stop-and-copy   _STOP
 + *                (100b)     (001b)     (011b)        (010b)       (000b)
 + * 0. Running or default state
 + *                             |
 + *
 + * 1. Normal Shutdown (optional)
 + *                             |------------------------------------->|
 + *
 + * 2. Save the state or suspend
 + *                             |------------------------->|---------->|
 + *
 + * 3. Save the state during live migration
 + *                             |----------->|------------>|---------->|
 + *
 + * 4. Resuming
 + *                  |<---------|
 + *
 + * 5. Resumed
 + *                  |--------->|
 + *
 + * 0. Default state of VFIO device is _RUNNNG when the user application starts.
 + * 1. During normal shutdown of the user application, the user application may
 + *    optionally change the VFIO device state from _RUNNING to _STOP. This
 + *    transition is optional. The vendor driver must support this transition but
 + *    must not require it.
 + * 2. When the user application saves state or suspends the application, the
 + *    device state transitions from _RUNNING to stop-and-copy and then to _STOP.
 + *    On state transition from _RUNNING to stop-and-copy, driver must stop the
 + *    device, save the device state and send it to the application through the
 + *    migration region. The sequence to be followed for such transition is given
 + *    below.
 + * 3. In live migration of user application, the state transitions from _RUNNING
 + *    to pre-copy, to stop-and-copy, and to _STOP.
 + *    On state transition from _RUNNING to pre-copy, the driver should start
 + *    gathering the device state while the application is still running and send
 + *    the device state data to application through the migration region.
 + *    On state transition from pre-copy to stop-and-copy, the driver must stop
 + *    the device, save the device state and send it to the user application
 + *    through the migration region.
 + *    Vendor drivers must support the pre-copy state even for implementations
 + *    where no data is provided to the user before the stop-and-copy state. The
 + *    user must not be required to consume all migration data before the device
 + *    transitions to a new state, including the stop-and-copy state.
 + *    The sequence to be followed for above two transitions is given below.
 + * 4. To start the resuming phase, the device state should be transitioned from
 + *    the _RUNNING to the _RESUMING state.
 + *    In the _RESUMING state, the driver should use the device state data
 + *    received through the migration region to resume the device.
 + * 5. After providing saved device data to the driver, the application should
 + *    change the state from _RESUMING to _RUNNING.
 + *
 + * reserved:
 + *      Reads on this field return zero and writes are ignored.
 + *
 + * pending_bytes: (read only)
 + *      The number of pending bytes still to be migrated from the vendor driver.
 + *
 + * data_offset: (read only)
 + *      The user application should read data_offset field from the migration
 + *      region. The user application should read the device data from this
 + *      offset within the migration region during the _SAVING state or write
 + *      the device data during the _RESUMING state. See below for details of
 + *      sequence to be followed.
 + *
 + * data_size: (read/write)
 + *      The user application should read data_size to get the size in bytes of
 + *      the data copied in the migration region during the _SAVING state and
 + *      write the size in bytes of the data copied in the migration region
 + *      during the _RESUMING state.
 + *
 + * The format of the migration region is as follows:
 + *  ------------------------------------------------------------------
 + * |vfio_device_migration_info|    data section                      |
 + * |                          |     ///////////////////////////////  |
 + * ------------------------------------------------------------------
 + *   ^                              ^
 + *  offset 0-trapped part        data_offset
 + *
 + * The structure vfio_device_migration_info is always followed by the data
 + * section in the region, so data_offset will always be nonzero. The offset
 + * from where the data is copied is decided by the kernel driver. The data
 + * section can be trapped, mmapped, or partitioned, depending on how the kernel
 + * driver defines the data section. The data section partition can be defined
 + * as mapped by the sparse mmap capability. If mmapped, data_offset must be
 + * page aligned, whereas initial section which contains the
 + * vfio_device_migration_info structure, might not end at the offset, which is
 + * page aligned. The user is not required to access through mmap regardless
 + * of the capabilities of the region mmap.
 + * The vendor driver should determine whether and how to partition the data
 + * section. The vendor driver should return data_offset accordingly.
 + *
 + * The sequence to be followed while in pre-copy state and stop-and-copy state
 + * is as follows:
 + * a. Read pending_bytes, indicating the start of a new iteration to get device
 + *    data. Repeated read on pending_bytes at this stage should have no side
 + *    effects.
 + *    If pending_bytes == 0, the user application should not iterate to get data
 + *    for that device.
 + *    If pending_bytes > 0, perform the following steps.
 + * b. Read data_offset, indicating that the vendor driver should make data
 + *    available through the data section. The vendor driver should return this
 + *    read operation only after data is available from (region + data_offset)
 + *    to (region + data_offset + data_size).
 + * c. Read data_size, which is the amount of data in bytes available through
 + *    the migration region.
 + *    Read on data_offset and data_size should return the offset and size of
 + *    the current buffer if the user application reads data_offset and
 + *    data_size more than once here.
 + * d. Read data_size bytes of data from (region + data_offset) from the
 + *    migration region.
 + * e. Process the data.
 + * f. Read pending_bytes, which indicates that the data from the previous
 + *    iteration has been read. If pending_bytes > 0, go to step b.
 + *
 + * The user application can transition from the _SAVING|_RUNNING
 + * (pre-copy state) to the _SAVING (stop-and-copy) state regardless of the
 + * number of pending bytes. The user application should iterate in _SAVING
 + * (stop-and-copy) until pending_bytes is 0.
 + *
 + * The sequence to be followed while _RESUMING device state is as follows:
 + * While data for this device is available, repeat the following steps:
 + * a. Read data_offset from where the user application should write data.
 + * b. Write migration data starting at the migration region + data_offset for
 + *    the length determined by data_size from the migration source.
 + * c. Write data_size, which indicates to the vendor driver that data is
 + *    written in the migration region. Vendor driver must return this write
 + *    operations on consuming data. Vendor driver should apply the
 + *    user-provided migration region data to the device resume state.
 + *
 + * If an error occurs during the above sequences, the vendor driver can return
 + * an error code for next read() or write() operation, which will terminate the
 + * loop. The user application should then take the next necessary action, for
 + * example, failing migration or terminating the user application.
 + *
 + * For the user application, data is opaque. The user application should write
 + * data in the same order as the data is received and the data should be of
 + * same transaction size at the source.
  */
 -#define VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD	(1)
 +
 +struct vfio_device_migration_info {
 +	__u32 device_state;         /* VFIO device state */
 +#define VFIO_DEVICE_STATE_STOP      (0)
 +#define VFIO_DEVICE_STATE_RUNNING   (1 << 0)
 +#define VFIO_DEVICE_STATE_SAVING    (1 << 1)
 +#define VFIO_DEVICE_STATE_RESUMING  (1 << 2)
 +#define VFIO_DEVICE_STATE_MASK      (VFIO_DEVICE_STATE_RUNNING | \
 +				     VFIO_DEVICE_STATE_SAVING |  \
 +				     VFIO_DEVICE_STATE_RESUMING)
 +
 +#define VFIO_DEVICE_STATE_VALID(state) \
 +	(state & VFIO_DEVICE_STATE_RESUMING ? \
 +	(state & VFIO_DEVICE_STATE_MASK) == VFIO_DEVICE_STATE_RESUMING : 1)
 +
 +#define VFIO_DEVICE_STATE_IS_ERROR(state) \
 +	((state & VFIO_DEVICE_STATE_MASK) == (VFIO_DEVICE_STATE_SAVING | \
 +					      VFIO_DEVICE_STATE_RESUMING))
 +
 +#define VFIO_DEVICE_STATE_SET_ERROR(state) \
 +	((state & ~VFIO_DEVICE_STATE_MASK) | VFIO_DEVICE_SATE_SAVING | \
 +					     VFIO_DEVICE_STATE_RESUMING)
 +
 +	__u32 reserved;
 +	__u64 pending_bytes;
 +	__u64 data_offset;
 +	__u64 data_size;
 +};
 /*
  * The MSIX mappable capability informs that MSIX data of a BAR can be mmapped
@@ -570,6 +807,7 @@ enum {
 enum {
 	VFIO_CCW_IO_IRQ_INDEX,
 +	VFIO_CCW_CRW_IRQ_INDEX,
 	VFIO_CCW_NUM_IRQS
 };
@@ -700,6 +938,43 @@ struct vfio_device_ioeventfd {
 #define VFIO_DEVICE_IOEVENTFD		_IO(VFIO_TYPE, VFIO_BASE + 16)
 +/**
 + * VFIO_DEVICE_FEATURE - _IORW(VFIO_TYPE, VFIO_BASE + 17,
 + *			       struct vfio_device_feature)
 + *
 + * Get, set, or probe feature data of the device.  The feature is selected
 + * using the FEATURE_MASK portion of the flags field.  Support for a feature
 + * can be probed by setting both the FEATURE_MASK and PROBE bits.  A probe
 + * may optionally include the GET and/or SET bits to determine read vs write
 + * access of the feature respectively.  Probing a feature will return success
 + * if the feature is supported and all of the optionally indicated GET/SET
 + * methods are supported.  The format of the data portion of the structure is
 + * specific to the given feature.  The data portion is not required for
 + * probing.  GET and SET are mutually exclusive, except for use with PROBE.
 + *
 + * Return 0 on success, -errno on failure.
 + */
 +struct vfio_device_feature {
 +	__u32	argsz;
 +	__u32	flags;
 +#define VFIO_DEVICE_FEATURE_MASK	(0xffff) /* 16-bit feature index */
 +#define VFIO_DEVICE_FEATURE_GET		(1 << 16) /* Get feature into data[] */
 +#define VFIO_DEVICE_FEATURE_SET		(1 << 17) /* Set feature from data[] */
 +#define VFIO_DEVICE_FEATURE_PROBE	(1 << 18) /* Probe feature support */
 +	__u8	data[];
 +};
 +
 +#define VFIO_DEVICE_FEATURE		_IO(VFIO_TYPE, VFIO_BASE + 17)
 +
 +/*
 + * Provide support for setting a PCI VF Token, which is used as a shared
 + * secret between PF and VF drivers.  This feature may only be set on a
 + * PCI SR-IOV PF when SR-IOV is enabled on the PF and there are no existing
 + * open VFs.  Data provided when setting this feature is a 16-byte array
 + * (__u8 b[16]), representing a UUID.
 + */
 +#define VFIO_DEVICE_FEATURE_PCI_VF_TOKEN	(0)
 +
 /* -------- API for Type1 VFIO IOMMU -------- */
 /**
@@ -714,7 +989,54 @@ struct vfio_iommu_type1_info {
 	__u32	argsz;
 	__u32	flags;
 #define VFIO_IOMMU_INFO_PGSIZES (1 << 0)	/* supported page sizes info */
 -	__u64	iova_pgsizes;		/* Bitmap of supported page sizes */
 +#define VFIO_IOMMU_INFO_CAPS	(1 << 1)	/* Info supports caps */
 +	__u64	iova_pgsizes;	/* Bitmap of supported page sizes */
 +	__u32   cap_offset;	/* Offset within info struct of first cap */
 +};
 +
 +/*
 + * The IOVA capability allows to report the valid IOVA range(s)
 + * excluding any non-relaxable reserved regions exposed by
 + * devices attached to the container. Any DMA map attempt
 + * outside the valid iova range will return error.
 + *
 + * The structures below define version 1 of this capability.
 + */
 +#define VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE  1
 +
 +struct vfio_iova_range {
 +	__u64	start;
 +	__u64	end;
 +};
 +
 +struct vfio_iommu_type1_info_cap_iova_range {
 +	struct	vfio_info_cap_header header;
 +	__u32	nr_iovas;
 +	__u32	reserved;
 +	struct	vfio_iova_range iova_ranges[];
 +};
 +
 +/*
 + * The migration capability allows to report supported features for migration.
 + *
 + * The structures below define version 1 of this capability.
 + *
 + * The existence of this capability indicates that IOMMU kernel driver supports
 + * dirty page logging.
 + *
 + * pgsize_bitmap: Kernel driver returns bitmap of supported page sizes for dirty
 + * page logging.
 + * max_dirty_bitmap_size: Kernel driver returns maximum supported dirty bitmap
 + * size in bytes that can be used by user applications when getting the dirty
 + * bitmap.
 + */
 +#define VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION  2
 +
 +struct vfio_iommu_type1_info_cap_migration {
 +	struct	vfio_info_cap_header header;
 +	__u32	flags;
 +	__u64	pgsize_bitmap;
 +	__u64	max_dirty_bitmap_size;		/* in bytes */
 };
 #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
@@ -737,6 +1059,12 @@ struct vfio_iommu_type1_dma_map {
 #define VFIO_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13)
 +struct vfio_bitmap {
 +	__u64        pgsize;	/* page size for bitmap in bytes */
 +	__u64        size;	/* in bytes */
 +	__u64 *data;	/* one bit per page */
 +};
 +
 /**
  * VFIO_IOMMU_UNMAP_DMA - _IOWR(VFIO_TYPE, VFIO_BASE + 14,
  *							struct vfio_dma_unmap)
@@ -746,12 +1074,23 @@ struct vfio_iommu_type1_dma_map {
  * field.  No guarantee is made to the user that arbitrary unmaps of iova
  * or size different from those used in the original mapping call will
  * succeed.
 + * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP should be set to get the dirty bitmap
 + * before unmapping IO virtual addresses. When this flag is set, the user must
 + * provide a struct vfio_bitmap in data[]. User must provide zero-allocated
 + * memory via vfio_bitmap.data and its size in the vfio_bitmap.size field.
 + * A bit in the bitmap represents one page, of user provided page size in
 + * vfio_bitmap.pgsize field, consecutively starting from iova offset. Bit set
 + * indicates that the page at that offset from iova is dirty. A Bitmap of the
 + * pages in the range of unmapped size is returned in the user-provided
 + * vfio_bitmap.data.
  */
 struct vfio_iommu_type1_dma_unmap {
 	__u32	argsz;
 	__u32	flags;
 +#define VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP (1 << 0)
 	__u64	iova;				/* IO virtual address */
 	__u64	size;				/* Size of mapping (bytes) */
 +	__u8    data[];
 };
 #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
@@ -763,6 +1102,57 @@ struct vfio_iommu_type1_dma_unmap {
 #define VFIO_IOMMU_ENABLE	_IO(VFIO_TYPE, VFIO_BASE + 15)
 #define VFIO_IOMMU_DISABLE	_IO(VFIO_TYPE, VFIO_BASE + 16)
 +/**
 + * VFIO_IOMMU_DIRTY_PAGES - _IOWR(VFIO_TYPE, VFIO_BASE + 17,
 + *                                     struct vfio_iommu_type1_dirty_bitmap)
 + * IOCTL is used for dirty pages logging.
 + * Caller should set flag depending on which operation to perform, details as
 + * below:
 + *
 + * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_START flag set, instructs
 + * the IOMMU driver to log pages that are dirtied or potentially dirtied by
 + * the device; designed to be used when a migration is in progress. Dirty pages
 + * are logged until logging is disabled by user application by calling the IOCTL
 + * with VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP flag.
 + *
 + * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP flag set, instructs
 + * the IOMMU driver to stop logging dirtied pages.
 + *
 + * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP flag set
 + * returns the dirty pages bitmap for IOMMU container for a given IOVA range.
 + * The user must specify the IOVA range and the pgsize through the structure
 + * vfio_iommu_type1_dirty_bitmap_get in the data[] portion. This interface
 + * supports getting a bitmap of the smallest supported pgsize only and can be
 + * modified in future to get a bitmap of any specified supported pgsize. The
 + * user must provide a zeroed memory area for the bitmap memory and specify its
 + * size in bitmap.size. One bit is used to represent one page consecutively
 + * starting from iova offset. The user should provide page size in bitmap.pgsize
 + * field. A bit set in the bitmap indicates that the page at that offset from
 + * iova is dirty. The caller must set argsz to a value including the size of
 + * structure vfio_iommu_type1_dirty_bitmap_get, but excluding the size of the
 + * actual bitmap. If dirty pages logging is not enabled, an error will be
 + * returned.
 + *
 + * Only one of the flags _START, _STOP and _GET may be specified at a time.
 + *
 + */
 +struct vfio_iommu_type1_dirty_bitmap {
 +	__u32        argsz;
 +	__u32        flags;
 +#define VFIO_IOMMU_DIRTY_PAGES_FLAG_START	(1 << 0)
 +#define VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP	(1 << 1)
 +#define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP	(1 << 2)
 +	__u8         data[];
 +};
 +
 +struct vfio_iommu_type1_dirty_bitmap_get {
 +	__u64              iova;	/* IO virtual address */
 +	__u64              size;	/* Size of iova range */
 +	struct vfio_bitmap bitmap;
 +};
 +
 +#define VFIO_IOMMU_DIRTY_PAGES             _IO(VFIO_TYPE, VFIO_BASE + 17)
 +
 /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
 /*
 -- 
 2.27.0
--- a/memory-Set-DIRTY_MEMORY_MIGRATION-when-IOMMU-is-enab.patch
+++ b/memory-Set-DIRTY_MEMORY_MIGRATION-when-IOMMU-is-enab.patch
@ -0,0 +1,35 @@
 From 0ae8b3e05294fee99870efa9b58e22e16f31caf9 Mon Sep 17 00:00:00 2001
 From: Kirti Wankhede <kwankhede@nvidia.com>
 Date: Mon, 26 Oct 2020 15:06:20 +0530
 Subject: [PATCH] memory: Set DIRTY_MEMORY_MIGRATION when IOMMU is enabled
 mr->ram_block is NULL when mr->is_iommu is true, then fr.dirty_log_mask
 wasn't set correctly due to which memory listener's log_sync doesn't
 get called.
 This patch returns log_mask with DIRTY_MEMORY_MIGRATION set when
 IOMMU is enabled.
 Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
 Reviewed-by: Yan Zhao <yan.y.zhao@intel.com>
 Acked-by: Paolo Bonzini <pbonzini@redhat.com>
 Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
 ---
 memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 diff --git a/memory.c b/memory.c
 index 5d8c9a9234..44713efc66 100644
 --- a/memory.c
 +++ b/memory.c
@@ -1825,7 +1825,7 @@ bool memory_region_is_ram_device(MemoryRegion *mr)
 uint8_t memory_region_get_dirty_log_mask(MemoryRegion *mr)
 {
     uint8_t mask = mr->dirty_log_mask;
 -    if (global_dirty_log && mr->ram_block) {
 +    if (global_dirty_log && (mr->ram_block || memory_region_is_iommu(mr))) {
         mask |= (1 << DIRTY_MEMORY_MIGRATION);
     }
     return mask;
 -- 
 2.27.0
--- a/migration-register_savevm_live-doesn-t-need-dev.patch
+++ b/migration-register_savevm_live-doesn-t-need-dev.patch
@ -0,0 +1,201 @@
 From 0f7cde69416f85ec3d3f57769ae38db3d72fda8c Mon Sep 17 00:00:00 2001
 From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
 Date: Thu, 22 Aug 2019 12:54:33 +0100
 Subject: [PATCH] migration: register_savevm_live doesn't need dev
 Commit 78dd48df3 removed the last caller of register_savevm_live for an
 instantiable device (rather than a single system wide device);
 so trim out the parameter.
 Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
 Message-Id: <20190822115433.12070-1-dgilbert@redhat.com>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Reviewed-by: Cornelia Huck <cohuck@redhat.com>
 Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
 ---
 docs/devel/migration.rst       |  3 +--
 hw/ppc/spapr.c                 |  2 +-
 hw/s390x/s390-skeys.c          |  2 +-
 hw/s390x/s390-stattrib.c       |  2 +-
 hw/s390x/tod.c                 |  2 +-
 include/migration/register.h   |  3 +--
 migration/block-dirty-bitmap.c |  2 +-
 migration/block.c              |  2 +-
 migration/ram.c                |  2 +-
 migration/savevm.c             | 23 +----------------------
 net/slirp.c                    |  2 +-
 11 files changed, 11 insertions(+), 34 deletions(-)
 diff --git a/docs/devel/migration.rst b/docs/devel/migration.rst
 index 220059679a..cc6f839fce 100644
 --- a/docs/devel/migration.rst
 +++ b/docs/devel/migration.rst
@@ -183,8 +183,7 @@ another to load the state back.
 .. code:: c
 -  int register_savevm_live(DeviceState *dev,
 -                           const char *idstr,
 +  int register_savevm_live(const char *idstr,
                            int instance_id,
                            int version_id,
                            SaveVMHandlers *ops,
 diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
 index b0f37c34a4..289967c3de 100644
 --- a/hw/ppc/spapr.c
 +++ b/hw/ppc/spapr.c
@@ -3069,7 +3069,7 @@ static void spapr_machine_init(MachineState *machine)
      * interface, this is a legacy from the sPAPREnvironment structure
      * which predated MachineState but had a similar function */
     vmstate_register(NULL, 0, &vmstate_spapr, spapr);
 -    register_savevm_live(NULL, "spapr/htab", VMSTATE_INSTANCE_ID_ANY, 1,
 +    register_savevm_live("spapr/htab", VMSTATE_INSTANCE_ID_ANY, 1,
                          &savevm_htab_handlers, spapr);
     qbus_set_hotplug_handler(sysbus_get_default(), OBJECT(machine),
 diff --git a/hw/s390x/s390-skeys.c b/hw/s390x/s390-skeys.c
 index e5bd92c0c7..fb7d57865d 100644
 --- a/hw/s390x/s390-skeys.c
 +++ b/hw/s390x/s390-skeys.c
@@ -388,7 +388,7 @@ static inline void s390_skeys_set_migration_enabled(Object *obj, bool value,
     ss->migration_enabled = value;
     if (ss->migration_enabled) {
 -        register_savevm_live(NULL, TYPE_S390_SKEYS, 0, 1,
 +        register_savevm_live(TYPE_S390_SKEYS, 0, 1,
                              &savevm_s390_storage_keys, ss);
     } else {
         unregister_savevm(DEVICE(ss), TYPE_S390_SKEYS, ss);
 diff --git a/hw/s390x/s390-stattrib.c b/hw/s390x/s390-stattrib.c
 index 766f2015a4..5ee15d5e82 100644
 --- a/hw/s390x/s390-stattrib.c
 +++ b/hw/s390x/s390-stattrib.c
@@ -382,7 +382,7 @@ static void s390_stattrib_instance_init(Object *obj)
 {
     S390StAttribState *sas = S390_STATTRIB(obj);
 -    register_savevm_live(NULL, TYPE_S390_STATTRIB, 0, 0,
 +    register_savevm_live(TYPE_S390_STATTRIB, 0, 0,
                          &savevm_s390_stattrib_handlers, sas);
     object_property_add_bool(obj, "migration-enabled",
 diff --git a/hw/s390x/tod.c b/hw/s390x/tod.c
 index a9fca8eb0b..d6b22bb966 100644
 --- a/hw/s390x/tod.c
 +++ b/hw/s390x/tod.c
@@ -100,7 +100,7 @@ static void s390_tod_realize(DeviceState *dev, Error **errp)
     S390TODState *td = S390_TOD(dev);
     /* Legacy migration interface */
 -    register_savevm_live(NULL, "todclock", 0, 1, &savevm_tod, td);
 +    register_savevm_live("todclock", 0, 1, &savevm_tod, td);
 }
 static void s390_tod_class_init(ObjectClass *oc, void *data)
 diff --git a/include/migration/register.h b/include/migration/register.h
 index 8b2bc5b129..f3ba10b6ef 100644
 --- a/include/migration/register.h
 +++ b/include/migration/register.h
@@ -68,8 +68,7 @@ typedef struct SaveVMHandlers {
     int (*resume_prepare)(MigrationState *s, void *opaque);
 } SaveVMHandlers;
 -int register_savevm_live(DeviceState *dev,
 -                         const char *idstr,
 +int register_savevm_live(const char *idstr,
                          uint32_t instance_id,
                          int version_id,
                          const SaveVMHandlers *ops,
 diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c
 index 4a896a09eb..11e8feb595 100644
 --- a/migration/block-dirty-bitmap.c
 +++ b/migration/block-dirty-bitmap.c
@@ -733,7 +733,7 @@ void dirty_bitmap_mig_init(void)
 {
     QSIMPLEQ_INIT(&dirty_bitmap_mig_state.dbms_list);
 -    register_savevm_live(NULL, "dirty-bitmap", 0, 1,
 +    register_savevm_live("dirty-bitmap", 0, 1,
                          &savevm_dirty_bitmap_handlers,
                          &dirty_bitmap_mig_state);
 }
 diff --git a/migration/block.c b/migration/block.c
 index 91f98ef44a..ec15d1d6b3 100644
 --- a/migration/block.c
 +++ b/migration/block.c
@@ -1030,6 +1030,6 @@ void blk_mig_init(void)
     QSIMPLEQ_INIT(&block_mig_state.blk_list);
     qemu_mutex_init(&block_mig_state.lock);
 -    register_savevm_live(NULL, "block", 0, 1, &savevm_block_handlers,
 +    register_savevm_live("block", 0, 1, &savevm_block_handlers,
                          &block_mig_state);
 }
 diff --git a/migration/ram.c b/migration/ram.c
 index d6657a8093..2077ba5be4 100644
 --- a/migration/ram.c
 +++ b/migration/ram.c
@@ -5125,5 +5125,5 @@ static SaveVMHandlers savevm_ram_handlers = {
 void ram_mig_init(void)
 {
     qemu_mutex_init(&XBZRLE.lock);
 -    register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
 +    register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
 }
 diff --git a/migration/savevm.c b/migration/savevm.c
 index f0974380e5..cdb79222a4 100644
 --- a/migration/savevm.c
 +++ b/migration/savevm.c
@@ -683,8 +683,7 @@ static void savevm_state_handler_insert(SaveStateEntry *nse)
    of the system, so instance_id should be removed/replaced.
    Meanwhile pass -1 as instance_id if you do not already have a clearly
    distinguishing id for all instances of your device class. */
 -int register_savevm_live(DeviceState *dev,
 -                         const char *idstr,
 +int register_savevm_live(const char *idstr,
                          uint32_t instance_id,
                          int version_id,
                          const SaveVMHandlers *ops,
@@ -703,26 +702,6 @@ int register_savevm_live(DeviceState *dev,
         se->is_ram = 1;
     }
 -    if (dev) {
 -        char *id = qdev_get_dev_path(dev);
 -        if (id) {
 -            if (snprintf(se->idstr, sizeof(se->idstr), "%s/", id) >=
 -                sizeof(se->idstr)) {
 -                error_report("Path too long for VMState (%s)", id);
 -                g_free(id);
 -                g_free(se);
 -
 -                return -1;
 -            }
 -            g_free(id);
 -
 -            se->compat = g_new0(CompatEntry, 1);
 -            pstrcpy(se->compat->idstr, sizeof(se->compat->idstr), idstr);
 -            se->compat->instance_id = instance_id == -1 ?
 -                         calculate_compat_instance_id(idstr) : instance_id;
 -            instance_id = -1;
 -        }
 -    }
     pstrcat(se->idstr, sizeof(se->idstr), idstr);
     if (instance_id == VMSTATE_INSTANCE_ID_ANY) {
 diff --git a/net/slirp.c b/net/slirp.c
 index b34cb29276..f42f496641 100644
 --- a/net/slirp.c
 +++ b/net/slirp.c
@@ -576,7 +576,7 @@ static int net_slirp_init(NetClientState *peer, const char *model,
      * specific version?
      */
     g_assert(slirp_state_version() == 4);
 -    register_savevm_live(NULL, "slirp", 0, slirp_state_version(),
 +    register_savevm_live("slirp", 0, slirp_state_version(),
                          &savevm_slirp_state, s->slirp);
     s->poll_notifier.notify = net_slirp_poll_notify;
 -- 
 2.27.0
--- a/qapi-Add-VFIO-devices-migration-stats-in-Migration-s.patch
+++ b/qapi-Add-VFIO-devices-migration-stats-in-Migration-s.patch
@ -0,0 +1,214 @@
 From f97eaa27e2fb6b985f090af9acaa780bb6a2ee5b Mon Sep 17 00:00:00 2001
 From: Kirti Wankhede <kwankhede@nvidia.com>
 Date: Mon, 26 Oct 2020 15:06:27 +0530
 Subject: [PATCH] qapi: Add VFIO devices migration stats in Migration stats
 Added amount of bytes transferred to the VM at destination by all VFIO
 devices
 Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
 Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
 Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
 ---
 hw/vfio/common.c              | 19 +++++++++++++++++++
 hw/vfio/migration.c           |  9 +++++++++
 include/hw/vfio/vfio-common.h |  3 +++
 migration/migration.c         | 17 +++++++++++++++++
 monitor/hmp-cmds.c            |  6 ++++++
 qapi/migration.json           | 17 +++++++++++++++++
 6 files changed, 71 insertions(+)
 diff --git a/hw/vfio/common.c b/hw/vfio/common.c
 index 4ce1c10734..a86a4c4506 100644
 --- a/hw/vfio/common.c
 +++ b/hw/vfio/common.c
@@ -291,6 +291,25 @@ const MemoryRegionOps vfio_region_ops = {
  * Device state interfaces
  */
 +bool vfio_mig_active(void)
 +{
 +    VFIOGroup *group;
 +    VFIODevice *vbasedev;
 +
 +    if (QLIST_EMPTY(&vfio_group_list)) {
 +        return false;
 +    }
 +
 +    QLIST_FOREACH(group, &vfio_group_list, next) {
 +        QLIST_FOREACH(vbasedev, &group->device_list, next) {
 +            if (vbasedev->migration_blocker) {
 +                return false;
 +            }
 +        }
 +    }
 +    return true;
 +}
 +
 static bool vfio_devices_all_stopped_and_saving(VFIOContainer *container)
 {
     VFIOGroup *group;
 diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
 index 0bdf6a1820..b77c66557e 100644
 --- a/hw/vfio/migration.c
 +++ b/hw/vfio/migration.c
@@ -45,6 +45,8 @@
 #define VFIO_MIG_FLAG_DEV_SETUP_STATE   (0xffffffffef100003ULL)
 #define VFIO_MIG_FLAG_DEV_DATA_STATE    (0xffffffffef100004ULL)
 +static int64_t bytes_transferred;
 +
 static inline int vfio_mig_access(VFIODevice *vbasedev, void *val, int count,
                                   off_t off, bool iswrite)
 {
@@ -255,6 +257,7 @@ static int vfio_save_buffer(QEMUFile *f, VFIODevice *vbasedev, uint64_t *size)
         *size = data_size;
     }
 +    bytes_transferred += data_size;
     return ret;
 }
@@ -785,6 +788,7 @@ static void vfio_migration_state_notifier(Notifier *notifier, void *data)
     case MIGRATION_STATUS_CANCELLING:
     case MIGRATION_STATUS_CANCELLED:
     case MIGRATION_STATUS_FAILED:
 +        bytes_transferred = 0;
         ret = vfio_migration_set_state(vbasedev,
                       ~(VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RESUMING),
                       VFIO_DEVICE_STATE_RUNNING);
@@ -866,6 +870,11 @@ err:
 /* ---------------------------------------------------------------------- */
 +int64_t vfio_mig_bytes_transferred(void)
 +{
 +    return bytes_transferred;
 +}
 +
 int vfio_migration_probe(VFIODevice *vbasedev, Error **errp)
 {
     VFIOContainer *container = vbasedev->group->container;
 diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
 index 8fd0212264..048731e81f 100644
 --- a/include/hw/vfio/vfio-common.h
 +++ b/include/hw/vfio/vfio-common.h
@@ -203,6 +203,9 @@ extern const MemoryRegionOps vfio_region_ops;
 typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList;
 extern VFIOGroupList vfio_group_list;
 +bool vfio_mig_active(void);
 +int64_t vfio_mig_bytes_transferred(void);
 +
 #ifdef CONFIG_LINUX
 int vfio_get_region_info(VFIODevice *vbasedev, int index,
                          struct vfio_region_info **info);
 diff --git a/migration/migration.c b/migration/migration.c
 index b0b9430822..9faf5f63a6 100644
 --- a/migration/migration.c
 +++ b/migration/migration.c
@@ -49,6 +49,10 @@
 #include "monitor/monitor.h"
 #include "net/announce.h"
 +#ifdef CONFIG_VFIO
 +#include "hw/vfio/vfio-common.h"
 +#endif
 +
 #define MAX_THROTTLE  (32 << 20)      /* Migration transfer speed throttling */
 /* Amount of time to allocate to each "chunk" of bandwidth-throttled
@@ -908,6 +912,17 @@ static void populate_disk_info(MigrationInfo *info)
     }
 }
 +static void populate_vfio_info(MigrationInfo *info)
 +{
 +#ifdef CONFIG_VFIO
 +    if (vfio_mig_active()) {
 +        info->has_vfio = true;
 +        info->vfio = g_malloc0(sizeof(*info->vfio));
 +        info->vfio->transferred = vfio_mig_bytes_transferred();
 +    }
 +#endif
 +}
 +
 static void fill_source_migration_info(MigrationInfo *info)
 {
     MigrationState *s = migrate_get_current();
@@ -941,6 +956,7 @@ static void fill_source_migration_info(MigrationInfo *info)
         populate_ram_info(info, s);
         populate_disk_info(info);
 +        populate_vfio_info(info);
         break;
     case MIGRATION_STATUS_COLO:
         info->has_status = true;
@@ -956,6 +972,7 @@ static void fill_source_migration_info(MigrationInfo *info)
         info->setup_time = s->setup_time;
         populate_ram_info(info, s);
 +        populate_vfio_info(info);
         break;
     case MIGRATION_STATUS_FAILED:
         info->has_status = true;
 diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c
 index e5a7a88ba2..cecaae0a47 100644
 --- a/monitor/hmp-cmds.c
 +++ b/monitor/hmp-cmds.c
@@ -370,6 +370,12 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict)
         }
         monitor_printf(mon, "]\n");
     }
 +
 +    if (info->has_vfio) {
 +        monitor_printf(mon, "vfio device transferred: %" PRIu64 " kbytes\n",
 +                       info->vfio->transferred >> 10);
 +    }
 +
     qapi_free_MigrationInfo(info);
     qapi_free_MigrationCapabilityStatusList(caps);
 }
 diff --git a/qapi/migration.json b/qapi/migration.json
 index 587ef65872..1f0eb19ac6 100644
 --- a/qapi/migration.json
 +++ b/qapi/migration.json
@@ -141,6 +141,18 @@
             'active', 'postcopy-active', 'postcopy-paused',
             'postcopy-recover', 'completed', 'failed', 'colo',
             'pre-switchover', 'device' ] }
 +##
 +# @VfioStats:
 +#
 +# Detailed VFIO devices migration statistics
 +#
 +# @transferred: amount of bytes transferred to the target VM by VFIO devices
 +#
 +# Since: 5.2
 +#
 +##
 +{ 'struct': 'VfioStats',
 +  'data': {'transferred': 'int' } }
 ##
 # @MigrationInfo:
@@ -202,11 +214,16 @@
 #
 # @socket-address: Only used for tcp, to know what the real port is (Since 4.0)
 #
 +# @vfio: @VfioStats containing detailed VFIO devices migration statistics,
 +#        only returned if VFIO device is present, migration is supported by all
 +#        VFIO devices and status is 'active' or 'completed' (since 5.2)
 +#
 # Since: 0.14.0
 ##
 { 'struct': 'MigrationInfo',
   'data': {'*status': 'MigrationStatus', '*ram': 'MigrationStats',
            '*disk': 'MigrationStats',
 +           '*vfio': 'VfioStats',
            '*xbzrle-cache': 'XBZRLECacheStats',
            '*total-time': 'int',
            '*expected-downtime': 'int',
 -- 
 2.27.0
--- a/qemu.spec
+++ b/qemu.spec
@ -1,6 +1,6 @@
 Name: qemu
 Version: 4.1.0
-Release: 72
+Release: 73
 Epoch: 2
 Summary: QEMU is a generic and open source machine emulator and virtualizer
 License: GPLv2 and BSD and MIT and CC-BY-SA-4.0
@ -476,6 +476,28 @@ Patch0463: virtio-input-fix-memory-leak-on-unrealize.patch
 Patch0464: target-arm-only-set-ID_PFR1_EL1.GIC-for-AArch32-gues.patch
 Patch0465: target-arm-clear-EL2-and-EL3-only-when-kvm-is-not-en.patch
 Patch0466: target-arm-Update-the-ID-registers-of-Kunpeng-920.patch
 Patch0467: hw-net-fix-vmxnet3-live-migration.patch
 Patch0468: include-Make-headers-more-self-contained.patch
 Patch0469: migration-register_savevm_live-doesn-t-need-dev.patch
 Patch0470: vmstate-add-qom-interface-to-get-id.patch
 Patch0471: linux-headers-Update-against-Add-migration-support-f.patch
 Patch0472: vfio-Add-function-to-unmap-VFIO-region.patch
 Patch0473: vfio-Add-vfio_get_object-callback-to-VFIODeviceOps.patch
 Patch0474: vfio-Add-save-and-load-functions-for-VFIO-PCI-device.patch
 Patch0475: vfio-Add-migration-region-initialization-and-finaliz.patch
 Patch0476: vfio-Add-VM-state-change-handler-to-know-state-of-VM.patch
 Patch0477: vfio-Add-migration-state-change-notifier.patch
 Patch0478: vfio-Register-SaveVMHandlers-for-VFIO-device.patch
 Patch0479: vfio-Add-save-state-functions-to-SaveVMHandlers.patch
 Patch0480: vfio-Add-load-state-functions-to-SaveVMHandlers.patch
 Patch0481: memory-Set-DIRTY_MEMORY_MIGRATION-when-IOMMU-is-enab.patch
 Patch0482: vfio-Get-migration-capability-flags-for-container.patch
 Patch0483: vfio-Add-function-to-start-and-stop-dirty-pages-trac.patch
 Patch0484: vfio-Add-vfio_listener_log_sync-to-mark-dirty-pages.patch
 Patch0485: vfio-Dirty-page-tracking-when-vIOMMU-is-enabled.patch
 Patch0486: vfio-Add-ioctl-to-get-dirty-pages-bitmap-during-dma-.patch
 Patch0487: vfio-Make-vfio-pci-device-migration-capable.patch
 Patch0488: qapi-Add-VFIO-devices-migration-stats-in-Migration-s.patch
 BuildRequires: flex
 BuildRequires: gcc
@ -870,6 +892,30 @@ getent passwd qemu >/dev/null || \
 %endif
 %changelog
 * Thu Jul 29 2021 imxcc <xingchaochao@huawei.com>
 - hw/net: fix vmxnet3 live migration
 - include: Make headers more self-contained
 - migration: register_savevm_live doesn't need dev
 - vmstate: add qom interface to get id
 - linux headers: Update against "Add migration support for VFIO devices"
 - vfio: Add function to unmap VFIO region
 - vfio: Add vfio_get_object callback to VFIODeviceOps
 - vfio: Add save and load functions for VFIO PCI devices
 - vfio: Add migration region initialization and finalize function
 - vfio: Add VM state change handler to know state of VM
 - vfio: Add migration state change notifier
 - vfio: Register SaveVMHandlers for VFIO device
 - vfio: Add save state functions to SaveVMHandlers
 - vfio: Add load state functions to SaveVMHandlers
 - memory: Set DIRTY_MEMORY_MIGRATION when IOMMU is enabled
 - vfio: Get migration capability flags for container
 - vfio: Add function to start and stop dirty pages tracking
 - vfio: Add vfio_listener_log_sync to mark dirty pages
 - vfio: Dirty page tracking when vIOMMU is enabled
 - vfio: Add ioctl to get dirty pages bitmap during dma unmap
 - vfio: Make vfio-pci device migration capable
 - qapi: Add VFIO devices migration stats in Migration stats
 * Wed Jul 28 2021 imxcc <xingchaochao@huawei.com>
 - object: return self in object_ref()
 - file-posix: Fix leaked fd in raw_open_common() error path
--- a/vfio-Add-VM-state-change-handler-to-know-state-of-VM.patch
+++ b/vfio-Add-VM-state-change-handler-to-know-state-of-VM.patch
@ -0,0 +1,258 @@
 From 3a875293ae00266e1c82a5c382066efc4acc64ce Mon Sep 17 00:00:00 2001
 From: Kirti Wankhede <kwankhede@nvidia.com>
 Date: Mon, 26 Oct 2020 15:06:15 +0530
 Subject: [PATCH] vfio: Add VM state change handler to know state of VM
 VM state change handler is called on change in VM's state. Based on
 VM state, VFIO device state should be changed.
 Added read/write helper functions for migration region.
 Added function to set device_state.
 Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
 Reviewed-by: Neo Jia <cjia@nvidia.com>
 Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
 Reviewed-by: Cornelia Huck <cohuck@redhat.com>
 [aw: lx -> HWADDR_PRIx, remove redundant parens]
 Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
 Signed-off-by: Shenming Lu <lushenming@huawei.com>
 ---
 hw/vfio/migration.c           | 160 ++++++++++++++++++++++++++++++++++
 hw/vfio/trace-events          |   2 +
 include/hw/vfio/vfio-common.h |   4 +
 3 files changed, 166 insertions(+)
 diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
 index fd7faf423c..ca82c78536 100644
 --- a/hw/vfio/migration.c
 +++ b/hw/vfio/migration.c
@@ -10,6 +10,7 @@
 #include "qemu/osdep.h"
 #include <linux/vfio.h>
 +#include "sysemu/sysemu.h"
 #include "hw/vfio/vfio-common.h"
 #include "cpu.h"
 #include "migration/migration.h"
@@ -22,6 +23,157 @@
 #include "exec/ram_addr.h"
 #include "pci.h"
 #include "trace.h"
 +#include "hw/hw.h"
 +
 +static inline int vfio_mig_access(VFIODevice *vbasedev, void *val, int count,
 +                                  off_t off, bool iswrite)
 +{
 +    int ret;
 +
 +    ret = iswrite ? pwrite(vbasedev->fd, val, count, off) :
 +                    pread(vbasedev->fd, val, count, off);
 +    if (ret < count) {
 +        error_report("vfio_mig_%s %d byte %s: failed at offset 0x%"
 +                     HWADDR_PRIx", err: %s", iswrite ? "write" : "read", count,
 +                     vbasedev->name, off, strerror(errno));
 +        return (ret < 0) ? ret : -EINVAL;
 +    }
 +    return 0;
 +}
 +
 +static int vfio_mig_rw(VFIODevice *vbasedev, __u8 *buf, size_t count,
 +                       off_t off, bool iswrite)
 +{
 +    int ret, done = 0;
 +    __u8 *tbuf = buf;
 +
 +    while (count) {
 +        int bytes = 0;
 +
 +        if (count >= 8 && !(off % 8)) {
 +            bytes = 8;
 +        } else if (count >= 4 && !(off % 4)) {
 +            bytes = 4;
 +        } else if (count >= 2 && !(off % 2)) {
 +            bytes = 2;
 +        } else {
 +            bytes = 1;
 +        }
 +
 +        ret = vfio_mig_access(vbasedev, tbuf, bytes, off, iswrite);
 +        if (ret) {
 +            return ret;
 +        }
 +
 +        count -= bytes;
 +        done += bytes;
 +        off += bytes;
 +        tbuf += bytes;
 +    }
 +    return done;
 +}
 +
 +#define vfio_mig_read(f, v, c, o)       vfio_mig_rw(f, (__u8 *)v, c, o, false)
 +#define vfio_mig_write(f, v, c, o)      vfio_mig_rw(f, (__u8 *)v, c, o, true)
 +
 +#define VFIO_MIG_STRUCT_OFFSET(f)       \
 +                                 offsetof(struct vfio_device_migration_info, f)
 +/*
 + * Change the device_state register for device @vbasedev. Bits set in @mask
 + * are preserved, bits set in @value are set, and bits not set in either @mask
 + * or @value are cleared in device_state. If the register cannot be accessed,
 + * the resulting state would be invalid, or the device enters an error state,
 + * an error is returned.
 + */
 +
 +static int vfio_migration_set_state(VFIODevice *vbasedev, uint32_t mask,
 +                                    uint32_t value)
 +{
 +    VFIOMigration *migration = vbasedev->migration;
 +    VFIORegion *region = &migration->region;
 +    off_t dev_state_off = region->fd_offset +
 +                          VFIO_MIG_STRUCT_OFFSET(device_state);
 +    uint32_t device_state;
 +    int ret;
 +
 +    ret = vfio_mig_read(vbasedev, &device_state, sizeof(device_state),
 +                        dev_state_off);
 +    if (ret < 0) {
 +        return ret;
 +    }
 +
 +    device_state = (device_state & mask) | value;
 +
 +    if (!VFIO_DEVICE_STATE_VALID(device_state)) {
 +        return -EINVAL;
 +    }
 +
 +    ret = vfio_mig_write(vbasedev, &device_state, sizeof(device_state),
 +                         dev_state_off);
 +    if (ret < 0) {
 +        int rret;
 +
 +        rret = vfio_mig_read(vbasedev, &device_state, sizeof(device_state),
 +                             dev_state_off);
 +
 +        if ((rret < 0) || (VFIO_DEVICE_STATE_IS_ERROR(device_state))) {
 +            hw_error("%s: Device in error state 0x%x", vbasedev->name,
 +                     device_state);
 +            return rret ? rret : -EIO;
 +        }
 +        return ret;
 +    }
 +
 +    migration->device_state = device_state;
 +    trace_vfio_migration_set_state(vbasedev->name, device_state);
 +    return 0;
 +}
 +
 +static void vfio_vmstate_change(void *opaque, int running, RunState state)
 +{
 +    VFIODevice *vbasedev = opaque;
 +    VFIOMigration *migration = vbasedev->migration;
 +    uint32_t value, mask;
 +    int ret;
 +
 +    if (vbasedev->migration->vm_running == running) {
 +        return;
 +    }
 +
 +    if (running) {
 +        /*
 +         * Here device state can have one of _SAVING, _RESUMING or _STOP bit.
 +         * Transition from _SAVING to _RUNNING can happen if there is migration
 +         * failure, in that case clear _SAVING bit.
 +         * Transition from _RESUMING to _RUNNING occurs during resuming
 +         * phase, in that case clear _RESUMING bit.
 +         * In both the above cases, set _RUNNING bit.
 +         */
 +        mask = ~VFIO_DEVICE_STATE_MASK;
 +        value = VFIO_DEVICE_STATE_RUNNING;
 +    } else {
 +        /*
 +         * Here device state could be either _RUNNING or _SAVING|_RUNNING. Reset
 +         * _RUNNING bit
 +         */
 +        mask = ~VFIO_DEVICE_STATE_RUNNING;
 +        value = 0;
 +    }
 +
 +    ret = vfio_migration_set_state(vbasedev, mask, value);
 +    if (ret) {
 +        /*
 +         * Migration should be aborted in this case, but vm_state_notify()
 +         * currently does not support reporting failures.
 +         */
 +        error_report("%s: Failed to set device state 0x%x", vbasedev->name,
 +                     (migration->device_state & mask) | value);
 +        qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
 +    }
 +    vbasedev->migration->vm_running = running;
 +    trace_vfio_vmstate_change(vbasedev->name, running, RunState_str(state),
 +            (migration->device_state & mask) | value);
 +}
 static void vfio_migration_exit(VFIODevice *vbasedev)
 {
@@ -38,6 +190,7 @@ static int vfio_migration_init(VFIODevice *vbasedev,
 {
     int ret;
     Object *obj;
 +    VFIOMigration *migration;
     if (!vbasedev->ops->vfio_get_object) {
         return -EINVAL;
@@ -64,6 +217,10 @@ static int vfio_migration_init(VFIODevice *vbasedev,
         ret = -EINVAL;
         goto err;
     }
 +
 +    migration = vbasedev->migration;
 +    migration->vm_state = qemu_add_vm_change_state_handler(vfio_vmstate_change,
 +                                                           vbasedev);
     return 0;
 err:
@@ -111,6 +268,9 @@ add_blocker:
 void vfio_migration_finalize(VFIODevice *vbasedev)
 {
     if (vbasedev->migration) {
 +        VFIOMigration *migration = vbasedev->migration;
 +
 +        qemu_del_vm_change_state_handler(migration->vm_state);
         vfio_migration_exit(vbasedev);
     }
 diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
 index fd034ac536..1626862315 100644
 --- a/hw/vfio/trace-events
 +++ b/hw/vfio/trace-events
@@ -146,3 +146,5 @@ vfio_display_edid_write_error(void) ""
 # migration.c
 vfio_migration_probe(const char *name, uint32_t index) " (%s) Region %d"
 +vfio_migration_set_state(const char *name, uint32_t state) " (%s) state %d"
 +vfio_vmstate_change(const char *name, int running, const char *reason, uint32_t dev_state) " (%s) running %d reason %s device state %d"
 diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
 index e0482c2bac..533d6737ac 100644
 --- a/include/hw/vfio/vfio-common.h
 +++ b/include/hw/vfio/vfio-common.h
@@ -29,6 +29,7 @@
 #ifdef CONFIG_LINUX
 #include <linux/vfio.h>
 #endif
 +#include "sysemu/sysemu.h"
 #define VFIO_MSG_PREFIX "vfio %s: "
@@ -58,7 +59,10 @@ typedef struct VFIORegion {
 } VFIORegion;
 typedef struct VFIOMigration {
 +    VMChangeStateEntry *vm_state;
     VFIORegion region;
 +    uint32_t device_state;
 +    int vm_running;
 } VFIOMigration;
 typedef struct VFIOAddressSpace {
 -- 
 2.27.0
--- a/vfio-Add-function-to-start-and-stop-dirty-pages-trac.patch
+++ b/vfio-Add-function-to-start-and-stop-dirty-pages-trac.patch
@ -0,0 +1,83 @@
 From 4363ea5cded9c6d2838a9564b067f583a6ef077f Mon Sep 17 00:00:00 2001
 From: Kirti Wankhede <kwankhede@nvidia.com>
 Date: Mon, 26 Oct 2020 15:06:22 +0530
 Subject: [PATCH] vfio: Add function to start and stop dirty pages tracking
 Call VFIO_IOMMU_DIRTY_PAGES ioctl to start and stop dirty pages tracking
 for VFIO devices.
 Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
 Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
 Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
 ---
 hw/vfio/migration.c | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)
 diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
 index 0d2bd9e5cd..0bdf6a1820 100644
 --- a/hw/vfio/migration.c
 +++ b/hw/vfio/migration.c
@@ -11,6 +11,7 @@
 #include "qemu/main-loop.h"
 #include "qemu/cutils.h"
 #include <linux/vfio.h>
 +#include <sys/ioctl.h>
 #include "sysemu/sysemu.h"
 #include "hw/vfio/vfio-common.h"
@@ -391,10 +392,40 @@ static int vfio_load_device_config_state(QEMUFile *f, void *opaque)
     return qemu_file_get_error(f);
 }
 +static int vfio_set_dirty_page_tracking(VFIODevice *vbasedev, bool start)
 +{
 +    int ret;
 +    VFIOMigration *migration = vbasedev->migration;
 +    VFIOContainer *container = vbasedev->group->container;
 +    struct vfio_iommu_type1_dirty_bitmap dirty = {
 +        .argsz = sizeof(dirty),
 +    };
 +
 +    if (start) {
 +        if (migration->device_state & VFIO_DEVICE_STATE_SAVING) {
 +            dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START;
 +        } else {
 +            return -EINVAL;
 +        }
 +    } else {
 +            dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP;
 +    }
 +
 +    ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, &dirty);
 +    if (ret) {
 +        error_report("Failed to set dirty tracking flag 0x%x errno: %d",
 +                     dirty.flags, errno);
 +        return -errno;
 +    }
 +    return ret;
 +}
 +
 static void vfio_migration_cleanup(VFIODevice *vbasedev)
 {
     VFIOMigration *migration = vbasedev->migration;
 +    vfio_set_dirty_page_tracking(vbasedev, false);
 +
     if (migration->region.mmaps) {
         vfio_region_unmap(&migration->region);
     }
@@ -435,6 +466,11 @@ static int vfio_save_setup(QEMUFile *f, void *opaque)
         return ret;
     }
 +    ret = vfio_set_dirty_page_tracking(vbasedev, true);
 +    if (ret) {
 +        return ret;
 +    }
 +
     qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
     ret = qemu_file_get_error(f);
 -- 
 2.27.0
--- a/vfio-Add-function-to-unmap-VFIO-region.patch
+++ b/vfio-Add-function-to-unmap-VFIO-region.patch
@ -0,0 +1,103 @@
 From 68cc2be61588d14de2313342ee87eb0bb2b990e0 Mon Sep 17 00:00:00 2001
 From: Kirti Wankhede <kwankhede@nvidia.com>
 Date: Mon, 26 Oct 2020 15:06:11 +0530
 Subject: [PATCH] vfio: Add function to unmap VFIO region
 This function will be used for migration region.
 Migration region is mmaped when migration starts and will be unmapped when
 migration is complete.
 Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
 Reviewed-by: Neo Jia <cjia@nvidia.com>
 Reviewed-by: Cornelia Huck <cohuck@redhat.com>
 Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
 ---
 hw/vfio/common.c              | 32 ++++++++++++++++++++++++++++----
 hw/vfio/trace-events          |  1 +
 include/hw/vfio/vfio-common.h |  1 +
 3 files changed, 30 insertions(+), 4 deletions(-)
 diff --git a/hw/vfio/common.c b/hw/vfio/common.c
 index a859298fda..4c32b1bb99 100644
 --- a/hw/vfio/common.c
 +++ b/hw/vfio/common.c
@@ -906,6 +906,18 @@ int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
     return 0;
 }
 +static void vfio_subregion_unmap(VFIORegion *region, int index)
 +{
 +    trace_vfio_region_unmap(memory_region_name(&region->mmaps[index].mem),
 +                            region->mmaps[index].offset,
 +                            region->mmaps[index].offset +
 +                            region->mmaps[index].size - 1);
 +    memory_region_del_subregion(region->mem, &region->mmaps[index].mem);
 +    munmap(region->mmaps[index].mmap, region->mmaps[index].size);
 +    object_unparent(OBJECT(&region->mmaps[index].mem));
 +    region->mmaps[index].mmap = NULL;
 +}
 +
 int vfio_region_mmap(VFIORegion *region)
 {
     int i, prot = 0;
@@ -936,10 +948,7 @@ int vfio_region_mmap(VFIORegion *region)
             region->mmaps[i].mmap = NULL;
             for (i--; i >= 0; i--) {
 -                memory_region_del_subregion(region->mem, &region->mmaps[i].mem);
 -                munmap(region->mmaps[i].mmap, region->mmaps[i].size);
 -                object_unparent(OBJECT(&region->mmaps[i].mem));
 -                region->mmaps[i].mmap = NULL;
 +                vfio_subregion_unmap(region, i);
             }
             return ret;
@@ -964,6 +973,21 @@ int vfio_region_mmap(VFIORegion *region)
     return 0;
 }
 +void vfio_region_unmap(VFIORegion *region)
 +{
 +    int i;
 +
 +    if (!region->mem) {
 +        return;
 +    }
 +
 +    for (i = 0; i < region->nr_mmaps; i++) {
 +        if (region->mmaps[i].mmap) {
 +            vfio_subregion_unmap(region, i);
 +        }
 +    }
 +}
 +
 void vfio_region_exit(VFIORegion *region)
 {
     int i;
 diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
 index b1ef55a33f..8cdc27946c 100644
 --- a/hw/vfio/trace-events
 +++ b/hw/vfio/trace-events
@@ -111,6 +111,7 @@ vfio_region_mmap(const char *name, unsigned long offset, unsigned long end) "Reg
 vfio_region_exit(const char *name, int index) "Device %s, region %d"
 vfio_region_finalize(const char *name, int index) "Device %s, region %d"
 vfio_region_mmaps_set_enabled(const char *name, bool enabled) "Region %s mmaps enabled: %d"
 +vfio_region_unmap(const char *name, unsigned long offset, unsigned long end) "Region %s unmap [0x%lx - 0x%lx]"
 vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Device %s region %d: %d sparse mmap entries"
 vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]"
 vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%0x8"
 diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
 index 9107bd41c0..93493891ba 100644
 --- a/include/hw/vfio/vfio-common.h
 +++ b/include/hw/vfio/vfio-common.h
@@ -171,6 +171,7 @@ int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
                       int index, const char *name);
 int vfio_region_mmap(VFIORegion *region);
 void vfio_region_mmaps_set_enabled(VFIORegion *region, bool enabled);
 +void vfio_region_unmap(VFIORegion *region);
 void vfio_region_exit(VFIORegion *region);
 void vfio_region_finalize(VFIORegion *region);
 void vfio_reset_handler(void *opaque);
 -- 
 2.27.0
--- a/vfio-Add-ioctl-to-get-dirty-pages-bitmap-during-dma-.patch
+++ b/vfio-Add-ioctl-to-get-dirty-pages-bitmap-during-dma-.patch
@ -0,0 +1,162 @@
 From 1333031bd3b488ed4904a61fd292cd5aa93f8c5b Mon Sep 17 00:00:00 2001
 From: Kirti Wankhede <kwankhede@nvidia.com>
 Date: Mon, 26 Oct 2020 15:06:25 +0530
 Subject: [PATCH] vfio: Add ioctl to get dirty pages bitmap during dma unmap
 With vIOMMU, IO virtual address range can get unmapped while in pre-copy
 phase of migration. In that case, unmap ioctl should return pages pinned
 in that range and QEMU should find its correcponding guest physical
 addresses and report those dirty.
 Suggested-by: Alex Williamson <alex.williamson@redhat.com>
 Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
 Reviewed-by: Neo Jia <cjia@nvidia.com>
 [aw: fix error_report types, fix cpu_physical_memory_set_dirty_lebitmap() cast]
 Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
 ---
 hw/vfio/common.c | 97 ++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 93 insertions(+), 4 deletions(-)
 diff --git a/hw/vfio/common.c b/hw/vfio/common.c
 index 8773b998ac..4ce1c10734 100644
 --- a/hw/vfio/common.c
 +++ b/hw/vfio/common.c
@@ -320,11 +320,95 @@ static bool vfio_devices_all_stopped_and_saving(VFIOContainer *container)
     return true;
 }
 +static bool vfio_devices_all_running_and_saving(VFIOContainer *container)
 +{
 +    VFIOGroup *group;
 +    VFIODevice *vbasedev;
 +    MigrationState *ms = migrate_get_current();
 +
 +    if (!migration_is_setup_or_active(ms->state)) {
 +        return false;
 +    }
 +
 +    QLIST_FOREACH(group, &container->group_list, container_next) {
 +        QLIST_FOREACH(vbasedev, &group->device_list, next) {
 +            VFIOMigration *migration = vbasedev->migration;
 +
 +            if (!migration) {
 +                return false;
 +            }
 +
 +            if ((migration->device_state & VFIO_DEVICE_STATE_SAVING) &&
 +                (migration->device_state & VFIO_DEVICE_STATE_RUNNING)) {
 +                continue;
 +            } else {
 +                return false;
 +            }
 +        }
 +    }
 +    return true;
 +}
 +
 +static int vfio_dma_unmap_bitmap(VFIOContainer *container,
 +                                 hwaddr iova, ram_addr_t size,
 +                                 IOMMUTLBEntry *iotlb)
 +{
 +    struct vfio_iommu_type1_dma_unmap *unmap;
 +    struct vfio_bitmap *bitmap;
 +    uint64_t pages = TARGET_PAGE_ALIGN(size) >> TARGET_PAGE_BITS;
 +    int ret;
 +
 +    unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap));
 +
 +    unmap->argsz = sizeof(*unmap) + sizeof(*bitmap);
 +    unmap->iova = iova;
 +    unmap->size = size;
 +    unmap->flags |= VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP;
 +    bitmap = (struct vfio_bitmap *)&unmap->data;
 +
 +    /*
 +     * cpu_physical_memory_set_dirty_lebitmap() expects pages in bitmap of
 +     * TARGET_PAGE_SIZE to mark those dirty. Hence set bitmap_pgsize to
 +     * TARGET_PAGE_SIZE.
 +     */
 +
 +    bitmap->pgsize = TARGET_PAGE_SIZE;
 +    bitmap->size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) /
 +                   BITS_PER_BYTE;
 +
 +    if (bitmap->size > container->max_dirty_bitmap_size) {
 +        error_report("UNMAP: Size of bitmap too big 0x%"PRIx64,
 +                     (uint64_t)bitmap->size);
 +        ret = -E2BIG;
 +        goto unmap_exit;
 +    }
 +
 +    bitmap->data = g_try_malloc0(bitmap->size);
 +    if (!bitmap->data) {
 +        ret = -ENOMEM;
 +        goto unmap_exit;
 +    }
 +
 +    ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap);
 +    if (!ret) {
 +        cpu_physical_memory_set_dirty_lebitmap((unsigned long *)bitmap->data,
 +                iotlb->translated_addr, pages);
 +    } else {
 +        error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %m");
 +    }
 +
 +    g_free(bitmap->data);
 +unmap_exit:
 +    g_free(unmap);
 +    return ret;
 +}
 +
 /*
  * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
  */
 static int vfio_dma_unmap(VFIOContainer *container,
 -                          hwaddr iova, ram_addr_t size)
 +                          hwaddr iova, ram_addr_t size,
 +                          IOMMUTLBEntry *iotlb)
 {
     struct vfio_iommu_type1_dma_unmap unmap = {
         .argsz = sizeof(unmap),
@@ -333,6 +417,11 @@ static int vfio_dma_unmap(VFIOContainer *container,
         .size = size,
     };
 +    if (iotlb && container->dirty_pages_supported &&
 +        vfio_devices_all_running_and_saving(container)) {
 +        return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
 +    }
 +
     while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
         /*
          * The type1 backend has an off-by-one bug in the kernel (71a7d3d78e3c
@@ -380,7 +469,7 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
      * the VGA ROM space.
      */
     if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
 -        (errno == EBUSY && vfio_dma_unmap(container, iova, size) == 0 &&
 +        (errno == EBUSY && vfio_dma_unmap(container, iova, size, NULL) == 0 &&
          ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
         return 0;
     }
@@ -530,7 +619,7 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
                          iotlb->addr_mask + 1, vaddr, ret);
         }
     } else {
 -        ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1);
 +        ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1, iotlb);
         if (ret) {
             error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
                          "0x%"HWADDR_PRIx") = %d (%m)",
@@ -816,7 +905,7 @@ static void vfio_listener_region_del(MemoryListener *listener,
     }
     if (try_unmap) {
 -        ret = vfio_dma_unmap(container, iova, int128_get64(llsize));
 +        ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
         if (ret) {
             error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
                          "0x%"HWADDR_PRIx") = %d (%m)",
 -- 
 2.27.0
--- a/vfio-Add-load-state-functions-to-SaveVMHandlers.patch
+++ b/vfio-Add-load-state-functions-to-SaveVMHandlers.patch
@ -0,0 +1,266 @@
 From ddef5d5257987f2f415ce41fdc482feda61aa796 Mon Sep 17 00:00:00 2001
 From: Kirti Wankhede <kwankhede@nvidia.com>
 Date: Mon, 26 Oct 2020 15:06:19 +0530
 Subject: [PATCH] vfio: Add load state functions to SaveVMHandlers
 Sequence  during _RESUMING device state:
 While data for this device is available, repeat below steps:
 a. read data_offset from where user application should write data.
 b. write data of data_size to migration region from data_offset.
 c. write data_size which indicates vendor driver that data is written in
   staging buffer.
 For user, data is opaque. User should write data in the same order as
 received.
 Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
 Reviewed-by: Neo Jia <cjia@nvidia.com>
 Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
 Reviewed-by: Yan Zhao <yan.y.zhao@intel.com>
 Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
 ---
 hw/vfio/migration.c  | 195 +++++++++++++++++++++++++++++++++++++++++++
 hw/vfio/trace-events |   4 +
 2 files changed, 199 insertions(+)
 diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
 index f78a77e1e3..954c064435 100644
 --- a/hw/vfio/migration.c
 +++ b/hw/vfio/migration.c
@@ -257,6 +257,77 @@ static int vfio_save_buffer(QEMUFile *f, VFIODevice *vbasedev, uint64_t *size)
     return ret;
 }
 +static int vfio_load_buffer(QEMUFile *f, VFIODevice *vbasedev,
 +                            uint64_t data_size)
 +{
 +    VFIORegion *region = &vbasedev->migration->region;
 +    uint64_t data_offset = 0, size, report_size;
 +    int ret;
 +
 +    do {
 +        ret = vfio_mig_read(vbasedev, &data_offset, sizeof(data_offset),
 +                      region->fd_offset + VFIO_MIG_STRUCT_OFFSET(data_offset));
 +        if (ret < 0) {
 +            return ret;
 +        }
 +
 +        if (data_offset + data_size > region->size) {
 +            /*
 +             * If data_size is greater than the data section of migration region
 +             * then iterate the write buffer operation. This case can occur if
 +             * size of migration region at destination is smaller than size of
 +             * migration region at source.
 +             */
 +            report_size = size = region->size - data_offset;
 +            data_size -= size;
 +        } else {
 +            report_size = size = data_size;
 +            data_size = 0;
 +        }
 +
 +        trace_vfio_load_state_device_data(vbasedev->name, data_offset, size);
 +
 +        while (size) {
 +            void *buf;
 +            uint64_t sec_size;
 +            bool buf_alloc = false;
 +
 +            buf = get_data_section_size(region, data_offset, size, &sec_size);
 +
 +            if (!buf) {
 +                buf = g_try_malloc(sec_size);
 +                if (!buf) {
 +                    error_report("%s: Error allocating buffer ", __func__);
 +                    return -ENOMEM;
 +                }
 +                buf_alloc = true;
 +            }
 +
 +            qemu_get_buffer(f, buf, sec_size);
 +
 +            if (buf_alloc) {
 +                ret = vfio_mig_write(vbasedev, buf, sec_size,
 +                        region->fd_offset + data_offset);
 +                g_free(buf);
 +
 +                if (ret < 0) {
 +                    return ret;
 +                }
 +            }
 +            size -= sec_size;
 +            data_offset += sec_size;
 +        }
 +
 +        ret = vfio_mig_write(vbasedev, &report_size, sizeof(report_size),
 +                        region->fd_offset + VFIO_MIG_STRUCT_OFFSET(data_size));
 +        if (ret < 0) {
 +            return ret;
 +        }
 +    } while (data_size);
 +
 +    return 0;
 +}
 +
 static int vfio_update_pending(VFIODevice *vbasedev)
 {
     VFIOMigration *migration = vbasedev->migration;
@@ -293,6 +364,33 @@ static int vfio_save_device_config_state(QEMUFile *f, void *opaque)
     return qemu_file_get_error(f);
 }
 +static int vfio_load_device_config_state(QEMUFile *f, void *opaque)
 +{
 +    VFIODevice *vbasedev = opaque;
 +    uint64_t data;
 +
 +    if (vbasedev->ops && vbasedev->ops->vfio_load_config) {
 +        int ret;
 +
 +        ret = vbasedev->ops->vfio_load_config(vbasedev, f);
 +        if (ret) {
 +            error_report("%s: Failed to load device config space",
 +                         vbasedev->name);
 +            return ret;
 +        }
 +    }
 +
 +    data = qemu_get_be64(f);
 +    if (data != VFIO_MIG_FLAG_END_OF_STATE) {
 +        error_report("%s: Failed loading device config space, "
 +                     "end flag incorrect 0x%"PRIx64, vbasedev->name, data);
 +        return -EINVAL;
 +    }
 +
 +    trace_vfio_load_device_config_state(vbasedev->name);
 +    return qemu_file_get_error(f);
 +}
 +
 static void vfio_migration_cleanup(VFIODevice *vbasedev)
 {
     VFIOMigration *migration = vbasedev->migration;
@@ -483,12 +581,109 @@ static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
     return ret;
 }
 +static int vfio_load_setup(QEMUFile *f, void *opaque)
 +{
 +    VFIODevice *vbasedev = opaque;
 +    VFIOMigration *migration = vbasedev->migration;
 +    int ret = 0;
 +
 +    if (migration->region.mmaps) {
 +        ret = vfio_region_mmap(&migration->region);
 +        if (ret) {
 +            error_report("%s: Failed to mmap VFIO migration region %d: %s",
 +                         vbasedev->name, migration->region.nr,
 +                         strerror(-ret));
 +            error_report("%s: Falling back to slow path", vbasedev->name);
 +        }
 +    }
 +
 +    ret = vfio_migration_set_state(vbasedev, ~VFIO_DEVICE_STATE_MASK,
 +                                   VFIO_DEVICE_STATE_RESUMING);
 +    if (ret) {
 +        error_report("%s: Failed to set state RESUMING", vbasedev->name);
 +        if (migration->region.mmaps) {
 +            vfio_region_unmap(&migration->region);
 +        }
 +    }
 +    return ret;
 +}
 +
 +static int vfio_load_cleanup(void *opaque)
 +{
 +    VFIODevice *vbasedev = opaque;
 +
 +    vfio_migration_cleanup(vbasedev);
 +    trace_vfio_load_cleanup(vbasedev->name);
 +    return 0;
 +}
 +
 +static int vfio_load_state(QEMUFile *f, void *opaque, int version_id)
 +{
 +    VFIODevice *vbasedev = opaque;
 +    int ret = 0;
 +    uint64_t data;
 +
 +    data = qemu_get_be64(f);
 +    while (data != VFIO_MIG_FLAG_END_OF_STATE) {
 +
 +        trace_vfio_load_state(vbasedev->name, data);
 +
 +        switch (data) {
 +        case VFIO_MIG_FLAG_DEV_CONFIG_STATE:
 +        {
 +            ret = vfio_load_device_config_state(f, opaque);
 +            if (ret) {
 +                return ret;
 +            }
 +            break;
 +        }
 +        case VFIO_MIG_FLAG_DEV_SETUP_STATE:
 +        {
 +            data = qemu_get_be64(f);
 +            if (data == VFIO_MIG_FLAG_END_OF_STATE) {
 +                return ret;
 +            } else {
 +                error_report("%s: SETUP STATE: EOS not found 0x%"PRIx64,
 +                             vbasedev->name, data);
 +                return -EINVAL;
 +            }
 +            break;
 +        }
 +        case VFIO_MIG_FLAG_DEV_DATA_STATE:
 +        {
 +            uint64_t data_size = qemu_get_be64(f);
 +
 +            if (data_size) {
 +                ret = vfio_load_buffer(f, vbasedev, data_size);
 +                if (ret < 0) {
 +                    return ret;
 +                }
 +            }
 +            break;
 +        }
 +        default:
 +            error_report("%s: Unknown tag 0x%"PRIx64, vbasedev->name, data);
 +            return -EINVAL;
 +        }
 +
 +        data = qemu_get_be64(f);
 +        ret = qemu_file_get_error(f);
 +        if (ret) {
 +            return ret;
 +        }
 +    }
 +    return ret;
 +}
 +
 static SaveVMHandlers savevm_vfio_handlers = {
     .save_setup = vfio_save_setup,
     .save_cleanup = vfio_save_cleanup,
     .save_live_pending = vfio_save_pending,
     .save_live_iterate = vfio_save_iterate,
     .save_live_complete_precopy = vfio_save_complete_precopy,
 +    .load_setup = vfio_load_setup,
 +    .load_cleanup = vfio_load_cleanup,
 +    .load_state = vfio_load_state,
 };
 /* ---------------------------------------------------------------------- */
 diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
 index 9a1c5e17d9..4f08f5a633 100644
 --- a/hw/vfio/trace-events
 +++ b/hw/vfio/trace-events
@@ -157,3 +157,7 @@ vfio_save_device_config_state(const char *name) " (%s)"
 vfio_save_pending(const char *name, uint64_t precopy, uint64_t postcopy, uint64_t compatible) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" compatible 0x%"PRIx64
 vfio_save_iterate(const char *name, int data_size) " (%s) data_size %d"
 vfio_save_complete_precopy(const char *name) " (%s)"
 +vfio_load_device_config_state(const char *name) " (%s)"
 +vfio_load_state(const char *name, uint64_t data) " (%s) data 0x%"PRIx64
 +vfio_load_state_device_data(const char *name, uint64_t data_offset, uint64_t data_size) " (%s) Offset 0x%"PRIx64" size 0x%"PRIx64
 +vfio_load_cleanup(const char *name) " (%s)"
 -- 
 2.27.0
--- a/vfio-Add-migration-region-initialization-and-finaliz.patch
+++ b/vfio-Add-migration-region-initialization-and-finaliz.patch
@ -0,0 +1,209 @@
 From b7128f8aa03482634c07691cef69e7ed2d35200e Mon Sep 17 00:00:00 2001
 From: Kirti Wankhede <kwankhede@nvidia.com>
 Date: Mon, 26 Oct 2020 15:06:14 +0530
 Subject: [PATCH] vfio: Add migration region initialization and finalize
 function
 Whether the VFIO device supports migration or not is decided based of
 migration region query. If migration region query is successful and migration
 region initialization is successful then migration is supported else
 migration is blocked.
 Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
 Reviewed-by: Neo Jia <cjia@nvidia.com>
 Acked-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
 Reviewed-by: Cornelia Huck <cohuck@redhat.com>
 Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
 Signed-off-by: Shenming Lu <lushenming@huawei.com>
 ---
 hw/vfio/Makefile.objs         |   2 +-
 hw/vfio/migration.c           | 122 ++++++++++++++++++++++++++++++++++
 hw/vfio/trace-events          |   3 +
 include/hw/vfio/vfio-common.h |   9 +++
 4 files changed, 135 insertions(+), 1 deletion(-)
 create mode 100644 hw/vfio/migration.c
 diff --git a/hw/vfio/Makefile.objs b/hw/vfio/Makefile.objs
 index abad8b818c..36033d1437 100644
 --- a/hw/vfio/Makefile.objs
 +++ b/hw/vfio/Makefile.objs
@@ -1,4 +1,4 @@
 -obj-y += common.o spapr.o
 +obj-y += common.o spapr.o migration.o
 obj-$(CONFIG_VFIO_PCI) += pci.o pci-quirks.o display.o
 obj-$(CONFIG_VFIO_CCW) += ccw.o
 obj-$(CONFIG_VFIO_PLATFORM) += platform.o
 diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
 new file mode 100644
 index 0000000000..fd7faf423c
 --- /dev/null
 +++ b/hw/vfio/migration.c
@@ -0,0 +1,122 @@
 +/*
 + * Migration support for VFIO devices
 + *
 + * Copyright NVIDIA, Inc. 2020
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2. See
 + * the COPYING file in the top-level directory.
 + */
 +
 +#include "qemu/osdep.h"
 +#include <linux/vfio.h>
 +
 +#include "hw/vfio/vfio-common.h"
 +#include "cpu.h"
 +#include "migration/migration.h"
 +#include "migration/qemu-file.h"
 +#include "migration/register.h"
 +#include "migration/blocker.h"
 +#include "migration/misc.h"
 +#include "qapi/error.h"
 +#include "exec/ramlist.h"
 +#include "exec/ram_addr.h"
 +#include "pci.h"
 +#include "trace.h"
 +
 +static void vfio_migration_exit(VFIODevice *vbasedev)
 +{
 +    VFIOMigration *migration = vbasedev->migration;
 +
 +    vfio_region_exit(&migration->region);
 +    vfio_region_finalize(&migration->region);
 +    g_free(vbasedev->migration);
 +    vbasedev->migration = NULL;
 +}
 +
 +static int vfio_migration_init(VFIODevice *vbasedev,
 +                               struct vfio_region_info *info)
 +{
 +    int ret;
 +    Object *obj;
 +
 +    if (!vbasedev->ops->vfio_get_object) {
 +        return -EINVAL;
 +    }
 +
 +    obj = vbasedev->ops->vfio_get_object(vbasedev);
 +    if (!obj) {
 +        return -EINVAL;
 +    }
 +
 +    vbasedev->migration = g_new0(VFIOMigration, 1);
 +
 +    ret = vfio_region_setup(obj, vbasedev, &vbasedev->migration->region,
 +                            info->index, "migration");
 +    if (ret) {
 +        error_report("%s: Failed to setup VFIO migration region %d: %s",
 +                     vbasedev->name, info->index, strerror(-ret));
 +        goto err;
 +    }
 +
 +    if (!vbasedev->migration->region.size) {
 +        error_report("%s: Invalid zero-sized VFIO migration region %d",
 +                     vbasedev->name, info->index);
 +        ret = -EINVAL;
 +        goto err;
 +    }
 +    return 0;
 +
 +err:
 +    vfio_migration_exit(vbasedev);
 +    return ret;
 +}
 +
 +/* ---------------------------------------------------------------------- */
 +
 +int vfio_migration_probe(VFIODevice *vbasedev, Error **errp)
 +{
 +    struct vfio_region_info *info = NULL;
 +    Error *local_err = NULL;
 +    int ret;
 +
 +    ret = vfio_get_dev_region_info(vbasedev, VFIO_REGION_TYPE_MIGRATION,
 +                                   VFIO_REGION_SUBTYPE_MIGRATION, &info);
 +    if (ret) {
 +        goto add_blocker;
 +    }
 +
 +    ret = vfio_migration_init(vbasedev, info);
 +    if (ret) {
 +        goto add_blocker;
 +    }
 +
 +    g_free(info);
 +    trace_vfio_migration_probe(vbasedev->name, info->index);
 +    return 0;
 +
 +add_blocker:
 +    error_setg(&vbasedev->migration_blocker,
 +               "VFIO device doesn't support migration");
 +    g_free(info);
 +
 +    ret = migrate_add_blocker(vbasedev->migration_blocker, &local_err);
 +    if (local_err) {
 +        error_propagate(errp, local_err);
 +        error_free(vbasedev->migration_blocker);
 +        vbasedev->migration_blocker = NULL;
 +    }
 +    return ret;
 +}
 +
 +void vfio_migration_finalize(VFIODevice *vbasedev)
 +{
 +    if (vbasedev->migration) {
 +        vfio_migration_exit(vbasedev);
 +    }
 +
 +    if (vbasedev->migration_blocker) {
 +        migrate_del_blocker(vbasedev->migration_blocker);
 +        error_free(vbasedev->migration_blocker);
 +        vbasedev->migration_blocker = NULL;
 +    }
 +}
 diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
 index 8cdc27946c..fd034ac536 100644
 --- a/hw/vfio/trace-events
 +++ b/hw/vfio/trace-events
@@ -143,3 +143,6 @@ vfio_display_edid_link_up(void) ""
 vfio_display_edid_link_down(void) ""
 vfio_display_edid_update(uint32_t prefx, uint32_t prefy) "%ux%u"
 vfio_display_edid_write_error(void) ""
 +
 +# migration.c
 +vfio_migration_probe(const char *name, uint32_t index) " (%s) Region %d"
 diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
 index 6ea4898c4d..e0482c2bac 100644
 --- a/include/hw/vfio/vfio-common.h
 +++ b/include/hw/vfio/vfio-common.h
@@ -57,6 +57,10 @@ typedef struct VFIORegion {
     uint8_t nr; /* cache the region number for debug */
 } VFIORegion;
 +typedef struct VFIOMigration {
 +    VFIORegion region;
 +} VFIOMigration;
 +
 typedef struct VFIOAddressSpace {
     AddressSpace *as;
     QLIST_HEAD(, VFIOContainer) containers;
@@ -113,6 +117,8 @@ typedef struct VFIODevice {
     unsigned int num_irqs;
     unsigned int num_regions;
     unsigned int flags;
 +    VFIOMigration *migration;
 +    Error *migration_blocker;
 } VFIODevice;
 struct VFIODeviceOps {
@@ -204,4 +210,7 @@ int vfio_spapr_create_window(VFIOContainer *container,
 int vfio_spapr_remove_window(VFIOContainer *container,
                              hwaddr offset_within_address_space);
 +int vfio_migration_probe(VFIODevice *vbasedev, Error **errp);
 +void vfio_migration_finalize(VFIODevice *vbasedev);
 +
 #endif /* HW_VFIO_VFIO_COMMON_H */
 -- 
 2.27.0
--- a/vfio-Add-migration-state-change-notifier.patch
+++ b/vfio-Add-migration-state-change-notifier.patch
@ -0,0 +1,104 @@
 From b61729a5e0ab89d29f041202b50d042405076e62 Mon Sep 17 00:00:00 2001
 From: Kirti Wankhede <kwankhede@nvidia.com>
 Date: Mon, 26 Oct 2020 15:06:16 +0530
 Subject: [PATCH] vfio: Add migration state change notifier
 Added migration state change notifier to get notification on migration state
 change. These states are translated to VFIO device state and conveyed to
 vendor driver.
 Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
 Reviewed-by: Neo Jia <cjia@nvidia.com>
 Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
 Reviewed-by: Cornelia Huck <cohuck@redhat.com>
 Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
 ---
 hw/vfio/migration.c           | 28 ++++++++++++++++++++++++++++
 hw/vfio/trace-events          |  1 +
 include/hw/vfio/vfio-common.h |  2 ++
 3 files changed, 31 insertions(+)
 diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
 index ca82c78536..0c6c9b655f 100644
 --- a/hw/vfio/migration.c
 +++ b/hw/vfio/migration.c
@@ -175,6 +175,30 @@ static void vfio_vmstate_change(void *opaque, int running, RunState state)
             (migration->device_state & mask) | value);
 }
 +static void vfio_migration_state_notifier(Notifier *notifier, void *data)
 +{
 +    MigrationState *s = data;
 +    VFIOMigration *migration = container_of(notifier, VFIOMigration,
 +                                            migration_state);
 +    VFIODevice *vbasedev = migration->vbasedev;
 +    int ret;
 +
 +    trace_vfio_migration_state_notifier(vbasedev->name,
 +                                        MigrationStatus_str(s->state));
 +
 +    switch (s->state) {
 +    case MIGRATION_STATUS_CANCELLING:
 +    case MIGRATION_STATUS_CANCELLED:
 +    case MIGRATION_STATUS_FAILED:
 +        ret = vfio_migration_set_state(vbasedev,
 +                      ~(VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RESUMING),
 +                      VFIO_DEVICE_STATE_RUNNING);
 +        if (ret) {
 +            error_report("%s: Failed to set state RUNNING", vbasedev->name);
 +        }
 +    }
 +}
 +
 static void vfio_migration_exit(VFIODevice *vbasedev)
 {
     VFIOMigration *migration = vbasedev->migration;
@@ -219,8 +243,11 @@ static int vfio_migration_init(VFIODevice *vbasedev,
     }
     migration = vbasedev->migration;
 +    migration->vbasedev = vbasedev;
     migration->vm_state = qemu_add_vm_change_state_handler(vfio_vmstate_change,
                                                            vbasedev);
 +    migration->migration_state.notify = vfio_migration_state_notifier;
 +    add_migration_state_change_notifier(&migration->migration_state);
     return 0;
 err:
@@ -270,6 +297,7 @@ void vfio_migration_finalize(VFIODevice *vbasedev)
     if (vbasedev->migration) {
         VFIOMigration *migration = vbasedev->migration;
 +        remove_migration_state_change_notifier(&migration->migration_state);
         qemu_del_vm_change_state_handler(migration->vm_state);
         vfio_migration_exit(vbasedev);
     }
 diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
 index 1626862315..bd3d47b005 100644
 --- a/hw/vfio/trace-events
 +++ b/hw/vfio/trace-events
@@ -148,3 +148,4 @@ vfio_display_edid_write_error(void) ""
 vfio_migration_probe(const char *name, uint32_t index) " (%s) Region %d"
 vfio_migration_set_state(const char *name, uint32_t state) " (%s) state %d"
 vfio_vmstate_change(const char *name, int running, const char *reason, uint32_t dev_state) " (%s) running %d reason %s device state %d"
 +vfio_migration_state_notifier(const char *name, const char *state) " (%s) state %s"
 diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
 index 533d6737ac..efff0590ae 100644
 --- a/include/hw/vfio/vfio-common.h
 +++ b/include/hw/vfio/vfio-common.h
@@ -59,10 +59,12 @@ typedef struct VFIORegion {
 } VFIORegion;
 typedef struct VFIOMigration {
 +    struct VFIODevice *vbasedev;
     VMChangeStateEntry *vm_state;
     VFIORegion region;
     uint32_t device_state;
     int vm_running;
 +    Notifier migration_state;
 } VFIOMigration;
 typedef struct VFIOAddressSpace {
 -- 
 2.27.0
--- a/vfio-Add-save-and-load-functions-for-VFIO-PCI-device.patch
+++ b/vfio-Add-save-and-load-functions-for-VFIO-PCI-device.patch
@ -0,0 +1,106 @@
 From 92f104ca6e35acae079ca3bb432f24452058d483 Mon Sep 17 00:00:00 2001
 From: Kirti Wankhede <kwankhede@nvidia.com>
 Date: Mon, 26 Oct 2020 15:06:13 +0530
 Subject: [PATCH] vfio: Add save and load functions for VFIO PCI devices
 Added functions to save and restore PCI device specific data,
 specifically config space of PCI device.
 Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
 Reviewed-by: Neo Jia <cjia@nvidia.com>
 Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
 ---
 hw/vfio/pci.c                 | 51 +++++++++++++++++++++++++++++++++++
 include/hw/vfio/vfio-common.h |  2 ++
 2 files changed, 53 insertions(+)
 diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
 index de0d286fc9..b9fae3ad28 100644
 --- a/hw/vfio/pci.c
 +++ b/hw/vfio/pci.c
@@ -35,6 +35,7 @@
 #include "pci.h"
 #include "trace.h"
 #include "qapi/error.h"
 +#include "migration/qemu-file.h"
 #define TYPE_VFIO_PCI "vfio-pci"
 #define PCI_VFIO(obj)    OBJECT_CHECK(VFIOPCIDevice, obj, TYPE_VFIO_PCI)
@@ -2395,11 +2396,61 @@ static Object *vfio_pci_get_object(VFIODevice *vbasedev)
     return OBJECT(vdev);
 }
 +static bool vfio_msix_present(void *opaque, int version_id)
 +{
 +    PCIDevice *pdev = opaque;
 +
 +    return msix_present(pdev);
 +}
 +
 +const VMStateDescription vmstate_vfio_pci_config = {
 +    .name = "VFIOPCIDevice",
 +    .version_id = 1,
 +    .minimum_version_id = 1,
 +    .fields = (VMStateField[]) {
 +        VMSTATE_PCI_DEVICE(pdev, VFIOPCIDevice),
 +        VMSTATE_MSIX_TEST(pdev, VFIOPCIDevice, vfio_msix_present),
 +        VMSTATE_END_OF_LIST()
 +    }
 +};
 +
 +static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f)
 +{
 +    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
 +
 +    vmstate_save_state(f, &vmstate_vfio_pci_config, vdev, NULL);
 +}
 +
 +static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
 +{
 +    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
 +    PCIDevice *pdev = &vdev->pdev;
 +    int ret;
 +
 +    ret = vmstate_load_state(f, &vmstate_vfio_pci_config, vdev, 1);
 +    if (ret) {
 +        return ret;
 +    }
 +
 +    vfio_pci_write_config(pdev, PCI_COMMAND,
 +                          pci_get_word(pdev->config + PCI_COMMAND), 2);
 +
 +    if (msi_enabled(pdev)) {
 +        vfio_msi_enable(vdev);
 +    } else if (msix_enabled(pdev)) {
 +        vfio_msix_enable(vdev);
 +    }
 +
 +    return ret;
 +}
 +
 static VFIODeviceOps vfio_pci_ops = {
     .vfio_compute_needs_reset = vfio_pci_compute_needs_reset,
     .vfio_hot_reset_multi = vfio_pci_hot_reset_multi,
     .vfio_eoi = vfio_intx_eoi,
     .vfio_get_object = vfio_pci_get_object,
 +    .vfio_save_config = vfio_pci_save_config,
 +    .vfio_load_config = vfio_pci_load_config,
 };
 int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
 diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
 index 771b6d59a3..6ea4898c4d 100644
 --- a/include/hw/vfio/vfio-common.h
 +++ b/include/hw/vfio/vfio-common.h
@@ -120,6 +120,8 @@ struct VFIODeviceOps {
     int (*vfio_hot_reset_multi)(VFIODevice *vdev);
     void (*vfio_eoi)(VFIODevice *vdev);
     Object *(*vfio_get_object)(VFIODevice *vdev);
 +    void (*vfio_save_config)(VFIODevice *vdev, QEMUFile *f);
 +    int (*vfio_load_config)(VFIODevice *vdev, QEMUFile *f);
 };
 typedef struct VFIOGroup {
 -- 
 2.27.0
--- a/vfio-Add-save-state-functions-to-SaveVMHandlers.patch
+++ b/vfio-Add-save-state-functions-to-SaveVMHandlers.patch
@ -0,0 +1,380 @@
 From 94f106f95e887d1d706e8f771fd6ad287ddac2dc Mon Sep 17 00:00:00 2001
 From: Kirti Wankhede <kwankhede@nvidia.com>
 Date: Mon, 26 Oct 2020 15:06:18 +0530
 Subject: [PATCH] vfio: Add save state functions to SaveVMHandlers
 Added .save_live_pending, .save_live_iterate and .save_live_complete_precopy
 functions. These functions handles pre-copy and stop-and-copy phase.
 In _SAVING|_RUNNING device state or pre-copy phase:
 - read pending_bytes. If pending_bytes > 0, go through below steps.
 - read data_offset - indicates kernel driver to write data to staging
  buffer.
 - read data_size - amount of data in bytes written by vendor driver in
  migration region.
 - read data_size bytes of data from data_offset in the migration region.
 - Write data packet to file stream as below:
 {VFIO_MIG_FLAG_DEV_DATA_STATE, data_size, actual data,
 VFIO_MIG_FLAG_END_OF_STATE }
 In _SAVING device state or stop-and-copy phase
 a. read config space of device and save to migration file stream. This
   doesn't need to be from vendor driver. Any other special config state
   from driver can be saved as data in following iteration.
 b. read pending_bytes. If pending_bytes > 0, go through below steps.
 c. read data_offset - indicates kernel driver to write data to staging
   buffer.
 d. read data_size - amount of data in bytes written by vendor driver in
   migration region.
 e. read data_size bytes of data from data_offset in the migration region.
 f. Write data packet as below:
   {VFIO_MIG_FLAG_DEV_DATA_STATE, data_size, actual data}
 g. iterate through steps b to f while (pending_bytes > 0)
 h. Write {VFIO_MIG_FLAG_END_OF_STATE}
 When data region is mapped, its user's responsibility to read data from
 data_offset of data_size before moving to next steps.
 Added fix suggested by Artem Polyakov to reset pending_bytes in
 vfio_save_iterate().
 Added fix suggested by Zhi Wang to add 0 as data size in migration stream and
 add END_OF_STATE delimiter to indicate phase complete.
 Suggested-by: Artem Polyakov <artemp@nvidia.com>
 Suggested-by: Zhi Wang <zhi.wang.linux@gmail.com>
 Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
 Reviewed-by: Neo Jia <cjia@nvidia.com>
 Reviewed-by: Yan Zhao <yan.y.zhao@intel.com>
 Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
 ---
 hw/vfio/migration.c           | 276 ++++++++++++++++++++++++++++++++++
 hw/vfio/trace-events          |   6 +
 include/hw/vfio/vfio-common.h |   1 +
 3 files changed, 283 insertions(+)
 diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
 index 405228fc5a..f78a77e1e3 100644
 --- a/hw/vfio/migration.c
 +++ b/hw/vfio/migration.c
@@ -148,6 +148,151 @@ static int vfio_migration_set_state(VFIODevice *vbasedev, uint32_t mask,
     return 0;
 }
 +static void *get_data_section_size(VFIORegion *region, uint64_t data_offset,
 +                                   uint64_t data_size, uint64_t *size)
 +{
 +    void *ptr = NULL;
 +    uint64_t limit = 0;
 +    int i;
 +
 +    if (!region->mmaps) {
 +        if (size) {
 +            *size = MIN(data_size, region->size - data_offset);
 +        }
 +        return ptr;
 +    }
 +
 +    for (i = 0; i < region->nr_mmaps; i++) {
 +        VFIOMmap *map = region->mmaps + i;
 +
 +        if ((data_offset >= map->offset) &&
 +            (data_offset < map->offset + map->size)) {
 +
 +            /* check if data_offset is within sparse mmap areas */
 +            ptr = map->mmap + data_offset - map->offset;
 +            if (size) {
 +                *size = MIN(data_size, map->offset + map->size - data_offset);
 +            }
 +            break;
 +        } else if ((data_offset < map->offset) &&
 +                   (!limit || limit > map->offset)) {
 +            /*
 +             * data_offset is not within sparse mmap areas, find size of
 +             * non-mapped area. Check through all list since region->mmaps list
 +             * is not sorted.
 +             */
 +            limit = map->offset;
 +        }
 +    }
 +
 +    if (!ptr && size) {
 +        *size = limit ? MIN(data_size, limit - data_offset) : data_size;
 +    }
 +    return ptr;
 +}
 +
 +static int vfio_save_buffer(QEMUFile *f, VFIODevice *vbasedev, uint64_t *size)
 +{
 +    VFIOMigration *migration = vbasedev->migration;
 +    VFIORegion *region = &migration->region;
 +    uint64_t data_offset = 0, data_size = 0, sz;
 +    int ret;
 +
 +    ret = vfio_mig_read(vbasedev, &data_offset, sizeof(data_offset),
 +                      region->fd_offset + VFIO_MIG_STRUCT_OFFSET(data_offset));
 +    if (ret < 0) {
 +        return ret;
 +    }
 +
 +    ret = vfio_mig_read(vbasedev, &data_size, sizeof(data_size),
 +                        region->fd_offset + VFIO_MIG_STRUCT_OFFSET(data_size));
 +    if (ret < 0) {
 +        return ret;
 +    }
 +
 +    trace_vfio_save_buffer(vbasedev->name, data_offset, data_size,
 +                           migration->pending_bytes);
 +
 +    qemu_put_be64(f, data_size);
 +    sz = data_size;
 +
 +    while (sz) {
 +        void *buf;
 +        uint64_t sec_size;
 +        bool buf_allocated = false;
 +
 +        buf = get_data_section_size(region, data_offset, sz, &sec_size);
 +
 +        if (!buf) {
 +            buf = g_try_malloc(sec_size);
 +            if (!buf) {
 +                error_report("%s: Error allocating buffer ", __func__);
 +                return -ENOMEM;
 +            }
 +            buf_allocated = true;
 +
 +            ret = vfio_mig_read(vbasedev, buf, sec_size,
 +                                region->fd_offset + data_offset);
 +            if (ret < 0) {
 +                g_free(buf);
 +                return ret;
 +            }
 +        }
 +
 +        qemu_put_buffer(f, buf, sec_size);
 +
 +        if (buf_allocated) {
 +            g_free(buf);
 +        }
 +        sz -= sec_size;
 +        data_offset += sec_size;
 +    }
 +
 +    ret = qemu_file_get_error(f);
 +
 +    if (!ret && size) {
 +        *size = data_size;
 +    }
 +
 +    return ret;
 +}
 +
 +static int vfio_update_pending(VFIODevice *vbasedev)
 +{
 +    VFIOMigration *migration = vbasedev->migration;
 +    VFIORegion *region = &migration->region;
 +    uint64_t pending_bytes = 0;
 +    int ret;
 +
 +    ret = vfio_mig_read(vbasedev, &pending_bytes, sizeof(pending_bytes),
 +                    region->fd_offset + VFIO_MIG_STRUCT_OFFSET(pending_bytes));
 +    if (ret < 0) {
 +        migration->pending_bytes = 0;
 +        return ret;
 +    }
 +
 +    migration->pending_bytes = pending_bytes;
 +    trace_vfio_update_pending(vbasedev->name, pending_bytes);
 +    return 0;
 +}
 +
 +static int vfio_save_device_config_state(QEMUFile *f, void *opaque)
 +{
 +    VFIODevice *vbasedev = opaque;
 +
 +    qemu_put_be64(f, VFIO_MIG_FLAG_DEV_CONFIG_STATE);
 +
 +    if (vbasedev->ops && vbasedev->ops->vfio_save_config) {
 +        vbasedev->ops->vfio_save_config(vbasedev, f);
 +    }
 +
 +    qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
 +
 +    trace_vfio_save_device_config_state(vbasedev->name);
 +
 +    return qemu_file_get_error(f);
 +}
 +
 static void vfio_migration_cleanup(VFIODevice *vbasedev)
 {
     VFIOMigration *migration = vbasedev->migration;
@@ -210,9 +355,140 @@ static void vfio_save_cleanup(void *opaque)
     trace_vfio_save_cleanup(vbasedev->name);
 }
 +static void vfio_save_pending(QEMUFile *f, void *opaque,
 +                              uint64_t threshold_size,
 +                              uint64_t *res_precopy_only,
 +                              uint64_t *res_compatible,
 +                              uint64_t *res_postcopy_only)
 +{
 +    VFIODevice *vbasedev = opaque;
 +    VFIOMigration *migration = vbasedev->migration;
 +    int ret;
 +
 +    ret = vfio_update_pending(vbasedev);
 +    if (ret) {
 +        return;
 +    }
 +
 +    *res_precopy_only += migration->pending_bytes;
 +
 +    trace_vfio_save_pending(vbasedev->name, *res_precopy_only,
 +                            *res_postcopy_only, *res_compatible);
 +}
 +
 +static int vfio_save_iterate(QEMUFile *f, void *opaque)
 +{
 +    VFIODevice *vbasedev = opaque;
 +    VFIOMigration *migration = vbasedev->migration;
 +    uint64_t data_size;
 +    int ret;
 +
 +    qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE);
 +
 +    if (migration->pending_bytes == 0) {
 +        ret = vfio_update_pending(vbasedev);
 +        if (ret) {
 +            return ret;
 +        }
 +
 +        if (migration->pending_bytes == 0) {
 +            qemu_put_be64(f, 0);
 +            qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
 +            /* indicates data finished, goto complete phase */
 +            return 1;
 +        }
 +    }
 +
 +    ret = vfio_save_buffer(f, vbasedev, &data_size);
 +    if (ret) {
 +        error_report("%s: vfio_save_buffer failed %s", vbasedev->name,
 +                     strerror(errno));
 +        return ret;
 +    }
 +
 +    qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
 +
 +    ret = qemu_file_get_error(f);
 +    if (ret) {
 +        return ret;
 +    }
 +
 +    /*
 +     * Reset pending_bytes as .save_live_pending is not called during savevm or
 +     * snapshot case, in such case vfio_update_pending() at the start of this
 +     * function updates pending_bytes.
 +     */
 +    migration->pending_bytes = 0;
 +    trace_vfio_save_iterate(vbasedev->name, data_size);
 +    return 0;
 +}
 +
 +static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
 +{
 +    VFIODevice *vbasedev = opaque;
 +    VFIOMigration *migration = vbasedev->migration;
 +    uint64_t data_size;
 +    int ret;
 +
 +    ret = vfio_migration_set_state(vbasedev, ~VFIO_DEVICE_STATE_RUNNING,
 +                                   VFIO_DEVICE_STATE_SAVING);
 +    if (ret) {
 +        error_report("%s: Failed to set state STOP and SAVING",
 +                     vbasedev->name);
 +        return ret;
 +    }
 +
 +    ret = vfio_save_device_config_state(f, opaque);
 +    if (ret) {
 +        return ret;
 +    }
 +
 +    ret = vfio_update_pending(vbasedev);
 +    if (ret) {
 +        return ret;
 +    }
 +
 +    while (migration->pending_bytes > 0) {
 +        qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE);
 +        ret = vfio_save_buffer(f, vbasedev, &data_size);
 +        if (ret < 0) {
 +            error_report("%s: Failed to save buffer", vbasedev->name);
 +            return ret;
 +        }
 +
 +        if (data_size == 0) {
 +            break;
 +        }
 +
 +        ret = vfio_update_pending(vbasedev);
 +        if (ret) {
 +            return ret;
 +        }
 +    }
 +
 +    qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
 +
 +    ret = qemu_file_get_error(f);
 +    if (ret) {
 +        return ret;
 +    }
 +
 +    ret = vfio_migration_set_state(vbasedev, ~VFIO_DEVICE_STATE_SAVING, 0);
 +    if (ret) {
 +        error_report("%s: Failed to set state STOPPED", vbasedev->name);
 +        return ret;
 +    }
 +
 +    trace_vfio_save_complete_precopy(vbasedev->name);
 +    return ret;
 +}
 +
 static SaveVMHandlers savevm_vfio_handlers = {
     .save_setup = vfio_save_setup,
     .save_cleanup = vfio_save_cleanup,
 +    .save_live_pending = vfio_save_pending,
 +    .save_live_iterate = vfio_save_iterate,
 +    .save_live_complete_precopy = vfio_save_complete_precopy,
 };
 /* ---------------------------------------------------------------------- */
 diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
 index 86c18def01..9a1c5e17d9 100644
 --- a/hw/vfio/trace-events
 +++ b/hw/vfio/trace-events
@@ -151,3 +151,9 @@ vfio_vmstate_change(const char *name, int running, const char *reason, uint32_t
 vfio_migration_state_notifier(const char *name, const char *state) " (%s) state %s"
 vfio_save_setup(const char *name) " (%s)"
 vfio_save_cleanup(const char *name) " (%s)"
 +vfio_save_buffer(const char *name, uint64_t data_offset, uint64_t data_size, uint64_t pending) " (%s) Offset 0x%"PRIx64" size 0x%"PRIx64" pending 0x%"PRIx64
 +vfio_update_pending(const char *name, uint64_t pending) " (%s) pending 0x%"PRIx64
 +vfio_save_device_config_state(const char *name) " (%s)"
 +vfio_save_pending(const char *name, uint64_t precopy, uint64_t postcopy, uint64_t compatible) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" compatible 0x%"PRIx64
 +vfio_save_iterate(const char *name, int data_size) " (%s) data_size %d"
 +vfio_save_complete_precopy(const char *name) " (%s)"
 diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
 index efff0590ae..c825524606 100644
 --- a/include/hw/vfio/vfio-common.h
 +++ b/include/hw/vfio/vfio-common.h
@@ -65,6 +65,7 @@ typedef struct VFIOMigration {
     uint32_t device_state;
     int vm_running;
     Notifier migration_state;
 +    uint64_t pending_bytes;
 } VFIOMigration;
 typedef struct VFIOAddressSpace {
 -- 
 2.27.0
--- a/vfio-Add-vfio_get_object-callback-to-VFIODeviceOps.patch
+++ b/vfio-Add-vfio_get_object-callback-to-VFIODeviceOps.patch
@ -0,0 +1,55 @@
 From c1de789d89132b66243fbfe253f10764ce514a08 Mon Sep 17 00:00:00 2001
 From: Kirti Wankhede <kwankhede@nvidia.com>
 Date: Mon, 26 Oct 2020 15:06:12 +0530
 Subject: [PATCH] vfio: Add vfio_get_object callback to VFIODeviceOps
 Hook vfio_get_object callback for PCI devices.
 Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
 Reviewed-by: Neo Jia <cjia@nvidia.com>
 Suggested-by: Cornelia Huck <cohuck@redhat.com>
 Reviewed-by: Cornelia Huck <cohuck@redhat.com>
 Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
 ---
 hw/vfio/pci.c                 | 8 ++++++++
 include/hw/vfio/vfio-common.h | 1 +
 2 files changed, 9 insertions(+)
 diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
 index d7a4e1875c..de0d286fc9 100644
 --- a/hw/vfio/pci.c
 +++ b/hw/vfio/pci.c
@@ -2388,10 +2388,18 @@ static void vfio_pci_compute_needs_reset(VFIODevice *vbasedev)
     }
 }
 +static Object *vfio_pci_get_object(VFIODevice *vbasedev)
 +{
 +    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
 +
 +    return OBJECT(vdev);
 +}
 +
 static VFIODeviceOps vfio_pci_ops = {
     .vfio_compute_needs_reset = vfio_pci_compute_needs_reset,
     .vfio_hot_reset_multi = vfio_pci_hot_reset_multi,
     .vfio_eoi = vfio_intx_eoi,
 +    .vfio_get_object = vfio_pci_get_object,
 };
 int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
 diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
 index 93493891ba..771b6d59a3 100644
 --- a/include/hw/vfio/vfio-common.h
 +++ b/include/hw/vfio/vfio-common.h
@@ -119,6 +119,7 @@ struct VFIODeviceOps {
     void (*vfio_compute_needs_reset)(VFIODevice *vdev);
     int (*vfio_hot_reset_multi)(VFIODevice *vdev);
     void (*vfio_eoi)(VFIODevice *vdev);
 +    Object *(*vfio_get_object)(VFIODevice *vdev);
 };
 typedef struct VFIOGroup {
 -- 
 2.27.0
--- a/vfio-Add-vfio_listener_log_sync-to-mark-dirty-pages.patch
+++ b/vfio-Add-vfio_listener_log_sync-to-mark-dirty-pages.patch
@ -0,0 +1,182 @@
 From 3ac0647003d192579bcb6c1081b75d9c8ada78e0 Mon Sep 17 00:00:00 2001
 From: Kirti Wankhede <kwankhede@nvidia.com>
 Date: Mon, 26 Oct 2020 15:06:23 +0530
 Subject: [PATCH] vfio: Add vfio_listener_log_sync to mark dirty pages
 vfio_listener_log_sync gets list of dirty pages from container using
 VFIO_IOMMU_GET_DIRTY_BITMAP ioctl and mark those pages dirty when all
 devices are stopped and saving state.
 Return early for the RAM block section of mapped MMIO region.
 Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
 Reviewed-by: Neo Jia <cjia@nvidia.com>
 [aw: fix error_report types, fix cpu_physical_memory_set_dirty_lebitmap() cast]
 Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
 ---
 hw/vfio/common.c     | 116 +++++++++++++++++++++++++++++++++++++++++++
 hw/vfio/trace-events |   1 +
 2 files changed, 117 insertions(+)
 diff --git a/hw/vfio/common.c b/hw/vfio/common.c
 index 35168b8f3e..4d2828fc97 100644
 --- a/hw/vfio/common.c
 +++ b/hw/vfio/common.c
@@ -29,6 +29,7 @@
 #include "hw/vfio/vfio.h"
 #include "exec/address-spaces.h"
 #include "exec/memory.h"
 +#include "exec/ram_addr.h"
 #include "hw/hw.h"
 #include "qemu/error-report.h"
 #include "qemu/range.h"
@@ -36,6 +37,7 @@
 #include "sysemu/kvm.h"
 #include "trace.h"
 #include "qapi/error.h"
 +#include "migration/migration.h"
 VFIOGroupList vfio_group_list =
     QLIST_HEAD_INITIALIZER(vfio_group_list);
@@ -285,6 +287,39 @@ const MemoryRegionOps vfio_region_ops = {
     },
 };
 +/*
 + * Device state interfaces
 + */
 +
 +static bool vfio_devices_all_stopped_and_saving(VFIOContainer *container)
 +{
 +    VFIOGroup *group;
 +    VFIODevice *vbasedev;
 +    MigrationState *ms = migrate_get_current();
 +
 +    if (!migration_is_setup_or_active(ms->state)) {
 +        return false;
 +    }
 +
 +    QLIST_FOREACH(group, &container->group_list, container_next) {
 +        QLIST_FOREACH(vbasedev, &group->device_list, next) {
 +            VFIOMigration *migration = vbasedev->migration;
 +
 +            if (!migration) {
 +                return false;
 +            }
 +
 +            if ((migration->device_state & VFIO_DEVICE_STATE_SAVING) &&
 +                !(migration->device_state & VFIO_DEVICE_STATE_RUNNING)) {
 +                continue;
 +            } else {
 +                return false;
 +            }
 +        }
 +    }
 +    return true;
 +}
 +
 /*
  * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
  */
@@ -794,9 +829,90 @@ static void vfio_listener_region_del(MemoryListener *listener,
     }
 }
 +static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
 +                                 uint64_t size, ram_addr_t ram_addr)
 +{
 +    struct vfio_iommu_type1_dirty_bitmap *dbitmap;
 +    struct vfio_iommu_type1_dirty_bitmap_get *range;
 +    uint64_t pages;
 +    int ret;
 +
 +    dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range));
 +
 +    dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range);
 +    dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
 +    range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data;
 +    range->iova = iova;
 +    range->size = size;
 +
 +    /*
 +     * cpu_physical_memory_set_dirty_lebitmap() expects pages in bitmap of
 +     * TARGET_PAGE_SIZE to mark those dirty. Hence set bitmap's pgsize to
 +     * TARGET_PAGE_SIZE.
 +     */
 +    range->bitmap.pgsize = TARGET_PAGE_SIZE;
 +
 +    pages = TARGET_PAGE_ALIGN(range->size) >> TARGET_PAGE_BITS;
 +    range->bitmap.size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) /
 +                                         BITS_PER_BYTE;
 +    range->bitmap.data = g_try_malloc0(range->bitmap.size);
 +    if (!range->bitmap.data) {
 +        ret = -ENOMEM;
 +        goto err_out;
 +    }
 +
 +    ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap);
 +    if (ret) {
 +        error_report("Failed to get dirty bitmap for iova: 0x%"PRIx64
 +                " size: 0x%"PRIx64" err: %d", (uint64_t)range->iova,
 +                (uint64_t)range->size, errno);
 +        goto err_out;
 +    }
 +
 +    cpu_physical_memory_set_dirty_lebitmap((unsigned long *)range->bitmap.data,
 +                                            ram_addr, pages);
 +
 +    trace_vfio_get_dirty_bitmap(container->fd, range->iova, range->size,
 +                                range->bitmap.size, ram_addr);
 +err_out:
 +    g_free(range->bitmap.data);
 +    g_free(dbitmap);
 +
 +    return ret;
 +}
 +
 +static int vfio_sync_dirty_bitmap(VFIOContainer *container,
 +                                  MemoryRegionSection *section)
 +{
 +    ram_addr_t ram_addr;
 +
 +    ram_addr = memory_region_get_ram_addr(section->mr) +
 +               section->offset_within_region;
 +
 +    return vfio_get_dirty_bitmap(container,
 +                       TARGET_PAGE_ALIGN(section->offset_within_address_space),
 +                       int128_get64(section->size), ram_addr);
 +}
 +
 +static void vfio_listerner_log_sync(MemoryListener *listener,
 +        MemoryRegionSection *section)
 +{
 +    VFIOContainer *container = container_of(listener, VFIOContainer, listener);
 +
 +    if (vfio_listener_skipped_section(section) ||
 +        !container->dirty_pages_supported) {
 +        return;
 +    }
 +
 +    if (vfio_devices_all_stopped_and_saving(container)) {
 +        vfio_sync_dirty_bitmap(container, section);
 +    }
 +}
 +
 static const MemoryListener vfio_memory_listener = {
     .region_add = vfio_listener_region_add,
     .region_del = vfio_listener_region_del,
 +    .log_sync = vfio_listerner_log_sync,
 };
 static void vfio_listener_release(VFIOContainer *container)
 diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
 index 4f08f5a633..4167f35d64 100644
 --- a/hw/vfio/trace-events
 +++ b/hw/vfio/trace-events
@@ -161,3 +161,4 @@ vfio_load_device_config_state(const char *name) " (%s)"
 vfio_load_state(const char *name, uint64_t data) " (%s) data 0x%"PRIx64
 vfio_load_state_device_data(const char *name, uint64_t data_offset, uint64_t data_size) " (%s) Offset 0x%"PRIx64" size 0x%"PRIx64
 vfio_load_cleanup(const char *name) " (%s)"
 +vfio_get_dirty_bitmap(int fd, uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start) "container fd=%d, iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64
 -- 
 2.27.0
--- a/vfio-Dirty-page-tracking-when-vIOMMU-is-enabled.patch
+++ b/vfio-Dirty-page-tracking-when-vIOMMU-is-enabled.patch
@ -0,0 +1,162 @@
 From a400753d0f1a008367165aadf375abfe86a66ed7 Mon Sep 17 00:00:00 2001
 From: Kirti Wankhede <kwankhede@nvidia.com>
 Date: Mon, 26 Oct 2020 15:06:24 +0530
 Subject: [PATCH] vfio: Dirty page tracking when vIOMMU is enabled
 When vIOMMU is enabled, register MAP notifier from log_sync when all
 devices in container are in stop and copy phase of migration. Call replay
 and get dirty pages from notifier callback.
 Suggested-by: Alex Williamson <alex.williamson@redhat.com>
 Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
 Reviewed-by: Yan Zhao <yan.y.zhao@intel.com>
 Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
 ---
 hw/vfio/common.c     | 88 +++++++++++++++++++++++++++++++++++++++++---
 hw/vfio/trace-events |  1 +
 2 files changed, 83 insertions(+), 6 deletions(-)
 diff --git a/hw/vfio/common.c b/hw/vfio/common.c
 index 4d2828fc97..8773b998ac 100644
 --- a/hw/vfio/common.c
 +++ b/hw/vfio/common.c
@@ -441,8 +441,8 @@ static bool vfio_listener_skipped_section(MemoryRegionSection *section)
 }
 /* Called with rcu_read_lock held.  */
 -static bool vfio_get_vaddr(IOMMUTLBEntry *iotlb, void **vaddr,
 -                           bool *read_only)
 +static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
 +                               ram_addr_t *ram_addr, bool *read_only)
 {
     MemoryRegion *mr;
     hwaddr xlat;
@@ -473,8 +473,17 @@ static bool vfio_get_vaddr(IOMMUTLBEntry *iotlb, void **vaddr,
         return false;
     }
 -    *vaddr = memory_region_get_ram_ptr(mr) + xlat;
 -    *read_only = !writable || mr->readonly;
 +    if (vaddr) {
 +        *vaddr = memory_region_get_ram_ptr(mr) + xlat;
 +    }
 +
 +    if (ram_addr) {
 +        *ram_addr = memory_region_get_ram_addr(mr) + xlat;
 +    }
 +
 +    if (read_only) {
 +        *read_only = !writable || mr->readonly;
 +    }
     return true;
 }
@@ -484,7 +493,6 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
     VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
     VFIOContainer *container = giommu->container;
     hwaddr iova = iotlb->iova + giommu->iommu_offset;
 -    bool read_only;
     void *vaddr;
     int ret;
@@ -500,7 +508,9 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
     rcu_read_lock();
     if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
 -        if (!vfio_get_vaddr(iotlb, &vaddr, &read_only)) {
 +        bool read_only;
 +
 +        if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only)) {
             goto out;
         }
         /*
@@ -881,11 +891,77 @@ err_out:
     return ret;
 }
 +typedef struct {
 +    IOMMUNotifier n;
 +    VFIOGuestIOMMU *giommu;
 +} vfio_giommu_dirty_notifier;
 +
 +static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
 +{
 +    vfio_giommu_dirty_notifier *gdn = container_of(n,
 +                                                vfio_giommu_dirty_notifier, n);
 +    VFIOGuestIOMMU *giommu = gdn->giommu;
 +    VFIOContainer *container = giommu->container;
 +    hwaddr iova = iotlb->iova + giommu->iommu_offset;
 +    ram_addr_t translated_addr;
 +
 +    trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask);
 +
 +    if (iotlb->target_as != &address_space_memory) {
 +        error_report("Wrong target AS \"%s\", only system memory is allowed",
 +                     iotlb->target_as->name ? iotlb->target_as->name : "none");
 +        return;
 +    }
 +
 +    rcu_read_lock();
 +    if (vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL)) {
 +        int ret;
 +
 +        ret = vfio_get_dirty_bitmap(container, iova, iotlb->addr_mask + 1,
 +                                    translated_addr);
 +        if (ret) {
 +            error_report("vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", "
 +                         "0x%"HWADDR_PRIx") = %d (%m)",
 +                         container, iova,
 +                         iotlb->addr_mask + 1, ret);
 +        }
 +    }
 +    rcu_read_unlock();
 +}
 +
 static int vfio_sync_dirty_bitmap(VFIOContainer *container,
                                   MemoryRegionSection *section)
 {
     ram_addr_t ram_addr;
 +    if (memory_region_is_iommu(section->mr)) {
 +        VFIOGuestIOMMU *giommu;
 +
 +        QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
 +            if (MEMORY_REGION(giommu->iommu) == section->mr &&
 +                giommu->n.start == section->offset_within_region) {
 +                Int128 llend;
 +                vfio_giommu_dirty_notifier gdn = { .giommu = giommu };
 +                int idx = memory_region_iommu_attrs_to_index(giommu->iommu,
 +                                                       MEMTXATTRS_UNSPECIFIED);
 +
 +                llend = int128_add(int128_make64(section->offset_within_region),
 +                                   section->size);
 +                llend = int128_sub(llend, int128_one());
 +
 +                iommu_notifier_init(&gdn.n,
 +                                    vfio_iommu_map_dirty_notify,
 +                                    IOMMU_NOTIFIER_MAP,
 +                                    section->offset_within_region,
 +                                    int128_get64(llend),
 +                                    idx);
 +                memory_region_iommu_replay(giommu->iommu, &gdn.n);
 +                break;
 +            }
 +        }
 +        return 0;
 +    }
 +
     ram_addr = memory_region_get_ram_addr(section->mr) +
                section->offset_within_region;
 diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
 index 4167f35d64..575ebde6e0 100644
 --- a/hw/vfio/trace-events
 +++ b/hw/vfio/trace-events
@@ -162,3 +162,4 @@ vfio_load_state(const char *name, uint64_t data) " (%s) data 0x%"PRIx64
 vfio_load_state_device_data(const char *name, uint64_t data_offset, uint64_t data_size) " (%s) Offset 0x%"PRIx64" size 0x%"PRIx64
 vfio_load_cleanup(const char *name) " (%s)"
 vfio_get_dirty_bitmap(int fd, uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start) "container fd=%d, iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64
 +vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64
 -- 
 2.27.0
--- a/vfio-Get-migration-capability-flags-for-container.patch
+++ b/vfio-Get-migration-capability-flags-for-container.patch
@ -0,0 +1,186 @@
 From fc49c9cbf2deba53370f48ad9db2adc5f6ceb3ba Mon Sep 17 00:00:00 2001
 From: Kirti Wankhede <kwankhede@nvidia.com>
 Date: Mon, 26 Oct 2020 15:06:21 +0530
 Subject: [PATCH] vfio: Get migration capability flags for container
 Added helper functions to get IOMMU info capability chain.
 Added function to get migration capability information from that
 capability chain for IOMMU container.
 Similar change was proposed earlier:
 https://lists.gnu.org/archive/html/qemu-devel/2018-05/msg03759.html
 Disable migration for devices if IOMMU module doesn't support migration
 capability.
 Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
 Cc: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
 Cc: Eric Auger <eric.auger@redhat.com>
 Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
 ---
 hw/vfio/common.c              | 90 +++++++++++++++++++++++++++++++----
 hw/vfio/migration.c           |  7 ++-
 include/hw/vfio/vfio-common.h |  3 ++
 3 files changed, 91 insertions(+), 9 deletions(-)
 diff --git a/hw/vfio/common.c b/hw/vfio/common.c
 index 4c32b1bb99..35168b8f3e 100644
 --- a/hw/vfio/common.c
 +++ b/hw/vfio/common.c
@@ -1210,6 +1210,75 @@ static int vfio_init_container(VFIOContainer *container, int group_fd,
     return 0;
 }
 +static int vfio_get_iommu_info(VFIOContainer *container,
 +                               struct vfio_iommu_type1_info **info)
 +{
 +
 +    size_t argsz = sizeof(struct vfio_iommu_type1_info);
 +
 +    *info = g_new0(struct vfio_iommu_type1_info, 1);
 +again:
 +    (*info)->argsz = argsz;
 +
 +    if (ioctl(container->fd, VFIO_IOMMU_GET_INFO, *info)) {
 +        g_free(*info);
 +        *info = NULL;
 +        return -errno;
 +    }
 +
 +    if (((*info)->argsz > argsz)) {
 +        argsz = (*info)->argsz;
 +        *info = g_realloc(*info, argsz);
 +        goto again;
 +    }
 +
 +    return 0;
 +}
 +
 +static struct vfio_info_cap_header *
 +vfio_get_iommu_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
 +{
 +    struct vfio_info_cap_header *hdr;
 +    void *ptr = info;
 +
 +    if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) {
 +        return NULL;
 +    }
 +
 +    for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) {
 +        if (hdr->id == id) {
 +            return hdr;
 +        }
 +    }
 +
 +    return NULL;
 +}
 +
 +static void vfio_get_iommu_info_migration(VFIOContainer *container,
 +                                         struct vfio_iommu_type1_info *info)
 +{
 +    struct vfio_info_cap_header *hdr;
 +    struct vfio_iommu_type1_info_cap_migration *cap_mig;
 +
 +    hdr = vfio_get_iommu_info_cap(info, VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION);
 +    if (!hdr) {
 +        return;
 +    }
 +
 +    cap_mig = container_of(hdr, struct vfio_iommu_type1_info_cap_migration,
 +                            header);
 +
 +    /*
 +     * cpu_physical_memory_set_dirty_lebitmap() expects pages in bitmap of
 +     * TARGET_PAGE_SIZE to mark those dirty.
 +     */
 +    if (cap_mig->pgsize_bitmap & TARGET_PAGE_SIZE) {
 +        container->dirty_pages_supported = true;
 +        container->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size;
 +        container->dirty_pgsizes = cap_mig->pgsize_bitmap;
 +    }
 +}
 +
 static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
                                   Error **errp)
 {
@@ -1273,6 +1342,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
     container = g_malloc0(sizeof(*container));
     container->space = space;
     container->fd = fd;
 +    container->dirty_pages_supported = false;
     QLIST_INIT(&container->giommu_list);
     QLIST_INIT(&container->hostwin_list);
@@ -1285,7 +1355,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
     case VFIO_TYPE1v2_IOMMU:
     case VFIO_TYPE1_IOMMU:
     {
 -        struct vfio_iommu_type1_info info;
 +        struct vfio_iommu_type1_info *info;
         /*
          * FIXME: This assumes that a Type1 IOMMU can map any 64-bit
@@ -1294,15 +1364,19 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
          * existing Type1 IOMMUs generally support any IOVA we're
          * going to actually try in practice.
          */
 -        info.argsz = sizeof(info);
 -        ret = ioctl(fd, VFIO_IOMMU_GET_INFO, &info);
 -        /* Ignore errors */
 -        if (ret || !(info.flags & VFIO_IOMMU_INFO_PGSIZES)) {
 +        ret = vfio_get_iommu_info(container, &info);
 +
 +        if (ret || !(info->flags & VFIO_IOMMU_INFO_PGSIZES)) {
             /* Assume 4k IOVA page size */
 -            info.iova_pgsizes = 4096;
 +            info->iova_pgsizes = 4096;
         }
 -        vfio_host_win_add(container, 0, (hwaddr)-1, info.iova_pgsizes);
 -        container->pgsizes = info.iova_pgsizes;
 +        vfio_host_win_add(container, 0, (hwaddr)-1, info->iova_pgsizes);
 +        container->pgsizes = info->iova_pgsizes;
 +
 +        if (!ret) {
 +            vfio_get_iommu_info_migration(container, info);
 +        }
 +        g_free(info);
         break;
     }
     case VFIO_SPAPR_TCE_v2_IOMMU:
 diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
 index 954c064435..0d2bd9e5cd 100644
 --- a/hw/vfio/migration.c
 +++ b/hw/vfio/migration.c
@@ -832,9 +832,14 @@ err:
 int vfio_migration_probe(VFIODevice *vbasedev, Error **errp)
 {
 +    VFIOContainer *container = vbasedev->group->container;
     struct vfio_region_info *info = NULL;
     Error *local_err = NULL;
 -    int ret;
 +    int ret = -ENOTSUP;
 +
 +    if (!container->dirty_pages_supported) {
 +        goto add_blocker;
 +    }
     ret = vfio_get_dev_region_info(vbasedev, VFIO_REGION_TYPE_MIGRATION,
                                    VFIO_REGION_SUBTYPE_MIGRATION, &info);
 diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
 index c825524606..8fd0212264 100644
 --- a/include/hw/vfio/vfio-common.h
 +++ b/include/hw/vfio/vfio-common.h
@@ -84,6 +84,9 @@ typedef struct VFIOContainer {
     unsigned iommu_type;
     int error;
     bool initialized;
 +    bool dirty_pages_supported;
 +    uint64_t dirty_pgsizes;
 +    uint64_t max_dirty_bitmap_size;
     unsigned long pgsizes;
     QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
     QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
 -- 
 2.27.0
--- a/vfio-Make-vfio-pci-device-migration-capable.patch
+++ b/vfio-Make-vfio-pci-device-migration-capable.patch
@ -0,0 +1,73 @@
 From b20bf027d44809dd6c6376cf0b77e5c5b2057cba Mon Sep 17 00:00:00 2001
 From: Jens Freimann <jfreimann@redhat.com>
 Date: Tue, 29 Oct 2019 12:49:05 +0100
 Subject: [PATCH] vfio: Make vfio-pci device migration capable
 If the device is not a failover primary device, call
 vfio_migration_probe() and vfio_migration_finalize() to enable
 migration support for those devices that support it respectively to
 tear it down again.
 Removed migration blocker from VFIO PCI device specific structure and use
 migration blocker from generic structure of  VFIO device.
 Note: Since the current version don't add the failover feature for assigned
 PCI devices, just remove the failover related code in the original patch for
 simplicity.
 Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
 Reviewed-by: Neo Jia <cjia@nvidia.com>
 Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
 Reviewed-by: Cornelia Huck <cohuck@redhat.com>
 Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
 Signed-off-by: Shenming Lu <lushenming@huawei.com>
 ---
 hw/vfio/pci.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)
 diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
 index b9fae3ad28..a637c35e7a 100644
 --- a/hw/vfio/pci.c
 +++ b/hw/vfio/pci.c
@@ -3049,6 +3049,11 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
         }
     }
 +    ret = vfio_migration_probe(&vdev->vbasedev, errp);
 +    if (ret) {
 +        error_report("%s: Migration disabled", vdev->vbasedev.name);
 +    }
 +
     vfio_register_err_notifier(vdev);
     vfio_register_req_notifier(vdev);
     vfio_setup_resetfn_quirk(vdev);
@@ -3096,6 +3101,7 @@ static void vfio_exitfn(PCIDevice *pdev)
     }
     vfio_teardown_msi(vdev);
     vfio_bars_exit(vdev);
 +    vfio_migration_finalize(&vdev->vbasedev);
 }
 static void vfio_pci_reset(DeviceState *dev)
@@ -3204,11 +3210,6 @@ static Property vfio_pci_dev_properties[] = {
     DEFINE_PROP_END_OF_LIST(),
 };
 -static const VMStateDescription vfio_pci_vmstate = {
 -    .name = "vfio-pci",
 -    .unmigratable = 1,
 -};
 -
 static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
@@ -3216,7 +3217,6 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
     dc->reset = vfio_pci_reset;
     dc->props = vfio_pci_dev_properties;
 -    dc->vmsd = &vfio_pci_vmstate;
     dc->desc = "VFIO-based PCI device assignment";
     set_bit(DEVICE_CATEGORY_MISC, dc->categories);
     pdc->realize = vfio_realize;
 -- 
 2.27.0
--- a/vfio-Register-SaveVMHandlers-for-VFIO-device.patch
+++ b/vfio-Register-SaveVMHandlers-for-VFIO-device.patch
@ -0,0 +1,183 @@
 From cd5b58f2ba20e59f2c29d955b8bbd7f5016030b7 Mon Sep 17 00:00:00 2001
 From: Kirti Wankhede <kwankhede@nvidia.com>
 Date: Mon, 26 Oct 2020 15:06:17 +0530
 Subject: [PATCH] vfio: Register SaveVMHandlers for VFIO device
 Define flags to be used as delimiter in migration stream for VFIO devices.
 Added .save_setup and .save_cleanup functions. Map & unmap migration
 region from these functions at source during saving or pre-copy phase.
 Set VFIO device state depending on VM's state. During live migration, VM is
 running when .save_setup is called, _SAVING | _RUNNING state is set for VFIO
 device. During save-restore, VM is paused, _SAVING state is set for VFIO device.
 Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
 Reviewed-by: Neo Jia <cjia@nvidia.com>
 Reviewed-by: Cornelia Huck <cohuck@redhat.com>
 Reviewed-by: Yan Zhao <yan.y.zhao@intel.com>
 Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
 ---
 hw/vfio/migration.c  | 102 +++++++++++++++++++++++++++++++++++++++++++
 hw/vfio/trace-events |   2 +
 2 files changed, 104 insertions(+)
 diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
 index 0c6c9b655f..405228fc5a 100644
 --- a/hw/vfio/migration.c
 +++ b/hw/vfio/migration.c
@@ -8,12 +8,15 @@
  */
 #include "qemu/osdep.h"
 +#include "qemu/main-loop.h"
 +#include "qemu/cutils.h"
 #include <linux/vfio.h>
 #include "sysemu/sysemu.h"
 #include "hw/vfio/vfio-common.h"
 #include "cpu.h"
 #include "migration/migration.h"
 +#include "migration/vmstate.h"
 #include "migration/qemu-file.h"
 #include "migration/register.h"
 #include "migration/blocker.h"
@@ -25,6 +28,22 @@
 #include "trace.h"
 #include "hw/hw.h"
 +/*
 + * Flags to be used as unique delimiters for VFIO devices in the migration
 + * stream. These flags are composed as:
 + * 0xffffffff => MSB 32-bit all 1s
 + * 0xef10     => Magic ID, represents emulated (virtual) function IO
 + * 0x0000     => 16-bits reserved for flags
 + *
 + * The beginning of state information is marked by _DEV_CONFIG_STATE,
 + * _DEV_SETUP_STATE, or _DEV_DATA_STATE, respectively. The end of a
 + * certain state information is marked by _END_OF_STATE.
 + */
 +#define VFIO_MIG_FLAG_END_OF_STATE      (0xffffffffef100001ULL)
 +#define VFIO_MIG_FLAG_DEV_CONFIG_STATE  (0xffffffffef100002ULL)
 +#define VFIO_MIG_FLAG_DEV_SETUP_STATE   (0xffffffffef100003ULL)
 +#define VFIO_MIG_FLAG_DEV_DATA_STATE    (0xffffffffef100004ULL)
 +
 static inline int vfio_mig_access(VFIODevice *vbasedev, void *val, int count,
                                   off_t off, bool iswrite)
 {
@@ -129,6 +148,75 @@ static int vfio_migration_set_state(VFIODevice *vbasedev, uint32_t mask,
     return 0;
 }
 +static void vfio_migration_cleanup(VFIODevice *vbasedev)
 +{
 +    VFIOMigration *migration = vbasedev->migration;
 +
 +    if (migration->region.mmaps) {
 +        vfio_region_unmap(&migration->region);
 +    }
 +}
 +
 +/* ---------------------------------------------------------------------- */
 +
 +static int vfio_save_setup(QEMUFile *f, void *opaque)
 +{
 +    VFIODevice *vbasedev = opaque;
 +    VFIOMigration *migration = vbasedev->migration;
 +    int ret;
 +
 +    trace_vfio_save_setup(vbasedev->name);
 +
 +    qemu_put_be64(f, VFIO_MIG_FLAG_DEV_SETUP_STATE);
 +
 +    if (migration->region.mmaps) {
 +        /*
 +         * Calling vfio_region_mmap() from migration thread. Memory API called
 +         * from this function require locking the iothread when called from
 +         * outside the main loop thread.
 +         */
 +        qemu_mutex_lock_iothread();
 +        ret = vfio_region_mmap(&migration->region);
 +        qemu_mutex_unlock_iothread();
 +        if (ret) {
 +            error_report("%s: Failed to mmap VFIO migration region: %s",
 +                         vbasedev->name, strerror(-ret));
 +            error_report("%s: Falling back to slow path", vbasedev->name);
 +        }
 +    }
 +
 +    ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_MASK,
 +                                   VFIO_DEVICE_STATE_SAVING);
 +    if (ret) {
 +        error_report("%s: Failed to set state SAVING", vbasedev->name);
 +        return ret;
 +    }
 +
 +    qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
 +
 +    ret = qemu_file_get_error(f);
 +    if (ret) {
 +        return ret;
 +    }
 +
 +    return 0;
 +}
 +
 +static void vfio_save_cleanup(void *opaque)
 +{
 +    VFIODevice *vbasedev = opaque;
 +
 +    vfio_migration_cleanup(vbasedev);
 +    trace_vfio_save_cleanup(vbasedev->name);
 +}
 +
 +static SaveVMHandlers savevm_vfio_handlers = {
 +    .save_setup = vfio_save_setup,
 +    .save_cleanup = vfio_save_cleanup,
 +};
 +
 +/* ---------------------------------------------------------------------- */
 +
 static void vfio_vmstate_change(void *opaque, int running, RunState state)
 {
     VFIODevice *vbasedev = opaque;
@@ -215,6 +303,8 @@ static int vfio_migration_init(VFIODevice *vbasedev,
     int ret;
     Object *obj;
     VFIOMigration *migration;
 +    char id[256] = "";
 +    g_autofree char *path = NULL, *oid = NULL;
     if (!vbasedev->ops->vfio_get_object) {
         return -EINVAL;
@@ -244,6 +334,18 @@ static int vfio_migration_init(VFIODevice *vbasedev,
     migration = vbasedev->migration;
     migration->vbasedev = vbasedev;
 +
 +    oid = vmstate_if_get_id(VMSTATE_IF(DEVICE(obj)));
 +    if (oid) {
 +        path = g_strdup_printf("%s/vfio", oid);
 +    } else {
 +        path = g_strdup("vfio");
 +    }
 +    strpadcpy(id, sizeof(id), path, '\0');
 +
 +    register_savevm_live(id, VMSTATE_INSTANCE_ID_ANY, 1, &savevm_vfio_handlers,
 +                         vbasedev);
 +
     migration->vm_state = qemu_add_vm_change_state_handler(vfio_vmstate_change,
                                                            vbasedev);
     migration->migration_state.notify = vfio_migration_state_notifier;
 diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
 index bd3d47b005..86c18def01 100644
 --- a/hw/vfio/trace-events
 +++ b/hw/vfio/trace-events
@@ -149,3 +149,5 @@ vfio_migration_probe(const char *name, uint32_t index) " (%s) Region %d"
 vfio_migration_set_state(const char *name, uint32_t state) " (%s) state %d"
 vfio_vmstate_change(const char *name, int running, const char *reason, uint32_t dev_state) " (%s) running %d reason %s device state %d"
 vfio_migration_state_notifier(const char *name, const char *state) " (%s) state %s"
 +vfio_save_setup(const char *name) " (%s)"
 +vfio_save_cleanup(const char *name) " (%s)"
 -- 
 2.27.0
--- a/vmstate-add-qom-interface-to-get-id.patch
+++ b/vmstate-add-qom-interface-to-get-id.patch
@ -0,0 +1,210 @@
 From d771fca664e40c7d7ec5dfa2c656a282bff705b7 Mon Sep 17 00:00:00 2001
 From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= <marcandre.lureau@redhat.com>
 Date: Wed, 28 Aug 2019 16:00:19 +0400
 Subject: [PATCH] vmstate: add qom interface to get id
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 Add an interface to get the instance id, instead of depending on
 Device and qdev_get_dev_path().
 Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
 Reviewed-by: Daniel P. Berrangé <berrange@redhat.com>
 Acked-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
 ---
 MAINTAINERS                  |  2 ++
 hw/core/Makefile.objs        |  1 +
 hw/core/qdev.c               | 14 +++++++++++++
 hw/core/vmstate-if.c         | 23 +++++++++++++++++++++
 include/hw/vmstate-if.h      | 40 ++++++++++++++++++++++++++++++++++++
 include/migration/register.h |  2 ++
 include/migration/vmstate.h  |  2 ++
 tests/Makefile.include       |  1 +
 8 files changed, 85 insertions(+)
 create mode 100644 hw/core/vmstate-if.c
 create mode 100644 include/hw/vmstate-if.h
 diff --git a/MAINTAINERS b/MAINTAINERS
 index d6de200453..e2d74d7ec3 100644
 --- a/MAINTAINERS
 +++ b/MAINTAINERS
@@ -2135,6 +2135,8 @@ Migration
 M: Juan Quintela <quintela@redhat.com>
 M: Dr. David Alan Gilbert <dgilbert@redhat.com>
 S: Maintained
 +F: hw/core/vmstate-if.c
 +F: include/hw/vmstate-if.h
 F: include/migration/
 F: migration/
 F: scripts/vmstate-static-checker.py
 diff --git a/hw/core/Makefile.objs b/hw/core/Makefile.objs
 index f8481d959f..54c51583d8 100644
 --- a/hw/core/Makefile.objs
 +++ b/hw/core/Makefile.objs
@@ -8,6 +8,7 @@ common-obj-y += irq.o
 common-obj-y += hotplug.o
 common-obj-$(CONFIG_SOFTMMU) += nmi.o
 common-obj-$(CONFIG_SOFTMMU) += vm-change-state-handler.o
 +common-obj-y += vmstate-if.o
 common-obj-$(CONFIG_EMPTY_SLOT) += empty_slot.o
 common-obj-$(CONFIG_XILINX_AXI) += stream.o
 diff --git a/hw/core/qdev.c b/hw/core/qdev.c
 index 4b32f2f46d..13931b1117 100644
 --- a/hw/core/qdev.c
 +++ b/hw/core/qdev.c
@@ -1048,9 +1048,18 @@ static void device_unparent(Object *obj)
     }
 }
 +static char *
 +device_vmstate_if_get_id(VMStateIf *obj)
 +{
 +    DeviceState *dev = DEVICE(obj);
 +
 +    return qdev_get_dev_path(dev);
 +}
 +
 static void device_class_init(ObjectClass *class, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(class);
 +    VMStateIfClass *vc = VMSTATE_IF_CLASS(class);
     class->unparent = device_unparent;
@@ -1062,6 +1071,7 @@ static void device_class_init(ObjectClass *class, void *data)
      */
     dc->hotpluggable = true;
     dc->user_creatable = true;
 +    vc->get_id = device_vmstate_if_get_id;
 }
 void device_class_set_parent_reset(DeviceClass *dc,
@@ -1119,6 +1129,10 @@ static const TypeInfo device_type_info = {
     .class_init = device_class_init,
     .abstract = true,
     .class_size = sizeof(DeviceClass),
 +    .interfaces = (InterfaceInfo[]) {
 +        { TYPE_VMSTATE_IF },
 +        { }
 +    }
 };
 static void qdev_register_types(void)
 diff --git a/hw/core/vmstate-if.c b/hw/core/vmstate-if.c
 new file mode 100644
 index 0000000000..bf453620fe
 --- /dev/null
 +++ b/hw/core/vmstate-if.c
@@ -0,0 +1,23 @@
 +/*
 + * VMState interface
 + *
 + * Copyright (c) 2009-2019 Red Hat Inc
 + * This work is licensed under the terms of the GNU GPL, version 2 or later.
 + * See the COPYING file in the top-level directory.
 + */
 +
 +#include "qemu/osdep.h"
 +#include "hw/vmstate-if.h"
 +
 +static const TypeInfo vmstate_if_info = {
 +    .name = TYPE_VMSTATE_IF,
 +    .parent = TYPE_INTERFACE,
 +    .class_size = sizeof(VMStateIfClass),
 +};
 +
 +static void vmstate_register_types(void)
 +{
 +    type_register_static(&vmstate_if_info);
 +}
 +
 +type_init(vmstate_register_types);
 diff --git a/include/hw/vmstate-if.h b/include/hw/vmstate-if.h
 new file mode 100644
 index 0000000000..8ff7f0f292
 --- /dev/null
 +++ b/include/hw/vmstate-if.h
@@ -0,0 +1,40 @@
 +/*
 + * VMState interface
 + *
 + * Copyright (c) 2009-2019 Red Hat Inc
 + * This work is licensed under the terms of the GNU GPL, version 2 or later.
 + * See the COPYING file in the top-level directory.
 + */
 +
 +#ifndef VMSTATE_IF_H
 +#define VMSTATE_IF_H
 +
 +#include "qom/object.h"
 +
 +#define TYPE_VMSTATE_IF "vmstate-if"
 +
 +#define VMSTATE_IF_CLASS(klass)                                     \
 +    OBJECT_CLASS_CHECK(VMStateIfClass, (klass), TYPE_VMSTATE_IF)
 +#define VMSTATE_IF_GET_CLASS(obj)                           \
 +    OBJECT_GET_CLASS(VMStateIfClass, (obj), TYPE_VMSTATE_IF)
 +#define VMSTATE_IF(obj)                             \
 +    INTERFACE_CHECK(VMStateIf, (obj), TYPE_VMSTATE_IF)
 +
 +typedef struct VMStateIf VMStateIf;
 +
 +typedef struct VMStateIfClass {
 +    InterfaceClass parent_class;
 +
 +    char * (*get_id)(VMStateIf *obj);
 +} VMStateIfClass;
 +
 +static inline char *vmstate_if_get_id(VMStateIf *vmif)
 +{
 +    if (!vmif) {
 +        return NULL;
 +    }
 +
 +    return VMSTATE_IF_GET_CLASS(vmif)->get_id(vmif);
 +}
 +
 +#endif /* VMSTATE_IF_H */
 diff --git a/include/migration/register.h b/include/migration/register.h
 index f3ba10b6ef..158130c8c4 100644
 --- a/include/migration/register.h
 +++ b/include/migration/register.h
@@ -14,6 +14,8 @@
 #ifndef MIGRATION_REGISTER_H
 #define MIGRATION_REGISTER_H
 +#include "hw/vmstate-if.h"
 +
 typedef struct SaveVMHandlers {
     /* This runs inside the iothread lock.  */
     SaveStateHandler *save_state;
 diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h
 index 8abd2e3b80..8cc1e19fd9 100644
 --- a/include/migration/vmstate.h
 +++ b/include/migration/vmstate.h
@@ -27,6 +27,8 @@
 #ifndef QEMU_VMSTATE_H
 #define QEMU_VMSTATE_H
 +#include "hw/vmstate-if.h"
 +
 typedef struct VMStateInfo VMStateInfo;
 typedef struct VMStateDescription VMStateDescription;
 typedef struct VMStateField VMStateField;
 diff --git a/tests/Makefile.include b/tests/Makefile.include
 index 3be60ab999..1c7772a230 100644
 --- a/tests/Makefile.include
 +++ b/tests/Makefile.include
@@ -566,6 +566,7 @@ tests/test-qdev-global-props$(EXESUF): tests/test-qdev-global-props.o \
 	hw/core/irq.o \
 	hw/core/fw-path-provider.o \
 	hw/core/reset.o \
 +	hw/core/vmstate-if.o \
 	$(test-qapi-obj-y)
 tests/test-vmstate$(EXESUF): tests/test-vmstate.o \
 	migration/vmstate.o migration/vmstate-types.o migration/qemu-file.o \
 -- 
 2.27.0