!673 Qemu update to version 6.2.0-55

From: @yezengruan 
Reviewed-by: @aven6 
Signed-off-by: @aven6
This commit is contained in:
openeuler-ci-bot 2022-11-04 02:32:44 +00:00 committed by Gitee
commit 4b7a282fcc
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
25 changed files with 3590 additions and 1 deletions

View File

@ -0,0 +1,113 @@
From 49cb3c9f3cc3a567ce2e6159bf27328c64b6601d Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 23 Mar 2022 12:33:25 +0100
Subject: [PATCH 10/10] KVM: x86: workaround invalid CPUID[0xD,9] info on some
AMD processors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
from mainline-v7.0.0-rc2
commit 58f7db26f21c690cf9a669c314cfd7371506084a
category: feature
feature: SPR AMX support for Qemu
bugzilla: https://gitee.com/openeuler/intel-qemu/issues/I5VHOB
Intel-SIG: commit 58f7db26f21c ("KVM: x86: workaround invalid CPUID[0xD,9] info
on some AMD processors")
----------------------------------------------------------------
KVM: x86: workaround invalid CPUID[0xD,9] info on some AMD processors
Some AMD processors expose the PKRU extended save state even if they do not have
the related PKU feature in CPUID. Worse, when they do they report a size of
64, whereas the expected size of the PKRU extended save state is 8, therefore
the esa->size == eax assertion does not hold.
The state is already ignored by KVM_GET_SUPPORTED_CPUID because it
was not enabled in the host XCR0. However, QEMU kvm_cpu_xsave_init()
runs before QEMU invokes arch_prctl() to enable dynamically-enabled
save states such as XTILEDATA, and KVM_GET_SUPPORTED_CPUID hides save
states that have yet to be enabled. Therefore, kvm_cpu_xsave_init()
needs to consult the host CPUID instead of KVM_GET_SUPPORTED_CPUID,
and dies with an assertion failure.
When setting up the ExtSaveArea array to match the host, ignore features that
KVM does not report as supported. This will cause QEMU to skip the incorrect
CPUID leaf instead of tripping the assertion.
Closes: https://gitlab.com/qemu-project/qemu/-/issues/916
Reported-by: Daniel P. Berrangé <berrange@redhat.com>
Analyzed-by: Yang Zhong <yang.zhong@intel.com>
Reported-by: Peter Krempa <pkrempa@redhat.com>
Tested-by: Daniel P. Berrangé <berrange@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Jason Zeng <jason.zeng@intel.com>
---
target/i386/cpu.c | 4 ++--
target/i386/cpu.h | 2 ++
target/i386/kvm/kvm-cpu.c | 19 ++++++++++++-------
3 files changed, 16 insertions(+), 9 deletions(-)
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 1bc03d3eef..551b47ab1e 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -4973,8 +4973,8 @@ CpuDefinitionInfoList *qmp_query_cpu_definitions(Error **errp)
return cpu_list;
}
-static uint64_t x86_cpu_get_supported_feature_word(FeatureWord w,
- bool migratable_only)
+uint64_t x86_cpu_get_supported_feature_word(FeatureWord w,
+ bool migratable_only)
{
FeatureWordInfo *wi = &feature_word_info[w];
uint64_t r = 0;
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index eaa99c302f..290f1beaea 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -605,6 +605,8 @@ typedef enum FeatureWord {
} FeatureWord;
typedef uint64_t FeatureWordArray[FEATURE_WORDS];
+uint64_t x86_cpu_get_supported_feature_word(FeatureWord w,
+ bool migratable_only);
/* cpuid_features bits */
#define CPUID_FP87 (1U << 0)
diff --git a/target/i386/kvm/kvm-cpu.c b/target/i386/kvm/kvm-cpu.c
index a35a1bf9fe..5eb955ce9a 100644
--- a/target/i386/kvm/kvm-cpu.c
+++ b/target/i386/kvm/kvm-cpu.c
@@ -99,13 +99,18 @@ static void kvm_cpu_xsave_init(void)
for (i = XSTATE_SSE_BIT + 1; i < XSAVE_STATE_AREA_COUNT; i++) {
ExtSaveArea *esa = &x86_ext_save_areas[i];
- if (esa->size) {
- host_cpuid(0xd, i, &eax, &ebx, &ecx, &edx);
- if (eax != 0) {
- assert(esa->size == eax);
- esa->offset = ebx;
- esa->ecx = ecx;
- }
+ if (!esa->size) {
+ continue;
+ }
+ if ((x86_cpu_get_supported_feature_word(esa->feature, false) & esa->bits)
+ != esa->bits) {
+ continue;
+ }
+ host_cpuid(0xd, i, &eax, &ebx, &ecx, &edx);
+ if (eax != 0) {
+ assert(esa->size == eax);
+ esa->offset = ebx;
+ esa->ecx = ecx;
}
}
}
--
2.27.0

View File

@ -0,0 +1,25 @@
From 7b859a86cbdde8bf17619c43a6d4ae687a20f003 Mon Sep 17 00:00:00 2001
From: dinglimin <dinglimin@cmss.chinamobile.com>
Date: Wed, 29 Jun 2022 16:26:17 +0800
Subject: [PATCH] Remove the unused local variable "records".
Signed-off-by: dinglimin <dinglimin@cmss.chinamobile.com>
---
tests/migration/guestperf/engine.py | 1 -
1 file changed, 1 deletion(-)
diff --git a/tests/migration/guestperf/engine.py b/tests/migration/guestperf/engine.py
index 87a6ab2009..59fca2c70b 100644
--- a/tests/migration/guestperf/engine.py
+++ b/tests/migration/guestperf/engine.py
@@ -65,7 +65,6 @@ def _vcpu_timing(self, pid, tid_list):
return records
def _cpu_timing(self, pid):
- records = []
now = time.time()
jiffies_per_sec = os.sysconf(os.sysconf_names['SC_CLK_TCK'])
--
2.27.0

View File

@ -0,0 +1,25 @@
From e7ef56975af8553690afb16f32fe74d62762b853 Mon Sep 17 00:00:00 2001
From: dinglimin <dinglimin@cmss.chinamobile.com>
Date: Wed, 29 Jun 2022 14:02:59 +0800
Subject: [PATCH] Remove this redundant return.
Signed-off-by: dinglimin <dinglimin@cmss.chinamobile.com>
---
scripts/vmstate-static-checker.py | 1 -
1 file changed, 1 deletion(-)
diff --git a/scripts/vmstate-static-checker.py b/scripts/vmstate-static-checker.py
index 539ead62b4..6838bf7e7c 100755
--- a/scripts/vmstate-static-checker.py
+++ b/scripts/vmstate-static-checker.py
@@ -367,7 +367,6 @@ def check_machine_type(s, d):
if s["Name"] != d["Name"]:
print("Warning: checking incompatible machine types:", end=' ')
print("\"" + s["Name"] + "\", \"" + d["Name"] + "\"")
- return
def main():
--
2.27.0

View File

@ -0,0 +1,66 @@
From 85583352f3bc28badd4cb336517f6a4eb440d5b0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hyman=20Huang=28=E9=BB=84=E5=8B=87=29?=
<huangy81@chinatelecom.cn>
Date: Sun, 26 Jun 2022 01:38:34 +0800
Subject: [PATCH 2/3] accel/kvm/kvm-all: Introduce kvm_dirty_ring_size function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Introduce kvm_dirty_ring_size util function to help calculate
dirty ring ful time.
Signed-off-by: Hyman Huang(黄勇) <huangy81@chinatelecom.cn>
Acked-by: Peter Xu <peterx@redhat.com>
Message-Id: <f9ce1f550bfc0e3a1f711e17b1dbc8f701700e56.1656177590.git.huangy81@chinatelecom.cn>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
accel/kvm/kvm-all.c | 5 +++++
accel/stubs/kvm-stub.c | 5 +++++
include/sysemu/kvm.h | 2 ++
3 files changed, 12 insertions(+)
diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 3bc6eb6294..d0c4310507 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -2332,6 +2332,11 @@ bool kvm_dirty_ring_enabled(void)
return kvm_state->kvm_dirty_ring_size ? true : false;
}
+uint32_t kvm_dirty_ring_size(void)
+{
+ return kvm_state->kvm_dirty_ring_size;
+}
+
static int kvm_init(MachineState *ms)
{
MachineClass *mc = MACHINE_GET_CLASS(ms);
diff --git a/accel/stubs/kvm-stub.c b/accel/stubs/kvm-stub.c
index 5319573e00..1128cb2928 100644
--- a/accel/stubs/kvm-stub.c
+++ b/accel/stubs/kvm-stub.c
@@ -152,4 +152,9 @@ bool kvm_dirty_ring_enabled(void)
{
return false;
}
+
+uint32_t kvm_dirty_ring_size(void)
+{
+ return 0;
+}
#endif
diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index 2623775c27..19c5c8402a 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -549,4 +549,6 @@ bool kvm_cpu_check_are_resettable(void);
bool kvm_arch_cpu_check_are_resettable(void);
bool kvm_dirty_ring_enabled(void);
+
+uint32_t kvm_dirty_ring_size(void);
#endif
--
2.27.0

View File

@ -0,0 +1,106 @@
From c6f781e50e75fc2e6b819291b6c5ce6c212f018b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hyman=20Huang=28=E9=BB=84=E5=8B=87=29?=
<huangy81@chinatelecom.cn>
Date: Sun, 26 Jun 2022 01:38:30 +0800
Subject: [PATCH 1/3] accel/kvm/kvm-all: Refactor per-vcpu dirty ring reaping
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Add a non-required argument 'CPUState' to kvm_dirty_ring_reap so
that it can cover single vcpu dirty-ring-reaping scenario.
Signed-off-by: Hyman Huang(黄勇) <huangy81@chinatelecom.cn>
Reviewed-by: Peter Xu <peterx@redhat.com>
Message-Id: <c32001242875e83b0d9f78f396fe2dcd380ba9e8.1656177590.git.huangy81@chinatelecom.cn>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
accel/kvm/kvm-all.c | 23 +++++++++++++----------
1 file changed, 13 insertions(+), 10 deletions(-)
diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index f2ce5cd45a..3bc6eb6294 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -773,17 +773,20 @@ static uint32_t kvm_dirty_ring_reap_one(KVMState *s, CPUState *cpu)
}
/* Must be with slots_lock held */
-static uint64_t kvm_dirty_ring_reap_locked(KVMState *s)
+static uint64_t kvm_dirty_ring_reap_locked(KVMState *s, CPUState* cpu)
{
int ret;
- CPUState *cpu;
uint64_t total = 0;
int64_t stamp;
stamp = get_clock();
- CPU_FOREACH(cpu) {
- total += kvm_dirty_ring_reap_one(s, cpu);
+ if (cpu) {
+ total = kvm_dirty_ring_reap_one(s, cpu);
+ } else {
+ CPU_FOREACH(cpu) {
+ total += kvm_dirty_ring_reap_one(s, cpu);
+ }
}
if (total) {
@@ -804,7 +807,7 @@ static uint64_t kvm_dirty_ring_reap_locked(KVMState *s)
* Currently for simplicity, we must hold BQL before calling this. We can
* consider to drop the BQL if we're clear with all the race conditions.
*/
-static uint64_t kvm_dirty_ring_reap(KVMState *s)
+static uint64_t kvm_dirty_ring_reap(KVMState *s, CPUState *cpu)
{
uint64_t total;
@@ -824,7 +827,7 @@ static uint64_t kvm_dirty_ring_reap(KVMState *s)
* reset below.
*/
kvm_slots_lock();
- total = kvm_dirty_ring_reap_locked(s);
+ total = kvm_dirty_ring_reap_locked(s, cpu);
kvm_slots_unlock();
return total;
@@ -871,7 +874,7 @@ static void kvm_dirty_ring_flush(void)
* vcpus out in a synchronous way.
*/
kvm_cpu_synchronize_kick_all();
- kvm_dirty_ring_reap(kvm_state);
+ kvm_dirty_ring_reap(kvm_state, NULL);
trace_kvm_dirty_ring_flush(1);
}
@@ -1415,7 +1418,7 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
* Not easy. Let's cross the fingers until it's fixed.
*/
if (kvm_state->kvm_dirty_ring_size) {
- kvm_dirty_ring_reap_locked(kvm_state);
+ kvm_dirty_ring_reap_locked(kvm_state, NULL);
} else {
kvm_slot_get_dirty_log(kvm_state, mem);
}
@@ -1487,7 +1490,7 @@ static void *kvm_dirty_ring_reaper_thread(void *data)
r->reaper_state = KVM_DIRTY_RING_REAPER_REAPING;
qemu_mutex_lock_iothread();
- kvm_dirty_ring_reap(s);
+ kvm_dirty_ring_reap(s, NULL);
qemu_mutex_unlock_iothread();
r->reaper_iteration++;
@@ -2957,7 +2960,7 @@ int kvm_cpu_exec(CPUState *cpu)
*/
trace_kvm_dirty_ring_full(cpu->cpu_index);
qemu_mutex_lock_iothread();
- kvm_dirty_ring_reap(kvm_state);
+ kvm_dirty_ring_reap(kvm_state, NULL);
qemu_mutex_unlock_iothread();
ret = 0;
break;
--
2.27.0

View File

@ -0,0 +1,73 @@
From 6e057dd5df580f0e525d808f5476ee973280371d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hyman=20Huang=28=E9=BB=84=E5=8B=87=29?=
<huangy81@chinatelecom.cn>
Date: Sun, 26 Jun 2022 01:38:31 +0800
Subject: [PATCH 2/3] cpus: Introduce cpu_list_generation_id
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Introduce cpu_list_generation_id to track cpu list generation so
that cpu hotplug/unplug can be detected during measurement of
dirty page rate.
cpu_list_generation_id could be used to detect changes of cpu
list, which is prepared for dirty page rate measurement.
Signed-off-by: Hyman Huang(黄勇) <huangy81@chinatelecom.cn>
Reviewed-by: Peter Xu <peterx@redhat.com>
Message-Id: <06e1f1362b2501a471dce796abb065b04f320fa5.1656177590.git.huangy81@chinatelecom.cn>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
cpus-common.c | 8 ++++++++
include/exec/cpu-common.h | 1 +
2 files changed, 9 insertions(+)
diff --git a/cpus-common.c b/cpus-common.c
index 6e73d3e58d..31c6415f37 100644
--- a/cpus-common.c
+++ b/cpus-common.c
@@ -73,6 +73,12 @@ static int cpu_get_free_index(void)
}
CPUTailQ cpus = QTAILQ_HEAD_INITIALIZER(cpus);
+static unsigned int cpu_list_generation_id;
+
+unsigned int cpu_list_generation_id_get(void)
+{
+ return cpu_list_generation_id;
+}
void cpu_list_add(CPUState *cpu)
{
@@ -84,6 +90,7 @@ void cpu_list_add(CPUState *cpu)
assert(!cpu_index_auto_assigned);
}
QTAILQ_INSERT_TAIL_RCU(&cpus, cpu, node);
+ cpu_list_generation_id++;
}
void cpu_list_remove(CPUState *cpu)
@@ -96,6 +103,7 @@ void cpu_list_remove(CPUState *cpu)
QTAILQ_REMOVE_RCU(&cpus, cpu, node);
cpu->cpu_index = UNASSIGNED_CPU_INDEX;
+ cpu_list_generation_id++;
}
CPUState *qemu_get_cpu(int index)
diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index 039d422bf4..cdee668f20 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -11,6 +11,7 @@
void qemu_init_cpu_list(void);
void cpu_list_lock(void);
void cpu_list_unlock(void);
+unsigned int cpu_list_generation_id_get(void);
void tcg_flush_softmmu_tlb(CPUState *cs);
--
2.27.0

View File

@ -0,0 +1,25 @@
From 58471cd8dcf8e6a66113ddf9bb4ac45c89bbd57b Mon Sep 17 00:00:00 2001
From: lifeng 71117973 <lif121@chinatelecom.cn>
Date: Wed, 2 Nov 2022 11:19:55 +0800
Subject: [PATCH 1/2] fix compilation errors of sw64 architecture on x86
platform
---
target/sw64/float_helper.c | 1 -
1 file changed, 1 deletion(-)
diff --git a/target/sw64/float_helper.c b/target/sw64/float_helper.c
index ad1c3cce48..c8e0845afc 100644
--- a/target/sw64/float_helper.c
+++ b/target/sw64/float_helper.c
@@ -653,7 +653,6 @@ void helper_ieee_input(CPUSW64State *env, uint64_t val)
{
#ifndef CONFIG_USER_ONLY
uint32_t exp = (uint32_t)(val >> 52) & 0x7ff;
- uint64_t frac = val & 0xfffffffffffffull;
if (exp == 0x7ff) {
/* Infinity or NaN. */
--
2.27.0

View File

@ -0,0 +1,28 @@
From cf6be03a1f5b7595a2ecada71fa8aa30de744703 Mon Sep 17 00:00:00 2001
From: lifeng 71117973 <lif121@chinatelecom.cn>
Date: Wed, 2 Nov 2022 17:20:50 +0800
Subject: [PATCH 2/2] fixed the error that no bios file soft link was created
in the build directory when compiling the sw64 architecture
---
configure | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/configure b/configure
index 9569d7a3d0..0ae7bcf065 100755
--- a/configure
+++ b/configure
@@ -3861,7 +3861,9 @@ for bios_file in \
$source_path/pc-bios/u-boot.* \
$source_path/pc-bios/edk2-*.fd.bz2 \
$source_path/pc-bios/palcode-* \
- $source_path/pc-bios/qemu_vga.ndrv
+ $source_path/pc-bios/qemu_vga.ndrv \
+ $source_path/pc-bios/core* \
+ $source_path/pc-bios/uefi-bios-sw
do
LINKS="$LINKS pc-bios/$(basename $bios_file)"
--
2.27.0

View File

@ -0,0 +1,34 @@
From 4f66d261c0f20189e387de57baca17167cc542ab Mon Sep 17 00:00:00 2001
From: Andy Pei <andy.pei@intel.com>
Date: Mon, 3 Jan 2022 17:28:12 +0800
Subject: [PATCH] hw/vhost-user-blk: turn on VIRTIO_BLK_F_SIZE_MAX feature for
virtio blk device
Turn on pre-defined feature VIRTIO_BLK_F_SIZE_MAX for virtio blk device to
avoid guest DMA request sizes which are too large for hardware spec.
Signed-off-by: dinglimin <dinglimin@cmss.chinamobile.com>
Signed-off-by: Andy Pei <andy.pei@intel.com>
Message-Id: <1641202092-149677-1-git-send-email-andy.pei@intel.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Raphael Norwitz <raphael.norwitz@nutanix.com>
---
hw/block/vhost-user-blk.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
index ba13cb87e5..eb1264afc7 100644
--- a/hw/block/vhost-user-blk.c
+++ b/hw/block/vhost-user-blk.c
@@ -252,6 +252,7 @@ static uint64_t vhost_user_blk_get_features(VirtIODevice *vdev,
VHostUserBlk *s = VHOST_USER_BLK(vdev);
/* Turn on pre-defined features */
+ virtio_add_feature(&features, VIRTIO_BLK_F_SIZE_MAX);
virtio_add_feature(&features, VIRTIO_BLK_F_SEG_MAX);
virtio_add_feature(&features, VIRTIO_BLK_F_GEOMETRY);
virtio_add_feature(&features, VIRTIO_BLK_F_TOPOLOGY);
--
2.27.0

View File

@ -0,0 +1,65 @@
From d6398243714a7a775c64e74dbd63c00863cb7e83 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 22 Feb 2022 17:58:11 +0100
Subject: [PATCH 01/10] linux-headers: include missing changes from 5.17
mainline inclusion
from mainline-v7.0.0-rc0
commit 1ea5208febcc068449b63282d72bb719ab67a466
category: feature
feature: SPR AMX support for Qemu
bugzilla: https://gitee.com/openeuler/intel-qemu/issues/I5VHOB
Intel-SIG: commit 1ea5208febcc ("linux-headers: include missing changes from 5.17")
------------------------------------------------
linux-headers: include missing changes from 5.17
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Jason Zeng <jason.zeng@intel.com>
---
linux-headers/asm-x86/kvm.h | 3 +++
linux-headers/linux/kvm.h | 7 +++++++
2 files changed, 10 insertions(+)
diff --git a/linux-headers/asm-x86/kvm.h b/linux-headers/asm-x86/kvm.h
index a6c327f8ad..2ab4f1818a 100644
--- a/linux-headers/asm-x86/kvm.h
+++ b/linux-headers/asm-x86/kvm.h
@@ -437,6 +437,9 @@ struct kvm_sync_regs {
#define KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE 0x00000001
+/* attributes for system fd (group 0) */
+#define KVM_X86_XCOMP_GUEST_SUPP 0
+
struct kvm_vmx_nested_state_data {
__u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
__u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h
index 5d8e42b8f8..7870cd0280 100644
--- a/linux-headers/linux/kvm.h
+++ b/linux-headers/linux/kvm.h
@@ -1112,6 +1112,10 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_BINARY_STATS_FD 203
#define KVM_CAP_EXIT_ON_EMULATION_FAILURE 204
#define KVM_CAP_ARM_MTE 205
+#define KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM 206
+#define KVM_CAP_VM_GPA_BITS 207
+#define KVM_CAP_XSAVE2 208
+#define KVM_CAP_SYS_ATTRIBUTES 209
#define KVM_CAP_ARM_CPU_FEATURE 555
@@ -2006,4 +2010,7 @@ struct kvm_stats_desc {
#define KVM_GET_STATS_FD _IO(KVMIO, 0xce)
+/* Available with KVM_CAP_XSAVE2 */
+#define KVM_GET_XSAVE2 _IOR(KVMIO, 0xcf, struct kvm_xsave)
+
#endif /* __LINUX_KVM_H */
--
2.27.0

View File

@ -0,0 +1,399 @@
From b6d1e022b7bb06faf2dcad3062b7061b59ef68a9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hyman=20Huang=28=E9=BB=84=E5=8B=87=29?=
<huangy81@chinatelecom.cn>
Date: Sun, 26 Jun 2022 01:38:32 +0800
Subject: [PATCH 3/3] migration/dirtyrate: Refactor dirty page rate calculation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
abstract out dirty log change logic into function
global_dirty_log_change.
abstract out dirty page rate calculation logic via
dirty-ring into function vcpu_calculate_dirtyrate.
abstract out mathematical dirty page rate calculation
into do_calculate_dirtyrate, decouple it from DirtyStat.
rename set_sample_page_period to dirty_stat_wait, which
is well-understood and will be reused in dirtylimit.
handle cpu hotplug/unplug scenario during measurement of
dirty page rate.
export util functions outside migration.
Signed-off-by: Hyman Huang(黄勇) <huangy81@chinatelecom.cn>
Reviewed-by: Peter Xu <peterx@redhat.com>
Message-Id: <7b6f6f4748d5b3d017b31a0429e630229ae97538.1656177590.git.huangy81@chinatelecom.cn>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
include/sysemu/dirtyrate.h | 28 +++++
migration/dirtyrate.c | 227 +++++++++++++++++++++++--------------
migration/dirtyrate.h | 7 +-
3 files changed, 174 insertions(+), 88 deletions(-)
create mode 100644 include/sysemu/dirtyrate.h
diff --git a/include/sysemu/dirtyrate.h b/include/sysemu/dirtyrate.h
new file mode 100644
index 0000000000..4d3b9a4902
--- /dev/null
+++ b/include/sysemu/dirtyrate.h
@@ -0,0 +1,28 @@
+/*
+ * dirty page rate helper functions
+ *
+ * Copyright (c) 2022 CHINA TELECOM CO.,LTD.
+ *
+ * Authors:
+ * Hyman Huang(黄勇) <huangy81@chinatelecom.cn>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_DIRTYRATE_H
+#define QEMU_DIRTYRATE_H
+
+typedef struct VcpuStat {
+ int nvcpu; /* number of vcpu */
+ DirtyRateVcpu *rates; /* array of dirty rate for each vcpu */
+} VcpuStat;
+
+int64_t vcpu_calculate_dirtyrate(int64_t calc_time_ms,
+ VcpuStat *stat,
+ unsigned int flag,
+ bool one_shot);
+
+void global_dirty_log_change(unsigned int flag,
+ bool start);
+#endif
diff --git a/migration/dirtyrate.c b/migration/dirtyrate.c
index 8043bc7946..c449095fc3 100644
--- a/migration/dirtyrate.c
+++ b/migration/dirtyrate.c
@@ -46,7 +46,7 @@ static struct DirtyRateStat DirtyStat;
static DirtyRateMeasureMode dirtyrate_mode =
DIRTY_RATE_MEASURE_MODE_PAGE_SAMPLING;
-static int64_t set_sample_page_period(int64_t msec, int64_t initial_time)
+static int64_t dirty_stat_wait(int64_t msec, int64_t initial_time)
{
int64_t current_time;
@@ -60,6 +60,132 @@ static int64_t set_sample_page_period(int64_t msec, int64_t initial_time)
return msec;
}
+static inline void record_dirtypages(DirtyPageRecord *dirty_pages,
+ CPUState *cpu, bool start)
+{
+ if (start) {
+ dirty_pages[cpu->cpu_index].start_pages = cpu->dirty_pages;
+ } else {
+ dirty_pages[cpu->cpu_index].end_pages = cpu->dirty_pages;
+ }
+}
+
+static int64_t do_calculate_dirtyrate(DirtyPageRecord dirty_pages,
+ int64_t calc_time_ms)
+{
+ uint64_t memory_size_MB;
+ uint64_t increased_dirty_pages =
+ dirty_pages.end_pages - dirty_pages.start_pages;
+
+ memory_size_MB = (increased_dirty_pages * TARGET_PAGE_SIZE) >> 20;
+
+ return memory_size_MB * 1000 / calc_time_ms;
+}
+
+void global_dirty_log_change(unsigned int flag, bool start)
+{
+ qemu_mutex_lock_iothread();
+ if (start) {
+ memory_global_dirty_log_start(flag);
+ } else {
+ memory_global_dirty_log_stop(flag);
+ }
+ qemu_mutex_unlock_iothread();
+}
+
+/*
+ * global_dirty_log_sync
+ * 1. sync dirty log from kvm
+ * 2. stop dirty tracking if needed.
+ */
+static void global_dirty_log_sync(unsigned int flag, bool one_shot)
+{
+ qemu_mutex_lock_iothread();
+ memory_global_dirty_log_sync();
+ if (one_shot) {
+ memory_global_dirty_log_stop(flag);
+ }
+ qemu_mutex_unlock_iothread();
+}
+
+static DirtyPageRecord *vcpu_dirty_stat_alloc(VcpuStat *stat)
+{
+ CPUState *cpu;
+ DirtyPageRecord *records;
+ int nvcpu = 0;
+
+ CPU_FOREACH(cpu) {
+ nvcpu++;
+ }
+
+ stat->nvcpu = nvcpu;
+ stat->rates = g_malloc0(sizeof(DirtyRateVcpu) * nvcpu);
+
+ records = g_malloc0(sizeof(DirtyPageRecord) * nvcpu);
+
+ return records;
+}
+
+static void vcpu_dirty_stat_collect(VcpuStat *stat,
+ DirtyPageRecord *records,
+ bool start)
+{
+ CPUState *cpu;
+
+ CPU_FOREACH(cpu) {
+ record_dirtypages(records, cpu, start);
+ }
+}
+
+int64_t vcpu_calculate_dirtyrate(int64_t calc_time_ms,
+ VcpuStat *stat,
+ unsigned int flag,
+ bool one_shot)
+{
+ DirtyPageRecord *records;
+ int64_t init_time_ms;
+ int64_t duration;
+ int64_t dirtyrate;
+ int i = 0;
+ unsigned int gen_id;
+
+retry:
+ init_time_ms = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+
+ cpu_list_lock();
+ gen_id = cpu_list_generation_id_get();
+ records = vcpu_dirty_stat_alloc(stat);
+ vcpu_dirty_stat_collect(stat, records, true);
+ cpu_list_unlock();
+
+ duration = dirty_stat_wait(calc_time_ms, init_time_ms);
+
+ global_dirty_log_sync(flag, one_shot);
+
+ cpu_list_lock();
+ if (gen_id != cpu_list_generation_id_get()) {
+ g_free(records);
+ g_free(stat->rates);
+ cpu_list_unlock();
+ goto retry;
+ }
+ vcpu_dirty_stat_collect(stat, records, false);
+ cpu_list_unlock();
+
+ for (i = 0; i < stat->nvcpu; i++) {
+ dirtyrate = do_calculate_dirtyrate(records[i], duration);
+
+ stat->rates[i].id = i;
+ stat->rates[i].dirty_rate = dirtyrate;
+
+ trace_dirtyrate_do_calculate_vcpu(i, dirtyrate);
+ }
+
+ g_free(records);
+
+ return duration;
+}
+
static bool is_sample_period_valid(int64_t sec)
{
if (sec < MIN_FETCH_DIRTYRATE_TIME_SEC ||
@@ -396,44 +522,6 @@ static bool compare_page_hash_info(struct RamblockDirtyInfo *info,
return true;
}
-static inline void record_dirtypages(DirtyPageRecord *dirty_pages,
- CPUState *cpu, bool start)
-{
- if (start) {
- dirty_pages[cpu->cpu_index].start_pages = cpu->dirty_pages;
- } else {
- dirty_pages[cpu->cpu_index].end_pages = cpu->dirty_pages;
- }
-}
-
-static void dirtyrate_global_dirty_log_start(void)
-{
- qemu_mutex_lock_iothread();
- memory_global_dirty_log_start(GLOBAL_DIRTY_DIRTY_RATE);
- qemu_mutex_unlock_iothread();
-}
-
-static void dirtyrate_global_dirty_log_stop(void)
-{
- qemu_mutex_lock_iothread();
- memory_global_dirty_log_sync();
- memory_global_dirty_log_stop(GLOBAL_DIRTY_DIRTY_RATE);
- qemu_mutex_unlock_iothread();
-}
-
-static int64_t do_calculate_dirtyrate_vcpu(DirtyPageRecord dirty_pages)
-{
- uint64_t memory_size_MB;
- int64_t time_s;
- uint64_t increased_dirty_pages =
- dirty_pages.end_pages - dirty_pages.start_pages;
-
- memory_size_MB = (increased_dirty_pages * TARGET_PAGE_SIZE) >> 20;
- time_s = DirtyStat.calc_time;
-
- return memory_size_MB / time_s;
-}
-
static inline void record_dirtypages_bitmap(DirtyPageRecord *dirty_pages,
bool start)
{
@@ -444,11 +532,6 @@ static inline void record_dirtypages_bitmap(DirtyPageRecord *dirty_pages,
}
}
-static void do_calculate_dirtyrate_bitmap(DirtyPageRecord dirty_pages)
-{
- DirtyStat.dirty_rate = do_calculate_dirtyrate_vcpu(dirty_pages);
-}
-
static inline void dirtyrate_manual_reset_protect(void)
{
RAMBlock *block = NULL;
@@ -492,71 +575,49 @@ static void calculate_dirtyrate_dirty_bitmap(struct DirtyRateConfig config)
DirtyStat.start_time = start_time / 1000;
msec = config.sample_period_seconds * 1000;
- msec = set_sample_page_period(msec, start_time);
+ msec = dirty_stat_wait(msec, start_time);
DirtyStat.calc_time = msec / 1000;
/*
- * dirtyrate_global_dirty_log_stop do two things.
+ * do two things.
* 1. fetch dirty bitmap from kvm
* 2. stop dirty tracking
*/
- dirtyrate_global_dirty_log_stop();
+ global_dirty_log_sync(GLOBAL_DIRTY_DIRTY_RATE, true);
record_dirtypages_bitmap(&dirty_pages, false);
- do_calculate_dirtyrate_bitmap(dirty_pages);
+ DirtyStat.dirty_rate = do_calculate_dirtyrate(dirty_pages, msec);
}
static void calculate_dirtyrate_dirty_ring(struct DirtyRateConfig config)
{
- CPUState *cpu;
- int64_t msec = 0;
- int64_t start_time;
+ int64_t duration;
uint64_t dirtyrate = 0;
uint64_t dirtyrate_sum = 0;
- DirtyPageRecord *dirty_pages;
- int nvcpu = 0;
int i = 0;
- CPU_FOREACH(cpu) {
- nvcpu++;
- }
-
- dirty_pages = g_new(DirtyPageRecord, nvcpu);
-
- DirtyStat.dirty_ring.nvcpu = nvcpu;
- DirtyStat.dirty_ring.rates = g_new(DirtyRateVcpu, nvcpu);
-
- dirtyrate_global_dirty_log_start();
-
- CPU_FOREACH(cpu) {
- record_dirtypages(dirty_pages, cpu, true);
- }
-
- start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
- DirtyStat.start_time = start_time / 1000;
+ /* start log sync */
+ global_dirty_log_change(GLOBAL_DIRTY_DIRTY_RATE, true);
- msec = config.sample_period_seconds * 1000;
- msec = set_sample_page_period(msec, start_time);
- DirtyStat.calc_time = msec / 1000;
+ DirtyStat.start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) / 1000;
- dirtyrate_global_dirty_log_stop();
+ /* calculate vcpu dirtyrate */
+ duration = vcpu_calculate_dirtyrate(config.sample_period_seconds * 1000,
+ &DirtyStat.dirty_ring,
+ GLOBAL_DIRTY_DIRTY_RATE,
+ true);
- CPU_FOREACH(cpu) {
- record_dirtypages(dirty_pages, cpu, false);
- }
+ DirtyStat.calc_time = duration / 1000;
+ /* calculate vm dirtyrate */
for (i = 0; i < DirtyStat.dirty_ring.nvcpu; i++) {
- dirtyrate = do_calculate_dirtyrate_vcpu(dirty_pages[i]);
- trace_dirtyrate_do_calculate_vcpu(i, dirtyrate);
-
- DirtyStat.dirty_ring.rates[i].id = i;
+ dirtyrate = DirtyStat.dirty_ring.rates[i].dirty_rate;
DirtyStat.dirty_ring.rates[i].dirty_rate = dirtyrate;
dirtyrate_sum += dirtyrate;
}
DirtyStat.dirty_rate = dirtyrate_sum;
- g_free(dirty_pages);
}
static void calculate_dirtyrate_sample_vm(struct DirtyRateConfig config)
@@ -574,7 +635,7 @@ static void calculate_dirtyrate_sample_vm(struct DirtyRateConfig config)
rcu_read_unlock();
msec = config.sample_period_seconds * 1000;
- msec = set_sample_page_period(msec, initial_time);
+ msec = dirty_stat_wait(msec, initial_time);
DirtyStat.start_time = initial_time / 1000;
DirtyStat.calc_time = msec / 1000;
diff --git a/migration/dirtyrate.h b/migration/dirtyrate.h
index 69d4c5b865..594a5c0bb6 100644
--- a/migration/dirtyrate.h
+++ b/migration/dirtyrate.h
@@ -13,6 +13,8 @@
#ifndef QEMU_MIGRATION_DIRTYRATE_H
#define QEMU_MIGRATION_DIRTYRATE_H
+#include "sysemu/dirtyrate.h"
+
/*
* Sample 512 pages per GB as default.
*/
@@ -65,11 +67,6 @@ typedef struct SampleVMStat {
uint64_t total_block_mem_MB; /* size of total sampled pages in MB */
} SampleVMStat;
-typedef struct VcpuStat {
- int nvcpu; /* number of vcpu */
- DirtyRateVcpu *rates; /* array of dirty rate for each vcpu */
-} VcpuStat;
-
/*
* Store calculation statistics for each measure.
*/
--
2.27.0

View File

@ -0,0 +1,48 @@
From 7cb2d342b9073ec9548202df6e1fb25fa4997d71 Mon Sep 17 00:00:00 2001
From: jianchunfu <jianchunfu_yewu@cmss.chinamobile.com>
Date: Thu, 30 Jun 2022 11:34:50 +0000
Subject: [PATCH] migration/dirtyrate: Replace malloc with g_new Using macro
g_new() to handling potential memory allocation failures in dirtyrate.
---
migration/dirtyrate.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/migration/dirtyrate.c b/migration/dirtyrate.c
index d65e744af9..8043bc7946 100644
--- a/migration/dirtyrate.c
+++ b/migration/dirtyrate.c
@@ -157,7 +157,7 @@ static void cleanup_dirtyrate_stat(struct DirtyRateConfig config)
{
/* last calc-dirty-rate qmp use dirty ring mode */
if (dirtyrate_mode == DIRTY_RATE_MEASURE_MODE_DIRTY_RING) {
- free(DirtyStat.dirty_ring.rates);
+ g_free(DirtyStat.dirty_ring.rates);
DirtyStat.dirty_ring.rates = NULL;
}
}
@@ -522,10 +522,10 @@ static void calculate_dirtyrate_dirty_ring(struct DirtyRateConfig config)
nvcpu++;
}
- dirty_pages = malloc(sizeof(*dirty_pages) * nvcpu);
+ dirty_pages = g_new(DirtyPageRecord, nvcpu);
DirtyStat.dirty_ring.nvcpu = nvcpu;
- DirtyStat.dirty_ring.rates = malloc(sizeof(DirtyRateVcpu) * nvcpu);
+ DirtyStat.dirty_ring.rates = g_new(DirtyRateVcpu, nvcpu);
dirtyrate_global_dirty_log_start();
@@ -556,7 +556,7 @@ static void calculate_dirtyrate_dirty_ring(struct DirtyRateConfig config)
}
DirtyStat.dirty_rate = dirtyrate_sum;
- free(dirty_pages);
+ g_free(dirty_pages);
}
static void calculate_dirtyrate_sample_vm(struct DirtyRateConfig config)
--
2.27.0

View File

@ -1,6 +1,6 @@
Name: qemu Name: qemu
Version: 6.2.0 Version: 6.2.0
Release: 54 Release: 55
Epoch: 10 Epoch: 10
Summary: QEMU is a generic and open source machine emulator and virtualizer Summary: QEMU is a generic and open source machine emulator and virtualizer
License: GPLv2 and BSD and MIT and CC-BY-SA-4.0 License: GPLv2 and BSD and MIT and CC-BY-SA-4.0
@ -317,6 +317,30 @@ Patch0304: pci-expose-TYPE_XIO3130_DOWNSTREAM-name.patch
Patch0305: acpi-pcihp-pcie-set-power-on-cap-on-parent-slot.patch Patch0305: acpi-pcihp-pcie-set-power-on-cap-on-parent-slot.patch
Patch0306: hw-display-ati_2d-Fix-buffer-overflow-in-ati_2d_blt-.patch Patch0306: hw-display-ati_2d-Fix-buffer-overflow-in-ati_2d_blt-.patch
Patch0307: ui-vnc-clipboard-fix-integer-underflow-in-vnc_client.patch Patch0307: ui-vnc-clipboard-fix-integer-underflow-in-vnc_client.patch
Patch0308: Remove-the-unused-local-variable-records.patch
Patch0309: Remove-this-redundant-return.patch
Patch0310: hw-vhost-user-blk-turn-on-VIRTIO_BLK_F_SIZE_MAX-feat.patch
Patch0311: migration-dirtyrate-Replace-malloc-with-g_new.patch
Patch0312: accel-kvm-kvm-all-Refactor-per-vcpu-dirty-ring-reapi.patch
Patch0313: cpus-Introduce-cpu_list_generation_id.patch
Patch0314: migration-dirtyrate-Refactor-dirty-page-rate-calcula.patch
Patch0315: softmmu-dirtylimit-Implement-vCPU-dirtyrate-calculat.patch
Patch0316: accel-kvm-kvm-all-Introduce-kvm_dirty_ring_size-func.patch
Patch0317: softmmu-dirtylimit-Implement-virtual-CPU-throttle.patch
Patch0318: softmmu-dirtylimit-Implement-dirty-page-rate-limit.patch
Patch0319: tests-Add-dirty-page-rate-limit-test.patch
Patch0320: linux-headers-include-missing-changes-from-5.17.patch
Patch0321: x86-Fix-the-64-byte-boundary-enumeration-for-extende.patch
Patch0322: x86-Add-AMX-XTILECFG-and-XTILEDATA-components.patch
Patch0323: x86-Grant-AMX-permission-for-guest.patch
Patch0324: x86-Add-XFD-faulting-bit-for-state-components.patch
Patch0325: x86-Add-AMX-CPUIDs-enumeration.patch
Patch0326: x86-add-support-for-KVM_CAP_XSAVE2-and-AMX-state-mig.patch
Patch0327: x86-Support-XFD-and-AMX-xsave-data-migration.patch
Patch0328: target-i386-kvm-do-not-access-uninitialized-variable.patch
Patch0329: KVM-x86-workaround-invalid-CPUID-0xD-9-info-on-some-.patch
Patch0330: fix-compilation-errors-of-sw64-architecture-on-x86-p.patch
Patch0331: fixed-the-error-that-no-bios-file-soft-link-was-crea.patch
BuildRequires: flex BuildRequires: flex
BuildRequires: gcc BuildRequires: gcc
@ -831,6 +855,11 @@ getent passwd qemu >/dev/null || \
%endif %endif
%changelog %changelog
* Thu Nov 03 2022 yezengruan <yezengruan@huawei.com> - 10:6.2.0-55
- support dirty restraint on vCPU
- support SPR AMX in Qemu
- fix compilation errors of sw64
* Mon Oct 24 2022 fushanqing <fushanqing@kylinos.cn> - 10:6.2.0-54 * Mon Oct 24 2022 fushanqing <fushanqing@kylinos.cn> - 10:6.2.0-54
- add '--enable-slirp' compilation options - add '--enable-slirp' compilation options

View File

@ -0,0 +1,435 @@
From 39d9c1f6de01abf003980f4c2fe3c08f9e6cd60c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hyman=20Huang=28=E9=BB=84=E5=8B=87=29?=
<huangy81@chinatelecom.cn>
Date: Sun, 26 Jun 2022 01:38:36 +0800
Subject: [PATCH] softmmu/dirtylimit: Implement dirty page rate limit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Implement dirtyrate calculation periodically basing on
dirty-ring and throttle virtual CPU until it reachs the quota
dirty page rate given by user.
Introduce qmp commands "set-vcpu-dirty-limit",
"cancel-vcpu-dirty-limit", "query-vcpu-dirty-limit"
to enable, disable, query dirty page limit for virtual CPU.
Meanwhile, introduce corresponding hmp commands
"set_vcpu_dirty_limit", "cancel_vcpu_dirty_limit",
"info vcpu_dirty_limit" so the feature can be more usable.
"query-vcpu-dirty-limit" success depends on enabling dirty
page rate limit, so just add it to the list of skipped
command to ensure qmp-cmd-test run successfully.
Signed-off-by: Hyman Huang(黄勇) <huangy81@chinatelecom.cn>
Acked-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Message-Id: <4143f26706d413dd29db0b672fe58b3d3fbe34bc.1656177590.git.huangy81@chinatelecom.cn>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
hmp-commands-info.hx | 13 +++
hmp-commands.hx | 32 ++++++
include/monitor/hmp.h | 3 +
qapi/migration.json | 80 +++++++++++++++
softmmu/dirtylimit.c | 194 +++++++++++++++++++++++++++++++++++++
tests/qtest/qmp-cmd-test.c | 2 +
6 files changed, 324 insertions(+)
diff --git a/hmp-commands-info.hx b/hmp-commands-info.hx
index 407a1da800..5dd3001af0 100644
--- a/hmp-commands-info.hx
+++ b/hmp-commands-info.hx
@@ -863,6 +863,19 @@ SRST
Display the vcpu dirty rate information.
ERST
+ {
+ .name = "vcpu_dirty_limit",
+ .args_type = "",
+ .params = "",
+ .help = "show dirty page limit information of all vCPU",
+ .cmd = hmp_info_vcpu_dirty_limit,
+ },
+
+SRST
+ ``info vcpu_dirty_limit``
+ Display the vcpu dirty page limit information.
+ERST
+
#if defined(TARGET_I386)
{
.name = "sgx",
diff --git a/hmp-commands.hx b/hmp-commands.hx
index 70a9136ac2..5bedee2d49 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -1744,3 +1744,35 @@ ERST
"\n\t\t\t -b to specify dirty bitmap as method of calculation)",
.cmd = hmp_calc_dirty_rate,
},
+
+SRST
+``set_vcpu_dirty_limit``
+ Set dirty page rate limit on virtual CPU, the information about all the
+ virtual CPU dirty limit status can be observed with ``info vcpu_dirty_limit``
+ command.
+ERST
+
+ {
+ .name = "set_vcpu_dirty_limit",
+ .args_type = "dirty_rate:l,cpu_index:l?",
+ .params = "dirty_rate [cpu_index]",
+ .help = "set dirty page rate limit, use cpu_index to set limit"
+ "\n\t\t\t\t\t on a specified virtual cpu",
+ .cmd = hmp_set_vcpu_dirty_limit,
+ },
+
+SRST
+``cancel_vcpu_dirty_limit``
+ Cancel dirty page rate limit on virtual CPU, the information about all the
+ virtual CPU dirty limit status can be observed with ``info vcpu_dirty_limit``
+ command.
+ERST
+
+ {
+ .name = "cancel_vcpu_dirty_limit",
+ .args_type = "cpu_index:l?",
+ .params = "[cpu_index]",
+ .help = "cancel dirty page rate limit, use cpu_index to cancel"
+ "\n\t\t\t\t\t limit on a specified virtual cpu",
+ .cmd = hmp_cancel_vcpu_dirty_limit,
+ },
diff --git a/include/monitor/hmp.h b/include/monitor/hmp.h
index 96d014826a..478820e54f 100644
--- a/include/monitor/hmp.h
+++ b/include/monitor/hmp.h
@@ -131,6 +131,9 @@ void hmp_replay_delete_break(Monitor *mon, const QDict *qdict);
void hmp_replay_seek(Monitor *mon, const QDict *qdict);
void hmp_info_dirty_rate(Monitor *mon, const QDict *qdict);
void hmp_calc_dirty_rate(Monitor *mon, const QDict *qdict);
+void hmp_set_vcpu_dirty_limit(Monitor *mon, const QDict *qdict);
+void hmp_cancel_vcpu_dirty_limit(Monitor *mon, const QDict *qdict);
+void hmp_info_vcpu_dirty_limit(Monitor *mon, const QDict *qdict);
void hmp_human_readable_text_helper(Monitor *mon,
HumanReadableText *(*qmp_handler)(Error **));
diff --git a/qapi/migration.json b/qapi/migration.json
index d4ebc5f028..fee266017d 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -1874,6 +1874,86 @@
##
{ 'command': 'query-dirty-rate', 'returns': 'DirtyRateInfo' }
+##
+# @DirtyLimitInfo:
+#
+# Dirty page rate limit information of a virtual CPU.
+#
+# @cpu-index: index of a virtual CPU.
+#
+# @limit-rate: upper limit of dirty page rate (MB/s) for a virtual
+# CPU, 0 means unlimited.
+#
+# @current-rate: current dirty page rate (MB/s) for a virtual CPU.
+#
+# Since: 6.2
+#
+##
+{ 'struct': 'DirtyLimitInfo',
+ 'data': { 'cpu-index': 'int',
+ 'limit-rate': 'uint64',
+ 'current-rate': 'uint64' } }
+
+##
+# @set-vcpu-dirty-limit:
+#
+# Set the upper limit of dirty page rate for virtual CPUs.
+#
+# Requires KVM with accelerator property "dirty-ring-size" set.
+# A virtual CPU's dirty page rate is a measure of its memory load.
+# To observe dirty page rates, use @calc-dirty-rate.
+#
+# @cpu-index: index of a virtual CPU, default is all.
+#
+# @dirty-rate: upper limit of dirty page rate (MB/s) for virtual CPUs.
+#
+# Since: 6.2
+#
+# Example:
+# {"execute": "set-vcpu-dirty-limit"}
+# "arguments": { "dirty-rate": 200,
+# "cpu-index": 1 } }
+#
+##
+{ 'command': 'set-vcpu-dirty-limit',
+ 'data': { '*cpu-index': 'int',
+ 'dirty-rate': 'uint64' } }
+
+##
+# @cancel-vcpu-dirty-limit:
+#
+# Cancel the upper limit of dirty page rate for virtual CPUs.
+#
+# Cancel the dirty page limit for the vCPU which has been set with
+# set-vcpu-dirty-limit command. Note that this command requires
+# support from dirty ring, same as the "set-vcpu-dirty-limit".
+#
+# @cpu-index: index of a virtual CPU, default is all.
+#
+# Since: 6.2
+#
+# Example:
+# {"execute": "cancel-vcpu-dirty-limit"}
+# "arguments": { "cpu-index": 1 } }
+#
+##
+{ 'command': 'cancel-vcpu-dirty-limit',
+ 'data': { '*cpu-index': 'int'} }
+
+##
+# @query-vcpu-dirty-limit:
+#
+# Returns information about virtual CPU dirty page rate limits, if any.
+#
+# Since: 6.2
+#
+# Example:
+# {"execute": "query-vcpu-dirty-limit"}
+#
+##
+{ 'command': 'query-vcpu-dirty-limit',
+ 'returns': [ 'DirtyLimitInfo' ] }
+
##
# @snapshot-save:
#
diff --git a/softmmu/dirtylimit.c b/softmmu/dirtylimit.c
index e5a4f970bd..8d98cb7f2c 100644
--- a/softmmu/dirtylimit.c
+++ b/softmmu/dirtylimit.c
@@ -14,8 +14,12 @@
#include "qapi/error.h"
#include "qemu/main-loop.h"
#include "qapi/qapi-commands-migration.h"
+#include "qapi/qmp/qdict.h"
+#include "qapi/error.h"
#include "sysemu/dirtyrate.h"
#include "sysemu/dirtylimit.h"
+#include "monitor/hmp.h"
+#include "monitor/monitor.h"
#include "exec/memory.h"
#include "hw/boards.h"
#include "sysemu/kvm.h"
@@ -405,3 +409,193 @@ void dirtylimit_vcpu_execute(CPUState *cpu)
usleep(cpu->throttle_us_per_full);
}
}
+
+static void dirtylimit_init(void)
+{
+ dirtylimit_state_initialize();
+ dirtylimit_change(true);
+ vcpu_dirty_rate_stat_initialize();
+ vcpu_dirty_rate_stat_start();
+}
+
+static void dirtylimit_cleanup(void)
+{
+ vcpu_dirty_rate_stat_stop();
+ vcpu_dirty_rate_stat_finalize();
+ dirtylimit_change(false);
+ dirtylimit_state_finalize();
+}
+
+void qmp_cancel_vcpu_dirty_limit(bool has_cpu_index,
+ int64_t cpu_index,
+ Error **errp)
+{
+ if (!kvm_enabled() || !kvm_dirty_ring_enabled()) {
+ return;
+ }
+
+ if (has_cpu_index && !dirtylimit_vcpu_index_valid(cpu_index)) {
+ error_setg(errp, "incorrect cpu index specified");
+ return;
+ }
+
+ if (!dirtylimit_in_service()) {
+ return;
+ }
+
+ dirtylimit_state_lock();
+
+ if (has_cpu_index) {
+ dirtylimit_set_vcpu(cpu_index, 0, false);
+ } else {
+ dirtylimit_set_all(0, false);
+ }
+
+ if (!dirtylimit_state->limited_nvcpu) {
+ dirtylimit_cleanup();
+ }
+
+ dirtylimit_state_unlock();
+}
+
+void hmp_cancel_vcpu_dirty_limit(Monitor *mon, const QDict *qdict)
+{
+ int64_t cpu_index = qdict_get_try_int(qdict, "cpu_index", -1);
+ Error *err = NULL;
+
+ qmp_cancel_vcpu_dirty_limit(!!(cpu_index != -1), cpu_index, &err);
+ if (err) {
+ hmp_handle_error(mon, err);
+ return;
+ }
+
+ monitor_printf(mon, "[Please use 'info vcpu_dirty_limit' to query "
+ "dirty limit for virtual CPU]\n");
+}
+
+void qmp_set_vcpu_dirty_limit(bool has_cpu_index,
+ int64_t cpu_index,
+ uint64_t dirty_rate,
+ Error **errp)
+{
+ if (!kvm_enabled() || !kvm_dirty_ring_enabled()) {
+ error_setg(errp, "dirty page limit feature requires KVM with"
+ " accelerator property 'dirty-ring-size' set'");
+ return;
+ }
+
+ if (has_cpu_index && !dirtylimit_vcpu_index_valid(cpu_index)) {
+ error_setg(errp, "incorrect cpu index specified");
+ return;
+ }
+
+ if (!dirty_rate) {
+ qmp_cancel_vcpu_dirty_limit(has_cpu_index, cpu_index, errp);
+ return;
+ }
+
+ dirtylimit_state_lock();
+
+ if (!dirtylimit_in_service()) {
+ dirtylimit_init();
+ }
+
+ if (has_cpu_index) {
+ dirtylimit_set_vcpu(cpu_index, dirty_rate, true);
+ } else {
+ dirtylimit_set_all(dirty_rate, true);
+ }
+
+ dirtylimit_state_unlock();
+}
+
+void hmp_set_vcpu_dirty_limit(Monitor *mon, const QDict *qdict)
+{
+ int64_t dirty_rate = qdict_get_int(qdict, "dirty_rate");
+ int64_t cpu_index = qdict_get_try_int(qdict, "cpu_index", -1);
+ Error *err = NULL;
+
+ qmp_set_vcpu_dirty_limit(!!(cpu_index != -1), cpu_index, dirty_rate, &err);
+ if (err) {
+ hmp_handle_error(mon, err);
+ return;
+ }
+
+ monitor_printf(mon, "[Please use 'info vcpu_dirty_limit' to query "
+ "dirty limit for virtual CPU]\n");
+}
+
+static struct DirtyLimitInfo *dirtylimit_query_vcpu(int cpu_index)
+{
+ DirtyLimitInfo *info = NULL;
+
+ info = g_malloc0(sizeof(*info));
+ info->cpu_index = cpu_index;
+ info->limit_rate = dirtylimit_vcpu_get_state(cpu_index)->quota;
+ info->current_rate = vcpu_dirty_rate_get(cpu_index);
+
+ return info;
+}
+
+static struct DirtyLimitInfoList *dirtylimit_query_all(void)
+{
+ int i, index;
+ DirtyLimitInfo *info = NULL;
+ DirtyLimitInfoList *head = NULL, **tail = &head;
+
+ dirtylimit_state_lock();
+
+ if (!dirtylimit_in_service()) {
+ dirtylimit_state_unlock();
+ return NULL;
+ }
+
+ for (i = 0; i < dirtylimit_state->max_cpus; i++) {
+ index = dirtylimit_state->states[i].cpu_index;
+ if (dirtylimit_vcpu_get_state(index)->enabled) {
+ info = dirtylimit_query_vcpu(index);
+ QAPI_LIST_APPEND(tail, info);
+ }
+ }
+
+ dirtylimit_state_unlock();
+
+ return head;
+}
+
+struct DirtyLimitInfoList *qmp_query_vcpu_dirty_limit(Error **errp)
+{
+ if (!dirtylimit_in_service()) {
+ return NULL;
+ }
+
+ return dirtylimit_query_all();
+}
+
+void hmp_info_vcpu_dirty_limit(Monitor *mon, const QDict *qdict)
+{
+ DirtyLimitInfoList *limit, *head, *info = NULL;
+ Error *err = NULL;
+
+ if (!dirtylimit_in_service()) {
+ monitor_printf(mon, "Dirty page limit not enabled!\n");
+ return;
+ }
+
+ info = qmp_query_vcpu_dirty_limit(&err);
+ if (err) {
+ hmp_handle_error(mon, err);
+ return;
+ }
+
+ head = info;
+ for (limit = head; limit != NULL; limit = limit->next) {
+ monitor_printf(mon, "vcpu[%"PRIi64"], limit rate %"PRIi64 " (MB/s),"
+ " current rate %"PRIi64 " (MB/s)\n",
+ limit->value->cpu_index,
+ limit->value->limit_rate,
+ limit->value->current_rate);
+ }
+
+ g_free(info);
+}
diff --git a/tests/qtest/qmp-cmd-test.c b/tests/qtest/qmp-cmd-test.c
index 7f103ea3fd..4b216a0435 100644
--- a/tests/qtest/qmp-cmd-test.c
+++ b/tests/qtest/qmp-cmd-test.c
@@ -110,6 +110,8 @@ static bool query_is_ignored(const char *cmd)
"query-sev-capabilities",
"query-sgx",
"query-sgx-capabilities",
+ /* Success depends on enabling dirty page rate limit */
+ "query-vcpu-dirty-limit",
NULL
};
int i;
--
2.27.0

View File

@ -0,0 +1,214 @@
From 1c1049bda8e91cc6015c32fc7cc9d0f16ad46b58 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hyman=20Huang=28=E9=BB=84=E5=8B=87=29?=
<huangy81@chinatelecom.cn>
Date: Sun, 26 Jun 2022 01:38:33 +0800
Subject: [PATCH 1/3] softmmu/dirtylimit: Implement vCPU dirtyrate calculation
periodically
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Introduce the third method GLOBAL_DIRTY_LIMIT of dirty
tracking for calculate dirtyrate periodly for dirty page
rate limit.
Add dirtylimit.c to implement dirtyrate calculation periodly,
which will be used for dirty page rate limit.
Add dirtylimit.h to export util functions for dirty page rate
limit implementation.
Signed-off-by: Hyman Huang(黄勇) <huangy81@chinatelecom.cn>
Reviewed-by: Peter Xu <peterx@redhat.com>
Message-Id: <5d0d641bffcb9b1c4cc3e323b6dfecb36050d948.1656177590.git.huangy81@chinatelecom.cn>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
include/exec/memory.h | 5 +-
include/sysemu/dirtylimit.h | 22 +++++++
softmmu/dirtylimit.c | 116 ++++++++++++++++++++++++++++++++++++
softmmu/meson.build | 1 +
4 files changed, 143 insertions(+), 1 deletion(-)
create mode 100644 include/sysemu/dirtylimit.h
create mode 100644 softmmu/dirtylimit.c
diff --git a/include/exec/memory.h b/include/exec/memory.h
index 3e84d62e40..4326d74b95 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -69,7 +69,10 @@ static inline void fuzz_dma_read_cb(size_t addr,
/* Dirty tracking enabled because measuring dirty rate */
#define GLOBAL_DIRTY_DIRTY_RATE (1U << 1)
-#define GLOBAL_DIRTY_MASK (0x3)
+/* Dirty tracking enabled because dirty limit */
+#define GLOBAL_DIRTY_LIMIT (1U << 2)
+
+#define GLOBAL_DIRTY_MASK (0x7)
extern unsigned int global_dirty_tracking;
diff --git a/include/sysemu/dirtylimit.h b/include/sysemu/dirtylimit.h
new file mode 100644
index 0000000000..da459f03d6
--- /dev/null
+++ b/include/sysemu/dirtylimit.h
@@ -0,0 +1,22 @@
+/*
+ * Dirty page rate limit common functions
+ *
+ * Copyright (c) 2022 CHINA TELECOM CO.,LTD.
+ *
+ * Authors:
+ * Hyman Huang(黄勇) <huangy81@chinatelecom.cn>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+#ifndef QEMU_DIRTYRLIMIT_H
+#define QEMU_DIRTYRLIMIT_H
+
+#define DIRTYLIMIT_CALC_TIME_MS 1000 /* 1000ms */
+
+int64_t vcpu_dirty_rate_get(int cpu_index);
+void vcpu_dirty_rate_stat_start(void);
+void vcpu_dirty_rate_stat_stop(void);
+void vcpu_dirty_rate_stat_initialize(void);
+void vcpu_dirty_rate_stat_finalize(void);
+#endif
diff --git a/softmmu/dirtylimit.c b/softmmu/dirtylimit.c
new file mode 100644
index 0000000000..ebdc064c9d
--- /dev/null
+++ b/softmmu/dirtylimit.c
@@ -0,0 +1,116 @@
+/*
+ * Dirty page rate limit implementation code
+ *
+ * Copyright (c) 2022 CHINA TELECOM CO.,LTD.
+ *
+ * Authors:
+ * Hyman Huang(黄勇) <huangy81@chinatelecom.cn>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/main-loop.h"
+#include "qapi/qapi-commands-migration.h"
+#include "sysemu/dirtyrate.h"
+#include "sysemu/dirtylimit.h"
+#include "exec/memory.h"
+#include "hw/boards.h"
+
+struct {
+ VcpuStat stat;
+ bool running;
+ QemuThread thread;
+} *vcpu_dirty_rate_stat;
+
+static void vcpu_dirty_rate_stat_collect(void)
+{
+ VcpuStat stat;
+ int i = 0;
+
+ /* calculate vcpu dirtyrate */
+ vcpu_calculate_dirtyrate(DIRTYLIMIT_CALC_TIME_MS,
+ &stat,
+ GLOBAL_DIRTY_LIMIT,
+ false);
+
+ for (i = 0; i < stat.nvcpu; i++) {
+ vcpu_dirty_rate_stat->stat.rates[i].id = i;
+ vcpu_dirty_rate_stat->stat.rates[i].dirty_rate =
+ stat.rates[i].dirty_rate;
+ }
+
+ free(stat.rates);
+}
+
+static void *vcpu_dirty_rate_stat_thread(void *opaque)
+{
+ rcu_register_thread();
+
+ /* start log sync */
+ global_dirty_log_change(GLOBAL_DIRTY_LIMIT, true);
+
+ while (qatomic_read(&vcpu_dirty_rate_stat->running)) {
+ vcpu_dirty_rate_stat_collect();
+ }
+
+ /* stop log sync */
+ global_dirty_log_change(GLOBAL_DIRTY_LIMIT, false);
+
+ rcu_unregister_thread();
+ return NULL;
+}
+
+int64_t vcpu_dirty_rate_get(int cpu_index)
+{
+ DirtyRateVcpu *rates = vcpu_dirty_rate_stat->stat.rates;
+ return qatomic_read_i64(&rates[cpu_index].dirty_rate);
+}
+
+void vcpu_dirty_rate_stat_start(void)
+{
+ if (qatomic_read(&vcpu_dirty_rate_stat->running)) {
+ return;
+ }
+
+ qatomic_set(&vcpu_dirty_rate_stat->running, 1);
+ qemu_thread_create(&vcpu_dirty_rate_stat->thread,
+ "dirtyrate-stat",
+ vcpu_dirty_rate_stat_thread,
+ NULL,
+ QEMU_THREAD_JOINABLE);
+}
+
+void vcpu_dirty_rate_stat_stop(void)
+{
+ qatomic_set(&vcpu_dirty_rate_stat->running, 0);
+ qemu_mutex_unlock_iothread();
+ qemu_thread_join(&vcpu_dirty_rate_stat->thread);
+ qemu_mutex_lock_iothread();
+}
+
+void vcpu_dirty_rate_stat_initialize(void)
+{
+ MachineState *ms = MACHINE(qdev_get_machine());
+ int max_cpus = ms->smp.max_cpus;
+
+ vcpu_dirty_rate_stat =
+ g_malloc0(sizeof(*vcpu_dirty_rate_stat));
+
+ vcpu_dirty_rate_stat->stat.nvcpu = max_cpus;
+ vcpu_dirty_rate_stat->stat.rates =
+ g_malloc0(sizeof(DirtyRateVcpu) * max_cpus);
+
+ vcpu_dirty_rate_stat->running = false;
+}
+
+void vcpu_dirty_rate_stat_finalize(void)
+{
+ free(vcpu_dirty_rate_stat->stat.rates);
+ vcpu_dirty_rate_stat->stat.rates = NULL;
+
+ free(vcpu_dirty_rate_stat);
+ vcpu_dirty_rate_stat = NULL;
+}
diff --git a/softmmu/meson.build b/softmmu/meson.build
index d8e03018ab..95029a5db2 100644
--- a/softmmu/meson.build
+++ b/softmmu/meson.build
@@ -15,6 +15,7 @@ specific_ss.add(when: 'CONFIG_SOFTMMU', if_true: [files(
'vl.c',
'cpu-timers.c',
'runstate-action.c',
+ 'dirtylimit.c',
)])
specific_ss.add(when: ['CONFIG_SOFTMMU', 'CONFIG_TCG'], if_true: [files(
--
2.27.0

View File

@ -0,0 +1,469 @@
From 7b6ab56e68fb5031ea13b82743415413b1e70e71 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hyman=20Huang=28=E9=BB=84=E5=8B=87=29?=
<huangy81@chinatelecom.cn>
Date: Sun, 26 Jun 2022 01:38:35 +0800
Subject: [PATCH 3/3] softmmu/dirtylimit: Implement virtual CPU throttle
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Setup a negative feedback system when vCPU thread
handling KVM_EXIT_DIRTY_RING_FULL exit by introducing
throttle_us_per_full field in struct CPUState. Sleep
throttle_us_per_full microseconds to throttle vCPU
if dirtylimit is in service.
Signed-off-by: Hyman Huang(黄勇) <huangy81@chinatelecom.cn>
Reviewed-by: Peter Xu <peterx@redhat.com>
Message-Id: <977e808e03a1cef5151cae75984658b6821be618.1656177590.git.huangy81@chinatelecom.cn>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
accel/kvm/kvm-all.c | 20 ++-
include/hw/core/cpu.h | 6 +
include/sysemu/dirtylimit.h | 15 ++
softmmu/dirtylimit.c | 291 ++++++++++++++++++++++++++++++++++++
softmmu/trace-events | 7 +
5 files changed, 338 insertions(+), 1 deletion(-)
diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index d0c4310507..946ccb260b 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -45,6 +45,7 @@
#include "qemu/guest-random.h"
#include "sysemu/hw_accel.h"
#include "kvm-cpus.h"
+#include "sysemu/dirtylimit.h"
#include "hw/boards.h"
@@ -493,6 +494,7 @@ int kvm_init_vcpu(CPUState *cpu, Error **errp)
cpu->kvm_state = s;
cpu->vcpu_dirty = true;
cpu->dirty_pages = 0;
+ cpu->throttle_us_per_full = 0;
mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
if (mmap_size < 0) {
@@ -1486,6 +1488,11 @@ static void *kvm_dirty_ring_reaper_thread(void *data)
*/
sleep(1);
+ /* keep sleeping so that dirtylimit not be interfered by reaper */
+ if (dirtylimit_in_service()) {
+ continue;
+ }
+
trace_kvm_dirty_ring_reaper("wakeup");
r->reaper_state = KVM_DIRTY_RING_REAPER_REAPING;
@@ -2965,8 +2972,19 @@ int kvm_cpu_exec(CPUState *cpu)
*/
trace_kvm_dirty_ring_full(cpu->cpu_index);
qemu_mutex_lock_iothread();
- kvm_dirty_ring_reap(kvm_state, NULL);
+ /*
+ * We throttle vCPU by making it sleep once it exit from kernel
+ * due to dirty ring full. In the dirtylimit scenario, reaping
+ * all vCPUs after a single vCPU dirty ring get full result in
+ * the miss of sleep, so just reap the ring-fulled vCPU.
+ */
+ if (dirtylimit_in_service()) {
+ kvm_dirty_ring_reap(kvm_state, cpu);
+ } else {
+ kvm_dirty_ring_reap(kvm_state, NULL);
+ }
qemu_mutex_unlock_iothread();
+ dirtylimit_vcpu_execute(cpu);
ret = 0;
break;
case KVM_EXIT_SYSTEM_EVENT:
diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index e948e81f1a..9631c1e2f6 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -411,6 +411,12 @@ struct CPUState {
*/
bool throttle_thread_scheduled;
+ /*
+ * Sleep throttle_us_per_full microseconds once dirty ring is full
+ * if dirty page rate limit is enabled.
+ */
+ int64_t throttle_us_per_full;
+
bool ignore_memory_transaction_failures;
struct hax_vcpu_state *hax_vcpu;
diff --git a/include/sysemu/dirtylimit.h b/include/sysemu/dirtylimit.h
index da459f03d6..8d2c1f3a6b 100644
--- a/include/sysemu/dirtylimit.h
+++ b/include/sysemu/dirtylimit.h
@@ -19,4 +19,19 @@ void vcpu_dirty_rate_stat_start(void);
void vcpu_dirty_rate_stat_stop(void);
void vcpu_dirty_rate_stat_initialize(void);
void vcpu_dirty_rate_stat_finalize(void);
+
+void dirtylimit_state_lock(void);
+void dirtylimit_state_unlock(void);
+void dirtylimit_state_initialize(void);
+void dirtylimit_state_finalize(void);
+bool dirtylimit_in_service(void);
+bool dirtylimit_vcpu_index_valid(int cpu_index);
+void dirtylimit_process(void);
+void dirtylimit_change(bool start);
+void dirtylimit_set_vcpu(int cpu_index,
+ uint64_t quota,
+ bool enable);
+void dirtylimit_set_all(uint64_t quota,
+ bool enable);
+void dirtylimit_vcpu_execute(CPUState *cpu);
#endif
diff --git a/softmmu/dirtylimit.c b/softmmu/dirtylimit.c
index ebdc064c9d..e5a4f970bd 100644
--- a/softmmu/dirtylimit.c
+++ b/softmmu/dirtylimit.c
@@ -18,6 +18,26 @@
#include "sysemu/dirtylimit.h"
#include "exec/memory.h"
#include "hw/boards.h"
+#include "sysemu/kvm.h"
+#include "trace.h"
+
+/*
+ * Dirtylimit stop working if dirty page rate error
+ * value less than DIRTYLIMIT_TOLERANCE_RANGE
+ */
+#define DIRTYLIMIT_TOLERANCE_RANGE 25 /* MB/s */
+/*
+ * Plus or minus vcpu sleep time linearly if dirty
+ * page rate error value percentage over
+ * DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT.
+ * Otherwise, plus or minus a fixed vcpu sleep time.
+ */
+#define DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT 50
+/*
+ * Max vcpu sleep time percentage during a cycle
+ * composed of dirty ring full and sleep time.
+ */
+#define DIRTYLIMIT_THROTTLE_PCT_MAX 99
struct {
VcpuStat stat;
@@ -25,6 +45,30 @@ struct {
QemuThread thread;
} *vcpu_dirty_rate_stat;
+typedef struct VcpuDirtyLimitState {
+ int cpu_index;
+ bool enabled;
+ /*
+ * Quota dirty page rate, unit is MB/s
+ * zero if not enabled.
+ */
+ uint64_t quota;
+} VcpuDirtyLimitState;
+
+struct {
+ VcpuDirtyLimitState *states;
+ /* Max cpus number configured by user */
+ int max_cpus;
+ /* Number of vcpu under dirtylimit */
+ int limited_nvcpu;
+} *dirtylimit_state;
+
+/* protect dirtylimit_state */
+static QemuMutex dirtylimit_mutex;
+
+/* dirtylimit thread quit if dirtylimit_quit is true */
+static bool dirtylimit_quit;
+
static void vcpu_dirty_rate_stat_collect(void)
{
VcpuStat stat;
@@ -54,6 +98,9 @@ static void *vcpu_dirty_rate_stat_thread(void *opaque)
while (qatomic_read(&vcpu_dirty_rate_stat->running)) {
vcpu_dirty_rate_stat_collect();
+ if (dirtylimit_in_service()) {
+ dirtylimit_process();
+ }
}
/* stop log sync */
@@ -86,9 +133,11 @@ void vcpu_dirty_rate_stat_start(void)
void vcpu_dirty_rate_stat_stop(void)
{
qatomic_set(&vcpu_dirty_rate_stat->running, 0);
+ dirtylimit_state_unlock();
qemu_mutex_unlock_iothread();
qemu_thread_join(&vcpu_dirty_rate_stat->thread);
qemu_mutex_lock_iothread();
+ dirtylimit_state_lock();
}
void vcpu_dirty_rate_stat_initialize(void)
@@ -114,3 +163,245 @@ void vcpu_dirty_rate_stat_finalize(void)
free(vcpu_dirty_rate_stat);
vcpu_dirty_rate_stat = NULL;
}
+
+void dirtylimit_state_lock(void)
+{
+ qemu_mutex_lock(&dirtylimit_mutex);
+}
+
+void dirtylimit_state_unlock(void)
+{
+ qemu_mutex_unlock(&dirtylimit_mutex);
+}
+
+static void
+__attribute__((__constructor__)) dirtylimit_mutex_init(void)
+{
+ qemu_mutex_init(&dirtylimit_mutex);
+}
+
+static inline VcpuDirtyLimitState *dirtylimit_vcpu_get_state(int cpu_index)
+{
+ return &dirtylimit_state->states[cpu_index];
+}
+
+void dirtylimit_state_initialize(void)
+{
+ MachineState *ms = MACHINE(qdev_get_machine());
+ int max_cpus = ms->smp.max_cpus;
+ int i;
+
+ dirtylimit_state = g_malloc0(sizeof(*dirtylimit_state));
+
+ dirtylimit_state->states =
+ g_malloc0(sizeof(VcpuDirtyLimitState) * max_cpus);
+
+ for (i = 0; i < max_cpus; i++) {
+ dirtylimit_state->states[i].cpu_index = i;
+ }
+
+ dirtylimit_state->max_cpus = max_cpus;
+ trace_dirtylimit_state_initialize(max_cpus);
+}
+
+void dirtylimit_state_finalize(void)
+{
+ free(dirtylimit_state->states);
+ dirtylimit_state->states = NULL;
+
+ free(dirtylimit_state);
+ dirtylimit_state = NULL;
+
+ trace_dirtylimit_state_finalize();
+}
+
+bool dirtylimit_in_service(void)
+{
+ return !!dirtylimit_state;
+}
+
+bool dirtylimit_vcpu_index_valid(int cpu_index)
+{
+ MachineState *ms = MACHINE(qdev_get_machine());
+
+ return !(cpu_index < 0 ||
+ cpu_index >= ms->smp.max_cpus);
+}
+
+static inline int64_t dirtylimit_dirty_ring_full_time(uint64_t dirtyrate)
+{
+ static uint64_t max_dirtyrate;
+ uint32_t dirty_ring_size = kvm_dirty_ring_size();
+ uint64_t dirty_ring_size_meory_MB =
+ dirty_ring_size * TARGET_PAGE_SIZE >> 20;
+
+ if (max_dirtyrate < dirtyrate) {
+ max_dirtyrate = dirtyrate;
+ }
+
+ return dirty_ring_size_meory_MB * 1000000 / max_dirtyrate;
+}
+
+static inline bool dirtylimit_done(uint64_t quota,
+ uint64_t current)
+{
+ uint64_t min, max;
+
+ min = MIN(quota, current);
+ max = MAX(quota, current);
+
+ return ((max - min) <= DIRTYLIMIT_TOLERANCE_RANGE) ? true : false;
+}
+
+static inline bool
+dirtylimit_need_linear_adjustment(uint64_t quota,
+ uint64_t current)
+{
+ uint64_t min, max;
+
+ min = MIN(quota, current);
+ max = MAX(quota, current);
+
+ return ((max - min) * 100 / max) > DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT;
+}
+
+static void dirtylimit_set_throttle(CPUState *cpu,
+ uint64_t quota,
+ uint64_t current)
+{
+ int64_t ring_full_time_us = 0;
+ uint64_t sleep_pct = 0;
+ uint64_t throttle_us = 0;
+
+ if (current == 0) {
+ cpu->throttle_us_per_full = 0;
+ return;
+ }
+
+ ring_full_time_us = dirtylimit_dirty_ring_full_time(current);
+
+ if (dirtylimit_need_linear_adjustment(quota, current)) {
+ if (quota < current) {
+ sleep_pct = (current - quota) * 100 / current;
+ throttle_us =
+ ring_full_time_us * sleep_pct / (double)(100 - sleep_pct);
+ cpu->throttle_us_per_full += throttle_us;
+ } else {
+ sleep_pct = (quota - current) * 100 / quota;
+ throttle_us =
+ ring_full_time_us * sleep_pct / (double)(100 - sleep_pct);
+ cpu->throttle_us_per_full -= throttle_us;
+ }
+
+ trace_dirtylimit_throttle_pct(cpu->cpu_index,
+ sleep_pct,
+ throttle_us);
+ } else {
+ if (quota < current) {
+ cpu->throttle_us_per_full += ring_full_time_us / 10;
+ } else {
+ cpu->throttle_us_per_full -= ring_full_time_us / 10;
+ }
+ }
+
+ /*
+ * TODO: in the big kvm_dirty_ring_size case (eg: 65536, or other scenario),
+ * current dirty page rate may never reach the quota, we should stop
+ * increasing sleep time?
+ */
+ cpu->throttle_us_per_full = MIN(cpu->throttle_us_per_full,
+ ring_full_time_us * DIRTYLIMIT_THROTTLE_PCT_MAX);
+
+ cpu->throttle_us_per_full = MAX(cpu->throttle_us_per_full, 0);
+}
+
+static void dirtylimit_adjust_throttle(CPUState *cpu)
+{
+ uint64_t quota = 0;
+ uint64_t current = 0;
+ int cpu_index = cpu->cpu_index;
+
+ quota = dirtylimit_vcpu_get_state(cpu_index)->quota;
+ current = vcpu_dirty_rate_get(cpu_index);
+
+ if (!dirtylimit_done(quota, current)) {
+ dirtylimit_set_throttle(cpu, quota, current);
+ }
+
+ return;
+}
+
+void dirtylimit_process(void)
+{
+ CPUState *cpu;
+
+ if (!qatomic_read(&dirtylimit_quit)) {
+ dirtylimit_state_lock();
+
+ if (!dirtylimit_in_service()) {
+ dirtylimit_state_unlock();
+ return;
+ }
+
+ CPU_FOREACH(cpu) {
+ if (!dirtylimit_vcpu_get_state(cpu->cpu_index)->enabled) {
+ continue;
+ }
+ dirtylimit_adjust_throttle(cpu);
+ }
+ dirtylimit_state_unlock();
+ }
+}
+
+void dirtylimit_change(bool start)
+{
+ if (start) {
+ qatomic_set(&dirtylimit_quit, 0);
+ } else {
+ qatomic_set(&dirtylimit_quit, 1);
+ }
+}
+
+void dirtylimit_set_vcpu(int cpu_index,
+ uint64_t quota,
+ bool enable)
+{
+ trace_dirtylimit_set_vcpu(cpu_index, quota);
+
+ if (enable) {
+ dirtylimit_state->states[cpu_index].quota = quota;
+ if (!dirtylimit_vcpu_get_state(cpu_index)->enabled) {
+ dirtylimit_state->limited_nvcpu++;
+ }
+ } else {
+ dirtylimit_state->states[cpu_index].quota = 0;
+ if (dirtylimit_state->states[cpu_index].enabled) {
+ dirtylimit_state->limited_nvcpu--;
+ }
+ }
+
+ dirtylimit_state->states[cpu_index].enabled = enable;
+}
+
+void dirtylimit_set_all(uint64_t quota,
+ bool enable)
+{
+ MachineState *ms = MACHINE(qdev_get_machine());
+ int max_cpus = ms->smp.max_cpus;
+ int i;
+
+ for (i = 0; i < max_cpus; i++) {
+ dirtylimit_set_vcpu(i, quota, enable);
+ }
+}
+
+void dirtylimit_vcpu_execute(CPUState *cpu)
+{
+ if (dirtylimit_in_service() &&
+ dirtylimit_vcpu_get_state(cpu->cpu_index)->enabled &&
+ cpu->throttle_us_per_full) {
+ trace_dirtylimit_vcpu_execute(cpu->cpu_index,
+ cpu->throttle_us_per_full);
+ usleep(cpu->throttle_us_per_full);
+ }
+}
diff --git a/softmmu/trace-events b/softmmu/trace-events
index 9c88887b3c..22606dc27b 100644
--- a/softmmu/trace-events
+++ b/softmmu/trace-events
@@ -31,3 +31,10 @@ runstate_set(int current_state, const char *current_state_str, int new_state, co
system_wakeup_request(int reason) "reason=%d"
qemu_system_shutdown_request(int reason) "reason=%d"
qemu_system_powerdown_request(void) ""
+
+#dirtylimit.c
+dirtylimit_state_initialize(int max_cpus) "dirtylimit state initialize: max cpus %d"
+dirtylimit_state_finalize(void)
+dirtylimit_throttle_pct(int cpu_index, uint64_t pct, int64_t time_us) "CPU[%d] throttle percent: %" PRIu64 ", throttle adjust time %"PRIi64 " us"
+dirtylimit_set_vcpu(int cpu_index, uint64_t quota) "CPU[%d] set dirty page rate limit %"PRIu64
+dirtylimit_vcpu_execute(int cpu_index, int64_t sleep_time_us) "CPU[%d] sleep %"PRIi64 " us"
--
2.27.0

View File

@ -0,0 +1,77 @@
From 550d43a946b61bdadb418e0f8bef8b98e646276d Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 18 Mar 2022 16:23:47 +0100
Subject: [PATCH 09/10] target/i386: kvm: do not access uninitialized variable
on older kernels
from mainline-v7.0.0-rc1
commit 3ec5ad40081b14af28496198b4d08dbe13386790
category: feature
feature: SPR AMX support for Qemu
bugzilla: https://gitee.com/openeuler/intel-qemu/issues/I5VHOB
Intel-SIG: commit 3ec5ad40081b ("target/i386: kvm: do not access
uninitialized variable on older kernels")
---------------------------------------------------------
target/i386: kvm: do not access uninitialized variable on older kernels
KVM support for AMX includes a new system attribute, KVM_X86_XCOMP_GUEST_SUPP.
Commit 19db68ca68 ("x86: Grant AMX permission for guest", 2022-03-15) however
did not fully consider the behavior on older kernels. First, it warns
too aggressively. Second, it invokes the KVM_GET_DEVICE_ATTR ioctl
unconditionally and then uses the "bitmask" variable, which remains
uninitialized if the ioctl fails. Third, kvm_ioctl returns -errno rather
than -1 on errors.
While at it, explain why the ioctl is needed and KVM_GET_SUPPORTED_CPUID
is not enough.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Jason Zeng <jason.zeng@intel.com>
---
target/i386/kvm/kvm.c | 17 +++++++++++++----
1 file changed, 13 insertions(+), 4 deletions(-)
diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index 49fca5ea88..20e418463d 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -409,6 +409,12 @@ uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function,
}
} else if (function == 0xd && index == 0 &&
(reg == R_EAX || reg == R_EDX)) {
+ /*
+ * The value returned by KVM_GET_SUPPORTED_CPUID does not include
+ * features that still have to be enabled with the arch_prctl
+ * system call. QEMU needs the full value, which is retrieved
+ * with KVM_GET_DEVICE_ATTR.
+ */
struct kvm_device_attr attr = {
.group = 0,
.attr = KVM_X86_XCOMP_GUEST_SUPP,
@@ -417,13 +423,16 @@ uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function,
bool sys_attr = kvm_check_extension(s, KVM_CAP_SYS_ATTRIBUTES);
if (!sys_attr) {
- warn_report("cannot get sys attribute capabilities %d", sys_attr);
+ return ret;
}
int rc = kvm_ioctl(s, KVM_GET_DEVICE_ATTR, &attr);
- if (rc == -1 && (errno == ENXIO || errno == EINVAL)) {
- warn_report("KVM_GET_DEVICE_ATTR(0, KVM_X86_XCOMP_GUEST_SUPP) "
- "error: %d", rc);
+ if (rc < 0) {
+ if (rc != -ENXIO) {
+ warn_report("KVM_GET_DEVICE_ATTR(0, KVM_X86_XCOMP_GUEST_SUPP) "
+ "error: %d", rc);
+ }
+ return ret;
}
ret = (reg == R_EAX) ? bitmask : bitmask >> 32;
} else if (function == 0x80000001 && reg == R_ECX) {
--
2.27.0

View File

@ -0,0 +1,362 @@
From 8a0f4dcf94b280d5b7db7f604c42d088c928ac0d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hyman=20Huang=28=E9=BB=84=E5=8B=87=29?=
<huangy81@chinatelecom.cn>
Date: Sun, 26 Jun 2022 01:38:37 +0800
Subject: [PATCH] tests: Add dirty page rate limit test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Add dirty page rate limit test if kernel support dirty ring,
The following qmp commands are covered by this test case:
"calc-dirty-rate", "query-dirty-rate", "set-vcpu-dirty-limit",
"cancel-vcpu-dirty-limit" and "query-vcpu-dirty-limit".
Signed-off-by: Hyman Huang(黄勇) <huangy81@chinatelecom.cn>
Acked-by: Peter Xu <peterx@redhat.com>
Message-Id: <eed5b847a6ef0a9c02a36383dbdd7db367dd1e7e.1656177590.git.huangy81@chinatelecom.cn>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
tests/qtest/migration-helpers.c | 22 +++
tests/qtest/migration-helpers.h | 2 +
tests/qtest/migration-test.c | 256 ++++++++++++++++++++++++++++++++
3 files changed, 280 insertions(+)
diff --git a/tests/qtest/migration-helpers.c b/tests/qtest/migration-helpers.c
index 4ee26014b7..1e594f9cb1 100644
--- a/tests/qtest/migration-helpers.c
+++ b/tests/qtest/migration-helpers.c
@@ -75,6 +75,28 @@ QDict *wait_command(QTestState *who, const char *command, ...)
return ret;
}
+/*
+ * Execute the qmp command only
+ */
+QDict *qmp_command(QTestState *who, const char *command, ...)
+{
+ va_list ap;
+ QDict *resp, *ret;
+
+ va_start(ap, command);
+ resp = qtest_vqmp(who, command, ap);
+ va_end(ap);
+
+ g_assert(!qdict_haskey(resp, "error"));
+ g_assert(qdict_haskey(resp, "return"));
+
+ ret = qdict_get_qdict(resp, "return");
+ qobject_ref(ret);
+ qobject_unref(resp);
+
+ return ret;
+}
+
/*
* Send QMP command "migrate".
* Arguments are built from @fmt... (formatted like
diff --git a/tests/qtest/migration-helpers.h b/tests/qtest/migration-helpers.h
index d63bba9630..9bc809fb75 100644
--- a/tests/qtest/migration-helpers.h
+++ b/tests/qtest/migration-helpers.h
@@ -22,6 +22,8 @@ QDict *wait_command_fd(QTestState *who, int fd, const char *command, ...);
GCC_FMT_ATTR(2, 3)
QDict *wait_command(QTestState *who, const char *command, ...);
+QDict *qmp_command(QTestState *who, const char *command, ...);
+
GCC_FMT_ATTR(3, 4)
void migrate_qmp(QTestState *who, const char *uri, const char *fmt, ...);
diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
index 7b42f6fd90..8fad247f6c 100644
--- a/tests/qtest/migration-test.c
+++ b/tests/qtest/migration-test.c
@@ -23,6 +23,7 @@
#include "qapi/qapi-visit-sockets.h"
#include "qapi/qobject-input-visitor.h"
#include "qapi/qobject-output-visitor.h"
+#include "qapi/qmp/qlist.h"
#include "migration-helpers.h"
#include "tests/migration/migration-test.h"
@@ -42,6 +43,12 @@ static bool uffd_feature_thread_id;
/* A downtime where the test really should converge */
#define CONVERGE_DOWNTIME 1000
+/*
+ * Dirtylimit stop working if dirty page rate error
+ * value less than DIRTYLIMIT_TOLERANCE_RANGE
+ */
+#define DIRTYLIMIT_TOLERANCE_RANGE 25 /* MB/s */
+
#if defined(__linux__)
#include <sys/syscall.h>
#include <sys/vfs.h>
@@ -1394,6 +1401,253 @@ static void test_multifd_tcp_cancel(void)
test_migrate_end(from, to2, true);
}
+static void calc_dirty_rate(QTestState *who, uint64_t calc_time)
+{
+ qobject_unref(qmp_command(who,
+ "{ 'execute': 'calc-dirty-rate',"
+ "'arguments': { "
+ "'calc-time': %ld,"
+ "'mode': 'dirty-ring' }}",
+ calc_time));
+}
+
+static QDict *query_dirty_rate(QTestState *who)
+{
+ return qmp_command(who, "{ 'execute': 'query-dirty-rate' }");
+}
+
+static void dirtylimit_set_all(QTestState *who, uint64_t dirtyrate)
+{
+ qobject_unref(qmp_command(who,
+ "{ 'execute': 'set-vcpu-dirty-limit',"
+ "'arguments': { "
+ "'dirty-rate': %ld } }",
+ dirtyrate));
+}
+
+static void cancel_vcpu_dirty_limit(QTestState *who)
+{
+ qobject_unref(qmp_command(who,
+ "{ 'execute': 'cancel-vcpu-dirty-limit' }"));
+}
+
+static QDict *query_vcpu_dirty_limit(QTestState *who)
+{
+ QDict *rsp;
+
+ rsp = qtest_qmp(who, "{ 'execute': 'query-vcpu-dirty-limit' }");
+ g_assert(!qdict_haskey(rsp, "error"));
+ g_assert(qdict_haskey(rsp, "return"));
+
+ return rsp;
+}
+
+static bool calc_dirtyrate_ready(QTestState *who)
+{
+ QDict *rsp_return;
+ gchar *status;
+
+ rsp_return = query_dirty_rate(who);
+ g_assert(rsp_return);
+
+ status = g_strdup(qdict_get_str(rsp_return, "status"));
+ g_assert(status);
+
+ return g_strcmp0(status, "measuring");
+}
+
+static void wait_for_calc_dirtyrate_complete(QTestState *who,
+ int64_t time_s)
+{
+ int max_try_count = 10000;
+ usleep(time_s * 1000000);
+
+ while (!calc_dirtyrate_ready(who) && max_try_count--) {
+ usleep(1000);
+ }
+
+ /*
+ * Set the timeout with 10 s(max_try_count * 1000us),
+ * if dirtyrate measurement not complete, fail test.
+ */
+ g_assert_cmpint(max_try_count, !=, 0);
+}
+
+static int64_t get_dirty_rate(QTestState *who)
+{
+ QDict *rsp_return;
+ gchar *status;
+ QList *rates;
+ const QListEntry *entry;
+ QDict *rate;
+ int64_t dirtyrate;
+
+ rsp_return = query_dirty_rate(who);
+ g_assert(rsp_return);
+
+ status = g_strdup(qdict_get_str(rsp_return, "status"));
+ g_assert(status);
+ g_assert_cmpstr(status, ==, "measured");
+
+ rates = qdict_get_qlist(rsp_return, "vcpu-dirty-rate");
+ g_assert(rates && !qlist_empty(rates));
+
+ entry = qlist_first(rates);
+ g_assert(entry);
+
+ rate = qobject_to(QDict, qlist_entry_obj(entry));
+ g_assert(rate);
+
+ dirtyrate = qdict_get_try_int(rate, "dirty-rate", -1);
+
+ qobject_unref(rsp_return);
+ return dirtyrate;
+}
+
+static int64_t get_limit_rate(QTestState *who)
+{
+ QDict *rsp_return;
+ QList *rates;
+ const QListEntry *entry;
+ QDict *rate;
+ int64_t dirtyrate;
+
+ rsp_return = query_vcpu_dirty_limit(who);
+ g_assert(rsp_return);
+
+ rates = qdict_get_qlist(rsp_return, "return");
+ g_assert(rates && !qlist_empty(rates));
+
+ entry = qlist_first(rates);
+ g_assert(entry);
+
+ rate = qobject_to(QDict, qlist_entry_obj(entry));
+ g_assert(rate);
+
+ dirtyrate = qdict_get_try_int(rate, "limit-rate", -1);
+
+ qobject_unref(rsp_return);
+ return dirtyrate;
+}
+
+static QTestState *dirtylimit_start_vm(void)
+{
+ QTestState *vm = NULL;
+ g_autofree gchar *cmd = NULL;
+ const char *arch = qtest_get_arch();
+ g_autofree char *bootpath = NULL;
+
+ assert((strcmp(arch, "x86_64") == 0));
+ bootpath = g_strdup_printf("%s/bootsect", tmpfs);
+ assert(sizeof(x86_bootsect) == 512);
+ init_bootfile(bootpath, x86_bootsect, sizeof(x86_bootsect));
+
+ cmd = g_strdup_printf("-accel kvm,dirty-ring-size=4096 "
+ "-name dirtylimit-test,debug-threads=on "
+ "-m 150M -smp 1 "
+ "-serial file:%s/vm_serial "
+ "-drive file=%s,format=raw ",
+ tmpfs, bootpath);
+
+ vm = qtest_init(cmd);
+ return vm;
+}
+
+static void dirtylimit_stop_vm(QTestState *vm)
+{
+ qtest_quit(vm);
+ cleanup("bootsect");
+ cleanup("vm_serial");
+}
+
+static void test_vcpu_dirty_limit(void)
+{
+ QTestState *vm;
+ int64_t origin_rate;
+ int64_t quota_rate;
+ int64_t rate ;
+ int max_try_count = 20;
+ int hit = 0;
+
+ /* Start vm for vcpu dirtylimit test */
+ vm = dirtylimit_start_vm();
+
+ /* Wait for the first serial output from the vm*/
+ wait_for_serial("vm_serial");
+
+ /* Do dirtyrate measurement with calc time equals 1s */
+ calc_dirty_rate(vm, 1);
+
+ /* Sleep calc time and wait for calc dirtyrate complete */
+ wait_for_calc_dirtyrate_complete(vm, 1);
+
+ /* Query original dirty page rate */
+ origin_rate = get_dirty_rate(vm);
+
+ /* VM booted from bootsect should dirty memory steadily */
+ assert(origin_rate != 0);
+
+ /* Setup quota dirty page rate at half of origin */
+ quota_rate = origin_rate / 2;
+
+ /* Set dirtylimit */
+ dirtylimit_set_all(vm, quota_rate);
+
+ /*
+ * Check if set-vcpu-dirty-limit and query-vcpu-dirty-limit
+ * works literally
+ */
+ g_assert_cmpint(quota_rate, ==, get_limit_rate(vm));
+
+ /* Sleep a bit to check if it take effect */
+ usleep(2000000);
+
+ /*
+ * Check if dirtylimit take effect realistically, set the
+ * timeout with 20 s(max_try_count * 1s), if dirtylimit
+ * doesn't take effect, fail test.
+ */
+ while (--max_try_count) {
+ calc_dirty_rate(vm, 1);
+ wait_for_calc_dirtyrate_complete(vm, 1);
+ rate = get_dirty_rate(vm);
+
+ /*
+ * Assume hitting if current rate is less
+ * than quota rate (within accepting error)
+ */
+ if (rate < (quota_rate + DIRTYLIMIT_TOLERANCE_RANGE)) {
+ hit = 1;
+ break;
+ }
+ }
+
+ g_assert_cmpint(hit, ==, 1);
+
+ hit = 0;
+ max_try_count = 20;
+
+ /* Check if dirtylimit cancellation take effect */
+ cancel_vcpu_dirty_limit(vm);
+ while (--max_try_count) {
+ calc_dirty_rate(vm, 1);
+ wait_for_calc_dirtyrate_complete(vm, 1);
+ rate = get_dirty_rate(vm);
+
+ /*
+ * Assume dirtylimit be canceled if current rate is
+ * greater than quota rate (within accepting error)
+ */
+ if (rate > (quota_rate + DIRTYLIMIT_TOLERANCE_RANGE)) {
+ hit = 1;
+ break;
+ }
+ }
+
+ g_assert_cmpint(hit, ==, 1);
+ dirtylimit_stop_vm(vm);
+}
+
static bool kvm_dirty_ring_supported(void)
{
#if defined(__linux__) && defined(HOST_X86_64)
@@ -1483,6 +1737,8 @@ int main(int argc, char **argv)
if (kvm_dirty_ring_supported()) {
qtest_add_func("/migration/dirty_ring",
test_precopy_unix_dirty_ring);
+ qtest_add_func("/migration/vcpu_dirty_limit",
+ test_vcpu_dirty_limit);
}
ret = g_test_run();
--
2.27.0

View File

@ -0,0 +1,138 @@
From 42f96b9e73ff4a23fad56bc8fefea5e477ee95b9 Mon Sep 17 00:00:00 2001
From: Jing Liu <jing2.liu@intel.com>
Date: Wed, 16 Feb 2022 22:04:31 -0800
Subject: [PATCH 06/10] x86: Add AMX CPUIDs enumeration
from mainline-v7.0.0-rc0
commit f21a48171cf3fa39532fc8553fd82e81b88b6474
category: feature
feature: SPR AMX support for Qemu
bugzilla: https://gitee.com/openeuler/intel-qemu/issues/I5VHOB
Intel-SIG: commit f21a48171cf3 ("x86: Add AMX CPUIDs enumeration")
----------------------------------------------
x86: Add AMX CPUIDs enumeration
Add AMX primary feature bits XFD and AMX_TILE to
enumerate the CPU's AMX capability. Meanwhile, add
AMX TILE and TMUL CPUID leaf and subleaves which
exist when AMX TILE is present to provide the maximum
capability of TILE and TMUL.
Signed-off-by: Jing Liu <jing2.liu@intel.com>
Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Message-Id: <20220217060434.52460-6-yang.zhong@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Jason Zeng <jason.zeng@intel.com>
---
target/i386/cpu.c | 55 ++++++++++++++++++++++++++++++++++++++++---
target/i386/kvm/kvm.c | 4 +++-
2 files changed, 55 insertions(+), 4 deletions(-)
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index da81e47dc3..1bc03d3eef 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -574,6 +574,18 @@ static CPUCacheInfo legacy_l3_cache = {
#define INTEL_PT_CYCLE_BITMAP 0x1fff /* Support 0,2^(0~11) */
#define INTEL_PT_PSB_BITMAP (0x003f << 16) /* Support 2K,4K,8K,16K,32K,64K */
+/* CPUID Leaf 0x1D constants: */
+#define INTEL_AMX_TILE_MAX_SUBLEAF 0x1
+#define INTEL_AMX_TOTAL_TILE_BYTES 0x2000
+#define INTEL_AMX_BYTES_PER_TILE 0x400
+#define INTEL_AMX_BYTES_PER_ROW 0x40
+#define INTEL_AMX_TILE_MAX_NAMES 0x8
+#define INTEL_AMX_TILE_MAX_ROWS 0x10
+
+/* CPUID Leaf 0x1E constants: */
+#define INTEL_AMX_TMUL_MAX_K 0x10
+#define INTEL_AMX_TMUL_MAX_N 0x40
+
void x86_cpu_vendor_words2str(char *dst, uint32_t vendor1,
uint32_t vendor2, uint32_t vendor3)
{
@@ -843,8 +855,8 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = {
"avx512-vp2intersect", NULL, "md-clear", NULL,
NULL, NULL, "serialize", NULL,
"tsx-ldtrk", NULL, NULL /* pconfig */, NULL,
- NULL, NULL, NULL, "avx512-fp16",
- NULL, NULL, "spec-ctrl", "stibp",
+ NULL, NULL, "amx-bf16", "avx512-fp16",
+ "amx-tile", "amx-int8", "spec-ctrl", "stibp",
NULL, "arch-capabilities", "core-capability", "ssbd",
},
.cpuid = {
@@ -909,7 +921,7 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = {
.type = CPUID_FEATURE_WORD,
.feat_names = {
"xsaveopt", "xsavec", "xgetbv1", "xsaves",
- NULL, NULL, NULL, NULL,
+ "xfd", NULL, NULL, NULL,
NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL,
@@ -5605,6 +5617,43 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count,
}
break;
}
+ case 0x1D: {
+ /* AMX TILE */
+ *eax = 0;
+ *ebx = 0;
+ *ecx = 0;
+ *edx = 0;
+ if (!(env->features[FEAT_7_0_EDX] & CPUID_7_0_EDX_AMX_TILE)) {
+ break;
+ }
+
+ if (count == 0) {
+ /* Highest numbered palette subleaf */
+ *eax = INTEL_AMX_TILE_MAX_SUBLEAF;
+ } else if (count == 1) {
+ *eax = INTEL_AMX_TOTAL_TILE_BYTES |
+ (INTEL_AMX_BYTES_PER_TILE << 16);
+ *ebx = INTEL_AMX_BYTES_PER_ROW | (INTEL_AMX_TILE_MAX_NAMES << 16);
+ *ecx = INTEL_AMX_TILE_MAX_ROWS;
+ }
+ break;
+ }
+ case 0x1E: {
+ /* AMX TMUL */
+ *eax = 0;
+ *ebx = 0;
+ *ecx = 0;
+ *edx = 0;
+ if (!(env->features[FEAT_7_0_EDX] & CPUID_7_0_EDX_AMX_TILE)) {
+ break;
+ }
+
+ if (count == 0) {
+ /* Highest numbered palette subleaf */
+ *ebx = INTEL_AMX_TMUL_MAX_K | (INTEL_AMX_TMUL_MAX_N << 8);
+ }
+ break;
+ }
case 0x40000000:
/*
* CPUID code in kvm_arch_init_vcpu() ignores stuff
diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index e7f57d05a2..60ccdec5e8 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -1779,7 +1779,9 @@ int kvm_arch_init_vcpu(CPUState *cs)
c = &cpuid_data.entries[cpuid_i++];
}
break;
- case 0x14: {
+ case 0x14:
+ case 0x1d:
+ case 0x1e: {
uint32_t times;
c->function = i;
--
2.27.0

View File

@ -0,0 +1,115 @@
From 98f5dbc3fd8390728401528786ac94b39f0581ee Mon Sep 17 00:00:00 2001
From: Jing Liu <jing2.liu@intel.com>
Date: Wed, 16 Feb 2022 22:04:28 -0800
Subject: [PATCH 03/10] x86: Add AMX XTILECFG and XTILEDATA components
from mainline-v7.0.0-rc0
commit 1f16764f7d4515bfd5e4ae0aae814fa280a7d0c8
category: feature
feature: SPR AMX support for Qemu
bugzilla: https://gitee.com/openeuler/intel-qemu/issues/I5VHOB
Intel-SIG: commit 1f16764f7d45 ("x86: Add AMX XTILECFG and XTILEDATA components")
-------------------------------------------------------------
x86: Add AMX XTILECFG and XTILEDATA components
The AMX TILECFG register and the TMMx tile data registers are
saved/restored via XSAVE, respectively in state component 17
(64 bytes) and state component 18 (8192 bytes).
Add AMX feature bits to x86_ext_save_areas array to set
up AMX components. Add structs that define the layout of
AMX XSAVE areas and use QEMU_BUILD_BUG_ON to validate the
structs sizes.
Signed-off-by: Jing Liu <jing2.liu@intel.com>
Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Message-Id: <20220217060434.52460-3-yang.zhong@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Jason Zeng <jason.zeng@intel.com>
---
target/i386/cpu.c | 8 ++++++++
target/i386/cpu.h | 18 +++++++++++++++++-
2 files changed, 25 insertions(+), 1 deletion(-)
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 532ca45015..31d63be081 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -1401,6 +1401,14 @@ ExtSaveArea x86_ext_save_areas[XSAVE_STATE_AREA_COUNT] = {
[XSTATE_PKRU_BIT] =
{ .feature = FEAT_7_0_ECX, .bits = CPUID_7_0_ECX_PKU,
.size = sizeof(XSavePKRU) },
+ [XSTATE_XTILE_CFG_BIT] = {
+ .feature = FEAT_7_0_EDX, .bits = CPUID_7_0_EDX_AMX_TILE,
+ .size = sizeof(XSaveXTILECFG),
+ },
+ [XSTATE_XTILE_DATA_BIT] = {
+ .feature = FEAT_7_0_EDX, .bits = CPUID_7_0_EDX_AMX_TILE,
+ .size = sizeof(XSaveXTILEDATA)
+ },
};
static uint32_t xsave_area_size(uint64_t mask)
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 52330d1112..cc431b1d76 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -538,6 +538,8 @@ typedef enum X86Seg {
#define XSTATE_ZMM_Hi256_BIT 6
#define XSTATE_Hi16_ZMM_BIT 7
#define XSTATE_PKRU_BIT 9
+#define XSTATE_XTILE_CFG_BIT 17
+#define XSTATE_XTILE_DATA_BIT 18
#define XSTATE_FP_MASK (1ULL << XSTATE_FP_BIT)
#define XSTATE_SSE_MASK (1ULL << XSTATE_SSE_BIT)
@@ -846,6 +848,8 @@ typedef uint64_t FeatureWordArray[FEATURE_WORDS];
#define CPUID_7_0_EDX_TSX_LDTRK (1U << 16)
/* AVX512_FP16 instruction */
#define CPUID_7_0_EDX_AVX512_FP16 (1U << 23)
+/* AMX tile (two-dimensional register) */
+#define CPUID_7_0_EDX_AMX_TILE (1U << 24)
/* Speculation Control */
#define CPUID_7_0_EDX_SPEC_CTRL (1U << 26)
/* Single Thread Indirect Branch Predictors */
@@ -1349,6 +1353,16 @@ typedef struct XSavePKRU {
uint32_t padding;
} XSavePKRU;
+/* Ext. save area 17: AMX XTILECFG state */
+typedef struct XSaveXTILECFG {
+ uint8_t xtilecfg[64];
+} XSaveXTILECFG;
+
+/* Ext. save area 18: AMX XTILEDATA state */
+typedef struct XSaveXTILEDATA {
+ uint8_t xtiledata[8][1024];
+} XSaveXTILEDATA;
+
QEMU_BUILD_BUG_ON(sizeof(XSaveAVX) != 0x100);
QEMU_BUILD_BUG_ON(sizeof(XSaveBNDREG) != 0x40);
QEMU_BUILD_BUG_ON(sizeof(XSaveBNDCSR) != 0x40);
@@ -1356,6 +1370,8 @@ QEMU_BUILD_BUG_ON(sizeof(XSaveOpmask) != 0x40);
QEMU_BUILD_BUG_ON(sizeof(XSaveZMM_Hi256) != 0x200);
QEMU_BUILD_BUG_ON(sizeof(XSaveHi16_ZMM) != 0x400);
QEMU_BUILD_BUG_ON(sizeof(XSavePKRU) != 0x8);
+QEMU_BUILD_BUG_ON(sizeof(XSaveXTILECFG) != 0x40);
+QEMU_BUILD_BUG_ON(sizeof(XSaveXTILEDATA) != 0x2000);
typedef struct ExtSaveArea {
uint32_t feature, bits;
@@ -1363,7 +1379,7 @@ typedef struct ExtSaveArea {
uint32_t ecx;
} ExtSaveArea;
-#define XSAVE_STATE_AREA_COUNT (XSTATE_PKRU_BIT + 1)
+#define XSAVE_STATE_AREA_COUNT (XSTATE_XTILE_DATA_BIT + 1)
extern ExtSaveArea x86_ext_save_areas[XSAVE_STATE_AREA_COUNT];
--
2.27.0

View File

@ -0,0 +1,66 @@
From 52eed626a2200da02e67aa93c2a8d59cb529737b Mon Sep 17 00:00:00 2001
From: Jing Liu <jing2.liu@intel.com>
Date: Wed, 16 Feb 2022 22:04:30 -0800
Subject: [PATCH 05/10] x86: Add XFD faulting bit for state components
from mainline-v7.0.0-rc0
commit 0f17f6b30f3b051f0f96ccc98c9f7f395713699f
category: feature
feature: SPR AMX support for Qemu
bugzilla: https://gitee.com/openeuler/intel-qemu/issues/I5VHOB
Intel-SIG: commit 0f17f6b30f3b ("x86: Add XFD faulting bit for state
components")
-------------------------------------------------
x86: Add XFD faulting bit for state components
Intel introduces XFD faulting mechanism for extended
XSAVE features to dynamically enable the features in
runtime. If CPUID (EAX=0Dh, ECX=n, n>1).ECX[2] is set
as 1, it indicates support for XFD faulting of this
state component.
Signed-off-by: Jing Liu <jing2.liu@intel.com>
Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Message-Id: <20220217060434.52460-5-yang.zhong@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Jason Zeng <jason.zeng@intel.com>
---
target/i386/cpu.c | 3 ++-
target/i386/cpu.h | 2 ++
2 files changed, 4 insertions(+), 1 deletion(-)
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index fb6b4c86de..da81e47dc3 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -5515,7 +5515,8 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count,
const ExtSaveArea *esa = &x86_ext_save_areas[count];
*eax = esa->size;
*ebx = esa->offset;
- *ecx = esa->ecx & ESA_FEATURE_ALIGN64_MASK;
+ *ecx = esa->ecx &
+ (ESA_FEATURE_ALIGN64_MASK | ESA_FEATURE_XFD_MASK);
}
}
break;
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 93d1c60ac1..09c725ee13 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -556,8 +556,10 @@ typedef enum X86Seg {
#define XSTATE_DYNAMIC_MASK (XSTATE_XTILE_DATA_MASK)
#define ESA_FEATURE_ALIGN64_BIT 1
+#define ESA_FEATURE_XFD_BIT 2
#define ESA_FEATURE_ALIGN64_MASK (1U << ESA_FEATURE_ALIGN64_BIT)
+#define ESA_FEATURE_XFD_MASK (1U << ESA_FEATURE_XFD_BIT)
/* CPUID feature words */
--
2.27.0

View File

@ -0,0 +1,91 @@
From ab183c656a2bee466e7c609224cddb75b80d9d6f Mon Sep 17 00:00:00 2001
From: Jing Liu <jing2.liu@intel.com>
Date: Wed, 16 Feb 2022 22:04:27 -0800
Subject: [PATCH 02/10] x86: Fix the 64-byte boundary enumeration for extended
state
from mainline-v7.0.0-rc0
commit 131266b7565bd437127bd231563572696bb27235
category: feature
feature: SPR AMX support for Qemu
bugzilla: https://gitee.com/openeuler/intel-qemu/issues/I5VHOB
Intel-SIG: commit 131266b7565b ("x86: Fix the 64-byte boundary enumeration for extended state")
-----------------------------------------------------------
x86: Fix the 64-byte boundary enumeration for extended state
The extended state subleaves (EAX=0Dh, ECX=n, n>1).ECX[1]
indicate whether the extended state component locates
on the next 64-byte boundary following the preceding state
component when the compacted format of an XSAVE area is
used.
Right now, they are all zero because no supported component
needed the bit to be set, but the upcoming AMX feature will
use it. Fix the subleaves value according to KVM's supported
cpuid.
Signed-off-by: Jing Liu <jing2.liu@intel.com>
Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Message-Id: <20220217060434.52460-2-yang.zhong@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Jason Zeng <jason.zeng@intel.com>
---
target/i386/cpu.c | 1 +
target/i386/cpu.h | 6 ++++++
target/i386/kvm/kvm-cpu.c | 1 +
3 files changed, 8 insertions(+)
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index d9dca1dafb..532ca45015 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -5507,6 +5507,7 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count,
const ExtSaveArea *esa = &x86_ext_save_areas[count];
*eax = esa->size;
*ebx = esa->offset;
+ *ecx = esa->ecx & ESA_FEATURE_ALIGN64_MASK;
}
}
break;
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index d9296a9abc..52330d1112 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -549,6 +549,11 @@ typedef enum X86Seg {
#define XSTATE_Hi16_ZMM_MASK (1ULL << XSTATE_Hi16_ZMM_BIT)
#define XSTATE_PKRU_MASK (1ULL << XSTATE_PKRU_BIT)
+#define ESA_FEATURE_ALIGN64_BIT 1
+
+#define ESA_FEATURE_ALIGN64_MASK (1U << ESA_FEATURE_ALIGN64_BIT)
+
+
/* CPUID feature words */
typedef enum FeatureWord {
FEAT_1_EDX, /* CPUID[1].EDX */
@@ -1355,6 +1360,7 @@ QEMU_BUILD_BUG_ON(sizeof(XSavePKRU) != 0x8);
typedef struct ExtSaveArea {
uint32_t feature, bits;
uint32_t offset, size;
+ uint32_t ecx;
} ExtSaveArea;
#define XSAVE_STATE_AREA_COUNT (XSTATE_PKRU_BIT + 1)
diff --git a/target/i386/kvm/kvm-cpu.c b/target/i386/kvm/kvm-cpu.c
index d95028018e..ce27d3b1df 100644
--- a/target/i386/kvm/kvm-cpu.c
+++ b/target/i386/kvm/kvm-cpu.c
@@ -104,6 +104,7 @@ static void kvm_cpu_xsave_init(void)
if (sz != 0) {
assert(esa->size == sz);
esa->offset = kvm_arch_get_supported_cpuid(s, 0xd, i, R_EBX);
+ esa->ecx = kvm_arch_get_supported_cpuid(s, 0xd, i, R_ECX);
}
}
}
--
2.27.0

View File

@ -0,0 +1,218 @@
From b7e588a4506ce61c13e78175c2da5b69b60af128 Mon Sep 17 00:00:00 2001
From: Yang Zhong <yang.zhong@intel.com>
Date: Wed, 16 Feb 2022 22:04:29 -0800
Subject: [PATCH 04/10] x86: Grant AMX permission for guest
from mainline-v7.0.0-rc0
commit 19db68ca68a78fa033a21d419036b6e416554564
category: feature
feature: SPR AMX support for Qemu
bugzilla: https://gitee.com/openeuler/intel-qemu/issues/I5VHOB
Intel-SIG: commit 19db68ca68a7 ("x86: Grant AMX permission for guest")
--------------------------------------------------------
x86: Grant AMX permission for guest
Kernel allocates 4K xstate buffer by default. For XSAVE features
which require large state component (e.g. AMX), Linux kernel
dynamically expands the xstate buffer only after the process has
acquired the necessary permissions. Those are called dynamically-
enabled XSAVE features (or dynamic xfeatures).
There are separate permissions for native tasks and guests.
Qemu should request the guest permissions for dynamic xfeatures
which will be exposed to the guest. This only needs to be done
once before the first vcpu is created.
KVM implemented one new ARCH_GET_XCOMP_SUPP system attribute API to
get host side supported_xcr0 and Qemu can decide if it can request
dynamically enabled XSAVE features permission.
https://lore.kernel.org/all/20220126152210.3044876-1-pbonzini@redhat.com/
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Signed-off-by: Jing Liu <jing2.liu@intel.com>
Message-Id: <20220217060434.52460-4-yang.zhong@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Jason Zeng <jason.zeng@intel.com>
---
target/i386/cpu.c | 7 +++++
target/i386/cpu.h | 4 +++
target/i386/kvm/kvm-cpu.c | 12 ++++----
target/i386/kvm/kvm.c | 57 ++++++++++++++++++++++++++++++++++++++
target/i386/kvm/kvm_i386.h | 1 +
5 files changed, 75 insertions(+), 6 deletions(-)
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 31d63be081..fb6b4c86de 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -6048,6 +6048,7 @@ static void x86_cpu_enable_xsave_components(X86CPU *cpu)
CPUX86State *env = &cpu->env;
int i;
uint64_t mask;
+ static bool request_perm;
if (!(env->features[FEAT_1_ECX] & CPUID_EXT_XSAVE)) {
env->features[FEAT_XSAVE_COMP_LO] = 0;
@@ -6063,6 +6064,12 @@ static void x86_cpu_enable_xsave_components(X86CPU *cpu)
}
}
+ /* Only request permission for first vcpu */
+ if (kvm_enabled() && !request_perm) {
+ kvm_request_xsave_components(cpu, mask);
+ request_perm = true;
+ }
+
env->features[FEAT_XSAVE_COMP_LO] = mask;
env->features[FEAT_XSAVE_COMP_HI] = mask >> 32;
}
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index cc431b1d76..93d1c60ac1 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -550,6 +550,10 @@ typedef enum X86Seg {
#define XSTATE_ZMM_Hi256_MASK (1ULL << XSTATE_ZMM_Hi256_BIT)
#define XSTATE_Hi16_ZMM_MASK (1ULL << XSTATE_Hi16_ZMM_BIT)
#define XSTATE_PKRU_MASK (1ULL << XSTATE_PKRU_BIT)
+#define XSTATE_XTILE_CFG_MASK (1ULL << XSTATE_XTILE_CFG_BIT)
+#define XSTATE_XTILE_DATA_MASK (1ULL << XSTATE_XTILE_DATA_BIT)
+
+#define XSTATE_DYNAMIC_MASK (XSTATE_XTILE_DATA_MASK)
#define ESA_FEATURE_ALIGN64_BIT 1
diff --git a/target/i386/kvm/kvm-cpu.c b/target/i386/kvm/kvm-cpu.c
index ce27d3b1df..a35a1bf9fe 100644
--- a/target/i386/kvm/kvm-cpu.c
+++ b/target/i386/kvm/kvm-cpu.c
@@ -84,7 +84,7 @@ static void kvm_cpu_max_instance_init(X86CPU *cpu)
static void kvm_cpu_xsave_init(void)
{
static bool first = true;
- KVMState *s = kvm_state;
+ uint32_t eax, ebx, ecx, edx;
int i;
if (!first) {
@@ -100,11 +100,11 @@ static void kvm_cpu_xsave_init(void)
ExtSaveArea *esa = &x86_ext_save_areas[i];
if (esa->size) {
- int sz = kvm_arch_get_supported_cpuid(s, 0xd, i, R_EAX);
- if (sz != 0) {
- assert(esa->size == sz);
- esa->offset = kvm_arch_get_supported_cpuid(s, 0xd, i, R_EBX);
- esa->ecx = kvm_arch_get_supported_cpuid(s, 0xd, i, R_ECX);
+ host_cpuid(0xd, i, &eax, &ebx, &ecx, &edx);
+ if (eax != 0) {
+ assert(esa->size == eax);
+ esa->offset = ebx;
+ esa->ecx = ecx;
}
}
}
diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index 5a698bde19..e7f57d05a2 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -17,6 +17,7 @@
#include "qapi/error.h"
#include <sys/ioctl.h>
#include <sys/utsname.h>
+#include <sys/syscall.h>
#include <linux/kvm.h>
#include "standard-headers/asm-x86/kvm_para.h"
@@ -347,6 +348,7 @@ uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function,
struct kvm_cpuid2 *cpuid;
uint32_t ret = 0;
uint32_t cpuid_1_edx;
+ uint64_t bitmask;
cpuid = get_supported_cpuid(s);
@@ -404,6 +406,25 @@ uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function,
if (!has_msr_arch_capabs) {
ret &= ~CPUID_7_0_EDX_ARCH_CAPABILITIES;
}
+ } else if (function == 0xd && index == 0 &&
+ (reg == R_EAX || reg == R_EDX)) {
+ struct kvm_device_attr attr = {
+ .group = 0,
+ .attr = KVM_X86_XCOMP_GUEST_SUPP,
+ .addr = (unsigned long) &bitmask
+ };
+
+ bool sys_attr = kvm_check_extension(s, KVM_CAP_SYS_ATTRIBUTES);
+ if (!sys_attr) {
+ warn_report("cannot get sys attribute capabilities %d", sys_attr);
+ }
+
+ int rc = kvm_ioctl(s, KVM_GET_DEVICE_ATTR, &attr);
+ if (rc == -1 && (errno == ENXIO || errno == EINVAL)) {
+ warn_report("KVM_GET_DEVICE_ATTR(0, KVM_X86_XCOMP_GUEST_SUPP) "
+ "error: %d", rc);
+ }
+ ret = (reg == R_EAX) ? bitmask : bitmask >> 32;
} else if (function == 0x80000001 && reg == R_ECX) {
/*
* It's safe to enable TOPOEXT even if it's not returned by
@@ -5050,3 +5071,39 @@ bool kvm_arch_cpu_check_are_resettable(void)
{
return !sev_es_enabled();
}
+
+#define ARCH_REQ_XCOMP_GUEST_PERM 0x1025
+
+void kvm_request_xsave_components(X86CPU *cpu, uint64_t mask)
+{
+ KVMState *s = kvm_state;
+ uint64_t supported;
+
+ mask &= XSTATE_DYNAMIC_MASK;
+ if (!mask) {
+ return;
+ }
+ /*
+ * Just ignore bits that are not in CPUID[EAX=0xD,ECX=0].
+ * ARCH_REQ_XCOMP_GUEST_PERM would fail, and QEMU has warned
+ * about them already because they are not supported features.
+ */
+ supported = kvm_arch_get_supported_cpuid(s, 0xd, 0, R_EAX);
+ supported |= (uint64_t)kvm_arch_get_supported_cpuid(s, 0xd, 0, R_EDX) << 32;
+ mask &= supported;
+
+ while (mask) {
+ int bit = ctz64(mask);
+ int rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM, bit);
+ if (rc) {
+ /*
+ * Older kernel version (<5.17) do not support
+ * ARCH_REQ_XCOMP_GUEST_PERM, but also do not return
+ * any dynamic feature from kvm_arch_get_supported_cpuid.
+ */
+ warn_report("prctl(ARCH_REQ_XCOMP_GUEST_PERM) failure "
+ "for feature bit %d", bit);
+ }
+ mask &= ~BIT_ULL(bit);
+ }
+}
diff --git a/target/i386/kvm/kvm_i386.h b/target/i386/kvm/kvm_i386.h
index a978509d50..4124912c20 100644
--- a/target/i386/kvm/kvm_i386.h
+++ b/target/i386/kvm/kvm_i386.h
@@ -52,5 +52,6 @@ bool kvm_hyperv_expand_features(X86CPU *cpu, Error **errp);
uint64_t kvm_swizzle_msi_ext_dest_id(uint64_t address);
bool kvm_enable_sgx_provisioning(KVMState *s);
+void kvm_request_xsave_components(X86CPU *cpu, uint64_t mask);
#endif
--
2.27.0

View File

@ -0,0 +1,182 @@
From bb1b53e5d0b67d97042ea3c33b5c4c80e33809f2 Mon Sep 17 00:00:00 2001
From: Zeng Guang <guang.zeng@intel.com>
Date: Wed, 16 Feb 2022 22:04:33 -0800
Subject: [PATCH 08/10] x86: Support XFD and AMX xsave data migration
from mainline-v7.0.0-rc0
commit cdec2b753b487d9e8aab028231c35d87789ea083
category: feature
feature: SPR AMX support for Qemu
bugzilla: https://gitee.com/openeuler/intel-qemu/issues/I5VHOB
Intel-SIG: commit cdec2b753b48 ("x86: Support XFD and AMX xsave data
migration")
------------------------------------------------
x86: Support XFD and AMX xsave data migration
XFD(eXtended Feature Disable) allows to enable a
feature on xsave state while preventing specific
user threads from using the feature.
Support save and restore XFD MSRs if CPUID.D.1.EAX[4]
enumerate to be valid. Likewise migrate the MSRs and
related xsave state necessarily.
Signed-off-by: Zeng Guang <guang.zeng@intel.com>
Signed-off-by: Wei Wang <wei.w.wang@intel.com>
Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Message-Id: <20220217060434.52460-8-yang.zhong@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Jason Zeng <jason.zeng@intel.com>
---
target/i386/cpu.h | 9 +++++++++
target/i386/kvm/kvm.c | 18 +++++++++++++++++
target/i386/machine.c | 46 +++++++++++++++++++++++++++++++++++++++++++
3 files changed, 73 insertions(+)
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 74e66c352c..eaa99c302f 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -506,6 +506,9 @@ typedef enum X86Seg {
#define MSR_VM_HSAVE_PA 0xc0010117
+#define MSR_IA32_XFD 0x000001c4
+#define MSR_IA32_XFD_ERR 0x000001c5
+
#define MSR_IA32_BNDCFGS 0x00000d90
#define MSR_IA32_XSS 0x00000da0
#define MSR_IA32_UMWAIT_CONTROL 0xe1
@@ -871,6 +874,8 @@ typedef uint64_t FeatureWordArray[FEATURE_WORDS];
#define CPUID_7_1_EAX_AVX_VNNI (1U << 4)
/* AVX512 BFloat16 Instruction */
#define CPUID_7_1_EAX_AVX512_BF16 (1U << 5)
+/* XFD Extend Feature Disabled */
+#define CPUID_D_1_EAX_XFD (1U << 4)
/* Packets which contain IP payload have LIP values */
#define CPUID_14_0_ECX_LIP (1U << 31)
@@ -1612,6 +1617,10 @@ typedef struct CPUX86State {
uint64_t msr_rtit_cr3_match;
uint64_t msr_rtit_addrs[MAX_RTIT_ADDRS];
+ /* Per-VCPU XFD MSRs */
+ uint64_t msr_xfd;
+ uint64_t msr_xfd_err;
+
/* exception/interrupt handling */
int error_code;
int exception_is_int;
diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index b0b22dcf7c..49fca5ea88 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -3219,6 +3219,13 @@ static int kvm_put_msrs(X86CPU *cpu, int level)
env->msr_ia32_sgxlepubkeyhash[3]);
}
+ if (env->features[FEAT_XSAVE] & CPUID_D_1_EAX_XFD) {
+ kvm_msr_entry_add(cpu, MSR_IA32_XFD,
+ env->msr_xfd);
+ kvm_msr_entry_add(cpu, MSR_IA32_XFD_ERR,
+ env->msr_xfd_err);
+ }
+
/* Note: MSR_IA32_FEATURE_CONTROL is written separately, see
* kvm_put_msr_feature_control. */
}
@@ -3570,6 +3577,11 @@ static int kvm_get_msrs(X86CPU *cpu)
kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH3, 0);
}
+ if (env->features[FEAT_XSAVE] & CPUID_D_1_EAX_XFD) {
+ kvm_msr_entry_add(cpu, MSR_IA32_XFD, 0);
+ kvm_msr_entry_add(cpu, MSR_IA32_XFD_ERR, 0);
+ }
+
ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, cpu->kvm_msr_buf);
if (ret < 0) {
return ret;
@@ -3866,6 +3878,12 @@ static int kvm_get_msrs(X86CPU *cpu)
env->msr_ia32_sgxlepubkeyhash[index - MSR_IA32_SGXLEPUBKEYHASH0] =
msrs[i].data;
break;
+ case MSR_IA32_XFD:
+ env->msr_xfd = msrs[i].data;
+ break;
+ case MSR_IA32_XFD_ERR:
+ env->msr_xfd_err = msrs[i].data;
+ break;
}
}
diff --git a/target/i386/machine.c b/target/i386/machine.c
index 83c2b91529..3977e9d8f8 100644
--- a/target/i386/machine.c
+++ b/target/i386/machine.c
@@ -1455,6 +1455,48 @@ static const VMStateDescription vmstate_msr_intel_sgx = {
}
};
+static bool xfd_msrs_needed(void *opaque)
+{
+ X86CPU *cpu = opaque;
+ CPUX86State *env = &cpu->env;
+
+ return !!(env->features[FEAT_XSAVE] & CPUID_D_1_EAX_XFD);
+}
+
+static const VMStateDescription vmstate_msr_xfd = {
+ .name = "cpu/msr_xfd",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .needed = xfd_msrs_needed,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT64(env.msr_xfd, X86CPU),
+ VMSTATE_UINT64(env.msr_xfd_err, X86CPU),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+#ifdef TARGET_X86_64
+static bool amx_xtile_needed(void *opaque)
+{
+ X86CPU *cpu = opaque;
+ CPUX86State *env = &cpu->env;
+
+ return !!(env->features[FEAT_7_0_EDX] & CPUID_7_0_EDX_AMX_TILE);
+}
+
+static const VMStateDescription vmstate_amx_xtile = {
+ .name = "cpu/intel_amx_xtile",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .needed = amx_xtile_needed,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT8_ARRAY(env.xtilecfg, X86CPU, 64),
+ VMSTATE_UINT8_ARRAY(env.xtiledata, X86CPU, 8192),
+ VMSTATE_END_OF_LIST()
+ }
+};
+#endif
+
const VMStateDescription vmstate_x86_cpu = {
.name = "cpu",
.version_id = 12,
@@ -1593,6 +1635,10 @@ const VMStateDescription vmstate_x86_cpu = {
#endif
&vmstate_msr_tsx_ctrl,
&vmstate_msr_intel_sgx,
+ &vmstate_msr_xfd,
+#ifdef TARGET_X86_64
+ &vmstate_amx_xtile,
+#endif
NULL
}
};
--
2.27.0

View File

@ -0,0 +1,186 @@
From e98958c23ea5b15a8e84642c373336a8898cd63f Mon Sep 17 00:00:00 2001
From: Jing Liu <jing2.liu@intel.com>
Date: Wed, 16 Feb 2022 22:04:32 -0800
Subject: [PATCH 07/10] x86: add support for KVM_CAP_XSAVE2 and AMX state
migration
from mainline-v7.0.0-rc0
commit e56dd3c70abb31893c61ac834109fa7a38841330
category: feature
feature: SPR AMX support for Qemu
bugzilla: https://gitee.com/openeuler/intel-qemu/issues/I5VHOB
Intel-SIG: commit e56dd3c70abb ("x86: add support for KVM_CAP_XSAVE2 and
AMX state migration")
-------------------------------------------------------
x86: add support for KVM_CAP_XSAVE2 and AMX state migration
When dynamic xfeatures (e.g. AMX) are used by the guest, the xsave
area would be larger than 4KB. KVM_GET_XSAVE2 and KVM_SET_XSAVE
under KVM_CAP_XSAVE2 works with a xsave buffer larger than 4KB.
Always use the new ioctls under KVM_CAP_XSAVE2 when KVM supports it.
Signed-off-by: Jing Liu <jing2.liu@intel.com>
Signed-off-by: Zeng Guang <guang.zeng@intel.com>
Signed-off-by: Wei Wang <wei.w.wang@intel.com>
Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Message-Id: <20220217060434.52460-7-yang.zhong@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Jason Zeng <jason.zeng@intel.com>
---
target/i386/cpu.h | 4 ++++
target/i386/kvm/kvm.c | 42 ++++++++++++++++++++++++--------------
target/i386/xsave_helper.c | 28 +++++++++++++++++++++++++
3 files changed, 59 insertions(+), 15 deletions(-)
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 09c725ee13..74e66c352c 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -1523,6 +1523,10 @@ typedef struct CPUX86State {
uint64_t opmask_regs[NB_OPMASK_REGS];
YMMReg zmmh_regs[CPU_NB_REGS];
ZMMReg hi16_zmm_regs[CPU_NB_REGS];
+#ifdef TARGET_X86_64
+ uint8_t xtilecfg[64];
+ uint8_t xtiledata[8192];
+#endif
/* sysenter registers */
uint32_t sysenter_cs;
diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index 60ccdec5e8..b0b22dcf7c 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -123,6 +123,7 @@ static uint32_t num_architectural_pmu_gp_counters;
static uint32_t num_architectural_pmu_fixed_counters;
static int has_xsave;
+static int has_xsave2;
static int has_xcrs;
static int has_pit_state2;
static int has_exception_payload;
@@ -1585,6 +1586,26 @@ static Error *invtsc_mig_blocker;
#define KVM_MAX_CPUID_ENTRIES 100
+static void kvm_init_xsave(CPUX86State *env)
+{
+ if (has_xsave2) {
+ env->xsave_buf_len = QEMU_ALIGN_UP(has_xsave2, 4096);
+ } else if (has_xsave) {
+ env->xsave_buf_len = sizeof(struct kvm_xsave);
+ } else {
+ return;
+ }
+
+ env->xsave_buf = qemu_memalign(4096, env->xsave_buf_len);
+ memset(env->xsave_buf, 0, env->xsave_buf_len);
+ /*
+ * The allocated storage must be large enough for all of the
+ * possible XSAVE state components.
+ */
+ assert(kvm_arch_get_supported_cpuid(kvm_state, 0xd, 0, R_ECX) <=
+ env->xsave_buf_len);
+}
+
int kvm_arch_init_vcpu(CPUState *cs)
{
struct {
@@ -1614,6 +1635,8 @@ int kvm_arch_init_vcpu(CPUState *cs)
cpuid_i = 0;
+ has_xsave2 = kvm_check_extension(cs->kvm_state, KVM_CAP_XSAVE2);
+
r = kvm_arch_set_tsc_khz(cs);
if (r < 0) {
return r;
@@ -2003,19 +2026,7 @@ int kvm_arch_init_vcpu(CPUState *cs)
if (r) {
goto fail;
}
-
- if (has_xsave) {
- env->xsave_buf_len = sizeof(struct kvm_xsave);
- env->xsave_buf = qemu_memalign(4096, env->xsave_buf_len);
- memset(env->xsave_buf, 0, env->xsave_buf_len);
-
- /*
- * The allocated storage must be large enough for all of the
- * possible XSAVE state components.
- */
- assert(kvm_arch_get_supported_cpuid(kvm_state, 0xd, 0, R_ECX)
- <= env->xsave_buf_len);
- }
+ kvm_init_xsave(env);
max_nested_state_len = kvm_max_nested_state_length();
if (max_nested_state_len > 0) {
@@ -3263,13 +3274,14 @@ static int kvm_get_xsave(X86CPU *cpu)
{
CPUX86State *env = &cpu->env;
void *xsave = env->xsave_buf;
- int ret;
+ int type, ret;
if (!has_xsave) {
return kvm_get_fpu(cpu);
}
- ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_XSAVE, xsave);
+ type = has_xsave2 ? KVM_GET_XSAVE2 : KVM_GET_XSAVE;
+ ret = kvm_vcpu_ioctl(CPU(cpu), type, xsave);
if (ret < 0) {
return ret;
}
diff --git a/target/i386/xsave_helper.c b/target/i386/xsave_helper.c
index ac61a96344..996e9f3bfe 100644
--- a/target/i386/xsave_helper.c
+++ b/target/i386/xsave_helper.c
@@ -126,6 +126,20 @@ void x86_cpu_xsave_all_areas(X86CPU *cpu, void *buf, uint32_t buflen)
memcpy(pkru, &env->pkru, sizeof(env->pkru));
}
+
+ e = &x86_ext_save_areas[XSTATE_XTILE_CFG_BIT];
+ if (e->size && e->offset) {
+ XSaveXTILECFG *tilecfg = buf + e->offset;
+
+ memcpy(tilecfg, &env->xtilecfg, sizeof(env->xtilecfg));
+ }
+
+ e = &x86_ext_save_areas[XSTATE_XTILE_DATA_BIT];
+ if (e->size && e->offset && buflen >= e->size + e->offset) {
+ XSaveXTILEDATA *tiledata = buf + e->offset;
+
+ memcpy(tiledata, &env->xtiledata, sizeof(env->xtiledata));
+ }
#endif
}
@@ -247,5 +261,19 @@ void x86_cpu_xrstor_all_areas(X86CPU *cpu, const void *buf, uint32_t buflen)
pkru = buf + e->offset;
memcpy(&env->pkru, pkru, sizeof(env->pkru));
}
+
+ e = &x86_ext_save_areas[XSTATE_XTILE_CFG_BIT];
+ if (e->size && e->offset) {
+ const XSaveXTILECFG *tilecfg = buf + e->offset;
+
+ memcpy(&env->xtilecfg, tilecfg, sizeof(env->xtilecfg));
+ }
+
+ e = &x86_ext_save_areas[XSTATE_XTILE_DATA_BIT];
+ if (e->size && e->offset && buflen >= e->size + e->offset) {
+ const XSaveXTILEDATA *tiledata = buf + e->offset;
+
+ memcpy(&env->xtiledata, tiledata, sizeof(env->xtiledata));
+ }
#endif
}
--
2.27.0