qos: Some bugfixes for power_qos/cachembw_qos/cpu_qos

cpu_qos: Register reset_domain_bandwidth as exit func
after adding power_qos job
power_qos/cachembw_qos: Add type check for environment
variables

Signed-off-by: sundongxu <sundongxu3@huawei.com>
This commit is contained in:
sundongxu 2022-09-03 17:37:19 +08:00
parent bd4c9df0b7
commit e11c92af46
3 changed files with 202 additions and 1 deletions

View File

@ -0,0 +1,113 @@
From a165c7131e09749401b01b3a7d568e96a9ca8b3a Mon Sep 17 00:00:00 2001
From: Dongxu Sun <sundongxu3@huawei.com>
Date: Sat, 3 Sep 2022 15:02:47 +0800
Subject: [PATCH 1/2] cpu_qos: Register reset_domain_bandwidth as exit func
after adding power_qos job
Currently, the domain bandwidth can be changed by
skylark only in power_qos job, so reset_domain_bandwidth
should be resgistered after adding power_qos job.
Besides, there is no need to reset domain bandwidth
when the domain cgroup path does not exist, since the
domain may have been stopped.
Signed-off-by: Dongxu Sun <sundongxu3@huawei.com>
---
qos_controller/cpucontroller.py | 21 ++++++++++-----------
skylark.py | 3 ++-
util.py | 5 +++--
3 files changed, 15 insertions(+), 14 deletions(-)
diff --git a/qos_controller/cpucontroller.py b/qos_controller/cpucontroller.py
index f2a67e0..26b1240 100644
--- a/qos_controller/cpucontroller.py
+++ b/qos_controller/cpucontroller.py
@@ -63,12 +63,12 @@ class CpuController:
quota_path = os.path.join(vm_slices_path, domain.cgroup_name, "cpu.cfs_quota_us")
try:
- util.file_write(quota_path, str(domain_quota_us))
+ util.file_write(quota_path, str(domain_quota_us), log=False)
except IOError as error:
- LOGGER.error("Failed to limit domain %s(%d) cpu bandwidth: %s"
- % (domain.domain_name, domain.domain_id, str(error)))
# If VM doesn't stop, raise exception.
if os.access(quota_path, os.F_OK):
+ LOGGER.error("Failed to limit domain %s(%d) cpu bandwidth: %s"
+ % (domain.domain_name, domain.domain_id, str(error)))
raise
else:
LOGGER.info("Domain %s(%d) cpu bandwidth was limitted to %s"
@@ -83,12 +83,12 @@ class CpuController:
quota_path = os.path.join(vm_slices_path, domain.cgroup_name, "cpu.cfs_quota_us")
try:
- util.file_write(quota_path, str(initial_bandwidth))
+ util.file_write(quota_path, str(initial_bandwidth), log=False)
except IOError as error:
- LOGGER.error("Failed to recovery domain %s(%d) cpu bandwidth: %s!"
- % (domain.domain_name, domain.domain_id, str(error)))
# If VM doesn't stop, raise exception.
if os.access(quota_path, os.F_OK):
+ LOGGER.error("Failed to recovery domain %s(%d) cpu bandwidth: %s!"
+ % (domain.domain_name, domain.domain_id, str(error)))
raise
else:
LOGGER.info("Domain %s(%d) cpu bandwidth was recoveried to %s"
@@ -101,13 +101,12 @@ class CpuController:
domain = guest_info.low_prio_vm_dict.get(domain_id)
initial_bandwidth = domain.global_quota_config
quota_path = os.path.join(vm_slices_path, domain.cgroup_name, "cpu.cfs_quota_us")
-
try:
- util.file_write(quota_path, str(initial_bandwidth))
+ util.file_write(quota_path, str(initial_bandwidth), log=False)
except IOError:
- LOGGER.error("Failed to reset domain %s(%d) cpu bandwidth to its initial bandwidth %s!"
- % (domain.domain_name, domain.domain_id, initial_bandwidth))
- # This is on exiting path, make no sense to raise exception.
+ if os.access(quota_path, os.F_OK):
+ LOGGER.error("Failed to reset domain %s(%d) cpu bandwidth to its initial bandwidth %s!"
+ % (domain.domain_name, domain.domain_id, initial_bandwidth))
else:
LOGGER.info("Domain %s(%d) cpu bandwidth was reset to %s"
% (domain.domain_name, domain.domain_id, initial_bandwidth))
diff --git a/skylark.py b/skylark.py
index 6224f9b..2ec9862 100644
--- a/skylark.py
+++ b/skylark.py
@@ -84,8 +84,9 @@ class QosManager:
def init_qos_controller(self):
self.cpu_controller.set_low_priority_cgroup()
+ if os.getenv("POWER_QOS_MANAGEMENT", "false").lower() == "true":
+ atexit.register(self.cpu_controller.reset_domain_bandwidth, self.data_collector.guest_info)
self.cachembw_controller.init_cachembw_controller(self.data_collector.host_info.resctrl_info)
- atexit.register(self.cpu_controller.reset_domain_bandwidth, self.data_collector.guest_info)
self.net_controller.init_net_controller()
def start_scheduler(self):
diff --git a/util.py b/util.py
index 70f6f5a..2b8c3db 100644
--- a/util.py
+++ b/util.py
@@ -31,13 +31,14 @@ def file_read(file_path):
raise
-def file_write(file_path, value):
+def file_write(file_path, value, log=True):
try:
with open(file_path, 'wb') as file:
file.truncate()
file.write(str.encode(value))
except FileNotFoundError as error:
- LOGGER.error(str(error))
+ if log:
+ LOGGER.error(str(error))
raise
--
2.17.1

View File

@ -0,0 +1,82 @@
From 931b1d3767f6c62639d46cc51f9a831cba112de3 Mon Sep 17 00:00:00 2001
From: Dongxu Sun <sundongxu3@huawei.com>
Date: Sat, 3 Sep 2022 16:39:51 +0800
Subject: [PATCH 2/2] power_qos/cachembw_qos: Add type check for environment
variables
Add type check for environment variables.
Signed-off-by: Dongxu Sun <sundongxu3@huawei.com>
---
qos_analyzer/poweranalyzer.py | 16 ++++++++++------
qos_controller/cachembwcontroller.py | 9 +++++++--
2 files changed, 17 insertions(+), 8 deletions(-)
diff --git a/qos_analyzer/poweranalyzer.py b/qos_analyzer/poweranalyzer.py
index 23f6369..04fe51c 100644
--- a/qos_analyzer/poweranalyzer.py
+++ b/qos_analyzer/poweranalyzer.py
@@ -17,6 +17,7 @@ Description: This file is used for providing a power analyzer
# @code
import os
+import sys
from logger import LOGGER
from qos_controller import cpucontroller
@@ -34,13 +35,16 @@ class PowerAnalyzer:
self.qos_controller = cpucontroller.CpuController()
def set_hotspot_threshold(self, data_collector):
- self.tdp_threshold = float(os.getenv("TDP_THRESHOLD"))
- self.freq_threshold = float(os.getenv("FREQ_THRESHOLD"))
- self.abnormal_threshold = int(os.getenv("ABNORMAL_THRESHOLD"))
- self.quota_threshold = float(os.getenv("QUOTA_THRESHOLD"))
+ try:
+ self.tdp_threshold = float(os.getenv("TDP_THRESHOLD", "0.98"))
+ self.freq_threshold = float(os.getenv("FREQ_THRESHOLD", "0.98"))
+ self.abnormal_threshold = int(os.getenv("ABNORMAL_THRESHOLD", "3"))
+ self.quota_threshold = float(os.getenv("QUOTA_THRESHOLD", "0.9"))
+ except ValueError:
+ LOGGER.error("Threshold parameter type is incorrect, please check.")
+ sys.exit(1)
self.__check_threshold_validity()
-
- self.freq_threshold = float(os.getenv("FREQ_THRESHOLD")) * data_collector.host_info.cpu_turbofreq_mhz
+ self.freq_threshold = self.freq_threshold * data_collector.host_info.cpu_turbofreq_mhz
LOGGER.info("Frequency threshold is %.2f, abnormal times threshold is %d, bandwidth threshold is %.2f"
% (self.freq_threshold, self.abnormal_threshold, self.quota_threshold))
diff --git a/qos_controller/cachembwcontroller.py b/qos_controller/cachembwcontroller.py
index bbfe08f..a56ca59 100644
--- a/qos_controller/cachembwcontroller.py
+++ b/qos_controller/cachembwcontroller.py
@@ -17,6 +17,7 @@ Description: This file is used for control CACHE/MBW of low priority vms
# @code
import os
+import sys
import errno
import util
@@ -57,11 +58,15 @@ class CacheMBWController:
self.set_low_init_alloc(resctrl_info)
def __get_low_init_alloc(self, resctrl_info: ResctrlInfo):
- low_vms_mbw_init = float(os.getenv("MIN_MBW_LOW_VMS"))
+ try:
+ low_vms_mbw_init = float(os.getenv("MIN_MBW_LOW_VMS", "0.1"))
+ low_vms_cache_init = int(os.getenv("MIN_LLC_WAYS_LOW_VMS", "2"))
+ except ValueError:
+ LOGGER.error("MIN_MBW_LOW_VMS or MIN_LLC_WAYS_LOW_VMS parameter type is invalid.")
+ sys.exit(1)
if not LOW_MBW_INIT_FLOOR <= low_vms_mbw_init <= LOW_MBW_INIT_CEIL:
LOGGER.error("Invalid environment variables: MIN_MBW_LOW_VMS")
raise Exception
- low_vms_cache_init = int(os.getenv("MIN_LLC_WAYS_LOW_VMS"))
if not LOW_CACHE_INIT_FLOOR <= low_vms_cache_init <= LOW_CACHE_INIT_CEIL:
LOGGER.error("Invalid environment variables: MIN_LLC_WAYS_LOW_VMS")
raise Exception
--
2.17.1

View File

@ -1,6 +1,6 @@
Name: skylark
Version: 1.0.0
Release: 5
Release: 6
Summary: Skylark is a next-generation QoS-aware scheduler.
License: Mulan PSL v2
@ -11,6 +11,8 @@ Patch0001: guestinfo-Take-another-VM-stop-reason-to-account.patch
Patch0002: cpu_qos-Add-aditional-setting-for-cpu-QOS.patch
Patch0003: cachembw_qos-Add-a-job-to-sync-VM-pids-to-resctrl.patch
Patch0004: framework-create-pidfile-after-os.fork-in-child-proc.patch
Patch0005: cpu_qos-register-reset_domain_bandwidth-as-exit-func.patch
Patch0006: power_qos-cachembw_qos-Add-type-check-for-environmen.patch
BuildRequires: python3-devel make gcc coreutils systemd-units
Requires: python3 python3-APScheduler python3-libvirt
@ -60,6 +62,10 @@ make install DESTDIR=%{buildroot}
%changelog
* Sat Sep 03 2022 Dongxu Sun <sundongxu3@huawei.com> - 1.0.0-6
- cpu_qos: Register reset_domain_bandwidth as exit func after adding power_qos job
- power_qos/cachembw_qos: Add type check for environment variables
* Thu Aug 25 2022 Dongxu Sun <sundongxu3@huawei.com> - 1.0.0-5
- framework: create pidfile after os.fork in child process