qos: Some bugfixes for power_qos/cachembw_qos/cpu_qos
cpu_qos: Register reset_domain_bandwidth as exit func after adding power_qos job power_qos/cachembw_qos: Add type check for environment variables Signed-off-by: sundongxu <sundongxu3@huawei.com>
This commit is contained in:
parent
bd4c9df0b7
commit
e11c92af46
113
cpu_qos-register-reset_domain_bandwidth-as-exit-func.patch
Normal file
113
cpu_qos-register-reset_domain_bandwidth-as-exit-func.patch
Normal file
@ -0,0 +1,113 @@
|
|||||||
|
From a165c7131e09749401b01b3a7d568e96a9ca8b3a Mon Sep 17 00:00:00 2001
|
||||||
|
From: Dongxu Sun <sundongxu3@huawei.com>
|
||||||
|
Date: Sat, 3 Sep 2022 15:02:47 +0800
|
||||||
|
Subject: [PATCH 1/2] cpu_qos: Register reset_domain_bandwidth as exit func
|
||||||
|
after adding power_qos job
|
||||||
|
|
||||||
|
Currently, the domain bandwidth can be changed by
|
||||||
|
skylark only in power_qos job, so reset_domain_bandwidth
|
||||||
|
should be resgistered after adding power_qos job.
|
||||||
|
Besides, there is no need to reset domain bandwidth
|
||||||
|
when the domain cgroup path does not exist, since the
|
||||||
|
domain may have been stopped.
|
||||||
|
|
||||||
|
Signed-off-by: Dongxu Sun <sundongxu3@huawei.com>
|
||||||
|
---
|
||||||
|
qos_controller/cpucontroller.py | 21 ++++++++++-----------
|
||||||
|
skylark.py | 3 ++-
|
||||||
|
util.py | 5 +++--
|
||||||
|
3 files changed, 15 insertions(+), 14 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/qos_controller/cpucontroller.py b/qos_controller/cpucontroller.py
|
||||||
|
index f2a67e0..26b1240 100644
|
||||||
|
--- a/qos_controller/cpucontroller.py
|
||||||
|
+++ b/qos_controller/cpucontroller.py
|
||||||
|
@@ -63,12 +63,12 @@ class CpuController:
|
||||||
|
quota_path = os.path.join(vm_slices_path, domain.cgroup_name, "cpu.cfs_quota_us")
|
||||||
|
|
||||||
|
try:
|
||||||
|
- util.file_write(quota_path, str(domain_quota_us))
|
||||||
|
+ util.file_write(quota_path, str(domain_quota_us), log=False)
|
||||||
|
except IOError as error:
|
||||||
|
- LOGGER.error("Failed to limit domain %s(%d) cpu bandwidth: %s"
|
||||||
|
- % (domain.domain_name, domain.domain_id, str(error)))
|
||||||
|
# If VM doesn't stop, raise exception.
|
||||||
|
if os.access(quota_path, os.F_OK):
|
||||||
|
+ LOGGER.error("Failed to limit domain %s(%d) cpu bandwidth: %s"
|
||||||
|
+ % (domain.domain_name, domain.domain_id, str(error)))
|
||||||
|
raise
|
||||||
|
else:
|
||||||
|
LOGGER.info("Domain %s(%d) cpu bandwidth was limitted to %s"
|
||||||
|
@@ -83,12 +83,12 @@ class CpuController:
|
||||||
|
quota_path = os.path.join(vm_slices_path, domain.cgroup_name, "cpu.cfs_quota_us")
|
||||||
|
|
||||||
|
try:
|
||||||
|
- util.file_write(quota_path, str(initial_bandwidth))
|
||||||
|
+ util.file_write(quota_path, str(initial_bandwidth), log=False)
|
||||||
|
except IOError as error:
|
||||||
|
- LOGGER.error("Failed to recovery domain %s(%d) cpu bandwidth: %s!"
|
||||||
|
- % (domain.domain_name, domain.domain_id, str(error)))
|
||||||
|
# If VM doesn't stop, raise exception.
|
||||||
|
if os.access(quota_path, os.F_OK):
|
||||||
|
+ LOGGER.error("Failed to recovery domain %s(%d) cpu bandwidth: %s!"
|
||||||
|
+ % (domain.domain_name, domain.domain_id, str(error)))
|
||||||
|
raise
|
||||||
|
else:
|
||||||
|
LOGGER.info("Domain %s(%d) cpu bandwidth was recoveried to %s"
|
||||||
|
@@ -101,13 +101,12 @@ class CpuController:
|
||||||
|
domain = guest_info.low_prio_vm_dict.get(domain_id)
|
||||||
|
initial_bandwidth = domain.global_quota_config
|
||||||
|
quota_path = os.path.join(vm_slices_path, domain.cgroup_name, "cpu.cfs_quota_us")
|
||||||
|
-
|
||||||
|
try:
|
||||||
|
- util.file_write(quota_path, str(initial_bandwidth))
|
||||||
|
+ util.file_write(quota_path, str(initial_bandwidth), log=False)
|
||||||
|
except IOError:
|
||||||
|
- LOGGER.error("Failed to reset domain %s(%d) cpu bandwidth to its initial bandwidth %s!"
|
||||||
|
- % (domain.domain_name, domain.domain_id, initial_bandwidth))
|
||||||
|
- # This is on exiting path, make no sense to raise exception.
|
||||||
|
+ if os.access(quota_path, os.F_OK):
|
||||||
|
+ LOGGER.error("Failed to reset domain %s(%d) cpu bandwidth to its initial bandwidth %s!"
|
||||||
|
+ % (domain.domain_name, domain.domain_id, initial_bandwidth))
|
||||||
|
else:
|
||||||
|
LOGGER.info("Domain %s(%d) cpu bandwidth was reset to %s"
|
||||||
|
% (domain.domain_name, domain.domain_id, initial_bandwidth))
|
||||||
|
diff --git a/skylark.py b/skylark.py
|
||||||
|
index 6224f9b..2ec9862 100644
|
||||||
|
--- a/skylark.py
|
||||||
|
+++ b/skylark.py
|
||||||
|
@@ -84,8 +84,9 @@ class QosManager:
|
||||||
|
|
||||||
|
def init_qos_controller(self):
|
||||||
|
self.cpu_controller.set_low_priority_cgroup()
|
||||||
|
+ if os.getenv("POWER_QOS_MANAGEMENT", "false").lower() == "true":
|
||||||
|
+ atexit.register(self.cpu_controller.reset_domain_bandwidth, self.data_collector.guest_info)
|
||||||
|
self.cachembw_controller.init_cachembw_controller(self.data_collector.host_info.resctrl_info)
|
||||||
|
- atexit.register(self.cpu_controller.reset_domain_bandwidth, self.data_collector.guest_info)
|
||||||
|
self.net_controller.init_net_controller()
|
||||||
|
|
||||||
|
def start_scheduler(self):
|
||||||
|
diff --git a/util.py b/util.py
|
||||||
|
index 70f6f5a..2b8c3db 100644
|
||||||
|
--- a/util.py
|
||||||
|
+++ b/util.py
|
||||||
|
@@ -31,13 +31,14 @@ def file_read(file_path):
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
-def file_write(file_path, value):
|
||||||
|
+def file_write(file_path, value, log=True):
|
||||||
|
try:
|
||||||
|
with open(file_path, 'wb') as file:
|
||||||
|
file.truncate()
|
||||||
|
file.write(str.encode(value))
|
||||||
|
except FileNotFoundError as error:
|
||||||
|
- LOGGER.error(str(error))
|
||||||
|
+ if log:
|
||||||
|
+ LOGGER.error(str(error))
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
--
|
||||||
|
2.17.1
|
||||||
|
|
||||||
82
power_qos-cachembw_qos-Add-type-check-for-environmen.patch
Normal file
82
power_qos-cachembw_qos-Add-type-check-for-environmen.patch
Normal file
@ -0,0 +1,82 @@
|
|||||||
|
From 931b1d3767f6c62639d46cc51f9a831cba112de3 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Dongxu Sun <sundongxu3@huawei.com>
|
||||||
|
Date: Sat, 3 Sep 2022 16:39:51 +0800
|
||||||
|
Subject: [PATCH 2/2] power_qos/cachembw_qos: Add type check for environment
|
||||||
|
variables
|
||||||
|
|
||||||
|
Add type check for environment variables.
|
||||||
|
|
||||||
|
Signed-off-by: Dongxu Sun <sundongxu3@huawei.com>
|
||||||
|
---
|
||||||
|
qos_analyzer/poweranalyzer.py | 16 ++++++++++------
|
||||||
|
qos_controller/cachembwcontroller.py | 9 +++++++--
|
||||||
|
2 files changed, 17 insertions(+), 8 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/qos_analyzer/poweranalyzer.py b/qos_analyzer/poweranalyzer.py
|
||||||
|
index 23f6369..04fe51c 100644
|
||||||
|
--- a/qos_analyzer/poweranalyzer.py
|
||||||
|
+++ b/qos_analyzer/poweranalyzer.py
|
||||||
|
@@ -17,6 +17,7 @@ Description: This file is used for providing a power analyzer
|
||||||
|
# @code
|
||||||
|
|
||||||
|
import os
|
||||||
|
+import sys
|
||||||
|
|
||||||
|
from logger import LOGGER
|
||||||
|
from qos_controller import cpucontroller
|
||||||
|
@@ -34,13 +35,16 @@ class PowerAnalyzer:
|
||||||
|
self.qos_controller = cpucontroller.CpuController()
|
||||||
|
|
||||||
|
def set_hotspot_threshold(self, data_collector):
|
||||||
|
- self.tdp_threshold = float(os.getenv("TDP_THRESHOLD"))
|
||||||
|
- self.freq_threshold = float(os.getenv("FREQ_THRESHOLD"))
|
||||||
|
- self.abnormal_threshold = int(os.getenv("ABNORMAL_THRESHOLD"))
|
||||||
|
- self.quota_threshold = float(os.getenv("QUOTA_THRESHOLD"))
|
||||||
|
+ try:
|
||||||
|
+ self.tdp_threshold = float(os.getenv("TDP_THRESHOLD", "0.98"))
|
||||||
|
+ self.freq_threshold = float(os.getenv("FREQ_THRESHOLD", "0.98"))
|
||||||
|
+ self.abnormal_threshold = int(os.getenv("ABNORMAL_THRESHOLD", "3"))
|
||||||
|
+ self.quota_threshold = float(os.getenv("QUOTA_THRESHOLD", "0.9"))
|
||||||
|
+ except ValueError:
|
||||||
|
+ LOGGER.error("Threshold parameter type is incorrect, please check.")
|
||||||
|
+ sys.exit(1)
|
||||||
|
self.__check_threshold_validity()
|
||||||
|
-
|
||||||
|
- self.freq_threshold = float(os.getenv("FREQ_THRESHOLD")) * data_collector.host_info.cpu_turbofreq_mhz
|
||||||
|
+ self.freq_threshold = self.freq_threshold * data_collector.host_info.cpu_turbofreq_mhz
|
||||||
|
LOGGER.info("Frequency threshold is %.2f, abnormal times threshold is %d, bandwidth threshold is %.2f"
|
||||||
|
% (self.freq_threshold, self.abnormal_threshold, self.quota_threshold))
|
||||||
|
|
||||||
|
diff --git a/qos_controller/cachembwcontroller.py b/qos_controller/cachembwcontroller.py
|
||||||
|
index bbfe08f..a56ca59 100644
|
||||||
|
--- a/qos_controller/cachembwcontroller.py
|
||||||
|
+++ b/qos_controller/cachembwcontroller.py
|
||||||
|
@@ -17,6 +17,7 @@ Description: This file is used for control CACHE/MBW of low priority vms
|
||||||
|
# @code
|
||||||
|
|
||||||
|
import os
|
||||||
|
+import sys
|
||||||
|
import errno
|
||||||
|
|
||||||
|
import util
|
||||||
|
@@ -57,11 +58,15 @@ class CacheMBWController:
|
||||||
|
self.set_low_init_alloc(resctrl_info)
|
||||||
|
|
||||||
|
def __get_low_init_alloc(self, resctrl_info: ResctrlInfo):
|
||||||
|
- low_vms_mbw_init = float(os.getenv("MIN_MBW_LOW_VMS"))
|
||||||
|
+ try:
|
||||||
|
+ low_vms_mbw_init = float(os.getenv("MIN_MBW_LOW_VMS", "0.1"))
|
||||||
|
+ low_vms_cache_init = int(os.getenv("MIN_LLC_WAYS_LOW_VMS", "2"))
|
||||||
|
+ except ValueError:
|
||||||
|
+ LOGGER.error("MIN_MBW_LOW_VMS or MIN_LLC_WAYS_LOW_VMS parameter type is invalid.")
|
||||||
|
+ sys.exit(1)
|
||||||
|
if not LOW_MBW_INIT_FLOOR <= low_vms_mbw_init <= LOW_MBW_INIT_CEIL:
|
||||||
|
LOGGER.error("Invalid environment variables: MIN_MBW_LOW_VMS")
|
||||||
|
raise Exception
|
||||||
|
- low_vms_cache_init = int(os.getenv("MIN_LLC_WAYS_LOW_VMS"))
|
||||||
|
if not LOW_CACHE_INIT_FLOOR <= low_vms_cache_init <= LOW_CACHE_INIT_CEIL:
|
||||||
|
LOGGER.error("Invalid environment variables: MIN_LLC_WAYS_LOW_VMS")
|
||||||
|
raise Exception
|
||||||
|
--
|
||||||
|
2.17.1
|
||||||
|
|
||||||
@ -1,6 +1,6 @@
|
|||||||
Name: skylark
|
Name: skylark
|
||||||
Version: 1.0.0
|
Version: 1.0.0
|
||||||
Release: 5
|
Release: 6
|
||||||
Summary: Skylark is a next-generation QoS-aware scheduler.
|
Summary: Skylark is a next-generation QoS-aware scheduler.
|
||||||
|
|
||||||
License: Mulan PSL v2
|
License: Mulan PSL v2
|
||||||
@ -11,6 +11,8 @@ Patch0001: guestinfo-Take-another-VM-stop-reason-to-account.patch
|
|||||||
Patch0002: cpu_qos-Add-aditional-setting-for-cpu-QOS.patch
|
Patch0002: cpu_qos-Add-aditional-setting-for-cpu-QOS.patch
|
||||||
Patch0003: cachembw_qos-Add-a-job-to-sync-VM-pids-to-resctrl.patch
|
Patch0003: cachembw_qos-Add-a-job-to-sync-VM-pids-to-resctrl.patch
|
||||||
Patch0004: framework-create-pidfile-after-os.fork-in-child-proc.patch
|
Patch0004: framework-create-pidfile-after-os.fork-in-child-proc.patch
|
||||||
|
Patch0005: cpu_qos-register-reset_domain_bandwidth-as-exit-func.patch
|
||||||
|
Patch0006: power_qos-cachembw_qos-Add-type-check-for-environmen.patch
|
||||||
|
|
||||||
BuildRequires: python3-devel make gcc coreutils systemd-units
|
BuildRequires: python3-devel make gcc coreutils systemd-units
|
||||||
Requires: python3 python3-APScheduler python3-libvirt
|
Requires: python3 python3-APScheduler python3-libvirt
|
||||||
@ -60,6 +62,10 @@ make install DESTDIR=%{buildroot}
|
|||||||
|
|
||||||
|
|
||||||
%changelog
|
%changelog
|
||||||
|
* Sat Sep 03 2022 Dongxu Sun <sundongxu3@huawei.com> - 1.0.0-6
|
||||||
|
- cpu_qos: Register reset_domain_bandwidth as exit func after adding power_qos job
|
||||||
|
- power_qos/cachembw_qos: Add type check for environment variables
|
||||||
|
|
||||||
* Thu Aug 25 2022 Dongxu Sun <sundongxu3@huawei.com> - 1.0.0-5
|
* Thu Aug 25 2022 Dongxu Sun <sundongxu3@huawei.com> - 1.0.0-5
|
||||||
- framework: create pidfile after os.fork in child process
|
- framework: create pidfile after os.fork in child process
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user