From e11c92af468b96b00324060bfd88c4de7aeb6607 Mon Sep 17 00:00:00 2001 From: sundongxu Date: Sat, 3 Sep 2022 17:37:19 +0800 Subject: [PATCH] qos: Some bugfixes for power_qos/cachembw_qos/cpu_qos cpu_qos: Register reset_domain_bandwidth as exit func after adding power_qos job power_qos/cachembw_qos: Add type check for environment variables Signed-off-by: sundongxu --- ...-reset_domain_bandwidth-as-exit-func.patch | 113 ++++++++++++++++++ ...bw_qos-Add-type-check-for-environmen.patch | 82 +++++++++++++ skylark.spec | 8 +- 3 files changed, 202 insertions(+), 1 deletion(-) create mode 100644 cpu_qos-register-reset_domain_bandwidth-as-exit-func.patch create mode 100644 power_qos-cachembw_qos-Add-type-check-for-environmen.patch diff --git a/cpu_qos-register-reset_domain_bandwidth-as-exit-func.patch b/cpu_qos-register-reset_domain_bandwidth-as-exit-func.patch new file mode 100644 index 0000000..bd3a810 --- /dev/null +++ b/cpu_qos-register-reset_domain_bandwidth-as-exit-func.patch @@ -0,0 +1,113 @@ +From a165c7131e09749401b01b3a7d568e96a9ca8b3a Mon Sep 17 00:00:00 2001 +From: Dongxu Sun +Date: Sat, 3 Sep 2022 15:02:47 +0800 +Subject: [PATCH 1/2] cpu_qos: Register reset_domain_bandwidth as exit func + after adding power_qos job + +Currently, the domain bandwidth can be changed by +skylark only in power_qos job, so reset_domain_bandwidth +should be resgistered after adding power_qos job. +Besides, there is no need to reset domain bandwidth +when the domain cgroup path does not exist, since the +domain may have been stopped. + +Signed-off-by: Dongxu Sun +--- + qos_controller/cpucontroller.py | 21 ++++++++++----------- + skylark.py | 3 ++- + util.py | 5 +++-- + 3 files changed, 15 insertions(+), 14 deletions(-) + +diff --git a/qos_controller/cpucontroller.py b/qos_controller/cpucontroller.py +index f2a67e0..26b1240 100644 +--- a/qos_controller/cpucontroller.py ++++ b/qos_controller/cpucontroller.py +@@ -63,12 +63,12 @@ class CpuController: + quota_path = os.path.join(vm_slices_path, domain.cgroup_name, "cpu.cfs_quota_us") + + try: +- util.file_write(quota_path, str(domain_quota_us)) ++ util.file_write(quota_path, str(domain_quota_us), log=False) + except IOError as error: +- LOGGER.error("Failed to limit domain %s(%d) cpu bandwidth: %s" +- % (domain.domain_name, domain.domain_id, str(error))) + # If VM doesn't stop, raise exception. + if os.access(quota_path, os.F_OK): ++ LOGGER.error("Failed to limit domain %s(%d) cpu bandwidth: %s" ++ % (domain.domain_name, domain.domain_id, str(error))) + raise + else: + LOGGER.info("Domain %s(%d) cpu bandwidth was limitted to %s" +@@ -83,12 +83,12 @@ class CpuController: + quota_path = os.path.join(vm_slices_path, domain.cgroup_name, "cpu.cfs_quota_us") + + try: +- util.file_write(quota_path, str(initial_bandwidth)) ++ util.file_write(quota_path, str(initial_bandwidth), log=False) + except IOError as error: +- LOGGER.error("Failed to recovery domain %s(%d) cpu bandwidth: %s!" +- % (domain.domain_name, domain.domain_id, str(error))) + # If VM doesn't stop, raise exception. + if os.access(quota_path, os.F_OK): ++ LOGGER.error("Failed to recovery domain %s(%d) cpu bandwidth: %s!" ++ % (domain.domain_name, domain.domain_id, str(error))) + raise + else: + LOGGER.info("Domain %s(%d) cpu bandwidth was recoveried to %s" +@@ -101,13 +101,12 @@ class CpuController: + domain = guest_info.low_prio_vm_dict.get(domain_id) + initial_bandwidth = domain.global_quota_config + quota_path = os.path.join(vm_slices_path, domain.cgroup_name, "cpu.cfs_quota_us") +- + try: +- util.file_write(quota_path, str(initial_bandwidth)) ++ util.file_write(quota_path, str(initial_bandwidth), log=False) + except IOError: +- LOGGER.error("Failed to reset domain %s(%d) cpu bandwidth to its initial bandwidth %s!" +- % (domain.domain_name, domain.domain_id, initial_bandwidth)) +- # This is on exiting path, make no sense to raise exception. ++ if os.access(quota_path, os.F_OK): ++ LOGGER.error("Failed to reset domain %s(%d) cpu bandwidth to its initial bandwidth %s!" ++ % (domain.domain_name, domain.domain_id, initial_bandwidth)) + else: + LOGGER.info("Domain %s(%d) cpu bandwidth was reset to %s" + % (domain.domain_name, domain.domain_id, initial_bandwidth)) +diff --git a/skylark.py b/skylark.py +index 6224f9b..2ec9862 100644 +--- a/skylark.py ++++ b/skylark.py +@@ -84,8 +84,9 @@ class QosManager: + + def init_qos_controller(self): + self.cpu_controller.set_low_priority_cgroup() ++ if os.getenv("POWER_QOS_MANAGEMENT", "false").lower() == "true": ++ atexit.register(self.cpu_controller.reset_domain_bandwidth, self.data_collector.guest_info) + self.cachembw_controller.init_cachembw_controller(self.data_collector.host_info.resctrl_info) +- atexit.register(self.cpu_controller.reset_domain_bandwidth, self.data_collector.guest_info) + self.net_controller.init_net_controller() + + def start_scheduler(self): +diff --git a/util.py b/util.py +index 70f6f5a..2b8c3db 100644 +--- a/util.py ++++ b/util.py +@@ -31,13 +31,14 @@ def file_read(file_path): + raise + + +-def file_write(file_path, value): ++def file_write(file_path, value, log=True): + try: + with open(file_path, 'wb') as file: + file.truncate() + file.write(str.encode(value)) + except FileNotFoundError as error: +- LOGGER.error(str(error)) ++ if log: ++ LOGGER.error(str(error)) + raise + + +-- +2.17.1 + diff --git a/power_qos-cachembw_qos-Add-type-check-for-environmen.patch b/power_qos-cachembw_qos-Add-type-check-for-environmen.patch new file mode 100644 index 0000000..60b8191 --- /dev/null +++ b/power_qos-cachembw_qos-Add-type-check-for-environmen.patch @@ -0,0 +1,82 @@ +From 931b1d3767f6c62639d46cc51f9a831cba112de3 Mon Sep 17 00:00:00 2001 +From: Dongxu Sun +Date: Sat, 3 Sep 2022 16:39:51 +0800 +Subject: [PATCH 2/2] power_qos/cachembw_qos: Add type check for environment + variables + +Add type check for environment variables. + +Signed-off-by: Dongxu Sun +--- + qos_analyzer/poweranalyzer.py | 16 ++++++++++------ + qos_controller/cachembwcontroller.py | 9 +++++++-- + 2 files changed, 17 insertions(+), 8 deletions(-) + +diff --git a/qos_analyzer/poweranalyzer.py b/qos_analyzer/poweranalyzer.py +index 23f6369..04fe51c 100644 +--- a/qos_analyzer/poweranalyzer.py ++++ b/qos_analyzer/poweranalyzer.py +@@ -17,6 +17,7 @@ Description: This file is used for providing a power analyzer + # @code + + import os ++import sys + + from logger import LOGGER + from qos_controller import cpucontroller +@@ -34,13 +35,16 @@ class PowerAnalyzer: + self.qos_controller = cpucontroller.CpuController() + + def set_hotspot_threshold(self, data_collector): +- self.tdp_threshold = float(os.getenv("TDP_THRESHOLD")) +- self.freq_threshold = float(os.getenv("FREQ_THRESHOLD")) +- self.abnormal_threshold = int(os.getenv("ABNORMAL_THRESHOLD")) +- self.quota_threshold = float(os.getenv("QUOTA_THRESHOLD")) ++ try: ++ self.tdp_threshold = float(os.getenv("TDP_THRESHOLD", "0.98")) ++ self.freq_threshold = float(os.getenv("FREQ_THRESHOLD", "0.98")) ++ self.abnormal_threshold = int(os.getenv("ABNORMAL_THRESHOLD", "3")) ++ self.quota_threshold = float(os.getenv("QUOTA_THRESHOLD", "0.9")) ++ except ValueError: ++ LOGGER.error("Threshold parameter type is incorrect, please check.") ++ sys.exit(1) + self.__check_threshold_validity() +- +- self.freq_threshold = float(os.getenv("FREQ_THRESHOLD")) * data_collector.host_info.cpu_turbofreq_mhz ++ self.freq_threshold = self.freq_threshold * data_collector.host_info.cpu_turbofreq_mhz + LOGGER.info("Frequency threshold is %.2f, abnormal times threshold is %d, bandwidth threshold is %.2f" + % (self.freq_threshold, self.abnormal_threshold, self.quota_threshold)) + +diff --git a/qos_controller/cachembwcontroller.py b/qos_controller/cachembwcontroller.py +index bbfe08f..a56ca59 100644 +--- a/qos_controller/cachembwcontroller.py ++++ b/qos_controller/cachembwcontroller.py +@@ -17,6 +17,7 @@ Description: This file is used for control CACHE/MBW of low priority vms + # @code + + import os ++import sys + import errno + + import util +@@ -57,11 +58,15 @@ class CacheMBWController: + self.set_low_init_alloc(resctrl_info) + + def __get_low_init_alloc(self, resctrl_info: ResctrlInfo): +- low_vms_mbw_init = float(os.getenv("MIN_MBW_LOW_VMS")) ++ try: ++ low_vms_mbw_init = float(os.getenv("MIN_MBW_LOW_VMS", "0.1")) ++ low_vms_cache_init = int(os.getenv("MIN_LLC_WAYS_LOW_VMS", "2")) ++ except ValueError: ++ LOGGER.error("MIN_MBW_LOW_VMS or MIN_LLC_WAYS_LOW_VMS parameter type is invalid.") ++ sys.exit(1) + if not LOW_MBW_INIT_FLOOR <= low_vms_mbw_init <= LOW_MBW_INIT_CEIL: + LOGGER.error("Invalid environment variables: MIN_MBW_LOW_VMS") + raise Exception +- low_vms_cache_init = int(os.getenv("MIN_LLC_WAYS_LOW_VMS")) + if not LOW_CACHE_INIT_FLOOR <= low_vms_cache_init <= LOW_CACHE_INIT_CEIL: + LOGGER.error("Invalid environment variables: MIN_LLC_WAYS_LOW_VMS") + raise Exception +-- +2.17.1 + diff --git a/skylark.spec b/skylark.spec index 079e467..478ba2a 100644 --- a/skylark.spec +++ b/skylark.spec @@ -1,6 +1,6 @@ Name: skylark Version: 1.0.0 -Release: 5 +Release: 6 Summary: Skylark is a next-generation QoS-aware scheduler. License: Mulan PSL v2 @@ -11,6 +11,8 @@ Patch0001: guestinfo-Take-another-VM-stop-reason-to-account.patch Patch0002: cpu_qos-Add-aditional-setting-for-cpu-QOS.patch Patch0003: cachembw_qos-Add-a-job-to-sync-VM-pids-to-resctrl.patch Patch0004: framework-create-pidfile-after-os.fork-in-child-proc.patch +Patch0005: cpu_qos-register-reset_domain_bandwidth-as-exit-func.patch +Patch0006: power_qos-cachembw_qos-Add-type-check-for-environmen.patch BuildRequires: python3-devel make gcc coreutils systemd-units Requires: python3 python3-APScheduler python3-libvirt @@ -60,6 +62,10 @@ make install DESTDIR=%{buildroot} %changelog +* Sat Sep 03 2022 Dongxu Sun - 1.0.0-6 +- cpu_qos: Register reset_domain_bandwidth as exit func after adding power_qos job +- power_qos/cachembw_qos: Add type check for environment variables + * Thu Aug 25 2022 Dongxu Sun - 1.0.0-5 - framework: create pidfile after os.fork in child process