diff --git a/0001-optimize-cause-location-with-time-delay.patch b/0001-optimize-cause-location-with-time-delay.patch new file mode 100644 index 0000000..2030464 --- /dev/null +++ b/0001-optimize-cause-location-with-time-delay.patch @@ -0,0 +1,88 @@ +From be8f48eed633d99aaf9eadd25d7562391d0807b9 Mon Sep 17 00:00:00 2001 +From: algorithmofdish +Date: Wed, 14 Dec 2022 15:30:06 +0800 +Subject: [PATCH] perf(infer): optimize cause location with time delay + +--- + cause_inference/abnormal_event.py | 8 ++++++++ + cause_inference/cause_infer.py | 11 ++++++++--- + 2 files changed, 16 insertions(+), 3 deletions(-) + +diff --git a/cause_inference/abnormal_event.py b/cause_inference/abnormal_event.py +index f55c3d0..599d72d 100644 +--- a/cause_inference/abnormal_event.py ++++ b/cause_inference/abnormal_event.py +@@ -2,6 +2,7 @@ import json + from enum import Enum + from queue import Queue, Empty + from typing import List ++import time + + from kafka import KafkaConsumer + +@@ -37,6 +38,7 @@ class AbnEvtMgt: + except Empty as ex: + raise NoKpiEventException from ex + ++ self.wait_future_evts(abn_kpi.timestamp) + self.consume_kpi_evts_with_deadline(abn_kpi.timestamp) + self.consume_metric_evts_with_deadline(abn_kpi.timestamp) + self.clear_aging_evts(abn_kpi.timestamp) +@@ -145,6 +147,12 @@ class AbnEvtMgt: + def is_future(self, evt_ts, cur_ts): + return evt_ts > cur_ts + self.future_duration + ++ def wait_future_evts(self, evt_ts): ++ cur_ts = int(time.time()) * 1000 ++ if evt_ts <= cur_ts < evt_ts + self.future_duration: ++ wait_sec = (evt_ts + self.future_duration - cur_ts) // 1000 ++ time.sleep(wait_sec) ++ + + def preprocess_abn_score(score): + return max(0, score) +diff --git a/cause_inference/cause_infer.py b/cause_inference/cause_infer.py +index 82a83e1..b22768f 100644 +--- a/cause_inference/cause_infer.py ++++ b/cause_inference/cause_infer.py +@@ -58,10 +58,14 @@ class CauseLocator: + @staticmethod + def filter_causes(causes: List[Cause]) -> List[Cause]: + res = [] ++ dup = set() + for cause in causes: + filtered_cause = CauseLocator.clear_virtual_cause(cause) + if filtered_cause is not None: +- res.append(filtered_cause) ++ key = (filtered_cause.metric_id, filtered_cause.entity_id) ++ if key not in dup: ++ dup.add(key) ++ res.append(filtered_cause) + return res + + def construct_causal_graph(self, entity_causal_relations: List[tuple], abn_metrics: List[AbnormalEvent], +@@ -106,10 +110,11 @@ class CauseLocator: + self.topo_ts = self.topo_db_mgt.query_recent_topo_ts(self.abn_kpi.timestamp // 1000) + + def calc_corr_score(self, causal_graph: CausalGraph): ++ end_ts = self.abn_kpi.timestamp // 1000 + infer_config.infer_conf.get('evt_future_duration') + if not self.abn_kpi.hist_data: + hist_data = self.metric_db_mgt.query_metric_hist_data(self.abn_kpi.abnormal_metric_id, + self.abn_kpi.metric_labels, +- self.topo_ts) ++ end_ts) + self.abn_kpi.set_hist_data(hist_data) + + for node_id, node_attrs in causal_graph.entity_cause_graph.nodes.items(): +@@ -120,7 +125,7 @@ class CauseLocator: + + abn_metrics = causal_graph.get_abnormal_metrics(node_id) + for metric_id, metric_attrs in abn_metrics.items(): +- metric_hist_data = self.metric_db_mgt.query_metric_hist_data(metric_id, metric_labels, self.topo_ts) ++ metric_hist_data = self.metric_db_mgt.query_metric_hist_data(metric_id, metric_labels, end_ts) + + data_trend = trend(metric_hist_data) + metric_attrs.setdefault('real_trend', data_trend) +-- +2.21.0.windows.1 + diff --git a/0002-optimize-infer-rule-config.patch b/0002-optimize-infer-rule-config.patch new file mode 100644 index 0000000..ff9e8d4 --- /dev/null +++ b/0002-optimize-infer-rule-config.patch @@ -0,0 +1,82 @@ +From aeec448c24f3d724a8b79a9732091d8a833cedc5 Mon Sep 17 00:00:00 2001 +From: algorithmofdish +Date: Fri, 16 Dec 2022 17:04:51 +0800 +Subject: [PATCH] refactor(infer): optimize infer rule config + +--- + README.md | 2 +- + cause_inference/causal_graph.py | 7 +++++++ + cause_inference/output.py | 2 +- + config/infer-rule.yaml | 5 +---- + 4 files changed, 10 insertions(+), 6 deletions(-) + +diff --git a/README.md b/README.md +index 91d8afe..422ae86 100644 +--- a/README.md ++++ b/README.md +@@ -198,7 +198,7 @@ gala-spider 项目提供了两个功能模块,它们分别是: + - prometheus_server :指定 Prometheus 服务器地址 + - arangodb_server :指定 arangodb 服务器地址 + - kafka_server :指定 kafka 服务器地址 +- - log_level :指定 gala-spider 日志打印级别 ++ - log_level :指定 gala-inference 日志打印级别 + + 此外,如果需要从宿主机的配置文件中启动容器,可通过挂载卷的方式执行: + +diff --git a/cause_inference/causal_graph.py b/cause_inference/causal_graph.py +index f429bd6..10ad3ba 100644 +--- a/cause_inference/causal_graph.py ++++ b/cause_inference/causal_graph.py +@@ -67,6 +67,13 @@ class CausalGraph: + else: + abn_metrics[abn_metric.abnormal_metric_id] = abn_metric.to_dict() + ++ metric_labels = abn_metrics[abn_metric.abnormal_metric_id].get('metric_labels') ++ if not metric_labels: ++ metric_labels = dict(node_attrs.get('raw_data', {})) ++ if 'metrics' in metric_labels: ++ metric_labels.pop('metrics') ++ abn_metrics[abn_metric.abnormal_metric_id].update({'metric_labels': metric_labels}) ++ + def get_abnormal_metrics(self, node_id) -> dict: + return self.entity_cause_graph.nodes[node_id].get('abnormal_metrics', {}) + +diff --git a/cause_inference/output.py b/cause_inference/output.py +index 983b10c..51b9a54 100644 +--- a/cause_inference/output.py ++++ b/cause_inference/output.py +@@ -39,7 +39,7 @@ def format_cause_metrics(causes: List[Cause]): + 'metric_labels': node_attrs.get('metric_labels', {}), + 'timestamp': node_attrs.get('timestamp'), + 'desc': node_attrs.get('desc'), +- 'score': node_attrs.get('corr_score', 0.0), ++ 'score': cause.cause_score, + 'keyword': cause_keyword_mgt.get_keyword_of_entity(node_attrs.get('entity_type')), + } + path = [] +diff --git a/config/infer-rule.yaml b/config/infer-rule.yaml +index e88db7b..d5d1b51 100644 +--- a/config/infer-rule.yaml ++++ b/config/infer-rule.yaml +@@ -10,10 +10,6 @@ metric_categories: + metrics: + - gala_gopher_proc_read_bytes + - gala_gopher_proc_write_bytes +- - gala_gopher_proc_less_4k_io_read +- - gala_gopher_proc_less_4k_io_write +- - gala_gopher_proc_greater_4k_io_read +- - gala_gopher_proc_greater_4k_io_write + trend: rise + - + category: PROC_IO_DELAY +@@ -49,6 +45,7 @@ metric_categories: + - gala_gopher_block_latency_req_last + - gala_gopher_block_latency_req_sum + - gala_gopher_block_latency_req_jitter ++ - gala_gopher_block_count_latency_req + nic: + - + category: NIC_DROP +-- +2.21.0.windows.1 + diff --git a/gala-spider-1.0.0.tar.gz b/gala-spider-1.0.0.tar.gz deleted file mode 100644 index d4c266a..0000000 Binary files a/gala-spider-1.0.0.tar.gz and /dev/null differ diff --git a/gala-spider-1.0.1.tar.gz b/gala-spider-1.0.1.tar.gz new file mode 100644 index 0000000..88eefd1 Binary files /dev/null and b/gala-spider-1.0.1.tar.gz differ diff --git a/gala-spider.spec b/gala-spider.spec index bec76e4..b7d5379 100644 --- a/gala-spider.spec +++ b/gala-spider.spec @@ -1,8 +1,8 @@ %define debug_package %{nil} Name: gala-spider -Version: 1.0.0 -Release: 2 +Version: 1.0.1 +Release: 3 Summary: OS topological graph storage service and cause inference service for gala-ops project License: MulanPSL2 URL: https://gitee.com/openeuler/gala-spider @@ -11,10 +11,19 @@ Source0: %{name}-%{version}.tar.gz BuildRequires: python3-setuptools systemd Requires: python3-%{name} = %{version}-%{release} +patch0: 0001-optimize-cause-location-with-time-delay.patch +patch1: 0002-optimize-infer-rule-config.patch + %description OS topological graph storage service for gala-ops project +%package -n gala-ops +Summary: gala-anteater/spider/inference installation package +Requires: gala-anteater gala-inference gala-spider + +%description -n gala-ops +This package requires gala-anteater/spider/inference, allowing users to install them all at once %package -n python3-%{name} Summary: Python3 package of gala-spider @@ -41,7 +50,7 @@ Python3 package of gala-inference %prep -%autosetup +%autosetup -p1 %build @@ -64,7 +73,7 @@ fi %systemd_preun gala-spider.service %postun -%systemd_postun gala-spider.service +%systemd_postun_with_restart gala-spider.service %pre -n gala-inference @@ -79,7 +88,7 @@ fi %systemd_preun gala-inference.service %postun -n gala-inference -%systemd_postun gala-inference.service +%systemd_postun_with_restart gala-inference.service %files @@ -91,6 +100,8 @@ fi %{_bindir}/spider-storage %{_unitdir}/gala-spider.service +%files -n gala-ops +%defattr(-,root,root) %files -n python3-%{name} %{python3_sitelib}/spider/* @@ -103,6 +114,7 @@ fi %config(noreplace) %{_sysconfdir}/gala-inference/gala-inference.yaml %config(noreplace) %{_sysconfdir}/gala-inference/ext-observe-meta.yaml %config(noreplace) %{_sysconfdir}/gala-inference/infer-rule.yaml +%config(noreplace) %{_sysconfdir}/gala-inference/cause-keyword.yaml %{_bindir}/gala-inference %{_unitdir}/gala-inference.service @@ -113,6 +125,27 @@ fi %changelog +* Wed Dec 21 2022 algorithmofdish - 1.0.1-3 +- Optimize infer rule config + +* Thu Dec 15 2022 algorithmofdish - 1.0.1-2 +- Optimize cause location with time delay + +* Wed Dec 14 2022 algorithmofdish - 1.0.1-1 +- Update to 1.0.1: support cross host cause location + +* Sat Dec 10 2022 algorithmofdish - 1.0.0-6 +- Adaptation for abnormal event output change + +* Tue Dec 6 2022 Zhen Chen - 1.0.0-5 +- Provide gala-ops package to install anteater/spider/inference at once + +* Mon Dec 5 2022 tangxin xie - 1.0.0-4 +- add restart service when update software + +* Tue Nov 22 2022 algorithmofdish - 1.0.0-3 +- Cause inference optimization + * Mon Nov 14 2022 Zhen Chen - 1.0.0-2 - Update 1.0.0 tarball