400 lines
18 KiB
Python
400 lines
18 KiB
Python
#!/usr/bin/python3
|
|
# -*- coding: utf-8 -*-
|
|
# Copyright (c) 2019 Huawei Technologies Co., Ltd.
|
|
# A-Tune is licensed under the Mulan PSL v1.
|
|
# You can use this software according to the terms and conditions of the Mulan PSL v1.
|
|
# You may obtain a copy of Mulan PSL v1 at:
|
|
# http://license.coscl.org.cn/MulanPSL
|
|
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
|
|
# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
|
|
# PURPOSE.
|
|
# See the Mulan PSL v1 for more details.
|
|
# Create: 2019-10-29
|
|
|
|
"""
|
|
This class is used to train models and characterize system workload.
|
|
"""
|
|
|
|
import os
|
|
import glob
|
|
import hashlib
|
|
import numpy as np
|
|
import pandas as pd
|
|
from sklearn import svm
|
|
from collections import Counter
|
|
from xgboost import XGBClassifier
|
|
from sklearn.cluster import KMeans
|
|
from sklearn.externals import joblib
|
|
from sklearn.metrics import accuracy_score
|
|
from sklearn.ensemble import AdaBoostClassifier
|
|
from sklearn.model_selection import train_test_split as tts
|
|
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
|
|
|
class WorkloadCharacterization(object):
|
|
def __init__(self, model_path):
|
|
self.model_path = model_path
|
|
self.scaler = StandardScaler()
|
|
self.encoder = LabelEncoder()
|
|
self.dataset = None
|
|
self.X = None
|
|
self.y = None
|
|
|
|
def parsing(self, data_path):
|
|
df = [ ]
|
|
print (data_path)
|
|
csvfiles = glob.glob(data_path)
|
|
for csv in csvfiles:
|
|
print ("file:", csv)
|
|
data = pd.read_csv(csv, index_col=None, header=None)
|
|
df.append(data)
|
|
dataset = pd.concat(df)
|
|
self.dataset = dataset
|
|
return dataset
|
|
|
|
def clustering(self, data_path):
|
|
complete_path = os.path.join(data_path,"*.csv")
|
|
self.parsing(complete_path)
|
|
dataset = self.dataset.drop([28,41,42,43,44,57,58], axis=1)
|
|
cpu_util = pd.concat([dataset.iloc[:,8], dataset.iloc[:,45]], axis=1, join='inner')
|
|
io_util = pd.concat([dataset.iloc[:,3], dataset.iloc[:,19]], axis=1, join='inner')
|
|
net_util = dataset.iloc[:,25]
|
|
mem_total = dataset.iloc[:,29]
|
|
|
|
cpu_path = os.path.join(self.model_path,'cpu_clu.m')
|
|
io_path = os.path.join(self.model_path,'io_clu.m')
|
|
net_path = os.path.join(self.model_path,'net_clu.m')
|
|
mem_path = os.path.join(self.model_path,'mem_clu.m')
|
|
|
|
cpu = KMeans(n_clusters=2)
|
|
s = cpu.fit(cpu_util)
|
|
joblib.dump(cpu, cpu_path)
|
|
c_pred = cpu.predict(cpu_util)
|
|
|
|
io = KMeans(n_clusters=2)
|
|
s = io.fit(io_util)
|
|
joblib.dump(io, io_path)
|
|
i_pred = io.predict(io_util)
|
|
|
|
net = KMeans(n_clusters=2)
|
|
x= [[x] for x in net_util]
|
|
s = net.fit(x)
|
|
joblib.dump(net, net_path)
|
|
n_pred = net.predict(x)
|
|
|
|
mem = KMeans(n_clusters=2)
|
|
x= [[x] for x in mem_total]
|
|
s = mem.fit(x)
|
|
joblib.dump(mem, mem_path)
|
|
m_pred = mem.predict(x)
|
|
|
|
result = pd.concat([pd.DataFrame(c_pred), pd.DataFrame(i_pred), pd.DataFrame(n_pred), pd.DataFrame(m_pred)], axis=1)
|
|
cpu_result = result.iloc[:,0].map(lambda x:str(x))
|
|
io_result = result.iloc[:,1].map(lambda x:str(x))
|
|
net_result = result.iloc[:,2].map(lambda x:str(x))
|
|
mem_result = result.iloc[:,3].map(lambda x:str(x))
|
|
label = cpu_result.str.cat(io_result)
|
|
label = label.str.cat(net_result)
|
|
label = label.str.cat(mem_result)
|
|
dataset.insert(0, 'label', label)
|
|
dataset.to_csv('clusteringresult.csv')
|
|
|
|
def labelling(self, data):
|
|
cpu_path = os.path.join(self.model_path,'cpu_clu.m')
|
|
io_path = os.path.join(self.model_path,'io_clu.m')
|
|
net_path = os.path.join(self.model_path,'net_clu.m')
|
|
mem_path = os.path.join(self.model_path,'mem_clu.m')
|
|
|
|
cpu = joblib.load(cpu_path)
|
|
io = joblib.load(io_path)
|
|
net = joblib.load(net_path)
|
|
mem = joblib.load(mem_path)
|
|
|
|
X = pd.DataFrame(data)
|
|
|
|
cpu_util = pd.concat([X.iloc[:,8], X.iloc[:,45]], axis=1, join='inner')
|
|
io_util = pd.concat([X.iloc[:,3], X.iloc[:,19]], axis=1, join='inner')
|
|
net_util = [[x] for x in X.iloc[:, 25]]
|
|
mem_total = [[x] for x in X.iloc[:, 29]]
|
|
|
|
ctype = cpu.predict(cpu_util)
|
|
itype = io.predict(io_util)
|
|
ntype = net.predict(net_util)
|
|
mtype = mem.predict(mem_total)
|
|
cpred = Counter(ctype).most_common(1)[0]
|
|
ipred = Counter(itype).most_common(1)[0]
|
|
npred = Counter(ntype).most_common(1)[0]
|
|
mpred = Counter(mtype).most_common(1)[0]
|
|
|
|
print ("The clustering result is", cpred[0], ipred[0], npred[0], mpred[0])
|
|
print ("The accuracy of the label prediction are ", cpred[1]/len(data),
|
|
ipred[1]/len(data), npred[1]/len(data), mpred[1]/len(data))
|
|
return [cpred[0], ipred[0], npred[0], mpred[0]]
|
|
|
|
def svm_clf(self, clfpath, kernel):
|
|
X_train, X_test, y_train, y_test = tts(self.X, self.y, test_size=0.3)
|
|
|
|
rbf = svm.SVC(kernel= kernel, C=200)
|
|
rbf.fit(X_train, y_train)
|
|
rbf_score = rbf.score(X_test, y_test)
|
|
print ("the accuracy of rbf classification is %f" %rbf_score)
|
|
joblib.dump(rbf, clfpath)
|
|
|
|
def ada_clf(self, clfpath):
|
|
X_train, X_test, y_train, y_test = tts(self.X, self.y, test_size=0.3)
|
|
adaboost = AdaBoostClassifier(n_estimators=1000, learning_rate=0.05)
|
|
adaboost.fit(X_train, y_train)
|
|
adaboost_score = adaboost.score(X_test, y_test)
|
|
print ("the accuracy of adaboost is %f" %adaboost_score)
|
|
print ("the feature importances of this classifier are", adaboost.feature_importances_)
|
|
joblib.dump(adaboost, clfpath)
|
|
|
|
def xgb_clf(self, clfpath):
|
|
X_train, X_test, y_train, y_test = tts(self.X, self.y, test_size=0.3)
|
|
model = XGBClassifier(learning_rate=0.1, n_estimators=400, max_depth=6, min_child_weight = 0.5, gamma=0.01,
|
|
subsample=1, colsample_btree=1, scale_pos_weight=1, random_state=27, slient = 0, alpha=100)
|
|
model.fit(X_train,y_train)
|
|
y_pred = model.predict(X_test)
|
|
print ("the accuracy of xgboost is %f" %accuracy_score(y_test,y_pred))
|
|
print ("the feature importances of this classifier are", model.feature_importances_)
|
|
joblib.dump(model, clfpath)
|
|
|
|
def train(self, data_path):
|
|
encoder_path = os.path.join(self.model_path,"encoder.pkl")
|
|
complete_path = os.path.join(data_path,"*.csv")
|
|
|
|
io_path = os.path.join(data_path,"*0100.csv")
|
|
net_path = os.path.join(data_path,"*0010.csv")
|
|
cpu_path = os.path.join(data_path,"*1000.csv")
|
|
cpumem_path = os.path.join(data_path,"*1001.csv")
|
|
cpuio_path = os.path.join(data_path,"*1100.csv")
|
|
cpunet_path = os.path.join(data_path,"*1010.csv")
|
|
cpuiomem_path = os.path.join(data_path,"*1101.csv")
|
|
cpunetmem_path = os.path.join(data_path,"*1011.csv")
|
|
|
|
io_clf = os.path.join(self.model_path,"io_clf.m")
|
|
net_clf = os.path.join(self.model_path,"net_clf.m")
|
|
cpu_clf = os.path.join(self.model_path,"cpu_clf.m")
|
|
cpumem_clf = os.path.join(self.model_path,"cpumem_clf.m")
|
|
cpuio_clf = os.path.join(self.model_path,"cpuio_clf.m")
|
|
cpuiomem_clf = os.path.join(self.model_path,"cpuiomem_clf.m")
|
|
cpunet_clf = os.path.join(self.model_path,"cpunet_clf.m")
|
|
cpunetmem_clf = os.path.join(self.model_path,"cpunetmem_clf.m")
|
|
|
|
io_scl = os.path.join(self.model_path,"io_scaler.pkl")
|
|
net_scl = os.path.join(self.model_path,"net_scaler.pkl")
|
|
cpu_scl = os.path.join(self.model_path,"cpu_scaler.pkl")
|
|
cpumem_scl = os.path.join(self.model_path,"cpumem_scaler.pkl")
|
|
cpuio_scl = os.path.join(self.model_path,"cpuio_scaler.pkl")
|
|
cpuiomem_scl = os.path.join(self.model_path,"cpuiomem_scaler.pkl")
|
|
cpunet_scl = os.path.join(self.model_path,"cpunet_scaler.pkl")
|
|
cpunetmem_scl = os.path.join(self.model_path,"cpunetmem_scaler.pkl")
|
|
|
|
self.parsing(complete_path)
|
|
self.y = self.encoder.fit_transform(self.dataset.iloc[:, -1])
|
|
joblib.dump(self.encoder, encoder_path)
|
|
|
|
self.parsing(io_path)
|
|
self.X = self.scaler.fit_transform(self.dataset.iloc[:,:-1])
|
|
self.y = self.encoder.transform(self.dataset.iloc[:, -1])
|
|
joblib.dump(self.scaler, io_scl)
|
|
self.svm_clf(io_clf, 'sigmoid')
|
|
print ("The classifier for io-intensive workload has been generated.")
|
|
|
|
self.parsing(net_path)
|
|
self.X = self.scaler.fit_transform(self.dataset.iloc[:,:-1])
|
|
self.y = self.encoder.transform(self.dataset.iloc[:, -1])
|
|
joblib.dump(self.scaler, net_scl)
|
|
self.svm_clf(net_clf, 'sigmoid')
|
|
print ("The classifier for net-intensive workload has been generated.")
|
|
|
|
self.parsing(cpu_path)
|
|
self.X = self.scaler.fit_transform(self.dataset.iloc[:,:-1])
|
|
self.y = self.encoder.transform(self.dataset.iloc[:, -1])
|
|
joblib.dump(self.scaler, cpu_scl)
|
|
self.xgb_clf(cpu_clf)
|
|
print ("The classifier for cpu-intensive workload has been generated.")
|
|
|
|
self.parsing(cpumem_path)
|
|
self.X = self.scaler.fit_transform(self.dataset.iloc[:,:-1])
|
|
self.y = self.encoder.transform(self.dataset.iloc[:, -1])
|
|
joblib.dump(self.scaler, cpumem_scl)
|
|
self.svm_clf(cpumem_clf, 'rbf')
|
|
print ("The classifier for cpumem-intensive workload has been generated.")
|
|
|
|
self.parsing(cpuio_path)
|
|
self.X = self.scaler.fit_transform(self.dataset.iloc[:,:-1])
|
|
self.y = self.encoder.transform(self.dataset.iloc[:, -1])
|
|
joblib.dump(self.scaler, cpuio_scl)
|
|
self.svm_clf(cpuio_clf, 'rbf')
|
|
print ("The classifier for cpuio-intensive workload has been generated.")
|
|
|
|
self.parsing(cpunet_path)
|
|
self.X = self.scaler.fit_transform(self.dataset.iloc[:,:-1])
|
|
self.y = self.encoder.transform(self.dataset.iloc[:, -1])
|
|
joblib.dump(self.scaler, cpunet_scl)
|
|
self.svm_clf(cpunet_clf, 'rbf')
|
|
print ("The classifier for cpunet-intensive workload has been generated.")
|
|
|
|
self.parsing(cpuiomem_path)
|
|
self.X = self.scaler.fit_transform(self.dataset.iloc[:,:-1])
|
|
self.y = self.encoder.transform(self.dataset.iloc[:, -1])
|
|
joblib.dump(self.scaler, cpuiomem_scl)
|
|
self.svm_clf(cpuiomem_clf, 'rbf')
|
|
print ("The classifier for cpuiomem-intensive workload has been generated.")
|
|
|
|
self.parsing(cpunetmem_path)
|
|
self.X = self.scaler.fit_transform(self.dataset.iloc[:,:-1])
|
|
self.y = self.encoder.transform(self.dataset.iloc[:, -1])
|
|
joblib.dump(self.scaler, cpunetmem_scl)
|
|
self.svm_clf(cpunetmem_clf, 'rbf')
|
|
print ("The classifier for cpunetmem-intensive workload has been generated.")
|
|
|
|
def identify(self, x):
|
|
x = pd.DataFrame(x)
|
|
x = x.drop([28,41,42,43,44,57,58], axis=1)
|
|
l = self.labelling(x)
|
|
|
|
encoder_path = os.path.join(self.model_path,'encoder.pkl')
|
|
|
|
io_scaler = os.path.join(self.model_path,'io_scaler.pkl')
|
|
net_scaler = os.path.join(self.model_path,'net_scaler.pkl')
|
|
cpu_scaler = os.path.join(self.model_path,'cpu_scaler.pkl')
|
|
cpuio_scaler = os.path.join(self.model_path,'cpuio_scaler.pkl')
|
|
cpuiomem_scaler = os.path.join(self.model_path,'cpuiomem_scaler.pkl')
|
|
cpunet_scaler = os.path.join(self.model_path,'cpunet_scaler.pkl')
|
|
cpumem_scaler = os.path.join(self.model_path,'cpumem_scaler.pkl')
|
|
cpunetmem_scaler = os.path.join(self.model_path,'cpunetmem_scaler.pkl')
|
|
|
|
io_clf = os.path.join(self.model_path,'io_clf.m')
|
|
net_clf = os.path.join(self.model_path,'net_clf.m')
|
|
cpu_clf = os.path.join(self.model_path,'cpu_clf.m')
|
|
cpumem_clf = os.path.join(self.model_path,'cpumem_clf.m')
|
|
cpuio_clf = os.path.join(self.model_path,'cpuio_clf.m')
|
|
cpunet_clf = os.path.join(self.model_path,'cpunet_clf.m')
|
|
cpuiomem_clf = os.path.join(self.model_path,'cpuiomem_clf.m')
|
|
cpunetmem_clf = os.path.join(self.model_path,'cpunetmem_clf.m')
|
|
|
|
if l == [0, 0, 0, 0]:
|
|
resourcelimit = " "
|
|
print ("The workload has been identified as non-intensive type")
|
|
return resourcelimit, "default", 1
|
|
elif l == [0, 0, 0, 1] :
|
|
resourcelimit = "io"
|
|
print ("The workload has been identified as mem-intensive type")
|
|
return resourcelimit, "in-memory_computing", 1
|
|
elif l == [0, 0, 1, 0] :
|
|
resourcelimit = "network"
|
|
scaler = joblib.load(net_scaler)
|
|
x = scaler.transform(x)
|
|
print ("The workload has been identified as network-intensive type")
|
|
model = joblib.load(net_clf)
|
|
result = model.predict(x)
|
|
elif l == [0, 1, 0, 0] :
|
|
resourcelimit = "io"
|
|
scaler = joblib.load(io_scaler)
|
|
x = scaler.transform(x)
|
|
print ("The workload has been identified as io-intensive type")
|
|
model = joblib.load(io_clf)
|
|
result = model.predict(x)
|
|
elif l == [1, 0, 0, 0]:
|
|
resourcelimit = "cpu"
|
|
scaler = joblib.load(cpu_scaler)
|
|
x = scaler.transform(x)
|
|
print ("The workload has been identified as cpu-intensive type")
|
|
model = joblib.load(cpu_clf)
|
|
result = model.predict(x)
|
|
elif l == [1, 0, 0, 1] :
|
|
resourcelimit = "cpu,memory"
|
|
scaler = joblib.load(cpumem_scaler)
|
|
x = scaler.transform(x)
|
|
print ("The workload has been identified as cpu-mem-intensive type")
|
|
model = joblib.load(cpumem_clf)
|
|
result = model.predict(x)
|
|
elif l == [1, 0, 1, 0] :
|
|
resourcelimit = "cpu,io"
|
|
scaler = joblib.load(cpunet_scaler)
|
|
x = scaler.transform(x)
|
|
print ("The workload has been identified as cpu-net-intensive type")
|
|
model = joblib.load(cpunet_clf)
|
|
result = model.predict(x)
|
|
elif l == [1, 0, 1, 1] :
|
|
resourcelimit = "cpu,network,memory"
|
|
scaler = joblib.load(cpunetmem_scaler)
|
|
x = scaler.transform(x)
|
|
print ("The workload has been identified as cpu-net-mem-intensive type")
|
|
model = joblib.load(cpunetmem_clf)
|
|
result = model.predict(x)
|
|
elif l == [1, 1, 0, 0]:
|
|
resourcelimit = "cpu,io"
|
|
scaler = joblib.load(cpuio_scaler)
|
|
x = scaler.transform(x)
|
|
print ("The workload has been identified as cpu-io-intensive type")
|
|
model = joblib.load(cpuio_clf)
|
|
result = model.predict(x)
|
|
elif l == [1, 1, 0, 1] :
|
|
resourcelimit = "cpu,io,memory"
|
|
scaler = joblib.load(cpuiomem_scaler)
|
|
x = scaler.transform(x)
|
|
print ("The workload has been identified as cpu-io-mem-intensive type")
|
|
model = joblib.load(cpuiomem_clf)
|
|
result = model.predict(x)
|
|
elif l ==[1, 1, 1, 1]:
|
|
resourcelimit = "cpu,io,network,memory"
|
|
print ("The workload has been identified as cpu-io-net-mem-intensive type")
|
|
return resourcelimit, "big_database", 1
|
|
else:
|
|
resourcelimit = ""
|
|
print ("Error: Unsupported classifier type")
|
|
return resourcelimit, "default", 0
|
|
|
|
encoder = joblib.load(encoder_path)
|
|
result = encoder.inverse_transform(result)
|
|
print (result)
|
|
prediction = Counter(result).most_common(1)[0]
|
|
confidence = prediction[1]/len(result)
|
|
if confidence > 0.5:
|
|
return resourcelimit, prediction[0], confidence
|
|
else:
|
|
return resourcelimit, "default", confidence
|
|
|
|
def retrain(self, data_path, custom_path):
|
|
custom_path = os.path.abspath(custom_path)
|
|
(dirname, filename) = os.path.split(custom_path)
|
|
(modelname, extension) = os.path.splitext(filename)
|
|
scalername = modelname + '_scaler.pkl'
|
|
encodername = modelname + '_encoder.pkl'
|
|
|
|
data_path = os.path.join(data_path,"*.csv")
|
|
self.parsing(data_path)
|
|
self.X = self.scaler.fit_transform(self.dataset.iloc[:, :-1])
|
|
self.y = self.encoder.fit_transform(self.dataset.iloc[:, -1])
|
|
joblib.dump(self.scaler, os.path.join(dirname, scalername))
|
|
joblib.dump(self.encoder, os.path.join(dirname, encodername))
|
|
|
|
class_num = len(self.encoder.classes_)
|
|
if class_num == 1:
|
|
self.y[-1] = self.y[-1] + 1
|
|
self.svm_clf(custom_path)
|
|
|
|
def reidentify(self, x, custom_path):
|
|
custom_path = os.path.abspath(custom_path)
|
|
(dirname, filename) = os.path.split(custom_path)
|
|
(modelname, extension) = os.path.splitext(filename)
|
|
scalername = modelname + '_scaler.pkl'
|
|
encodername = modelname + '_encoder.pkl'
|
|
|
|
scaler = joblib.load(os.path.join(dirname, scalername))
|
|
encoder = joblib.load(os.path.join(dirname, encodername))
|
|
model = joblib.load(custom_path)
|
|
|
|
x = scaler.transform(x)
|
|
result = model.predict(x)
|
|
result = encoder.inverse_transform(result)
|
|
print (result)
|
|
|
|
prediction = Counter(result).most_common(1)[0]
|
|
confidence = prediction[1]/len(result)
|
|
if confidence > 0.5:
|
|
return prediction[0], confidence
|
|
else:
|
|
return "default", confidence |