From 9fd05bda9c5789bee45de142468fec0227929927 Mon Sep 17 00:00:00 2001 From: haozi007 Date: Thu, 29 Jul 2021 12:04:58 +0100 Subject: [PATCH 7/9] support rollback of create cluster 1. rollback if failed of create cluster; 2. support partial success of create cluster; Signed-off-by: haozi007 --- cmd/deploy.go | 66 +++++- pkg/api/tools.go | 41 ++++ pkg/api/types.go | 7 + pkg/clusterdeployment/binary/binary.go | 3 +- .../binary/cleanupcluster/cleanupnode.go | 4 +- pkg/clusterdeployment/clusterdeploy.go | 208 +++++++++++++----- pkg/utils/nodemanager/node.go | 2 +- pkg/utils/nodemanager/nodemanager.go | 27 ++- pkg/utils/utils.go | 4 + 9 files changed, 292 insertions(+), 70 deletions(-) diff --git a/cmd/deploy.go b/cmd/deploy.go index 3aba440..12fdd8e 100644 --- a/cmd/deploy.go +++ b/cmd/deploy.go @@ -24,8 +24,64 @@ import ( "isula.org/eggo/pkg/clusterdeployment" ) -func deploy(ccfg *api.ClusterConfig) error { - return clusterdeployment.CreateCluster(ccfg) +func deploy(conf *deployConfig) error { + if err := saveDeployConfig(conf, savedDeployConfigPath(conf.ClusterID)); err != nil { + return fmt.Errorf("save deploy config failed: %v", err) + } + + ccfg := toClusterdeploymentConfig(conf) + + cstatus, err := clusterdeployment.CreateCluster(ccfg) + if err != nil { + return err + } + + if cstatus.FailureCnt > 0 { + // if partial success, just update config of cluster, remove failed nodes + var tmp []*HostConfig + for _, n := range conf.Masters { + if success, ok := cstatus.StatusOfNodes[n.Ip]; ok && !success { + continue + } + tmp = append(tmp, n) + } + conf.Masters = tmp + + tmp = nil + for _, n := range conf.Workers { + if success, ok := cstatus.StatusOfNodes[n.Ip]; ok && !success { + continue + } + tmp = append(tmp, n) + } + conf.Workers = tmp + + tmp = nil + for _, n := range conf.Etcds { + if success, ok := cstatus.StatusOfNodes[n.Ip]; ok && !success { + continue + } + tmp = append(tmp, n) + } + conf.Etcds = tmp + + err = saveDeployConfig(conf, savedDeployConfigPath(conf.ClusterID)) + if err != nil { + fmt.Printf("") + clusterdeployment.RemoveCluster(ccfg) + return fmt.Errorf("update config of cluster failed: %v", err) + } + fmt.Printf("update config of cluster: %s", conf.ClusterID) + } + + fmt.Print(cstatus.Show()) + + if cstatus.Working { + fmt.Printf("To start using cluster: %s, you need following as a regular user:\n\n", ccfg.Name) + fmt.Printf("\texport KUBECONFIG=%s/admin.conf\n\n", api.GetClusterHomePath(ccfg.Name)) + } + + return err } func deployCluster(cmd *cobra.Command, args []string) error { @@ -40,11 +96,7 @@ func deployCluster(cmd *cobra.Command, args []string) error { // TODO: make sure config valid - if err := saveDeployConfig(conf, savedDeployConfigPath(conf.ClusterID)); err != nil { - return fmt.Errorf("save deploy config failed: %v", err) - } - - if err := deploy(toClusterdeploymentConfig(conf)); err != nil { + if err := deploy(conf); err != nil { return err } diff --git a/pkg/api/tools.go b/pkg/api/tools.go index a73b9ea..2e4fb4c 100644 --- a/pkg/api/tools.go +++ b/pkg/api/tools.go @@ -7,6 +7,7 @@ import ( "github.com/sirupsen/logrus" "isula.org/eggo/pkg/constants" + "k8s.io/apimachinery/pkg/util/json" ) var ( @@ -80,3 +81,43 @@ func GetEtcdServers(ecc *EtcdClusterConfig) string { func IsCleanupSchedule(schedule Schedule) bool { return schedule == SchedulePreCleanup || schedule == SchedulePostCleanup } + +func (hc HostConfig) DeepCopy() (*HostConfig, error) { + b, err := json.Marshal(hc) + if err != nil { + return nil, err + } + var result HostConfig + err = json.Unmarshal(b, &result) + return &result, err +} + +func (cs *ClusterStatus) Show() string { + var sb strings.Builder + var fb strings.Builder + + sb.WriteString("-------------------------------\n") + sb.WriteString("message: ") + sb.WriteString(cs.Message) + sb.WriteString("\nsummary: \n") + if cs.Working { + sb.WriteString(cs.ControlPlane) + sb.WriteString("\t\tsuccess") + sb.WriteString("\n") + } + for ip, ok := range cs.StatusOfNodes { + if ok { + sb.WriteString(ip) + sb.WriteString("\t\tsuccess") + sb.WriteString("\n") + } else { + fb.WriteString(ip) + fb.WriteString("\t\tfailure") + fb.WriteString("\n") + } + } + sb.WriteString(fb.String()) + sb.WriteString("-------------------------------\n") + + return sb.String() +} diff --git a/pkg/api/types.go b/pkg/api/types.go index 71f4f14..86da853 100644 --- a/pkg/api/types.go +++ b/pkg/api/types.go @@ -215,7 +215,14 @@ type ClusterConfig struct { } type ClusterStatus struct { + Message string `json:"message"` + ControlPlane string `json:"controlplane"` + Working bool `json:"working"` + StatusOfNodes map[string]bool `json:"statusOfNodes"` + SuccessCnt uint32 `json:"successCnt"` + FailureCnt uint32 `json:"failureCnt"` } + type InfrastructureAPI interface { // TODO: should add other dependence cluster configurations MachineInfraSetup(machine *HostConfig) error diff --git a/pkg/clusterdeployment/binary/binary.go b/pkg/clusterdeployment/binary/binary.go index cc6a6a0..dd260b0 100644 --- a/pkg/clusterdeployment/binary/binary.go +++ b/pkg/clusterdeployment/binary/binary.go @@ -533,7 +533,8 @@ func (bcp *BinaryClusterDeployment) PostNodeJoinHooks(node *api.HostConfig) erro } } - if utils.IsType(roles, (api.Master & api.Worker)) { + // check whether the node is worker and master + if utils.IsType(roles, (api.Master | api.Worker)) { if err := taintAndLabelNode(bcp.config.Name, node.Name); err != nil { return err } diff --git a/pkg/clusterdeployment/binary/cleanupcluster/cleanupnode.go b/pkg/clusterdeployment/binary/cleanupcluster/cleanupnode.go index 76cd7ce..da0488a 100644 --- a/pkg/clusterdeployment/binary/cleanupcluster/cleanupnode.go +++ b/pkg/clusterdeployment/binary/cleanupcluster/cleanupnode.go @@ -232,7 +232,6 @@ func execRemoveWorkerTask(conf *api.ClusterConfig, hostconfig *api.HostConfig) e task.SetIgnoreErrorFlag(taskRemoveWorker) if err := nodemanager.RunTaskOnNodes(taskRemoveWorker, []string{master}); err != nil { - logrus.Errorf("run task for remove worker failed: %v", err) return err } @@ -246,8 +245,7 @@ func CleanupNode(conf *api.ClusterConfig, hostconfig *api.HostConfig, delType ui if utils.IsType(delType, api.Worker) { if err := execRemoveWorkerTask(conf, hostconfig); err != nil { - logrus.Errorf("remove workers failed: %v", err) - return err + logrus.Warnf("ignore: remove workers failed: %v", err) } } diff --git a/pkg/clusterdeployment/clusterdeploy.go b/pkg/clusterdeployment/clusterdeploy.go index 016da2b..c4e0880 100644 --- a/pkg/clusterdeployment/clusterdeploy.go +++ b/pkg/clusterdeployment/clusterdeploy.go @@ -28,121 +28,215 @@ import ( "isula.org/eggo/pkg/utils/nodemanager" ) -func doCreateCluster(handler api.ClusterDeploymentAPI, cc *api.ClusterConfig) error { - var loadbalancer *api.HostConfig - var controlPlane *api.HostConfig - var joinNodes []*api.HostConfig - var joinNodeIDs []string +func splitNodes(nodes []*api.HostConfig) (*api.HostConfig, []*api.HostConfig, []*api.HostConfig, []string) { + var lb *api.HostConfig + var masters []*api.HostConfig + var workers []*api.HostConfig var etcdNodes []string - // Step1: setup infrastructure for all nodes in the cluster - for _, n := range cc.Nodes { - if err := handler.MachineInfraSetup(n); err != nil { - return err - } + + for _, n := range nodes { if utils.IsType(n.Type, api.LoadBalance) { - loadbalancer = n + lb = n } if utils.IsType(n.Type, api.ETCD) { etcdNodes = append(etcdNodes, n.Address) } + + if utils.IsType(n.Type, api.Master) { + masters = append(masters, n) + // node with master and worker, just put into masters + continue + } + if utils.IsType(n.Type, api.Worker) { - joinNodes = append(joinNodes, n) - joinNodeIDs = append(joinNodeIDs, n.Address) + workers = append(workers, n) } + } - if utils.IsType(n.Type, api.Master) { - if controlPlane == nil { - controlPlane = n - } else { - joinNodes = append(joinNodes, n) - joinNodeIDs = append(joinNodeIDs, n.Address) - } + return lb, masters, workers, etcdNodes +} + +func doJoinNodeOfCluster(handler api.ClusterDeploymentAPI, cc *api.ClusterConfig, masters, workers []*api.HostConfig) ([]string, []*api.HostConfig, error) { + var joinedNodeIDs []string + var failedNodes []*api.HostConfig + for _, node := range workers { + if err := handler.ClusterNodeJoin(node); err != nil { + failedNodes = append(failedNodes, node) + continue + } + joinedNodeIDs = append(joinedNodeIDs, node.Address) + } + for _, node := range masters { + if err := handler.ClusterNodeJoin(node); err != nil { + failedNodes = append(failedNodes, node) + continue + } + joinedNodeIDs = append(joinedNodeIDs, node.Address) + } + // wait all nodes ready + if err := nodemanager.WaitNodesFinishWithProgress(joinedNodeIDs, time.Minute*5); err != nil { + tFailedNodes, successNodes := nodemanager.CheckNodesStatus(joinedNodeIDs) + // update joined and failed nodes + failedNodes = append(failedNodes, tFailedNodes...) + joinedNodeIDs = successNodes + if len(successNodes) == 0 { + return joinedNodeIDs, nil, err + } + logrus.Warnf("wait some node to complete join failed: %v", err) + } + + return joinedNodeIDs, failedNodes, nil +} + +func doCreateCluster(handler api.ClusterDeploymentAPI, cc *api.ClusterConfig, cstatus *api.ClusterStatus) ([]*api.HostConfig, error) { + loadbalancer, masters, workers, etcdNodes := splitNodes(cc.Nodes) + + if len(masters) == 0 { + return nil, fmt.Errorf("no master found") + } + controlPlaneNode, err := masters[0].DeepCopy() + if err != nil { + return nil, err + } + cstatus.ControlPlane = controlPlaneNode.Address + masters = masters[1:] + + // Step1: setup infrastructure for all nodes in the cluster + for _, n := range cc.Nodes { + if err = handler.MachineInfraSetup(n); err != nil { + return nil, err } } // Step2: run precreate cluster hooks - if err := handler.PreCreateClusterHooks(); err != nil { - return err + if err = handler.PreCreateClusterHooks(); err != nil { + return nil, err } // Step3: setup etcd cluster // wait infrastructure task success on nodes of etcd cluster - if err := nodemanager.WaitNodesFinishWithProgress(etcdNodes, time.Minute*5); err != nil { - return err + if err = nodemanager.WaitNodesFinishWithProgress(etcdNodes, time.Minute*5); err != nil { + return nil, err } - if err := handler.EtcdClusterSetup(); err != nil { - return err + if err = handler.EtcdClusterSetup(); err != nil { + return nil, err } + // Step4: setup loadbalance for cluster - if err := handler.LoadBalancerSetup(loadbalancer); err != nil { - return err + if err = handler.LoadBalancerSetup(loadbalancer); err != nil { + return nil, err } + // Step5: setup control plane for cluster - if err := handler.ClusterControlPlaneInit(controlPlane); err != nil { - return err + if err = handler.ClusterControlPlaneInit(controlPlaneNode); err != nil { + return nil, err } // wait controlplane setup task success - if err := nodemanager.WaitNodesFinish([]string{controlPlane.Address}, time.Minute*5); err != nil { - return err + if err = nodemanager.WaitNodesFinish([]string{controlPlaneNode.Address}, time.Minute*5); err != nil { + return nil, err } - - //Step6: setup left nodes for cluster - for _, node := range joinNodes { - if err := handler.ClusterNodeJoin(node); err != nil { - return err + if utils.IsType(controlPlaneNode.Type, api.Worker) { + controlPlaneNode.Type = utils.ClearType(controlPlaneNode.Type, api.Master) + if err = handler.ClusterNodeJoin(controlPlaneNode); err != nil { + return nil, err } } - // wait all nodes ready - if err := nodemanager.WaitNodesFinishWithProgress(joinNodeIDs, time.Minute*5); err != nil { - return err + + //Step6: setup left nodes for cluster + joinedNodeIDs, failedNodes, err := doJoinNodeOfCluster(handler, cc, masters, workers) + if err != nil { + return nil, err } //Step7: setup addons for cluster - if err := handler.AddonsSetup(); err != nil { - return err + if err = handler.AddonsSetup(); err != nil { + return nil, err } // Step8: run postcreate cluster hooks - if err := handler.PostCreateClusterHooks(); err != nil { - return err + if err = handler.PostCreateClusterHooks(); err != nil { + return nil, err } - allNodes := utils.GetAllIPs(cc.Nodes) - if err := nodemanager.WaitNodesFinishWithProgress(allNodes, time.Minute*5); err != nil { - return err + if err = nodemanager.WaitNodesFinishWithProgress(append(joinedNodeIDs, controlPlaneNode.Address), time.Minute*5); err != nil { + return nil, err } - return nil + for _, sid := range joinedNodeIDs { + cstatus.StatusOfNodes[sid] = true + cstatus.SuccessCnt += 1 + } + cstatus.Working = true + + return failedNodes, nil } -func CreateCluster(cc *api.ClusterConfig) error { +func rollbackFailedNoeds(handler api.ClusterDeploymentAPI, nodes []*api.HostConfig) { + if nodes == nil { + return + } + var rollIDs []string + for _, n := range nodes { + // do best to cleanup, if error, just ignore + handler.ClusterNodeCleanup(n, n.Type) + handler.MachineInfraDestroy(n) + rollIDs = append(rollIDs, n.Address) + } + + if err := nodemanager.WaitNodesFinishWithProgress(rollIDs, time.Minute*5); err != nil { + logrus.Warnf("rollback failed: %v", err) + } +} + +func CreateCluster(cc *api.ClusterConfig) (api.ClusterStatus, error) { + cstatus := api.ClusterStatus{ + StatusOfNodes: make(map[string]bool), + } if cc == nil { - return fmt.Errorf("[cluster] cluster config is required") + return cstatus, fmt.Errorf("[cluster] cluster config is required") } creator, err := manager.GetClusterDeploymentDriver(cc.DeployDriver) if err != nil { logrus.Errorf("[cluster] get cluster deployment driver: %s failed: %v", cc.DeployDriver, err) - return err + return cstatus, err } handler, err := creator(cc) if err != nil { logrus.Errorf("[cluster] create cluster deployment instance with driver: %s, failed: %v", cc.DeployDriver, err) - return err + return cstatus, err } defer handler.Finish() // prepare eggo config directory if err := os.MkdirAll(api.GetClusterHomePath(cc.Name), 0750); err != nil { - return err + return cstatus, err } - if err := doCreateCluster(handler, cc); err != nil { - return err + failedNodes, err := doCreateCluster(handler, cc, &cstatus) + if err != nil { + logrus.Warnf("rollback cluster: %s", cc.Name) + doRemoveCluster(handler, cc) + cstatus.Message = err.Error() + return cstatus, err + } + // rollback failed nodes + rollbackFailedNoeds(handler, failedNodes) + // update status of cluster + if failedNodes != nil { + var failureIDs []string + for _, fid := range failedNodes { + failureIDs = append(failureIDs, fid.Address) + cstatus.StatusOfNodes[fid.Address] = false + cstatus.FailureCnt += 1 + } + logrus.Warnf("[cluster] failed nodes: %v", failureIDs) + cstatus.Message = "partial success of create cluster" + return cstatus, nil } - logrus.Infof("[cluster] create cluster '%s' successed", cc.Name) - return nil + cstatus.Message = "create cluster success" + return cstatus, nil } func doJoinNode(handler api.ClusterDeploymentAPI, cc *api.ClusterConfig, hostconfig *api.HostConfig) error { diff --git a/pkg/utils/nodemanager/node.go b/pkg/utils/nodemanager/node.go index d610e3b..a19d2e7 100644 --- a/pkg/utils/nodemanager/node.go +++ b/pkg/utils/nodemanager/node.go @@ -117,7 +117,7 @@ func (n *Node) updateNodeStatus(message string, status int) { func (n *Node) PushTask(t task.Task) bool { // only run ignore error tasks to cleanup node - if n.status.HasError() && task.IsIgnoreError(t) { + if n.status.HasError() && !task.IsIgnoreError(t) { logrus.Debugf("node finished with error: %v", n.status.Message) return false } diff --git a/pkg/utils/nodemanager/nodemanager.go b/pkg/utils/nodemanager/nodemanager.go index b135890..25f7d4e 100644 --- a/pkg/utils/nodemanager/nodemanager.go +++ b/pkg/utils/nodemanager/nodemanager.go @@ -37,6 +37,28 @@ var manager = &NodeManager{ nodes: make(map[string]*Node, 2), } +// return: key is node IP; value true is failed, false is success +func CheckNodesStatus(checkNodes []string) ([]*api.HostConfig, []string) { + var failures []*api.HostConfig + var success []string + manager.lock.RLock() + defer manager.lock.RUnlock() + for _, cn := range checkNodes { + n, ok := manager.nodes[cn] + if !ok { + failures = append(failures, n.host) + continue + } + if n.GetStatus().HasError() { + failures = append(failures, n.host) + } else { + success = append(success, cn) + } + } + + return failures, success +} + func RegisterNode(hcf *api.HostConfig, r runner.Runner) error { manager.lock.Lock() defer manager.lock.Unlock() @@ -227,7 +249,10 @@ outfor: } logrus.Infof("Tasks progress: %s", sb.String()) unfinishedNodes = nextUnfinished - time.Sleep(time.Second) + + // sleep time depend on count of wait nodes + st := len(unfinishedNodes) + 1 + time.Sleep(time.Second * time.Duration(st)) } } diff --git a/pkg/utils/utils.go b/pkg/utils/utils.go index 72f8777..84ec0ea 100644 --- a/pkg/utils/utils.go +++ b/pkg/utils/utils.go @@ -54,6 +54,10 @@ func IsType(curType uint16, expectedType uint16) bool { return (curType & expectedType) == expectedType } +func ClearType(curType uint16, clearType uint16) uint16 { + return (curType & ^clearType) +} + func AddSudo(cmd string) string { return "sudo -E /bin/sh -c \"" + cmd + "\"" } -- 2.25.1