containerd/patch/0054-containerd-clean-up-residual-container.patch

101 lines
3.8 KiB
Diff
Raw Normal View History

From a2310cbcff07f660b8d17584f687561b64bf27ad Mon Sep 17 00:00:00 2001
From: zhangtianyang <zhangtianyang2@huawei.com>
Date: Thu, 27 Feb 2020 16:51:59 +0800
Subject: [PATCH] containerd: clean up residual container after
shim abnormal exit
reason:from update/revert test an occasional failure has been found that
shim process has exited but container is still running, then following exec
call all report ttrpc close error.
the triggering condition is uncertain. this patch will make up the clean
work of the residual container after such failure occurred to avoid
subsequent call errors.
Change-Id: I0da9d4e46010cbe58f2fda21895caeb301936c47
Signed-off-by: zhangtianyang <zhangtianyang2@huawei.com>
---
runtime/v1/linux/runtime.go | 11 +++++++++++
services/tasks/local.go | 25 +++++++++++++++++++++++++
2 files changed, 36 insertions(+)
diff --git a/runtime/v1/linux/runtime.go b/runtime/v1/linux/runtime.go
index 96ad815..47a0cb6 100644
--- a/runtime/v1/linux/runtime.go
+++ b/runtime/v1/linux/runtime.go
@@ -511,6 +511,17 @@ func (r *Runtime) cleanupAfterDeadShim(ctx context.Context, bundle *bundle, ns,
return nil
}
+func (r *Runtime) CleanupAfterDeadShim(ctx context.Context, ns, id string) error {
+ bund := &bundle{id: id,
+ path: filepath.Join(r.state, ns, id),
+ workDir: filepath.Join(r.root, ns, id)}
+ pid, err := runc.ReadPidFile(filepath.Join(bund.path, proc.InitPidFile))
+ if err != nil {
+ return fmt.Errorf("failed to read pid from %s", proc.InitPidFile)
+ }
+ return r.cleanupAfterDeadShim(ctx, bund, ns, id, pid)
+}
+
func (r *Runtime) terminate(ctx context.Context, bundle *bundle, ns, id string) error {
rt, err := r.getRuntime(ctx, ns, id)
if err != nil {
diff --git a/services/tasks/local.go b/services/tasks/local.go
index 990e841..9818971 100644
--- a/services/tasks/local.go
+++ b/services/tasks/local.go
@@ -24,6 +24,7 @@ import (
"io/ioutil"
"os"
"path/filepath"
+ "strings"
"time"
api "github.com/containerd/containerd/api/services/tasks/v1"
@@ -41,6 +42,7 @@ import (
"github.com/containerd/containerd/mount"
"github.com/containerd/containerd/plugin"
"github.com/containerd/containerd/runtime"
+ "github.com/containerd/containerd/runtime/v1/linux"
"github.com/containerd/containerd/runtime/v2"
"github.com/containerd/containerd/services"
"github.com/containerd/typeurl"
@@ -383,11 +385,34 @@ func (l *local) Kill(ctx context.Context, r *api.KillRequest, _ ...grpc.CallOpti
}
}
if err := p.Kill(ctx, r.Signal, r.All); err != nil {
+ if (r.Signal == 9 || r.Signal == 15) && strings.Contains(err.Error(), "ttrpc: client shutting down") {
+ // not sure under what conditions will cause such ttrpc error. since the error has
+ // happened, we have to make up the clean up work to avoid container residue.
+ cleanErr := l.cleanupResidualContainer(ctx, r, t.Namespace())
+ log.G(ctx).WithField("clean error", cleanErr).Warnf(
+ "previous actions might encounter failure, try clean up the dead container.")
+ }
return nil, errdefs.ToGRPC(err)
}
return empty, nil
}
+func (l *local) cleanupResidualContainer(ctx context.Context, r *api.KillRequest, namespace string) error {
+ container, err := l.getContainer(ctx, r.ContainerID)
+ if err != nil {
+ return fmt.Errorf("failed to get container %s, %v", r.ContainerID, err)
+ }
+ rt, err := l.getRuntime(container.Runtime.Name)
+ if err != nil {
+ return fmt.Errorf("failed to get runtime %s, %v", container.Runtime.Name, err)
+ }
+ lRuntime, ok := rt.(*linux.Runtime)
+ if !ok {
+ return fmt.Errorf("no clean work for runtime other than linux ones")
+ }
+ return lRuntime.CleanupAfterDeadShim(ctx, namespace, r.ContainerID)
+}
+
func (l *local) ListPids(ctx context.Context, r *api.ListPidsRequest, _ ...grpc.CallOption) (*api.ListPidsResponse, error) {
t, err := l.getTask(ctx, r.ContainerID)
if err != nil {
--
1.8.3.1