From a2310cbcff07f660b8d17584f687561b64bf27ad Mon Sep 17 00:00:00 2001 From: zhangtianyang Date: Thu, 27 Feb 2020 16:51:59 +0800 Subject: [PATCH] containerd: clean up residual container after shim abnormal exit reason:from update/revert test an occasional failure has been found that shim process has exited but container is still running, then following exec call all report ttrpc close error. the triggering condition is uncertain. this patch will make up the clean work of the residual container after such failure occurred to avoid subsequent call errors. Change-Id: I0da9d4e46010cbe58f2fda21895caeb301936c47 Signed-off-by: zhangtianyang --- runtime/v1/linux/runtime.go | 11 +++++++++++ services/tasks/local.go | 25 +++++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/runtime/v1/linux/runtime.go b/runtime/v1/linux/runtime.go index 96ad815..47a0cb6 100644 --- a/runtime/v1/linux/runtime.go +++ b/runtime/v1/linux/runtime.go @@ -511,6 +511,17 @@ func (r *Runtime) cleanupAfterDeadShim(ctx context.Context, bundle *bundle, ns, return nil } +func (r *Runtime) CleanupAfterDeadShim(ctx context.Context, ns, id string) error { + bund := &bundle{id: id, + path: filepath.Join(r.state, ns, id), + workDir: filepath.Join(r.root, ns, id)} + pid, err := runc.ReadPidFile(filepath.Join(bund.path, proc.InitPidFile)) + if err != nil { + return fmt.Errorf("failed to read pid from %s", proc.InitPidFile) + } + return r.cleanupAfterDeadShim(ctx, bund, ns, id, pid) +} + func (r *Runtime) terminate(ctx context.Context, bundle *bundle, ns, id string) error { rt, err := r.getRuntime(ctx, ns, id) if err != nil { diff --git a/services/tasks/local.go b/services/tasks/local.go index 990e841..9818971 100644 --- a/services/tasks/local.go +++ b/services/tasks/local.go @@ -24,6 +24,7 @@ import ( "io/ioutil" "os" "path/filepath" + "strings" "time" api "github.com/containerd/containerd/api/services/tasks/v1" @@ -41,6 +42,7 @@ import ( "github.com/containerd/containerd/mount" "github.com/containerd/containerd/plugin" "github.com/containerd/containerd/runtime" + "github.com/containerd/containerd/runtime/v1/linux" "github.com/containerd/containerd/runtime/v2" "github.com/containerd/containerd/services" "github.com/containerd/typeurl" @@ -383,11 +385,34 @@ func (l *local) Kill(ctx context.Context, r *api.KillRequest, _ ...grpc.CallOpti } } if err := p.Kill(ctx, r.Signal, r.All); err != nil { + if (r.Signal == 9 || r.Signal == 15) && strings.Contains(err.Error(), "ttrpc: client shutting down") { + // not sure under what conditions will cause such ttrpc error. since the error has + // happened, we have to make up the clean up work to avoid container residue. + cleanErr := l.cleanupResidualContainer(ctx, r, t.Namespace()) + log.G(ctx).WithField("clean error", cleanErr).Warnf( + "previous actions might encounter failure, try clean up the dead container.") + } return nil, errdefs.ToGRPC(err) } return empty, nil } +func (l *local) cleanupResidualContainer(ctx context.Context, r *api.KillRequest, namespace string) error { + container, err := l.getContainer(ctx, r.ContainerID) + if err != nil { + return fmt.Errorf("failed to get container %s, %v", r.ContainerID, err) + } + rt, err := l.getRuntime(container.Runtime.Name) + if err != nil { + return fmt.Errorf("failed to get runtime %s, %v", container.Runtime.Name, err) + } + lRuntime, ok := rt.(*linux.Runtime) + if !ok { + return fmt.Errorf("no clean work for runtime other than linux ones") + } + return lRuntime.CleanupAfterDeadShim(ctx, namespace, r.ContainerID) +} + func (l *local) ListPids(ctx context.Context, r *api.ListPidsRequest, _ ...grpc.CallOption) (*api.ListPidsResponse, error) { t, err := l.getTask(ctx, r.ContainerID) if err != nil { -- 1.8.3.1