diff --git a/git-commit b/git-commit index cfa9df3..570f566 100644 --- a/git-commit +++ b/git-commit @@ -1 +1 @@ -d35711ce9c5492f1455036424a306c2a5b2d3735 +18c0c57644a490630eb21eec622d493d3e1e6164 diff --git a/patch/0028-runc-nsenter-cloned_binary-remove-bindfd-logic-entirely.patch b/patch/0028-runc-nsenter-cloned_binary-remove-bindfd-logic-entirely.patch new file mode 100644 index 0000000..5fe0dc3 --- /dev/null +++ b/patch/0028-runc-nsenter-cloned_binary-remove-bindfd-logic-entirely.patch @@ -0,0 +1,132 @@ +From 0c1d18dcdfdd29a940b306240c49bef91bd51316 Mon Sep 17 00:00:00 2001 +From: Aleksa Sarai +Date: Fri, 7 Jul 2023 22:45:44 +1000 +Subject: [PATCH] nsenter: cloned_binary: remove bindfd logic entirely + +While the ro-bind-mount trick did eliminate the memory overhead of +copying the runc binary for each "runc init" invocation, on machines +with very significant container churn, creating a temporary mount +namespace on every container invocation can trigger severe lock +contention on namespace_sem that makes containers fail to spawn. + +The only reason we added bindfd in commit 16612d74de5f ("nsenter: +cloned_binary: try to ro-bind /proc/self/exe before copying") was due to +a Kubernetes e2e test failure where they had a ridiculously small memory +limit. It seems incredibly unlikely that real workloads are running +without 10MB to spare for the very short time that runc is interacting +with the container. + +In addition, since the original cloned_binary implementation, cgroupv2 +is now almost universally used on modern systems. Unlike cgroupv1, the +cgroupv2 memcg implementation does not migrate memory usage when +processes change cgroups (even cgroupv1 only did this if you had +memory.move_charge_at_immigrate enabled). In addition, because we do the +/proc/self/exe clone before synchronising the bootstrap data read, we +are guaranteed to do the clone before "runc init" is moved into the +container cgroup -- meaning that the memory used by the /proc/self/exe +clone is charged against the root cgroup, and thus container workloads +should not be affected at all with memfd cloning. + +The long-term fix for this problem is to block the /proc/self/exe +re-opening attack entirely in-kernel, which is something I'm working +on[1]. Though it should also be noted that because the memfd is +completely separate to the host binary, even attacks like Dirty COW +against the runc binary can be defended against with the memfd approach. +Of course, once we have in-kernel protection against the /proc/self/exe +re-opening attack, we won't have that protection anymore... + +[1]: https://lwn.net/Articles/934460/ + +Signed-off-by: Aleksa Sarai +--- + libcontainer/nsenter/cloned_binary.c | 67 ---------------------------- + 1 file changed, 67 deletions(-) + +diff --git a/libcontainer/nsenter/cloned_binary.c b/libcontainer/nsenter/cloned_binary.c +index 4268ebd..8497375 100644 +--- a/libcontainer/nsenter/cloned_binary.c ++++ b/libcontainer/nsenter/cloned_binary.c +@@ -396,61 +396,6 @@ static int seal_execfd(int *fd, int fdtype) + return -1; + } + +-static int try_bindfd(void) +-{ +- int fd, ret = -1; +- char template[PATH_MAX] = { 0 }; +- char *prefix = getenv("_LIBCONTAINER_STATEDIR"); +- +- if (!prefix || *prefix != '/') +- prefix = "/tmp"; +- if (snprintf(template, sizeof(template), "%s/runc.XXXXXX", prefix) < 0) +- return ret; +- +- /* +- * We need somewhere to mount it, mounting anything over /proc/self is a +- * BAD idea on the host -- even if we do it temporarily. +- */ +- fd = mkstemp(template); +- if (fd < 0) +- return ret; +- close(fd); +- +- /* +- * For obvious reasons this won't work in rootless mode because we haven't +- * created a userns+mntns -- but getting that to work will be a bit +- * complicated and it's only worth doing if someone actually needs it. +- */ +- ret = -EPERM; +- if (mount("/proc/self/exe", template, "", MS_BIND, "") < 0) +- goto out; +- if (mount("", template, "", MS_REMOUNT | MS_BIND | MS_RDONLY, "") < 0) +- goto out_umount; +- +- /* Get read-only handle that we're sure can't be made read-write. */ +- ret = open(template, O_PATH | O_CLOEXEC); +- +-out_umount: +- /* +- * Make sure the MNT_DETACH works, otherwise we could get remounted +- * read-write and that would be quite bad (the fd would be made read-write +- * too, invalidating the protection). +- */ +- if (umount2(template, MNT_DETACH) < 0) { +- if (ret >= 0) +- close(ret); +- ret = -ENOTRECOVERABLE; +- } +- +-out: +- /* +- * We don't care about unlink errors, the worst that happens is that +- * there's an empty file left around in STATEDIR. +- */ +- unlink(template); +- return ret; +-} +- + static ssize_t fd_to_fd(int outfd, int infd) + { + ssize_t total = 0; +@@ -485,18 +430,6 @@ static int clone_binary(void) + size_t sent = 0; + int fdtype = EFD_NONE; + +- /* +- * Before we resort to copying, let's try creating an ro-binfd in one shot +- * by getting a handle for a read-only bind-mount of the execfd. +- */ +- execfd = try_bindfd(); +- if (execfd >= 0) +- return execfd; +- +- /* +- * Dammit, that didn't work -- time to copy the binary to a safe place we +- * can seal the contents. +- */ + execfd = make_execfd(&fdtype); + if (execfd < 0 || fdtype == EFD_NONE) + return -ENOTRECOVERABLE; +-- +2.33.0 + diff --git a/runc.spec b/runc.spec index 52068dc..7bfa9a4 100644 --- a/runc.spec +++ b/runc.spec @@ -3,7 +3,7 @@ Name: docker-runc Version: 1.1.8 -Release: 3 +Release: 4 Summary: runc is a CLI tool for spawning and running containers according to the OCI specification. License: ASL 2.0 @@ -54,6 +54,12 @@ install -p -m 755 runc $RPM_BUILD_ROOT/%{_bindir}/runc %{_bindir}/runc %changelog +* Tue Sep 12 2023 zhongjiawei - 1.1.8-4 +- Type:bugfix +- ID:NA +- SUG:NA +- DESC:remove bindfd logic entirely + * Wed Sep 6 2023 zhongjiawei - 1.1.8-3 - Type:bugfix - ID:NA diff --git a/series.conf b/series.conf index 45f3047..4d599e2 100644 --- a/series.conf +++ b/series.conf @@ -24,3 +24,4 @@ patch/0024-runc-modify-linuxcontainer-starttime-uint64-type-tob.patch patch/0025-runc-make-runc-spec-compatible-1.0.0.rc3.patch patch/0026-runc-Fixed-init-state-error-variable.patch patch/0027-runc-libct-fix-shared-pidns-detection.patch +patch/0028-runc-nsenter-cloned_binary-remove-bindfd-logic-entirely.patch