runc:remove bindfd logic entirely
This commit is contained in:
parent
81f9a87370
commit
8b9d61acfb
@ -1 +1 @@
|
||||
d35711ce9c5492f1455036424a306c2a5b2d3735
|
||||
18c0c57644a490630eb21eec622d493d3e1e6164
|
||||
|
||||
@ -0,0 +1,132 @@
|
||||
From 0c1d18dcdfdd29a940b306240c49bef91bd51316 Mon Sep 17 00:00:00 2001
|
||||
From: Aleksa Sarai <cyphar@cyphar.com>
|
||||
Date: Fri, 7 Jul 2023 22:45:44 +1000
|
||||
Subject: [PATCH] nsenter: cloned_binary: remove bindfd logic entirely
|
||||
|
||||
While the ro-bind-mount trick did eliminate the memory overhead of
|
||||
copying the runc binary for each "runc init" invocation, on machines
|
||||
with very significant container churn, creating a temporary mount
|
||||
namespace on every container invocation can trigger severe lock
|
||||
contention on namespace_sem that makes containers fail to spawn.
|
||||
|
||||
The only reason we added bindfd in commit 16612d74de5f ("nsenter:
|
||||
cloned_binary: try to ro-bind /proc/self/exe before copying") was due to
|
||||
a Kubernetes e2e test failure where they had a ridiculously small memory
|
||||
limit. It seems incredibly unlikely that real workloads are running
|
||||
without 10MB to spare for the very short time that runc is interacting
|
||||
with the container.
|
||||
|
||||
In addition, since the original cloned_binary implementation, cgroupv2
|
||||
is now almost universally used on modern systems. Unlike cgroupv1, the
|
||||
cgroupv2 memcg implementation does not migrate memory usage when
|
||||
processes change cgroups (even cgroupv1 only did this if you had
|
||||
memory.move_charge_at_immigrate enabled). In addition, because we do the
|
||||
/proc/self/exe clone before synchronising the bootstrap data read, we
|
||||
are guaranteed to do the clone before "runc init" is moved into the
|
||||
container cgroup -- meaning that the memory used by the /proc/self/exe
|
||||
clone is charged against the root cgroup, and thus container workloads
|
||||
should not be affected at all with memfd cloning.
|
||||
|
||||
The long-term fix for this problem is to block the /proc/self/exe
|
||||
re-opening attack entirely in-kernel, which is something I'm working
|
||||
on[1]. Though it should also be noted that because the memfd is
|
||||
completely separate to the host binary, even attacks like Dirty COW
|
||||
against the runc binary can be defended against with the memfd approach.
|
||||
Of course, once we have in-kernel protection against the /proc/self/exe
|
||||
re-opening attack, we won't have that protection anymore...
|
||||
|
||||
[1]: https://lwn.net/Articles/934460/
|
||||
|
||||
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
|
||||
---
|
||||
libcontainer/nsenter/cloned_binary.c | 67 ----------------------------
|
||||
1 file changed, 67 deletions(-)
|
||||
|
||||
diff --git a/libcontainer/nsenter/cloned_binary.c b/libcontainer/nsenter/cloned_binary.c
|
||||
index 4268ebd..8497375 100644
|
||||
--- a/libcontainer/nsenter/cloned_binary.c
|
||||
+++ b/libcontainer/nsenter/cloned_binary.c
|
||||
@@ -396,61 +396,6 @@ static int seal_execfd(int *fd, int fdtype)
|
||||
return -1;
|
||||
}
|
||||
|
||||
-static int try_bindfd(void)
|
||||
-{
|
||||
- int fd, ret = -1;
|
||||
- char template[PATH_MAX] = { 0 };
|
||||
- char *prefix = getenv("_LIBCONTAINER_STATEDIR");
|
||||
-
|
||||
- if (!prefix || *prefix != '/')
|
||||
- prefix = "/tmp";
|
||||
- if (snprintf(template, sizeof(template), "%s/runc.XXXXXX", prefix) < 0)
|
||||
- return ret;
|
||||
-
|
||||
- /*
|
||||
- * We need somewhere to mount it, mounting anything over /proc/self is a
|
||||
- * BAD idea on the host -- even if we do it temporarily.
|
||||
- */
|
||||
- fd = mkstemp(template);
|
||||
- if (fd < 0)
|
||||
- return ret;
|
||||
- close(fd);
|
||||
-
|
||||
- /*
|
||||
- * For obvious reasons this won't work in rootless mode because we haven't
|
||||
- * created a userns+mntns -- but getting that to work will be a bit
|
||||
- * complicated and it's only worth doing if someone actually needs it.
|
||||
- */
|
||||
- ret = -EPERM;
|
||||
- if (mount("/proc/self/exe", template, "", MS_BIND, "") < 0)
|
||||
- goto out;
|
||||
- if (mount("", template, "", MS_REMOUNT | MS_BIND | MS_RDONLY, "") < 0)
|
||||
- goto out_umount;
|
||||
-
|
||||
- /* Get read-only handle that we're sure can't be made read-write. */
|
||||
- ret = open(template, O_PATH | O_CLOEXEC);
|
||||
-
|
||||
-out_umount:
|
||||
- /*
|
||||
- * Make sure the MNT_DETACH works, otherwise we could get remounted
|
||||
- * read-write and that would be quite bad (the fd would be made read-write
|
||||
- * too, invalidating the protection).
|
||||
- */
|
||||
- if (umount2(template, MNT_DETACH) < 0) {
|
||||
- if (ret >= 0)
|
||||
- close(ret);
|
||||
- ret = -ENOTRECOVERABLE;
|
||||
- }
|
||||
-
|
||||
-out:
|
||||
- /*
|
||||
- * We don't care about unlink errors, the worst that happens is that
|
||||
- * there's an empty file left around in STATEDIR.
|
||||
- */
|
||||
- unlink(template);
|
||||
- return ret;
|
||||
-}
|
||||
-
|
||||
static ssize_t fd_to_fd(int outfd, int infd)
|
||||
{
|
||||
ssize_t total = 0;
|
||||
@@ -485,18 +430,6 @@ static int clone_binary(void)
|
||||
size_t sent = 0;
|
||||
int fdtype = EFD_NONE;
|
||||
|
||||
- /*
|
||||
- * Before we resort to copying, let's try creating an ro-binfd in one shot
|
||||
- * by getting a handle for a read-only bind-mount of the execfd.
|
||||
- */
|
||||
- execfd = try_bindfd();
|
||||
- if (execfd >= 0)
|
||||
- return execfd;
|
||||
-
|
||||
- /*
|
||||
- * Dammit, that didn't work -- time to copy the binary to a safe place we
|
||||
- * can seal the contents.
|
||||
- */
|
||||
execfd = make_execfd(&fdtype);
|
||||
if (execfd < 0 || fdtype == EFD_NONE)
|
||||
return -ENOTRECOVERABLE;
|
||||
--
|
||||
2.33.0
|
||||
|
||||
@ -3,7 +3,7 @@
|
||||
|
||||
Name: docker-runc
|
||||
Version: 1.1.8
|
||||
Release: 3
|
||||
Release: 4
|
||||
Summary: runc is a CLI tool for spawning and running containers according to the OCI specification.
|
||||
|
||||
License: ASL 2.0
|
||||
@ -54,6 +54,12 @@ install -p -m 755 runc $RPM_BUILD_ROOT/%{_bindir}/runc
|
||||
%{_bindir}/runc
|
||||
|
||||
%changelog
|
||||
* Tue Sep 12 2023 zhongjiawei<zhongjiawei1@huawei.com> - 1.1.8-4
|
||||
- Type:bugfix
|
||||
- ID:NA
|
||||
- SUG:NA
|
||||
- DESC:remove bindfd logic entirely
|
||||
|
||||
* Wed Sep 6 2023 zhongjiawei<zhongjiawei1@huawei.com> - 1.1.8-3
|
||||
- Type:bugfix
|
||||
- ID:NA
|
||||
|
||||
@ -24,3 +24,4 @@ patch/0024-runc-modify-linuxcontainer-starttime-uint64-type-tob.patch
|
||||
patch/0025-runc-make-runc-spec-compatible-1.0.0.rc3.patch
|
||||
patch/0026-runc-Fixed-init-state-error-variable.patch
|
||||
patch/0027-runc-libct-fix-shared-pidns-detection.patch
|
||||
patch/0028-runc-nsenter-cloned_binary-remove-bindfd-logic-entirely.patch
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user