!165 runc:remove bindfd logic entirely
From: @zhong-jiawei-1 Reviewed-by: @zhangsong234, @duguhaotian Signed-off-by: @duguhaotian
This commit is contained in:
commit
3ccd284b74
@ -1 +1 @@
|
|||||||
d35711ce9c5492f1455036424a306c2a5b2d3735
|
18c0c57644a490630eb21eec622d493d3e1e6164
|
||||||
|
|||||||
@ -0,0 +1,132 @@
|
|||||||
|
From 0c1d18dcdfdd29a940b306240c49bef91bd51316 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Aleksa Sarai <cyphar@cyphar.com>
|
||||||
|
Date: Fri, 7 Jul 2023 22:45:44 +1000
|
||||||
|
Subject: [PATCH] nsenter: cloned_binary: remove bindfd logic entirely
|
||||||
|
|
||||||
|
While the ro-bind-mount trick did eliminate the memory overhead of
|
||||||
|
copying the runc binary for each "runc init" invocation, on machines
|
||||||
|
with very significant container churn, creating a temporary mount
|
||||||
|
namespace on every container invocation can trigger severe lock
|
||||||
|
contention on namespace_sem that makes containers fail to spawn.
|
||||||
|
|
||||||
|
The only reason we added bindfd in commit 16612d74de5f ("nsenter:
|
||||||
|
cloned_binary: try to ro-bind /proc/self/exe before copying") was due to
|
||||||
|
a Kubernetes e2e test failure where they had a ridiculously small memory
|
||||||
|
limit. It seems incredibly unlikely that real workloads are running
|
||||||
|
without 10MB to spare for the very short time that runc is interacting
|
||||||
|
with the container.
|
||||||
|
|
||||||
|
In addition, since the original cloned_binary implementation, cgroupv2
|
||||||
|
is now almost universally used on modern systems. Unlike cgroupv1, the
|
||||||
|
cgroupv2 memcg implementation does not migrate memory usage when
|
||||||
|
processes change cgroups (even cgroupv1 only did this if you had
|
||||||
|
memory.move_charge_at_immigrate enabled). In addition, because we do the
|
||||||
|
/proc/self/exe clone before synchronising the bootstrap data read, we
|
||||||
|
are guaranteed to do the clone before "runc init" is moved into the
|
||||||
|
container cgroup -- meaning that the memory used by the /proc/self/exe
|
||||||
|
clone is charged against the root cgroup, and thus container workloads
|
||||||
|
should not be affected at all with memfd cloning.
|
||||||
|
|
||||||
|
The long-term fix for this problem is to block the /proc/self/exe
|
||||||
|
re-opening attack entirely in-kernel, which is something I'm working
|
||||||
|
on[1]. Though it should also be noted that because the memfd is
|
||||||
|
completely separate to the host binary, even attacks like Dirty COW
|
||||||
|
against the runc binary can be defended against with the memfd approach.
|
||||||
|
Of course, once we have in-kernel protection against the /proc/self/exe
|
||||||
|
re-opening attack, we won't have that protection anymore...
|
||||||
|
|
||||||
|
[1]: https://lwn.net/Articles/934460/
|
||||||
|
|
||||||
|
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
|
||||||
|
---
|
||||||
|
libcontainer/nsenter/cloned_binary.c | 67 ----------------------------
|
||||||
|
1 file changed, 67 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/libcontainer/nsenter/cloned_binary.c b/libcontainer/nsenter/cloned_binary.c
|
||||||
|
index 4268ebd..8497375 100644
|
||||||
|
--- a/libcontainer/nsenter/cloned_binary.c
|
||||||
|
+++ b/libcontainer/nsenter/cloned_binary.c
|
||||||
|
@@ -396,61 +396,6 @@ static int seal_execfd(int *fd, int fdtype)
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
-static int try_bindfd(void)
|
||||||
|
-{
|
||||||
|
- int fd, ret = -1;
|
||||||
|
- char template[PATH_MAX] = { 0 };
|
||||||
|
- char *prefix = getenv("_LIBCONTAINER_STATEDIR");
|
||||||
|
-
|
||||||
|
- if (!prefix || *prefix != '/')
|
||||||
|
- prefix = "/tmp";
|
||||||
|
- if (snprintf(template, sizeof(template), "%s/runc.XXXXXX", prefix) < 0)
|
||||||
|
- return ret;
|
||||||
|
-
|
||||||
|
- /*
|
||||||
|
- * We need somewhere to mount it, mounting anything over /proc/self is a
|
||||||
|
- * BAD idea on the host -- even if we do it temporarily.
|
||||||
|
- */
|
||||||
|
- fd = mkstemp(template);
|
||||||
|
- if (fd < 0)
|
||||||
|
- return ret;
|
||||||
|
- close(fd);
|
||||||
|
-
|
||||||
|
- /*
|
||||||
|
- * For obvious reasons this won't work in rootless mode because we haven't
|
||||||
|
- * created a userns+mntns -- but getting that to work will be a bit
|
||||||
|
- * complicated and it's only worth doing if someone actually needs it.
|
||||||
|
- */
|
||||||
|
- ret = -EPERM;
|
||||||
|
- if (mount("/proc/self/exe", template, "", MS_BIND, "") < 0)
|
||||||
|
- goto out;
|
||||||
|
- if (mount("", template, "", MS_REMOUNT | MS_BIND | MS_RDONLY, "") < 0)
|
||||||
|
- goto out_umount;
|
||||||
|
-
|
||||||
|
- /* Get read-only handle that we're sure can't be made read-write. */
|
||||||
|
- ret = open(template, O_PATH | O_CLOEXEC);
|
||||||
|
-
|
||||||
|
-out_umount:
|
||||||
|
- /*
|
||||||
|
- * Make sure the MNT_DETACH works, otherwise we could get remounted
|
||||||
|
- * read-write and that would be quite bad (the fd would be made read-write
|
||||||
|
- * too, invalidating the protection).
|
||||||
|
- */
|
||||||
|
- if (umount2(template, MNT_DETACH) < 0) {
|
||||||
|
- if (ret >= 0)
|
||||||
|
- close(ret);
|
||||||
|
- ret = -ENOTRECOVERABLE;
|
||||||
|
- }
|
||||||
|
-
|
||||||
|
-out:
|
||||||
|
- /*
|
||||||
|
- * We don't care about unlink errors, the worst that happens is that
|
||||||
|
- * there's an empty file left around in STATEDIR.
|
||||||
|
- */
|
||||||
|
- unlink(template);
|
||||||
|
- return ret;
|
||||||
|
-}
|
||||||
|
-
|
||||||
|
static ssize_t fd_to_fd(int outfd, int infd)
|
||||||
|
{
|
||||||
|
ssize_t total = 0;
|
||||||
|
@@ -485,18 +430,6 @@ static int clone_binary(void)
|
||||||
|
size_t sent = 0;
|
||||||
|
int fdtype = EFD_NONE;
|
||||||
|
|
||||||
|
- /*
|
||||||
|
- * Before we resort to copying, let's try creating an ro-binfd in one shot
|
||||||
|
- * by getting a handle for a read-only bind-mount of the execfd.
|
||||||
|
- */
|
||||||
|
- execfd = try_bindfd();
|
||||||
|
- if (execfd >= 0)
|
||||||
|
- return execfd;
|
||||||
|
-
|
||||||
|
- /*
|
||||||
|
- * Dammit, that didn't work -- time to copy the binary to a safe place we
|
||||||
|
- * can seal the contents.
|
||||||
|
- */
|
||||||
|
execfd = make_execfd(&fdtype);
|
||||||
|
if (execfd < 0 || fdtype == EFD_NONE)
|
||||||
|
return -ENOTRECOVERABLE;
|
||||||
|
--
|
||||||
|
2.33.0
|
||||||
|
|
||||||
@ -3,7 +3,7 @@
|
|||||||
|
|
||||||
Name: docker-runc
|
Name: docker-runc
|
||||||
Version: 1.1.8
|
Version: 1.1.8
|
||||||
Release: 3
|
Release: 4
|
||||||
Summary: runc is a CLI tool for spawning and running containers according to the OCI specification.
|
Summary: runc is a CLI tool for spawning and running containers according to the OCI specification.
|
||||||
|
|
||||||
License: ASL 2.0
|
License: ASL 2.0
|
||||||
@ -54,6 +54,12 @@ install -p -m 755 runc $RPM_BUILD_ROOT/%{_bindir}/runc
|
|||||||
%{_bindir}/runc
|
%{_bindir}/runc
|
||||||
|
|
||||||
%changelog
|
%changelog
|
||||||
|
* Tue Sep 12 2023 zhongjiawei<zhongjiawei1@huawei.com> - 1.1.8-4
|
||||||
|
- Type:bugfix
|
||||||
|
- ID:NA
|
||||||
|
- SUG:NA
|
||||||
|
- DESC:remove bindfd logic entirely
|
||||||
|
|
||||||
* Wed Sep 6 2023 zhongjiawei<zhongjiawei1@huawei.com> - 1.1.8-3
|
* Wed Sep 6 2023 zhongjiawei<zhongjiawei1@huawei.com> - 1.1.8-3
|
||||||
- Type:bugfix
|
- Type:bugfix
|
||||||
- ID:NA
|
- ID:NA
|
||||||
|
|||||||
@ -24,3 +24,4 @@ patch/0024-runc-modify-linuxcontainer-starttime-uint64-type-tob.patch
|
|||||||
patch/0025-runc-make-runc-spec-compatible-1.0.0.rc3.patch
|
patch/0025-runc-make-runc-spec-compatible-1.0.0.rc3.patch
|
||||||
patch/0026-runc-Fixed-init-state-error-variable.patch
|
patch/0026-runc-Fixed-init-state-error-variable.patch
|
||||||
patch/0027-runc-libct-fix-shared-pidns-detection.patch
|
patch/0027-runc-libct-fix-shared-pidns-detection.patch
|
||||||
|
patch/0028-runc-nsenter-cloned_binary-remove-bindfd-logic-entirely.patch
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user