runc/patch/0037-runc-fix-CVE-2024-21626.patch
2024-02-22 19:51:33 +08:00

646 lines
25 KiB
Diff

From 60c06a2748ba1ac4784cd4022645f36db1dd254f Mon Sep 17 00:00:00 2001
From: zhongjiawei <zhongjiawei1@huawei.com>
Date: Thu, 22 Feb 2024 19:44:07 +0800
Subject: [PATCH] runc:fix CVE-2024-21626
---
libcontainer/cgroups/file.go | 35 ++--
libcontainer/container_linux.go | 9 +
libcontainer/init_linux.go | 31 ++++
libcontainer/integration/seccomp_test.go | 20 +--
libcontainer/setns_init_linux.go | 18 ++
libcontainer/standard_init_linux.go | 19 ++
libcontainer/utils/utils.go | 36 ----
libcontainer/utils/utils_unix.go | 216 +++++++++++++++++++++--
8 files changed, 311 insertions(+), 73 deletions(-)
diff --git a/libcontainer/cgroups/file.go b/libcontainer/cgroups/file.go
index bc7f0a3..b78817c 100644
--- a/libcontainer/cgroups/file.go
+++ b/libcontainer/cgroups/file.go
@@ -10,6 +10,7 @@ import (
"strings"
"sync"
+ "github.com/opencontainers/runc/libcontainer/utils"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
@@ -81,16 +82,16 @@ var (
// TestMode is set to true by unit tests that need "fake" cgroupfs.
TestMode bool
- cgroupFd int = -1
- prepOnce sync.Once
- prepErr error
- resolveFlags uint64
+ cgroupRootHandle *os.File
+ prepOnce sync.Once
+ prepErr error
+ resolveFlags uint64
)
func prepareOpenat2() error {
prepOnce.Do(func() {
fd, err := unix.Openat2(-1, cgroupfsDir, &unix.OpenHow{
- Flags: unix.O_DIRECTORY | unix.O_PATH,
+ Flags: unix.O_DIRECTORY | unix.O_PATH | unix.O_CLOEXEC,
})
if err != nil {
prepErr = &os.PathError{Op: "openat2", Path: cgroupfsDir, Err: err}
@@ -101,15 +102,16 @@ func prepareOpenat2() error {
}
return
}
+ file := os.NewFile(uintptr(fd), cgroupfsDir)
+
var st unix.Statfs_t
- if err = unix.Fstatfs(fd, &st); err != nil {
+ if err := unix.Fstatfs(int(file.Fd()), &st); err != nil {
prepErr = &os.PathError{Op: "statfs", Path: cgroupfsDir, Err: err}
logrus.Warnf("falling back to securejoin: %s", prepErr)
return
}
- cgroupFd = fd
-
+ cgroupRootHandle = file
resolveFlags = unix.RESOLVE_BENEATH | unix.RESOLVE_NO_MAGICLINKS
if st.Type == unix.CGROUP2_SUPER_MAGIC {
// cgroupv2 has a single mountpoint and no "cpu,cpuacct" symlinks
@@ -136,7 +138,7 @@ func openFile(dir, file string, flags int) (*os.File, error) {
return openFallback(path, flags, mode)
}
- fd, err := unix.Openat2(cgroupFd, relPath,
+ fd, err := unix.Openat2(int(cgroupRootHandle.Fd()), relPath,
&unix.OpenHow{
Resolve: resolveFlags,
Flags: uint64(flags) | unix.O_CLOEXEC,
@@ -144,20 +146,21 @@ func openFile(dir, file string, flags int) (*os.File, error) {
})
if err != nil {
err = &os.PathError{Op: "openat2", Path: path, Err: err}
- // Check if cgroupFd is still opened to cgroupfsDir
+ // Check if cgroupRootHandle is still opened to cgroupfsDir
// (happens when this package is incorrectly used
// across the chroot/pivot_root/mntns boundary, or
// when /sys/fs/cgroup is remounted).
//
// TODO: if such usage will ever be common, amend this
- // to reopen cgroupFd and retry openat2.
- fdStr := strconv.Itoa(cgroupFd)
- fdDest, _ := os.Readlink("/proc/self/fd/" + fdStr)
+ // to reopen cgroupRootHandle and retry openat2.
+ fdPath, closer := utils.ProcThreadSelf("fd/" + strconv.Itoa(int(cgroupRootHandle.Fd())))
+ defer closer()
+ fdDest, _ := os.Readlink(fdPath)
if fdDest != cgroupfsDir {
- // Wrap the error so it is clear that cgroupFd
+ // Wrap the error so it is clear that cgroupRootHandle
// is opened to an unexpected/wrong directory.
- err = fmt.Errorf("cgroupFd %s unexpectedly opened to %s != %s: %w",
- fdStr, fdDest, cgroupfsDir, err)
+ err = fmt.Errorf("cgroupRootHandle %d unexpectedly opened to %s != %s: %w",
+ cgroupRootHandle.Fd(), fdDest, cgroupfsDir, err)
}
return nil, err
}
diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go
index 4f9433b..5086d50 100644
--- a/libcontainer/container_linux.go
+++ b/libcontainer/container_linux.go
@@ -356,6 +356,15 @@ func (c *linuxContainer) start(process *Process) (retErr error) {
}()
}
+ // Before starting "runc init", mark all non-stdio open files as O_CLOEXEC
+ // to make sure we don't leak any files into "runc init". Any files to be
+ // passed to "runc init" through ExtraFiles will get dup2'd by the Go
+ // runtime and thus their O_CLOEXEC flag will be cleared. This is some
+ // additional protection against attacks like CVE-2024-21626, by making
+ // sure we never leak files to "runc init" we didn't intend to.
+ if err := utils.CloseExecFrom(3); err != nil {
+ return fmt.Errorf("unable to mark non-stdio fds as cloexec: %w", err)
+ }
if err := parent.start(); err != nil {
return fmt.Errorf("unable to start container process: %w", err)
}
diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go
index 5b88c71..d9f1813 100644
--- a/libcontainer/init_linux.go
+++ b/libcontainer/init_linux.go
@@ -8,6 +8,7 @@ import (
"io"
"net"
"os"
+ "path/filepath"
"strings"
"unsafe"
@@ -135,6 +136,32 @@ func populateProcessEnvironment(env []string) error {
return nil
}
+// verifyCwd ensures that the current directory is actually inside the mount
+// namespace root of the current process.
+func verifyCwd() error {
+ // getcwd(2) on Linux detects if cwd is outside of the rootfs of the
+ // current mount namespace root, and in that case prefixes "(unreachable)"
+ // to the returned string. glibc's getcwd(3) and Go's Getwd() both detect
+ // when this happens and return ENOENT rather than returning a non-absolute
+ // path. In both cases we can therefore easily detect if we have an invalid
+ // cwd by checking the return value of getcwd(3). See getcwd(3) for more
+ // details, and CVE-2024-21626 for the security issue that motivated this
+ // check.
+ //
+ // We have to use unix.Getwd() here because os.Getwd() has a workaround for
+ // $PWD which involves doing stat(.), which can fail if the current
+ // directory is inaccessible to the container process.
+ if wd, err := unix.Getwd(); errors.Is(err, unix.ENOENT) {
+ return errors.New("current working directory is outside of container mount namespace root -- possible container breakout detected")
+ } else if err != nil {
+ return fmt.Errorf("failed to verify if current working directory is safe: %w", err)
+ } else if !filepath.IsAbs(wd) {
+ // We shouldn't ever hit this, but check just in case.
+ return fmt.Errorf("current working directory is not absolute -- possible container breakout detected: cwd is %q", wd)
+ }
+ return nil
+}
+
// finalizeNamespace drops the caps, sets the correct user
// and working dir, and closes any leaked file descriptors
// before executing the command inside the namespace
@@ -193,6 +220,10 @@ func finalizeNamespace(config *initConfig) error {
return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %w", config.Cwd, err)
}
}
+ // Make sure our final working directory is inside the container.
+ if err := verifyCwd(); err != nil {
+ return err
+ }
if err := system.ClearKeepCaps(); err != nil {
return fmt.Errorf("unable to clear keep caps: %w", err)
}
diff --git a/libcontainer/integration/seccomp_test.go b/libcontainer/integration/seccomp_test.go
index 31092a0..ecdfa79 100644
--- a/libcontainer/integration/seccomp_test.go
+++ b/libcontainer/integration/seccomp_test.go
@@ -13,7 +13,7 @@ import (
libseccomp "github.com/seccomp/libseccomp-golang"
)
-func TestSeccompDenyGetcwdWithErrno(t *testing.T) {
+func TestSeccompDenySyslogWithErrno(t *testing.T) {
if testing.Short() {
return
}
@@ -25,7 +25,7 @@ func TestSeccompDenyGetcwdWithErrno(t *testing.T) {
DefaultAction: configs.Allow,
Syscalls: []*configs.Syscall{
{
- Name: "getcwd",
+ Name: "syslog",
Action: configs.Errno,
ErrnoRet: &errnoRet,
},
@@ -39,7 +39,7 @@ func TestSeccompDenyGetcwdWithErrno(t *testing.T) {
buffers := newStdBuffers()
pwd := &libcontainer.Process{
Cwd: "/",
- Args: []string{"pwd"},
+ Args: []string{"dmesg"},
Env: standardEnvironment,
Stdin: buffers.Stdin,
Stdout: buffers.Stdout,
@@ -65,17 +65,17 @@ func TestSeccompDenyGetcwdWithErrno(t *testing.T) {
}
if exitCode == 0 {
- t.Fatalf("Getcwd should fail with negative exit code, instead got %d!", exitCode)
+ t.Fatalf("dmesg should fail with negative exit code, instead got %d!", exitCode)
}
- expected := "pwd: getcwd: No such process"
+ expected := "dmesg: klogctl: No such process"
actual := strings.Trim(buffers.Stderr.String(), "\n")
if actual != expected {
t.Fatalf("Expected output %s but got %s\n", expected, actual)
}
}
-func TestSeccompDenyGetcwd(t *testing.T) {
+func TestSeccompDenySyslog(t *testing.T) {
if testing.Short() {
return
}
@@ -85,7 +85,7 @@ func TestSeccompDenyGetcwd(t *testing.T) {
DefaultAction: configs.Allow,
Syscalls: []*configs.Syscall{
{
- Name: "getcwd",
+ Name: "syslog",
Action: configs.Errno,
},
},
@@ -98,7 +98,7 @@ func TestSeccompDenyGetcwd(t *testing.T) {
buffers := newStdBuffers()
pwd := &libcontainer.Process{
Cwd: "/",
- Args: []string{"pwd"},
+ Args: []string{"dmesg"},
Env: standardEnvironment,
Stdin: buffers.Stdin,
Stdout: buffers.Stdout,
@@ -124,10 +124,10 @@ func TestSeccompDenyGetcwd(t *testing.T) {
}
if exitCode == 0 {
- t.Fatalf("Getcwd should fail with negative exit code, instead got %d!", exitCode)
+ t.Fatalf("dmesg should fail with negative exit code, instead got %d!", exitCode)
}
- expected := "pwd: getcwd: Operation not permitted"
+ expected := "dmesg: klogctl: Operation not permitted"
actual := strings.Trim(buffers.Stderr.String(), "\n")
if actual != expected {
t.Fatalf("Expected output %s but got %s\n", expected, actual)
diff --git a/libcontainer/setns_init_linux.go b/libcontainer/setns_init_linux.go
index e9b8d62..1eea851 100644
--- a/libcontainer/setns_init_linux.go
+++ b/libcontainer/setns_init_linux.go
@@ -108,5 +108,23 @@ func (l *linuxSetnsInit) Init() error {
return &os.PathError{Op: "close log pipe", Path: "fd " + strconv.Itoa(l.logFd), Err: err}
}
+ // Close all file descriptors we are not passing to the container. This is
+ // necessary because the execve target could use internal runc fds as the
+ // execve path, potentially giving access to binary files from the host
+ // (which can then be opened by container processes, leading to container
+ // escapes). Note that because this operation will close any open file
+ // descriptors that are referenced by (*os.File) handles from underneath
+ // the Go runtime, we must not do any file operations after this point
+ // (otherwise the (*os.File) finaliser could close the wrong file). See
+ // CVE-2024-21626 for more information as to why this protection is
+ // necessary.
+ //
+ // This is not needed for runc-dmz, because the extra execve(2) step means
+ // that all O_CLOEXEC file descriptors have already been closed and thus
+ // the second execve(2) from runc-dmz cannot access internal file
+ // descriptors from runc.
+ if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil {
+ return err
+ }
return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ())
}
diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go
index cd962c8..8f595a4 100644
--- a/libcontainer/standard_init_linux.go
+++ b/libcontainer/standard_init_linux.go
@@ -20,6 +20,7 @@ import (
"github.com/opencontainers/runc/libcontainer/keys"
"github.com/opencontainers/runc/libcontainer/seccomp"
"github.com/opencontainers/runc/libcontainer/system"
+ "github.com/opencontainers/runc/libcontainer/utils"
)
type linuxStandardInit struct {
@@ -284,5 +285,23 @@ func (l *linuxStandardInit) Init() error {
return err
}
+ // Close all file descriptors we are not passing to the container. This is
+ // necessary because the execve target could use internal runc fds as the
+ // execve path, potentially giving access to binary files from the host
+ // (which can then be opened by container processes, leading to container
+ // escapes). Note that because this operation will close any open file
+ // descriptors that are referenced by (*os.File) handles from underneath
+ // the Go runtime, we must not do any file operations after this point
+ // (otherwise the (*os.File) finaliser could close the wrong file). See
+ // CVE-2024-21626 for more information as to why this protection is
+ // necessary.
+ //
+ // This is not needed for runc-dmz, because the extra execve(2) step means
+ // that all O_CLOEXEC file descriptors have already been closed and thus
+ // the second execve(2) from runc-dmz cannot access internal file
+ // descriptors from runc.
+ if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil {
+ return err
+ }
return system.Exec(name, l.config.Args[0:], os.Environ())
}
diff --git a/libcontainer/utils/utils.go b/libcontainer/utils/utils.go
index 6b9fc34..dd96abe 100644
--- a/libcontainer/utils/utils.go
+++ b/libcontainer/utils/utils.go
@@ -3,15 +3,12 @@ package utils
import (
"encoding/binary"
"encoding/json"
- "fmt"
"io"
"os"
"path/filepath"
- "strconv"
"strings"
"unsafe"
- securejoin "github.com/cyphar/filepath-securejoin"
"golang.org/x/sys/unix"
)
@@ -99,39 +96,6 @@ func stripRoot(root, path string) string {
return CleanPath("/" + path)
}
-// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...)
-// corresponding to the unsafePath resolved within the root. Before passing the
-// fd, this path is verified to have been inside the root -- so operating on it
-// through the passed fdpath should be safe. Do not access this path through
-// the original path strings, and do not attempt to use the pathname outside of
-// the passed closure (the file handle will be freed once the closure returns).
-func WithProcfd(root, unsafePath string, fn func(procfd string) error) error {
- // Remove the root then forcefully resolve inside the root.
- unsafePath = stripRoot(root, unsafePath)
- path, err := securejoin.SecureJoin(root, unsafePath)
- if err != nil {
- return fmt.Errorf("resolving path inside rootfs failed: %w", err)
- }
-
- // Open the target path.
- fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0)
- if err != nil {
- return fmt.Errorf("open o_path procfd: %w", err)
- }
- defer fh.Close()
-
- // Double-check the path is the one we expected.
- procfd := "/proc/self/fd/" + strconv.Itoa(int(fh.Fd()))
- if realpath, err := os.Readlink(procfd); err != nil {
- return fmt.Errorf("procfd verification failed: %w", err)
- } else if realpath != path {
- return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath)
- }
-
- // Run the closure.
- return fn(procfd)
-}
-
// SearchLabels searches a list of key-value pairs for the provided key and
// returns the corresponding value. The pairs must be separated with '='.
func SearchLabels(labels []string, query string) string {
diff --git a/libcontainer/utils/utils_unix.go b/libcontainer/utils/utils_unix.go
index 220d0b4..f57f087 100644
--- a/libcontainer/utils/utils_unix.go
+++ b/libcontainer/utils/utils_unix.go
@@ -5,9 +5,16 @@ package utils
import (
"fmt"
+ "math"
"os"
+ "path/filepath"
+ "runtime"
"strconv"
+ "sync"
+ _ "unsafe" // for go:linkname
+ securejoin "github.com/cyphar/filepath-securejoin"
+ "github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
@@ -23,10 +30,39 @@ func EnsureProcHandle(fh *os.File) error {
return nil
}
-// CloseExecFrom applies O_CLOEXEC to all file descriptors currently open for
-// the process (except for those below the given fd value).
-func CloseExecFrom(minFd int) error {
- fdDir, err := os.Open("/proc/self/fd")
+var (
+ haveCloseRangeCloexecBool bool
+ haveCloseRangeCloexecOnce sync.Once
+)
+
+func haveCloseRangeCloexec() bool {
+ haveCloseRangeCloexecOnce.Do(func() {
+ // Make sure we're not closing a random file descriptor.
+ tmpFd, err := unix.FcntlInt(0, unix.F_DUPFD_CLOEXEC, 0)
+ if err != nil {
+ return
+ }
+ defer unix.Close(tmpFd)
+
+ err = unix.CloseRange(uint(tmpFd), uint(tmpFd), unix.CLOSE_RANGE_CLOEXEC)
+ // Any error means we cannot use close_range(CLOSE_RANGE_CLOEXEC).
+ // -ENOSYS and -EINVAL ultimately mean we don't have support, but any
+ // other potential error would imply that even the most basic close
+ // operation wouldn't work.
+ haveCloseRangeCloexecBool = err == nil
+ })
+ return haveCloseRangeCloexecBool
+}
+
+type fdFunc func(fd int)
+
+// fdRangeFrom calls the passed fdFunc for each file descriptor that is open in
+// the current process.
+func fdRangeFrom(minFd int, fn fdFunc) error {
+ procSelfFd, closer := ProcThreadSelf("fd")
+ defer closer()
+
+ fdDir, err := os.Open(procSelfFd)
if err != nil {
return err
}
@@ -50,20 +86,178 @@ func CloseExecFrom(minFd int) error {
if fd < minFd {
continue
}
- // Intentionally ignore errors from unix.CloseOnExec -- the cases where
- // this might fail are basically file descriptors that have already
- // been closed (including and especially the one that was created when
- // os.ReadDir did the "opendir" syscall).
- unix.CloseOnExec(fd)
+ // Ignore the file descriptor we used for readdir, as it will be closed
+ // when we return.
+ if uintptr(fd) == fdDir.Fd() {
+ continue
+ }
+ // Run the closure.
+ fn(fd)
}
return nil
}
-// NewSockPair returns a new unix socket pair
-func NewSockPair(name string) (parent *os.File, child *os.File, err error) {
+// CloseExecFrom sets the O_CLOEXEC flag on all file descriptors greater or
+// equal to minFd in the current process.
+func CloseExecFrom(minFd int) error {
+ // Use close_range(CLOSE_RANGE_CLOEXEC) if possible.
+ if haveCloseRangeCloexec() {
+ err := unix.CloseRange(uint(minFd), math.MaxUint, unix.CLOSE_RANGE_CLOEXEC)
+ return os.NewSyscallError("close_range", err)
+ }
+ // Otherwise, fall back to the standard loop.
+ return fdRangeFrom(minFd, unix.CloseOnExec)
+}
+
+//go:linkname runtime_IsPollDescriptor internal/poll.IsPollDescriptor
+
+// In order to make sure we do not close the internal epoll descriptors the Go
+// runtime uses, we need to ensure that we skip descriptors that match
+// "internal/poll".IsPollDescriptor. Yes, this is a Go runtime internal thing,
+// unfortunately there's no other way to be sure we're only keeping the file
+// descriptors the Go runtime needs. Hopefully nothing blows up doing this...
+func runtime_IsPollDescriptor(fd uintptr) bool //nolint:revive
+
+// UnsafeCloseFrom closes all file descriptors greater or equal to minFd in the
+// current process, except for those critical to Go's runtime (such as the
+// netpoll management descriptors).
+//
+// NOTE: That this function is incredibly dangerous to use in most Go code, as
+// closing file descriptors from underneath *os.File handles can lead to very
+// bad behaviour (the closed file descriptor can be re-used and then any
+// *os.File operations would apply to the wrong file). This function is only
+// intended to be called from the last stage of runc init.
+func UnsafeCloseFrom(minFd int) error {
+ // We cannot use close_range(2) even if it is available, because we must
+ // not close some file descriptors.
+ return fdRangeFrom(minFd, func(fd int) {
+ if runtime_IsPollDescriptor(uintptr(fd)) {
+ // These are the Go runtimes internal netpoll file descriptors.
+ // These file descriptors are operated on deep in the Go scheduler,
+ // and closing those files from underneath Go can result in panics.
+ // There is no issue with keeping them because they are not
+ // executable and are not useful to an attacker anyway. Also we
+ // don't have any choice.
+ return
+ }
+ // There's nothing we can do about errors from close(2), and the
+ // only likely error to be seen is EBADF which indicates the fd was
+ // already closed (in which case, we got what we wanted).
+ _ = unix.Close(fd)
+ })
+}
+
+// NewSockPair returns a new SOCK_STREAM unix socket pair.
+func NewSockPair(name string) (parent, child *os.File, err error) {
fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0)
if err != nil {
return nil, nil, err
}
return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil
}
+
+// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...)
+// corresponding to the unsafePath resolved within the root. Before passing the
+// fd, this path is verified to have been inside the root -- so operating on it
+// through the passed fdpath should be safe. Do not access this path through
+// the original path strings, and do not attempt to use the pathname outside of
+// the passed closure (the file handle will be freed once the closure returns).
+func WithProcfd(root, unsafePath string, fn func(procfd string) error) error {
+ // Remove the root then forcefully resolve inside the root.
+ unsafePath = stripRoot(root, unsafePath)
+ path, err := securejoin.SecureJoin(root, unsafePath)
+ if err != nil {
+ return fmt.Errorf("resolving path inside rootfs failed: %w", err)
+ }
+
+ procSelfFd, closer := ProcThreadSelf("fd/")
+ defer closer()
+
+ // Open the target path.
+ fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0)
+ if err != nil {
+ return fmt.Errorf("open o_path procfd: %w", err)
+ }
+ defer fh.Close()
+
+ procfd := filepath.Join(procSelfFd, strconv.Itoa(int(fh.Fd())))
+ // Double-check the path is the one we expected.
+ if realpath, err := os.Readlink(procfd); err != nil {
+ return fmt.Errorf("procfd verification failed: %w", err)
+ } else if realpath != path {
+ return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath)
+ }
+
+ return fn(procfd)
+}
+
+type ProcThreadSelfCloser func()
+
+var (
+ haveProcThreadSelf bool
+ haveProcThreadSelfOnce sync.Once
+)
+
+// ProcThreadSelf returns a string that is equivalent to
+// /proc/thread-self/<subpath>, with a graceful fallback on older kernels where
+// /proc/thread-self doesn't exist. This method DOES NOT use SecureJoin,
+// meaning that the passed string needs to be trusted. The caller _must_ call
+// the returned procThreadSelfCloser function (which is runtime.UnlockOSThread)
+// *only once* after it has finished using the returned path string.
+func ProcThreadSelf(subpath string) (string, ProcThreadSelfCloser) {
+ haveProcThreadSelfOnce.Do(func() {
+ if _, err := os.Stat("/proc/thread-self/"); err == nil {
+ haveProcThreadSelf = true
+ } else {
+ logrus.Debugf("cannot stat /proc/thread-self (%v), falling back to /proc/self/task/<tid>", err)
+ }
+ })
+
+ // We need to lock our thread until the caller is done with the path string
+ // because any non-atomic operation on the path (such as opening a file,
+ // then reading it) could be interrupted by the Go runtime where the
+ // underlying thread is swapped out and the original thread is killed,
+ // resulting in pull-your-hair-out-hard-to-debug issues in the caller. In
+ // addition, the pre-3.17 fallback makes everything non-atomic because the
+ // same thing could happen between unix.Gettid() and the path operations.
+ //
+ // In theory, we don't need to lock in the atomic user case when using
+ // /proc/thread-self/, but it's better to be safe than sorry (and there are
+ // only one or two truly atomic users of /proc/thread-self/).
+ runtime.LockOSThread()
+
+ threadSelf := "/proc/thread-self/"
+ if !haveProcThreadSelf {
+ // Pre-3.17 kernels did not have /proc/thread-self, so do it manually.
+ threadSelf = "/proc/self/task/" + strconv.Itoa(unix.Gettid()) + "/"
+ if _, err := os.Stat(threadSelf); err != nil {
+ // Unfortunately, this code is called from rootfs_linux.go where we
+ // are running inside the pid namespace of the container but /proc
+ // is the host's procfs. Unfortunately there is no real way to get
+ // the correct tid to use here (the kernel age means we cannot do
+ // things like set up a private fsopen("proc") -- even scanning
+ // NSpid in all of the tasks in /proc/self/task/*/status requires
+ // Linux 4.1).
+ //
+ // So, we just have to assume that /proc/self is acceptable in this
+ // one specific case.
+ if os.Getpid() == 1 {
+ logrus.Debugf("/proc/thread-self (tid=%d) cannot be emulated inside the initial container setup -- using /proc/self instead: %v", unix.Gettid(), err)
+ } else {
+ // This should never happen, but the fallback should work in most cases...
+ logrus.Warnf("/proc/thread-self could not be emulated for pid=%d (tid=%d) -- using more buggy /proc/self fallback instead: %v", os.Getpid(), unix.Gettid(), err)
+ }
+ threadSelf = "/proc/self/"
+ }
+ }
+ return threadSelf + subpath, runtime.UnlockOSThread
+}
+
+// ProcThreadSelfFd is small wrapper around ProcThreadSelf to make it easier to
+// create a /proc/thread-self handle for given file descriptor.
+//
+// It is basically equivalent to ProcThreadSelf(fmt.Sprintf("fd/%d", fd)), but
+// without using fmt.Sprintf to avoid unneeded overhead.
+func ProcThreadSelfFd(fd uintptr) (string, ProcThreadSelfCloser) {
+ return ProcThreadSelf("fd/" + strconv.FormatUint(uint64(fd), 10))
+}
--
2.33.0