runc/patch/0005-runc-add-support-for-rootless-containers.patch
openeuler-iSula 5904ba4dcf runc: package init
Signed-off-by: openeuler-iSula <isula@huawei.com>
2019-12-29 15:34:20 +08:00

1491 lines
43 KiB
Diff

From 32d2efc77f61fc7142e72b30e82aca8eeefc7c54 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <asarai@suse.de>
Date: Sat, 23 Apr 2016 23:39:42 +1000
Subject: [PATCH 05/94] runc: add support for rootless containers
This enables the support for the rootless container mode. There are many
restrictions on what rootless containers can do, so many different runC
commands have been disabled:
* runc checkpoint
* runc events
* runc pause
* runc ps
* runc restore
* runc resume
* runc update
The following commands work:
* runc create
* runc delete
* runc exec
* runc kill
* runc list
* runc run
* runc spec
* runc state
In addition, any specification options that imply joining cgroups have
also been disabled. This is due to support for unprivileged subtree
management not being available from Linux upstream.
Change-Id: I5cfba61e3a3d7491f2b0bc00ccfd51b87684de8a
Signed-off-by: Aleksa Sarai <asarai@suse.de>
---
Makefile | 2 +-
checkpoint.go | 5 +
exec.go | 3 -
libcontainer/configs/config.go | 3 +
libcontainer/configs/validate/rootless.go | 117 +++++++++++++++
libcontainer/configs/validate/rootless_test.go | 195 +++++++++++++++++++++++++
libcontainer/configs/validate/validator.go | 5 +
libcontainer/container_linux.go | 49 +++++--
libcontainer/init_linux.go | 41 +++++-
libcontainer/message_linux.go | 1 +
libcontainer/nsenter/nsexec.c | 26 +++-
libcontainer/process_linux.go | 28 +++-
libcontainer/specconv/example.go | 160 ++++++++++++++++++++
libcontainer/specconv/spec_linux.go | 31 +++-
libcontainer/specconv/spec_linux_test.go | 80 +++++++++-
list.go | 19 ++-
ps.go | 5 +
restore.go | 6 +
spec.go | 150 +------------------
utils.go | 3 -
utils_linux.go | 6 +
21 files changed, 742 insertions(+), 193 deletions(-)
create mode 100644 libcontainer/configs/validate/rootless.go
create mode 100644 libcontainer/configs/validate/rootless_test.go
create mode 100644 libcontainer/specconv/example.go
diff --git a/Makefile b/Makefile
index b82884a..5fff515 100644
--- a/Makefile
+++ b/Makefile
@@ -4,7 +4,7 @@
SOURCES := $(shell find . 2>&1 | grep -E '.*\.(c|h|go)$$')
PREFIX := $(DESTDIR)/usr/local
-BINDIR := $(PREFIX)/sbin
+BINDIR := $(PREFIX)/bin
GIT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD 2>/dev/null)
GIT_BRANCH_CLEAN := $(shell echo $(GIT_BRANCH) | sed -e "s/[^[:alnum:]]/-/g")
RUNC_IMAGE := runc_dev$(if $(GIT_BRANCH_CLEAN),:$(GIT_BRANCH_CLEAN))
diff --git a/checkpoint.go b/checkpoint.go
index dd7704f..78977d7 100644
--- a/checkpoint.go
+++ b/checkpoint.go
@@ -39,6 +39,11 @@ checkpointed.`,
if err := checkArgs(context, 1, exactArgs); err != nil {
return err
}
+ // XXX: Currently this is untested with rootless containers.
+ if isRootless() {
+ return fmt.Errorf("runc checkpoint requires root")
+ }
+
container, err := getContainer(context)
if err != nil {
return err
diff --git a/exec.go b/exec.go
index 84061e6..22f2689 100644
--- a/exec.go
+++ b/exec.go
@@ -90,9 +90,6 @@ following will output a list of processes running in the container:
if err := checkArgs(context, 1, minArgs); err != nil {
return err
}
- if os.Geteuid() != 0 {
- return fmt.Errorf("runc should be run as root")
- }
if err := revisePidFile(context); err != nil {
return err
}
diff --git a/libcontainer/configs/config.go b/libcontainer/configs/config.go
index 890cd7d..98f4b85 100644
--- a/libcontainer/configs/config.go
+++ b/libcontainer/configs/config.go
@@ -183,6 +183,9 @@ type Config struct {
// NoNewKeyring will not allocated a new session keyring for the container. It will use the
// callers keyring in this case.
NoNewKeyring bool `json:"no_new_keyring"`
+
+ // Rootless specifies whether the container is a rootless container.
+ Rootless bool `json:"rootless"`
}
type Hooks struct {
diff --git a/libcontainer/configs/validate/rootless.go b/libcontainer/configs/validate/rootless.go
new file mode 100644
index 0000000..1e83ced
--- /dev/null
+++ b/libcontainer/configs/validate/rootless.go
@@ -0,0 +1,117 @@
+package validate
+
+import (
+ "fmt"
+ "os"
+ "reflect"
+ "strings"
+
+ "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+var (
+ geteuid = os.Geteuid
+ getegid = os.Getegid
+)
+
+func (v *ConfigValidator) rootless(config *configs.Config) error {
+ if err := rootlessMappings(config); err != nil {
+ return err
+ }
+ if err := rootlessMount(config); err != nil {
+ return err
+ }
+ // Currently, cgroups cannot effectively be used in rootless containers.
+ // The new cgroup namespace doesn't really help us either because it doesn't
+ // have nice interactions with the user namespace (we're working with upstream
+ // to fix this).
+ if err := rootlessCgroup(config); err != nil {
+ return err
+ }
+
+ // XXX: We currently can't verify the user config at all, because
+ // configs.Config doesn't store the user-related configs. So this
+ // has to be verified by setupUser() in init_linux.go.
+
+ return nil
+}
+
+func rootlessMappings(config *configs.Config) error {
+ rootuid, err := config.HostUID()
+ if err != nil {
+ return fmt.Errorf("failed to get root uid from uidMappings: %v", err)
+ }
+ if euid := geteuid(); euid != 0 {
+ if !config.Namespaces.Contains(configs.NEWUSER) {
+ return fmt.Errorf("rootless containers require user namespaces")
+ }
+ if rootuid != euid {
+ return fmt.Errorf("rootless containers cannot map container root to a different host user")
+ }
+ }
+
+ rootgid, err := config.HostGID()
+ if err != nil {
+ return fmt.Errorf("failed to get root gid from gidMappings: %v", err)
+ }
+
+ // Similar to the above test, we need to make sure that we aren't trying to
+ // map to a group ID that we don't have the right to be.
+ if rootgid != getegid() {
+ return fmt.Errorf("rootless containers cannot map container root to a different host group")
+ }
+
+ // We can only map one user and group inside a container (our own).
+ if len(config.UidMappings) != 1 || config.UidMappings[0].Size != 1 {
+ return fmt.Errorf("rootless containers cannot map more than one user")
+ }
+ if len(config.GidMappings) != 1 || config.GidMappings[0].Size != 1 {
+ return fmt.Errorf("rootless containers cannot map more than one group")
+ }
+
+ return nil
+}
+
+// cgroup verifies that the user isn't trying to set any cgroup limits or paths.
+func rootlessCgroup(config *configs.Config) error {
+ // Nothing set at all.
+ if config.Cgroups == nil || config.Cgroups.Resources == nil {
+ return nil
+ }
+
+ // Used for comparing to the zero value.
+ left := reflect.ValueOf(*config.Cgroups.Resources)
+ right := reflect.Zero(left.Type())
+
+ // This is all we need to do, since specconv won't add cgroup options in
+ // rootless mode.
+ if !reflect.DeepEqual(left.Interface(), right.Interface()) {
+ return fmt.Errorf("cannot specify resource limits in rootless container")
+ }
+
+ return nil
+}
+
+// mount verifies that the user isn't trying to set up any mounts they don't have
+// the rights to do. In addition, it makes sure that no mount has a `uid=` or
+// `gid=` option that doesn't resolve to root.
+func rootlessMount(config *configs.Config) error {
+ // XXX: We could whitelist allowed devices at this point, but I'm not
+ // convinced that's a good idea. The kernel is the best arbiter of
+ // access control.
+
+ for _, mount := range config.Mounts {
+ // Check that the options list doesn't contain any uid= or gid= entries
+ // that don't resolve to root.
+ for _, opt := range strings.Split(mount.Data, ",") {
+ if strings.HasPrefix(opt, "uid=") && opt != "uid=0" {
+ return fmt.Errorf("cannot specify uid= mount options in rootless containers where argument isn't 0")
+ }
+ if strings.HasPrefix(opt, "gid=") && opt != "gid=0" {
+ return fmt.Errorf("cannot specify gid= mount options in rootless containers where argument isn't 0")
+ }
+ }
+ }
+
+ return nil
+}
diff --git a/libcontainer/configs/validate/rootless_test.go b/libcontainer/configs/validate/rootless_test.go
new file mode 100644
index 0000000..23d678d
--- /dev/null
+++ b/libcontainer/configs/validate/rootless_test.go
@@ -0,0 +1,195 @@
+package validate
+
+import (
+ "testing"
+
+ "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+func init() {
+ geteuid = func() int { return 1337 }
+ getegid = func() int { return 7331 }
+}
+
+func rootlessConfig() *configs.Config {
+ return &configs.Config{
+ Rootfs: "/var",
+ Rootless: true,
+ Namespaces: configs.Namespaces(
+ []configs.Namespace{
+ {Type: configs.NEWUSER},
+ },
+ ),
+ UidMappings: []configs.IDMap{
+ {
+ HostID: geteuid(),
+ ContainerID: 0,
+ Size: 1,
+ },
+ },
+ GidMappings: []configs.IDMap{
+ {
+ HostID: getegid(),
+ ContainerID: 0,
+ Size: 1,
+ },
+ },
+ }
+}
+
+func TestValidateRootless(t *testing.T) {
+ validator := New()
+
+ config := rootlessConfig()
+ if err := validator.Validate(config); err != nil {
+ t.Errorf("Expected error to not occur: %+v", err)
+ }
+}
+
+/* rootlessMappings() */
+
+func TestValidateRootlessUserns(t *testing.T) {
+ validator := New()
+
+ config := rootlessConfig()
+ config.Namespaces = nil
+ if err := validator.Validate(config); err == nil {
+ t.Errorf("Expected error to occur if user namespaces not set")
+ }
+}
+
+func TestValidateRootlessMappingUid(t *testing.T) {
+ validator := New()
+
+ config := rootlessConfig()
+ config.UidMappings = nil
+ if err := validator.Validate(config); err == nil {
+ t.Errorf("Expected error to occur if no uid mappings provided")
+ }
+
+ config = rootlessConfig()
+ config.UidMappings[0].HostID = geteuid() + 1
+ if err := validator.Validate(config); err == nil {
+ t.Errorf("Expected error to occur if geteuid() != mapped uid")
+ }
+
+ config = rootlessConfig()
+ config.UidMappings[0].Size = 1024
+ if err := validator.Validate(config); err == nil {
+ t.Errorf("Expected error to occur if more than one uid mapped")
+ }
+
+ config = rootlessConfig()
+ config.UidMappings = append(config.UidMappings, configs.IDMap{
+ HostID: geteuid() + 1,
+ ContainerID: 0,
+ Size: 1,
+ })
+ if err := validator.Validate(config); err == nil {
+ t.Errorf("Expected error to occur if more than one uid extent mapped")
+ }
+}
+
+func TestValidateRootlessMappingGid(t *testing.T) {
+ validator := New()
+
+ config := rootlessConfig()
+ config.GidMappings = nil
+ if err := validator.Validate(config); err == nil {
+ t.Errorf("Expected error to occur if no gid mappings provided")
+ }
+
+ config = rootlessConfig()
+ config.GidMappings[0].HostID = getegid() + 1
+ if err := validator.Validate(config); err == nil {
+ t.Errorf("Expected error to occur if getegid() != mapped gid")
+ }
+
+ config = rootlessConfig()
+ config.GidMappings[0].Size = 1024
+ if err := validator.Validate(config); err == nil {
+ t.Errorf("Expected error to occur if more than one gid mapped")
+ }
+
+ config = rootlessConfig()
+ config.GidMappings = append(config.GidMappings, configs.IDMap{
+ HostID: getegid() + 1,
+ ContainerID: 0,
+ Size: 1,
+ })
+ if err := validator.Validate(config); err == nil {
+ t.Errorf("Expected error to occur if more than one gid extent mapped")
+ }
+}
+
+/* rootlessMount() */
+
+func TestValidateRootlessMountUid(t *testing.T) {
+ config := rootlessConfig()
+ validator := New()
+
+ config.Mounts = []*configs.Mount{
+ {
+ Source: "devpts",
+ Destination: "/dev/pts",
+ Device: "devpts",
+ },
+ }
+
+ if err := validator.Validate(config); err != nil {
+ t.Errorf("Expected error to not occur when uid= not set in mount options: %+v", err)
+ }
+
+ config.Mounts[0].Data = "uid=5"
+ if err := validator.Validate(config); err == nil {
+ t.Errorf("Expected error to occur when setting uid=5 in mount options")
+ }
+
+ config.Mounts[0].Data = "uid=0"
+ if err := validator.Validate(config); err != nil {
+ t.Errorf("Expected error to not occur when setting uid=0 in mount options: %+v", err)
+ }
+}
+
+func TestValidateRootlessMountGid(t *testing.T) {
+ config := rootlessConfig()
+ validator := New()
+
+ config.Mounts = []*configs.Mount{
+ {
+ Source: "devpts",
+ Destination: "/dev/pts",
+ Device: "devpts",
+ },
+ }
+
+ if err := validator.Validate(config); err != nil {
+ t.Errorf("Expected error to not occur when gid= not set in mount options: %+v", err)
+ }
+
+ config.Mounts[0].Data = "gid=5"
+ if err := validator.Validate(config); err == nil {
+ t.Errorf("Expected error to occur when setting gid=5 in mount options")
+ }
+
+ config.Mounts[0].Data = "gid=0"
+ if err := validator.Validate(config); err != nil {
+ t.Errorf("Expected error to not occur when setting gid=0 in mount options: %+v", err)
+ }
+}
+
+/* rootlessCgroup() */
+
+func TestValidateRootlessCgroup(t *testing.T) {
+ validator := New()
+
+ config := rootlessConfig()
+ config.Cgroups = &configs.Cgroup{
+ Resources: &configs.Resources{
+ PidsLimit: 1337,
+ },
+ }
+ if err := validator.Validate(config); err == nil {
+ t.Errorf("Expected error to occur if cgroup limits set")
+ }
+}
diff --git a/libcontainer/configs/validate/validator.go b/libcontainer/configs/validate/validator.go
index ecf8335..8284345 100644
--- a/libcontainer/configs/validate/validator.go
+++ b/libcontainer/configs/validate/validator.go
@@ -40,6 +40,11 @@ func (v *ConfigValidator) Validate(config *configs.Config) error {
if err := v.sysctl(config); err != nil {
return err
}
+ if config.Rootless {
+ if err := v.rootless(config); err != nil {
+ return err
+ }
+ }
return nil
}
diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go
index d2e0e2b..372763a 100644
--- a/libcontainer/container_linux.go
+++ b/libcontainer/container_linux.go
@@ -51,6 +51,9 @@ type State struct {
// Platform specific fields below here
+ // Specifies if the container was started under the rootless mode.
+ Rootless bool `json:"rootless"`
+
// Path to all the cgroups setup for a container. Key is cgroup subsystem name
// with the value as the path.
CgroupPaths map[string]string `json:"cgroup_paths"`
@@ -452,6 +455,7 @@ func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
PassedFilesCount: len(process.ExtraFiles),
ContainerId: c.ID(),
NoNewPrivileges: c.config.NoNewPrivileges,
+ Rootless: c.config.Rootless,
AppArmorProfile: c.config.AppArmorProfile,
ProcessLabel: c.config.ProcessLabel,
Rlimits: c.config.Rlimits,
@@ -622,6 +626,13 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
c.m.Lock()
defer c.m.Unlock()
+ // TODO(avagin): Figure out how to make this work nicely. CRIU 2.0 has
+ // support for doing unprivileged dumps, but the setup of
+ // rootless containers might make this complicated.
+ if c.config.Rootless {
+ return fmt.Errorf("cannot checkpoint a rootless container")
+ }
+
if err := c.checkCriuVersion("1.5.2"); err != nil {
return err
}
@@ -791,6 +802,13 @@ func (c *linuxContainer) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts
func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
c.m.Lock()
defer c.m.Unlock()
+
+ // TODO(avagin): Figure out how to make this work nicely. CRIU doesn't have
+ // support for unprivileged restore at the moment.
+ if c.config.Rootless {
+ return fmt.Errorf("cannot restore a rootless container")
+ }
+
if err := c.checkCriuVersion("1.5.2"); err != nil {
return err
}
@@ -918,6 +936,7 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
}
func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error {
+ // XXX: Do we need to deal with this case? AFAIK criu still requires root.
if err := c.cgroupManager.Apply(pid); err != nil {
return err
}
@@ -1319,6 +1338,7 @@ func (c *linuxContainer) currentState() (*State, error) {
InitProcessStartTime: startTime,
Created: c.created,
},
+ Rootless: c.config.Rootless,
CgroupPaths: c.cgroupManager.GetPaths(),
NamespacePaths: make(map[configs.NamespaceType]string),
ExternalDescriptors: externalDescriptors,
@@ -1446,16 +1466,19 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Na
Type: GidmapAttr,
Value: b,
})
- // check if we have CAP_SETGID to setgroup properly
- pid, err := capability.NewPid(os.Getpid())
- if err != nil {
- return nil, err
- }
- if !pid.Get(capability.EFFECTIVE, capability.CAP_SETGID) {
- r.AddData(&Boolmsg{
- Type: SetgroupAttr,
- Value: true,
- })
+ // The following only applies if we are root.
+ if !c.config.Rootless {
+ // check if we have CAP_SETGID to setgroup properly
+ pid, err := capability.NewPid(os.Getpid())
+ if err != nil {
+ return nil, err
+ }
+ if !pid.Get(capability.EFFECTIVE, capability.CAP_SETGID) {
+ r.AddData(&Boolmsg{
+ Type: SetgroupAttr,
+ Value: true,
+ })
+ }
}
}
}
@@ -1466,5 +1489,11 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Na
Value: []byte(fmt.Sprintf("%d", c.config.OomScoreAdj)),
})
+ // write rootless
+ r.AddData(&Boolmsg{
+ Type: RootlessAttr,
+ Value: c.config.Rootless,
+ })
+
return bytes.NewReader(r.Serialize()), nil
}
diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go
index 0f5d412..1187835 100644
--- a/libcontainer/init_linux.go
+++ b/libcontainer/init_linux.go
@@ -58,6 +58,7 @@ type initConfig struct {
ContainerId string `json:"containerid"`
Rlimits []configs.Rlimit `json:"rlimits"`
CreateConsole bool `json:"create_console"`
+ Rootless bool `json:"rootless"`
}
type initer interface {
@@ -229,18 +230,21 @@ func syncParentHooks(pipe io.ReadWriter) error {
func setupUser(config *initConfig) error {
// Set up defaults.
defaultExecUser := user.ExecUser{
- Uid: syscall.Getuid(),
- Gid: syscall.Getgid(),
+ Uid: 0,
+ Gid: 0,
Home: "/",
}
+
passwdPath, err := user.GetPasswdPath()
if err != nil {
return err
}
+
groupPath, err := user.GetGroupPath()
if err != nil {
return err
}
+
execUser, err := user.GetExecUserPath(config.User, &defaultExecUser, passwdPath, groupPath)
if err != nil {
return err
@@ -253,22 +257,49 @@ func setupUser(config *initConfig) error {
return err
}
}
+
+ if config.Rootless {
+ if execUser.Uid != 0 {
+ return fmt.Errorf("cannot run as a non-root user in a rootless container")
+ }
+
+ if execUser.Gid != 0 {
+ return fmt.Errorf("cannot run as a non-root group in a rootless container")
+ }
+
+ // We cannot set any additional groups in a rootless container and thus we
+ // bail if the user asked us to do so. TODO: We currently can't do this
+ // earlier, but if libcontainer.Process.User was typesafe this might work.
+ if len(addGroups) > 0 {
+ return fmt.Errorf("cannot set any additional groups in a rootless container")
+ }
+ }
+
// before we change to the container's user make sure that the processes STDIO
// is correctly owned by the user that we are switching to.
if err := fixStdioPermissions(execUser); err != nil {
return err
}
- suppGroups := append(execUser.Sgids, addGroups...)
- if err := syscall.Setgroups(suppGroups); err != nil {
- return err
+
+ // This isn't allowed in an unprivileged user namespace since Linux 3.19.
+ // There's nothing we can do about /etc/group entries, so we silently
+ // ignore setting groups here (since the user didn't explicitly ask us to
+ // set the group).
+ if !config.Rootless {
+ suppGroups := append(execUser.Sgids, addGroups...)
+ if err := syscall.Setgroups(suppGroups); err != nil {
+ return err
+ }
}
if err := system.Setgid(execUser.Gid); err != nil {
return err
}
+
if err := system.Setuid(execUser.Uid); err != nil {
return err
}
+
// if we didn't get HOME already, set it based on the user's HOME
if envHome := os.Getenv("HOME"); envHome == "" {
if err := os.Setenv("HOME", execUser.Home); err != nil {
diff --git a/libcontainer/message_linux.go b/libcontainer/message_linux.go
index 321d664..bc725a2 100644
--- a/libcontainer/message_linux.go
+++ b/libcontainer/message_linux.go
@@ -18,6 +18,7 @@ const (
GidmapAttr uint16 = 27284
SetgroupAttr uint16 = 27285
OomScoreAdjAttr uint16 = 27286
+ RootlessAttr uint16 = 27287
// When syscall.NLA_HDRLEN is in gccgo, take this out.
syscall_NLA_HDRLEN = (syscall.SizeofNlAttr + syscall.NLA_ALIGNTO - 1) & ^(syscall.NLA_ALIGNTO - 1)
diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c
index 9630206..0ad6883 100644
--- a/libcontainer/nsenter/nsexec.c
+++ b/libcontainer/nsenter/nsexec.c
@@ -72,6 +72,7 @@ struct nlconfig_t {
char *namespaces;
size_t namespaces_len;
uint8_t is_setgroup;
+ uint8_t is_rootless;
char *oom_score_adj;
size_t oom_score_adj_len;
};
@@ -87,6 +88,7 @@ struct nlconfig_t {
#define GIDMAP_ATTR 27284
#define SETGROUP_ATTR 27285
#define OOM_SCORE_ADJ_ATTR 27286
+#define ROOTLESS_ATTR 27287
/*
* Use the raw syscall for versions of glibc which don't include a function for
@@ -175,6 +177,7 @@ static void update_setgroups(int pid, enum policy_t setgroup)
policy = "deny";
break;
case SETGROUPS_DEFAULT:
+ default:
/* Nothing to do. */
return;
}
@@ -329,6 +332,9 @@ static void nl_parse(int fd, struct nlconfig_t *config)
case CLONE_FLAGS_ATTR:
config->cloneflags = readint32(current);
break;
+ case ROOTLESS_ATTR:
+ config->is_rootless = readint8(current);
+ break;
case OOM_SCORE_ADJ_ATTR:
config->oom_score_adj = current;
config->oom_score_adj_len = payload_len;
@@ -574,9 +580,21 @@ void nsexec(void)
exit(ret);
case SYNC_USERMAP_PLS:
- /* Enable setgroups(2) if we've been asked to. */
+ /*
+ * Enable setgroups(2) if we've been asked to. But we also
+ * have to explicitly disable setgroups(2) if we're
+ * creating a rootless container (this is required since
+ * Linux 3.19).
+ */
+ if (config.is_rootless && config.is_setgroup) {
+ kill(child, SIGKILL);
+ bail("cannot allow setgroup in an unprivileged user namespace setup");
+ }
+
if (config.is_setgroup)
update_setgroups(child, SETGROUPS_ALLOW);
+ if (config.is_rootless)
+ update_setgroups(child, SETGROUPS_DENY);
/* Set up mappings. */
update_uidmap(child, config.uidmap, config.uidmap_len);
@@ -818,8 +836,10 @@ void nsexec(void)
if (setgid(0) < 0)
bail("setgid failed");
- if (setgroups(0, NULL) < 0)
- bail("setgroups failed");
+ if (!config.is_rootless && config.is_setgroup) {
+ if (setgroups(0, NULL) < 0)
+ bail("setgroups failed");
+ }
s = SYNC_CHILD_READY;
if (write(syncfd, &s, sizeof(s)) != sizeof(s))
diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go
index c60f473..e8b7506 100644
--- a/libcontainer/process_linux.go
+++ b/libcontainer/process_linux.go
@@ -80,7 +80,8 @@ func (p *setnsProcess) start() (err error) {
if err = p.execSetns(); err != nil {
return newSystemErrorWithCause(err, "executing setns process")
}
- if len(p.cgroupPaths) > 0 {
+ // We can't join cgroups if we're in a rootless container.
+ if !p.config.Rootless && len(p.cgroupPaths) > 0 {
if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil {
return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid())
}
@@ -253,13 +254,15 @@ func (p *initProcess) start() error {
return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid())
}
p.setExternalDescriptors(fds)
- // Do this before syncing with child so that no children
- // can escape the cgroup
- if err := p.manager.Apply(p.pid()); err != nil {
- return newSystemErrorWithCause(err, "applying cgroup configuration for process")
+ if !p.container.config.Rootless {
+ // Do this before syncing with child so that no children can escape the
+ // cgroup. We can't do this if we're not running as root.
+ if err := p.manager.Apply(p.pid()); err != nil {
+ return newSystemErrorWithCause(err, "applying cgroup configuration for process")
+ }
}
defer func() {
- if err != nil {
+ if err != nil && !p.container.config.Rootless {
// TODO: should not be the responsibility to call here
p.manager.Destroy()
}
@@ -278,8 +281,11 @@ func (p *initProcess) start() error {
ierr := parseSync(p.parentPipe, func(sync *syncT) error {
switch sync.Type {
case procReady:
- if err := p.manager.Set(p.config.Config); err != nil {
- return newSystemErrorWithCause(err, "setting cgroup config for ready process")
+ // We can't set cgroups if we're in a rootless container.
+ if !p.container.config.Rootless {
+ if err := p.manager.Set(p.config.Config); err != nil {
+ return newSystemErrorWithCause(err, "setting cgroup config for ready process")
+ }
}
// set rlimits, this has to be done here because we lose permissions
// to raise the limits once we enter a user-namespace
@@ -424,6 +430,12 @@ func getPipeFds(pid int) ([]string, error) {
f := filepath.Join(dirPath, strconv.Itoa(i))
target, err := os.Readlink(f)
if err != nil {
+ // Ignore permission errors, for rootless containers and other
+ // non-dumpable processes. if we can't get the fd for a particular
+ // file, there's not much we can do.
+ if os.IsPermission(err) {
+ continue
+ }
return fds, err
}
fds[i] = target
diff --git a/libcontainer/specconv/example.go b/libcontainer/specconv/example.go
new file mode 100644
index 0000000..44fad97
--- /dev/null
+++ b/libcontainer/specconv/example.go
@@ -0,0 +1,160 @@
+package specconv
+
+import (
+ "runtime"
+
+ "github.com/opencontainers/runtime-spec/specs-go"
+)
+
+func sPtr(s string) *string { return &s }
+
+// ExampleSpec returns an example spec file, with many options set so a user
+// can see what a standard spec file looks like.
+func ExampleSpec() *specs.Spec {
+ return &specs.Spec{
+ Version: specs.Version,
+ Platform: specs.Platform{
+ OS: runtime.GOOS,
+ Arch: runtime.GOARCH,
+ },
+ Root: specs.Root{
+ Path: "rootfs",
+ Readonly: true,
+ },
+ Process: specs.Process{
+ Terminal: true,
+ User: specs.User{},
+ Args: []string{
+ "sh",
+ },
+ Env: []string{
+ "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
+ "TERM=xterm",
+ },
+ Cwd: "/",
+ NoNewPrivileges: true,
+ Capabilities: &specs.LinuxCapabilities{
+ Bounding: []string{
+ "CAP_AUDIT_WRITE",
+ "CAP_KILL",
+ "CAP_NET_BIND_SERVICE",
+ },
+ Permitted: []string{
+ "CAP_AUDIT_WRITE",
+ "CAP_KILL",
+ "CAP_NET_BIND_SERVICE",
+ },
+ Inheritable: []string{
+ "CAP_AUDIT_WRITE",
+ "CAP_KILL",
+ "CAP_NET_BIND_SERVICE",
+ },
+ Ambient: []string{
+ "CAP_AUDIT_WRITE",
+ "CAP_KILL",
+ "CAP_NET_BIND_SERVICE",
+ },
+ Effective: []string{
+ "CAP_AUDIT_WRITE",
+ "CAP_KILL",
+ "CAP_NET_BIND_SERVICE",
+ },
+ },
+ Rlimits: []specs.LinuxRlimit{
+ {
+ Type: "RLIMIT_NOFILE",
+ Hard: uint64(1024),
+ Soft: uint64(1024),
+ },
+ },
+ },
+ Hostname: "runc",
+ Mounts: []specs.Mount{
+ {
+ Destination: "/proc",
+ Type: "proc",
+ Source: "proc",
+ Options: nil,
+ },
+ {
+ Destination: "/dev",
+ Type: "tmpfs",
+ Source: "tmpfs",
+ Options: []string{"nosuid", "strictatime", "mode=755", "size=65536k"},
+ },
+ {
+ Destination: "/dev/pts",
+ Type: "devpts",
+ Source: "devpts",
+ Options: []string{"nosuid", "noexec", "newinstance", "ptmxmode=0666", "mode=0620", "gid=5"},
+ },
+ {
+ Destination: "/dev/shm",
+ Type: "tmpfs",
+ Source: "shm",
+ Options: []string{"nosuid", "noexec", "nodev", "mode=1777", "size=65536k"},
+ },
+ {
+ Destination: "/dev/mqueue",
+ Type: "mqueue",
+ Source: "mqueue",
+ Options: []string{"nosuid", "noexec", "nodev"},
+ },
+ {
+ Destination: "/sys",
+ Type: "sysfs",
+ Source: "sysfs",
+ Options: []string{"nosuid", "noexec", "nodev", "ro"},
+ },
+ {
+ Destination: "/sys/fs/cgroup",
+ Type: "cgroup",
+ Source: "cgroup",
+ Options: []string{"nosuid", "noexec", "nodev", "relatime", "ro"},
+ },
+ },
+ Linux: &specs.Linux{
+ MaskedPaths: []string{
+ "/proc/kcore",
+ "/proc/latency_stats",
+ "/proc/timer_list",
+ "/proc/timer_stats",
+ "/proc/sched_debug",
+ "/sys/firmware",
+ },
+ ReadonlyPaths: []string{
+ "/proc/asound",
+ "/proc/bus",
+ "/proc/fs",
+ "/proc/irq",
+ "/proc/sys",
+ "/proc/sysrq-trigger",
+ },
+ Resources: &specs.LinuxResources{
+ Devices: []specs.LinuxDeviceCgroup{
+ {
+ Allow: false,
+ Access: "rwm",
+ },
+ },
+ },
+ Namespaces: []specs.LinuxNamespace{
+ {
+ Type: "pid",
+ },
+ {
+ Type: "network",
+ },
+ {
+ Type: "ipc",
+ },
+ {
+ Type: "uts",
+ },
+ {
+ Type: "mount",
+ },
+ },
+ },
+ }
+}
diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go
index 52b3ca1..346b268 100644
--- a/libcontainer/specconv/spec_linux.go
+++ b/libcontainer/specconv/spec_linux.go
@@ -145,6 +145,7 @@ type CreateOpts struct {
NoPivotRoot bool
NoNewKeyring bool
Spec *specs.Spec
+ Rootless bool
}
// CreateLibcontainerConfig creates a new libcontainer configuration from a
@@ -175,6 +176,7 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) {
Hostname: spec.Hostname,
Labels: append(labels, fmt.Sprintf("bundle=%s", cwd)),
NoNewKeyring: opts.NoNewKeyring,
+ Rootless: opts.Rootless,
}
exists := false
@@ -208,7 +210,7 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) {
if err := setupUserNamespace(spec, config); err != nil {
return nil, err
}
- c, err := createCgroupConfig(opts.CgroupName, opts.UseSystemdCgroup, spec)
+ c, err := createCgroupConfig(opts)
if err != nil {
return nil, err
}
@@ -264,8 +266,14 @@ func createLibcontainerMount(cwd string, m specs.Mount) *configs.Mount {
}
}
-func createCgroupConfig(name string, useSystemdCgroup bool, spec *specs.Spec) (*configs.Cgroup, error) {
- var myCgroupPath string
+func createCgroupConfig(opts *CreateOpts) (*configs.Cgroup, error) {
+ var (
+ myCgroupPath string
+
+ spec = opts.Spec
+ useSystemdCgroup = opts.UseSystemdCgroup
+ name = opts.CgroupName
+ )
c := &configs.Cgroup{
Resources: &configs.Resources{},
@@ -301,9 +309,14 @@ func createCgroupConfig(name string, useSystemdCgroup bool, spec *specs.Spec) (*
c.Path = myCgroupPath
}
- c.Resources.AllowedDevices = allowedDevices
- if spec.Linux == nil {
- return c, nil
+ // In rootless containers, any attempt to make cgroup changes will fail.
+ // libcontainer will validate this and we shouldn't add any cgroup options
+ // the user didn't specify.
+ if !opts.Rootless {
+ c.Resources.AllowedDevices = allowedDevices
+ if spec.Linux == nil {
+ return c, nil
+ }
}
r := spec.Linux.Resources
if r == nil {
@@ -340,8 +353,10 @@ func createCgroupConfig(name string, useSystemdCgroup bool, spec *specs.Spec) (*
}
c.Resources.Devices = append(c.Resources.Devices, dd)
}
- // append the default allowed devices to the end of the list
- c.Resources.Devices = append(c.Resources.Devices, allowedDevices...)
+ if !opts.Rootless {
+ // append the default allowed devices to the end of the list
+ c.Resources.Devices = append(c.Resources.Devices, allowedDevices...)
+ }
if r.Memory != nil {
if r.Memory.Limit != nil {
c.Resources.Memory = *r.Memory.Limit
diff --git a/libcontainer/specconv/spec_linux_test.go b/libcontainer/specconv/spec_linux_test.go
index baa2638..741fae6 100644
--- a/libcontainer/specconv/spec_linux_test.go
+++ b/libcontainer/specconv/spec_linux_test.go
@@ -3,8 +3,10 @@
package specconv
import (
+ "os"
"testing"
+ "github.com/opencontainers/runc/libcontainer/configs/validate"
"github.com/opencontainers/runtime-spec/specs-go"
)
@@ -16,7 +18,13 @@ func TestLinuxCgroupsPathSpecified(t *testing.T) {
CgroupsPath: cgroupsPath,
}
- cgroup, err := createCgroupConfig("ContainerID", false, spec)
+ opts := &CreateOpts{
+ CgroupName: "ContainerID",
+ UseSystemdCgroup: false,
+ Spec: spec,
+ }
+
+ cgroup, err := createCgroupConfig(opts)
if err != nil {
t.Errorf("Couldn't create Cgroup config: %v", err)
}
@@ -28,8 +36,13 @@ func TestLinuxCgroupsPathSpecified(t *testing.T) {
func TestLinuxCgroupsPathNotSpecified(t *testing.T) {
spec := &specs.Spec{}
+ opts := &CreateOpts{
+ CgroupName: "ContainerID",
+ UseSystemdCgroup: false,
+ Spec: spec,
+ }
- cgroup, err := createCgroupConfig("ContainerID", false, spec)
+ cgroup, err := createCgroupConfig(opts)
if err != nil {
t.Errorf("Couldn't create Cgroup config: %v", err)
}
@@ -39,6 +52,26 @@ func TestLinuxCgroupsPathNotSpecified(t *testing.T) {
}
}
+func TestSpecconvExampleValidate(t *testing.T) {
+ spec := ExampleSpec()
+ spec.Root.Path = "/"
+ opts := &CreateOpts{
+ CgroupName: "ContainerID",
+ UseSystemdCgroup: false,
+ Spec: spec,
+ }
+
+ config, err := CreateLibcontainerConfig(opts)
+ if err != nil {
+ t.Errorf("Couldn't create libcontainer config: %v", err)
+ }
+
+ validator := validate.New()
+ if err := validator.Validate(config); err != nil {
+ t.Errorf("Expected specconv to produce valid container config: %v", err)
+ }
+}
+
func TestDupNamespaces(t *testing.T) {
spec := &specs.Spec{
Linux: &specs.Linux{
@@ -62,3 +95,46 @@ func TestDupNamespaces(t *testing.T) {
t.Errorf("Duplicated namespaces should be forbidden")
}
}
+
+func TestRootlessSpecconvValidate(t *testing.T) {
+ spec := &specs.Spec{
+ Linux: specs.Linux{
+ Namespaces: []specs.Namespace{
+ {
+ Type: specs.UserNamespace,
+ },
+ },
+ UIDMappings: []specs.IDMapping{
+ {
+ HostID: uint32(os.Geteuid()),
+ ContainerID: 0,
+ Size: 1,
+ },
+ },
+ GIDMappings: []specs.IDMapping{
+ {
+ HostID: uint32(os.Getegid()),
+ ContainerID: 0,
+ Size: 1,
+ },
+ },
+ },
+ }
+
+ opts := &CreateOpts{
+ CgroupName: "ContainerID",
+ UseSystemdCgroup: false,
+ Spec: spec,
+ Rootless: true,
+ }
+
+ config, err := CreateLibcontainerConfig(opts)
+ if err != nil {
+ t.Errorf("Couldn't create libcontainer config: %v", err)
+ }
+
+ validator := validate.New()
+ if err := validator.Validate(config); err != nil {
+ t.Errorf("Expected specconv to produce valid rootless container config: %v", err)
+ }
+}
diff --git a/list.go b/list.go
index c7550a2..1c3b9aa 100644
--- a/list.go
+++ b/list.go
@@ -7,12 +7,14 @@ import (
"io/ioutil"
"os"
"path/filepath"
+ "syscall"
"text/tabwriter"
"time"
"encoding/json"
"github.com/opencontainers/runc/libcontainer"
+ "github.com/opencontainers/runc/libcontainer/user"
"github.com/opencontainers/runc/libcontainer/utils"
"github.com/urfave/cli"
)
@@ -38,6 +40,8 @@ type containerState struct {
Created time.Time `json:"created"`
// Annotations is the user defined annotations added to the config.
Annotations map[string]string `json:"annotations,omitempty"`
+ // The owner of the state directory (the owner of the container).
+ Owner string `json:"owner"`
}
var listCommand = cli.Command{
@@ -85,14 +89,15 @@ To list containers created using a non-default value for "--root":
switch context.String("format") {
case "table":
w := tabwriter.NewWriter(os.Stdout, 12, 1, 3, ' ', 0)
- fmt.Fprint(w, "ID\tPID\tSTATUS\tBUNDLE\tCREATED\n")
+ fmt.Fprint(w, "ID\tPID\tSTATUS\tBUNDLE\tCREATED\tOWNER\n")
for _, item := range s {
- fmt.Fprintf(w, "%s\t%d\t%s\t%s\t%s\n",
+ fmt.Fprintf(w, "%s\t%d\t%s\t%s\t%s\t%s\n",
item.ID,
item.InitProcessPid,
item.Status,
item.Bundle,
- item.Created.Format(time.RFC3339Nano))
+ item.Created.Format(time.RFC3339Nano),
+ item.Owner)
}
if err := w.Flush(); err != nil {
return err
@@ -126,6 +131,13 @@ func getContainers(context *cli.Context) ([]containerState, error) {
var s []containerState
for _, item := range list {
if item.IsDir() {
+ // This cast is safe on Linux.
+ stat := item.Sys().(*syscall.Stat_t)
+ owner, err := user.LookupUid(int(stat.Uid))
+ if err != nil {
+ owner.Name = string(stat.Uid)
+ }
+
container, err := factory.Load(item.Name())
if err != nil {
fmt.Fprintf(os.Stderr, "load container %s: %v\n", item.Name(), err)
@@ -155,6 +167,7 @@ func getContainers(context *cli.Context) ([]containerState, error) {
Rootfs: state.BaseState.Config.Rootfs,
Created: state.BaseState.Created,
Annotations: annotations,
+ Owner: owner.Name,
})
}
}
diff --git a/ps.go b/ps.go
index b8a1b11..6e0c737 100644
--- a/ps.go
+++ b/ps.go
@@ -28,6 +28,11 @@ var psCommand = cli.Command{
if err := checkArgs(context, 1, minArgs); err != nil {
return err
}
+ // XXX: Currently not supported with rootless containers.
+ if isRootless() {
+ return fmt.Errorf("runc ps requires root")
+ }
+
container, err := getContainer(context)
if err != nil {
return err
diff --git a/restore.go b/restore.go
index afc6046..06f635f 100644
--- a/restore.go
+++ b/restore.go
@@ -3,6 +3,7 @@
package main
import (
+ "fmt"
"os"
"syscall"
@@ -86,6 +87,11 @@ using the runc checkpoint command.`,
if err := checkArgs(context, 1, exactArgs); err != nil {
return err
}
+ // XXX: Currently this is untested with rootless containers.
+ if isRootless() {
+ return fmt.Errorf("runc restore requires root")
+ }
+
imagePath := context.String("image-path")
id := context.Args().First()
if id == "" {
diff --git a/spec.go b/spec.go
index 1b55c6b..d7df312 100644
--- a/spec.go
+++ b/spec.go
@@ -10,6 +10,7 @@ import (
"runtime"
"github.com/opencontainers/runc/libcontainer/configs"
+ "github.com/opencontainers/runc/libcontainer/specconv"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/urfave/cli"
)
@@ -68,152 +69,7 @@ container on your host.`,
if err := checkArgs(context, 0, exactArgs); err != nil {
return err
}
- spec := specs.Spec{
- Version: specs.Version,
- Platform: specs.Platform{
- OS: runtime.GOOS,
- Arch: runtime.GOARCH,
- },
- Root: specs.Root{
- Path: "rootfs",
- Readonly: true,
- },
- Process: specs.Process{
- Terminal: true,
- User: specs.User{},
- Args: []string{
- "sh",
- },
- Env: []string{
- "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
- "TERM=xterm",
- },
- Cwd: "/",
- NoNewPrivileges: true,
- Capabilities: &specs.LinuxCapabilities{
- Bounding: []string{
- "CAP_AUDIT_WRITE",
- "CAP_KILL",
- "CAP_NET_BIND_SERVICE",
- },
- Permitted: []string{
- "CAP_AUDIT_WRITE",
- "CAP_KILL",
- "CAP_NET_BIND_SERVICE",
- },
- Inheritable: []string{
- "CAP_AUDIT_WRITE",
- "CAP_KILL",
- "CAP_NET_BIND_SERVICE",
- },
- Ambient: []string{
- "CAP_AUDIT_WRITE",
- "CAP_KILL",
- "CAP_NET_BIND_SERVICE",
- },
- Effective: []string{
- "CAP_AUDIT_WRITE",
- "CAP_KILL",
- "CAP_NET_BIND_SERVICE",
- },
- },
- Rlimits: []specs.LinuxRlimit{
- {
- Type: "RLIMIT_NOFILE",
- Hard: uint64(1024),
- Soft: uint64(1024),
- },
- },
- },
- Hostname: "runc",
- Mounts: []specs.Mount{
- {
- Destination: "/proc",
- Type: "proc",
- Source: "proc",
- Options: nil,
- },
- {
- Destination: "/dev",
- Type: "tmpfs",
- Source: "tmpfs",
- Options: []string{"nosuid", "strictatime", "mode=755", "size=65536k"},
- },
- {
- Destination: "/dev/pts",
- Type: "devpts",
- Source: "devpts",
- Options: []string{"nosuid", "noexec", "newinstance", "ptmxmode=0666", "mode=0620", "gid=5"},
- },
- {
- Destination: "/dev/shm",
- Type: "tmpfs",
- Source: "shm",
- Options: []string{"nosuid", "noexec", "nodev", "mode=1777", "size=65536k"},
- },
- {
- Destination: "/dev/mqueue",
- Type: "mqueue",
- Source: "mqueue",
- Options: []string{"nosuid", "noexec", "nodev"},
- },
- {
- Destination: "/sys",
- Type: "sysfs",
- Source: "sysfs",
- Options: []string{"nosuid", "noexec", "nodev", "ro"},
- },
- {
- Destination: "/sys/fs/cgroup",
- Type: "cgroup",
- Source: "cgroup",
- Options: []string{"nosuid", "noexec", "nodev", "relatime", "ro"},
- },
- },
- Linux: &specs.Linux{
- MaskedPaths: []string{
- "/proc/kcore",
- "/proc/latency_stats",
- "/proc/timer_list",
- "/proc/timer_stats",
- "/proc/sched_debug",
- "/sys/firmware",
- },
- ReadonlyPaths: []string{
- "/proc/asound",
- "/proc/bus",
- "/proc/fs",
- "/proc/irq",
- "/proc/sys",
- "/proc/sysrq-trigger",
- },
- Resources: &specs.LinuxResources{
- Devices: []specs.LinuxDeviceCgroup{
- {
- Allow: false,
- Access: "rwm",
- },
- },
- },
- Namespaces: []specs.LinuxNamespace{
- {
- Type: "pid",
- },
- {
- Type: "network",
- },
- {
- Type: "ipc",
- },
- {
- Type: "uts",
- },
- {
- Type: "mount",
- },
- },
- },
- }
+ spec := specconv.ExampleSpec()
checkNoFile := func(name string) error {
_, err := os.Stat(name)
@@ -234,7 +90,7 @@ container on your host.`,
if err := checkNoFile(specConfig); err != nil {
return err
}
- data, err := json.MarshalIndent(&spec, "", "\t")
+ data, err := json.MarshalIndent(spec, "", "\t")
if err != nil {
return err
}
diff --git a/utils.go b/utils.go
index 1286fd6..98f93a4 100644
--- a/utils.go
+++ b/utils.go
@@ -63,9 +63,6 @@ func setupSpec(context *cli.Context) (*specs.Spec, error) {
if err != nil {
return nil, err
}
- if os.Geteuid() != 0 {
- return nil, fmt.Errorf("runc should be run as root")
- }
return spec, nil
}
diff --git a/utils_linux.go b/utils_linux.go
index dcf156c..767015e 100644
--- a/utils_linux.go
+++ b/utils_linux.go
@@ -186,6 +186,11 @@ func createPidFile(path string, process *libcontainer.Process) error {
return os.Rename(tmpName, path)
}
+// XXX: Currently we autodetect rootless mode.
+func isRootless() bool {
+ return os.Geteuid() != 0
+}
+
func createContainer(context *cli.Context, id string, spec *specs.Spec) (libcontainer.Container, error) {
config, err := specconv.CreateLibcontainerConfig(&specconv.CreateOpts{
CgroupName: id,
@@ -193,6 +198,7 @@ func createContainer(context *cli.Context, id string, spec *specs.Spec) (libcont
NoPivotRoot: context.Bool("no-pivot"),
NoNewKeyring: context.Bool("no-new-keyring"),
Spec: spec,
+ Rootless: isRootless(),
})
if err != nil {
return nil, err
--
2.7.4.3