From 0eac597a6f853c4eb41d7ebe58398c117798542c Mon Sep 17 00:00:00 2001
From: WangFengTu <wangfengtu@huawei.com>
Date: Fri, 29 Jan 2021 16:55:13 +0800
Subject: [PATCH] support cgroup v2

Signed-off-by: WangFengTu <wangfengtu@huawei.com>
---
 src/lxc/cgroups/cgroup2_devices.c | 126 +++++++++++++++++++----------
 src/lxc/cgroups/isulad_cgfsng.c   |  56 ++++++++++++-
 src/lxc/lxccontainer.c            | 129 ++++++++++++++++++++++++++++++
 3 files changed, 269 insertions(+), 42 deletions(-)

diff --git a/src/lxc/cgroups/cgroup2_devices.c b/src/lxc/cgroups/cgroup2_devices.c
index 4efb28fb..05613c51 100644
--- a/src/lxc/cgroups/cgroup2_devices.c
+++ b/src/lxc/cgroups/cgroup2_devices.c
@@ -25,6 +25,19 @@
 #include <linux/bpf.h>
 #include <linux/filter.h>
 
+#define BPF_LOG_BUF_SIZE (1 << 23) /* 8MB */
+#ifndef BPF_LOG_LEVEL1
+#define BPF_LOG_LEVEL1 1
+#endif
+
+#ifndef BPF_LOG_LEVEL2
+#define BPF_LOG_LEVEL2 2
+#endif
+
+#ifndef BPF_LOG_LEVEL
+#define BPF_LOG_LEVEL (BPF_LOG_LEVEL1 | BPF_LOG_LEVEL2)
+#endif
+
 lxc_log_define(cgroup2_devices, cgroup);
 
 static int bpf_program_add_instructions(struct bpf_program *prog,
@@ -42,6 +55,8 @@ static int bpf_program_add_instructions(struct bpf_program *prog,
 		return log_error_errno(-1, ENOMEM, "Failed to reallocate bpf cgroup program");
 
 	prog->instructions = new_insn;
+	memset(prog->instructions + prog->n_instructions, 0,
+	       sizeof(struct bpf_insn) * count);
 	memcpy(prog->instructions + prog->n_instructions, instructions,
 	       sizeof(struct bpf_insn) * count);
 	prog->n_instructions += count;
@@ -118,29 +133,27 @@ void bpf_program_free(struct bpf_program *prog)
 			   .off = 0,                   \
 			   .imm = 0})
 
-static int bpf_access_mask(const char *acc)
+static int bpf_access_mask(const char *acc, __u32 *mask)
 {
-	int mask = 0;
-
 	if (!acc)
-		return mask;
+		return 0;
 
 	for (; *acc; acc++)
 		switch (*acc) {
 		case 'r':
-			mask |= BPF_DEVCG_ACC_READ;
+			*mask |= BPF_DEVCG_ACC_READ;
 			break;
 		case 'w':
-			mask |= BPF_DEVCG_ACC_WRITE;
+			*mask |= BPF_DEVCG_ACC_WRITE;
 			break;
 		case 'm':
-			mask |= BPF_DEVCG_ACC_MKNOD;
+			*mask |= BPF_DEVCG_ACC_MKNOD;
 			break;
 		default:
 			return -EINVAL;
 		}
 
-	return mask;
+	return 0;
 }
 
 static int bpf_device_type(char type)
@@ -157,19 +170,18 @@ static int bpf_device_type(char type)
 	return -1;
 }
 
-static inline bool bpf_device_all_access(int access_mask)
+static inline bool bpf_device_all_access(__u32 access_mask)
 {
-	return (access_mask == (BPF_DEVCG_ACC_READ | BPF_DEVCG_ACC_WRITE |
-				BPF_DEVCG_ACC_MKNOD));
+	return access_mask == (BPF_DEVCG_ACC_READ | BPF_DEVCG_ACC_WRITE | BPF_DEVCG_ACC_MKNOD);
 }
 
 struct bpf_program *bpf_program_new(uint32_t prog_type)
 {
 	__do_free struct bpf_program *prog = NULL;
 
-	prog = calloc(1, sizeof(struct bpf_program));
+	prog = zalloc(sizeof(struct bpf_program));
 	if (!prog)
-		return NULL;
+		return ret_set_errno(NULL, ENOMEM);
 
 	prog->prog_type = prog_type;
 	prog->kernel_fd = -EBADF;
@@ -209,12 +221,10 @@ int bpf_program_append_device(struct bpf_program *prog, struct device_item *devi
 {
 	int ret;
 	int jump_nr = 1;
-	struct bpf_insn bpf_access_decision[] = {
-	    BPF_MOV64_IMM(BPF_REG_0, device->allow),
-	    BPF_EXIT_INSN(),
-	};
-	int access_mask;
+	__u32 access_mask = 0;
 	int device_type;
+	struct bpf_insn bpf_access_decision[2];
+	bool add_exist = false;
 
 	if (!prog || !device)
 		return ret_set_errno(-1, EINVAL);
@@ -225,6 +235,13 @@ int bpf_program_append_device(struct bpf_program *prog, struct device_item *devi
 		return 0;
 	}
 
+	ret = bpf_access_mask(device->access, &access_mask);
+	if (ret < 0)
+		return log_error_errno(ret, -ret, "Invalid access mask specified %s", device->access);
+
+	if (!bpf_device_all_access(access_mask))
+		jump_nr += 3;
+
 	device_type = bpf_device_type(device->type);
 	if (device_type < 0)
 		return log_error_errno(-1, EINVAL, "Invalid bpf cgroup device type %c", device->type);
@@ -232,63 +249,67 @@ int bpf_program_append_device(struct bpf_program *prog, struct device_item *devi
 	if (device_type > 0)
 		jump_nr++;
 
-	access_mask = bpf_access_mask(device->access);
-	if (!bpf_device_all_access(access_mask))
-		jump_nr += 3;
-
-	if (device->major != -1)
+	if (device->major >= 0)
 		jump_nr++;
 
-	if (device->minor != -1)
+	if (device->minor >= 0)
 		jump_nr++;
 
 	if (device_type > 0) {
 		struct bpf_insn ins[] = {
-		    BPF_JMP_IMM(BPF_JNE, BPF_REG_2, device_type, jump_nr--),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_2, device_type, jump_nr--),
 		};
 
 		ret = bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins));
 		if (ret)
 			return log_error_errno(-1, errno, "Failed to add instructions to bpf cgroup program");
+		add_exist = true;
 	}
 
 	if (!bpf_device_all_access(access_mask)) {
 		struct bpf_insn ins[] = {
-		    BPF_MOV32_REG(BPF_REG_1, BPF_REG_3),
-		    BPF_ALU32_IMM(BPF_AND, BPF_REG_1, access_mask),
-		    BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, jump_nr),
+			BPF_MOV32_REG(BPF_REG_1, BPF_REG_3),
+			BPF_ALU32_IMM(BPF_AND, BPF_REG_1, access_mask),
+			BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, jump_nr-2),
 		};
 
 		jump_nr -= 3;
 		ret = bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins));
 		if (ret)
 			return log_error_errno(-1, errno, "Failed to add instructions to bpf cgroup program");
+		add_exist = true;
 	}
 
 	if (device->major >= 0) {
 		struct bpf_insn ins[] = {
-		    BPF_JMP_IMM(BPF_JNE, BPF_REG_4, device->major, jump_nr--),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_4, device->major, jump_nr--),
 		};
 
 		ret = bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins));
 		if (ret)
 			return log_error_errno(-1, errno, "Failed to add instructions to bpf cgroup program");
+		add_exist = true;
 	}
 
 	if (device->minor >= 0) {
 		struct bpf_insn ins[] = {
-		    BPF_JMP_IMM(BPF_JNE, BPF_REG_5, device->minor, jump_nr--),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_5, device->minor, jump_nr--),
 		};
 
 		ret = bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins));
 		if (ret)
 			return log_error_errno(-1, errno, "Failed to add instructions to bpf cgroup program");
+		add_exist = true;
 	}
 
-	ret = bpf_program_add_instructions(prog, bpf_access_decision,
-					    ARRAY_SIZE(bpf_access_decision));
-	if (ret)
-		return log_error_errno(-1, errno, "Failed to add instructions to bpf cgroup program");
+	if (add_exist) {
+		bpf_access_decision[0] = BPF_MOV64_IMM(BPF_REG_0, device->allow);
+		bpf_access_decision[1] = BPF_EXIT_INSN();
+		ret = bpf_program_add_instructions(prog, bpf_access_decision,
+					   ARRAY_SIZE(bpf_access_decision));
+		if (ret)
+			return log_error_errno(-1, errno, "Failed to add instructions to bpf cgroup program");
+	}
 
 	return 0;
 }
@@ -310,30 +331,49 @@ int bpf_program_finalize(struct bpf_program *prog)
 	return bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins));
 }
 
-static int bpf_program_load_kernel(struct bpf_program *prog, char *log_buf,
-				   size_t log_size)
+static int bpf_program_load_kernel(struct bpf_program *prog)
 {
+	__do_free char *log_buf = NULL;
+	__u32 log_level = 0;
+	__u32 log_size = 0;
 	union bpf_attr attr;
+	struct rlimit limit = {
+		.rlim_cur = RLIM_INFINITY,
+		.rlim_max = RLIM_INFINITY,
+	};
 
 	if (prog->kernel_fd >= 0) {
-		memset(log_buf, 0, log_size);
 		return 0;
 	}
 
+	if (lxc_log_get_level() <= LXC_LOG_LEVEL_DEBUG) {
+		log_buf = zalloc(BPF_LOG_BUF_SIZE);
+		if (!log_buf) {
+			WARN("Failed to allocate bpf log buffer");
+		} else {
+			log_level = BPF_LOG_LEVEL;
+			log_size = BPF_LOG_BUF_SIZE;
+		}
+	}
+
+	if (setrlimit(RLIMIT_MEMLOCK, &limit) < 0)
+		return log_error_errno(-1, errno, "Failed to set rlimit memlock to unlimited");
+
 	attr = (union bpf_attr){
 	    .prog_type	= prog->prog_type,
 	    .insns	= PTR_TO_UINT64(prog->instructions),
 	    .insn_cnt	= prog->n_instructions,
 	    .license	= PTR_TO_UINT64("GPL"),
 	    .log_buf	= PTR_TO_UINT64(log_buf),
-	    .log_level	= !!log_buf,
+	    .log_level	= log_level,
 	    .log_size	= log_size,
 	};
 
 	prog->kernel_fd = bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
 	if (prog->kernel_fd < 0)
-		return log_error_errno(-1, errno, "Failed to load bpf program");
+		return log_error_errno(-1, errno, "Failed to load bpf program: %s", log_buf);
 
+	TRACE("Loaded bpf program: %s", log_buf ?: "(null)");
 	return 0;
 }
 
@@ -362,7 +402,7 @@ int bpf_program_cgroup_attach(struct bpf_program *prog, int type,
 			return true;
 	}
 
-	ret = bpf_program_load_kernel(prog, NULL, 0);
+	ret = bpf_program_load_kernel(prog);
 	if (ret < 0)
 		return log_error_errno(-1, ret, "Failed to load bpf program");
 
@@ -518,11 +558,15 @@ bool bpf_devices_cgroup_supported(void)
 	if (prog < 0)
 		return log_trace(false, "Failed to allocate new bpf device cgroup program");
 
+	ret = bpf_program_init(prog);
+	if (ret)
+		return log_error_errno(false, ENOMEM, "Failed to initialize bpf program");
+
 	ret = bpf_program_add_instructions(prog, dummy, ARRAY_SIZE(dummy));
 	if (ret < 0)
 		return log_trace(false, "Failed to add new instructions to bpf device cgroup program");
 
-	ret = bpf_program_load_kernel(prog, NULL, 0);
+	ret = bpf_program_load_kernel(prog);
 	if (ret < 0)
 		return log_trace(false, "Failed to load new bpf device cgroup program");
 
diff --git a/src/lxc/cgroups/isulad_cgfsng.c b/src/lxc/cgroups/isulad_cgfsng.c
index e16f8a19..c80527d5 100644
--- a/src/lxc/cgroups/isulad_cgfsng.c
+++ b/src/lxc/cgroups/isulad_cgfsng.c
@@ -823,6 +823,9 @@ static bool isulad_cg_legacy_handle_cpuset_hierarchy(struct hierarchy *h, char *
 	char *cgpath, *slash;
 	bool sub_mk_success = false;
 
+	if (is_unified_hierarchy(h))
+		return true;
+
 	if (!string_in_list(h->controllers, "cpuset"))
 		return true;
 
@@ -1288,6 +1291,20 @@ __cgfsng_ops static bool isulad_cgfsng_mount(struct cgroup_ops *ops,
 		ERROR("Failed to create directory: %s", tmpfspath);
 		goto on_error;
 	}
+
+        if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
+                if (has_cgns && wants_force_mount) {
+                        /*
+                         * If cgroup namespaces are supported but the container
+                         * will not have CAP_SYS_ADMIN after it has started we
+                         * need to mount the cgroups manually.
+                         */
+                        return cg_mount_in_cgroup_namespace(type, ops->unified, tmpfspath) == 0;
+                }
+
+                return cg_mount_cgroup_full(type, ops->unified, tmpfspath) == 0;
+        }
+
 	ret = safe_mount(NULL, tmpfspath, "tmpfs",
 	                 MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
 	                 "size=10240k,mode=755", root, handler->conf->lsm_se_mount_context);
@@ -2196,8 +2213,16 @@ __cgfsng_ops static int isulad_cgfsng_set(struct cgroup_ops *ops,
 	h = get_hierarchy(ops, controller);
 	if (h) {
 		char *fullpath;
-
 		fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
+
+		if (strcmp(filename, "io.weight") == 0 || strcmp(filename, "io.bfq.weight") == 0) {
+			if (!file_exists(fullpath)) {
+				free(path);
+				free(fullpath);
+				return 0;
+			}
+		}
+
 		ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666);
 		free(fullpath);
 	}
@@ -2428,6 +2453,9 @@ __cgfsng_ops static bool isulad_cgfsng_setup_limits_legacy(struct cgroup_ops *op
 	if (!ops->hierarchies)
 		return ret_set_errno(false, EINVAL);
 
+	if (pure_unified_layout(ops))
+		return true;
+
 	sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
 	if (!sorted_cgroup_settings)
 		return false;
@@ -2528,6 +2556,7 @@ static int bpf_device_cgroup_prepare(struct cgroup_ops *ops,
 __cgfsng_ops static bool isulad_cgfsng_setup_limits(struct cgroup_ops *ops,
 					     struct lxc_handler *handler)
 {
+	__do_free char *path = NULL;
 	struct lxc_list *cgroup_settings, *iterator;
 	struct hierarchy *h;
 	struct lxc_conf *conf;
@@ -2549,6 +2578,9 @@ __cgfsng_ops static bool isulad_cgfsng_setup_limits(struct cgroup_ops *ops,
 		return true;
 	cgroup_settings = &conf->cgroup2;
 
+	if (!pure_unified_layout(ops))
+		return true;
+
 	if (!ops->unified)
 		return false;
 	h = ops->unified;
@@ -2560,7 +2592,29 @@ __cgfsng_ops static bool isulad_cgfsng_setup_limits(struct cgroup_ops *ops,
 		if (strncmp("devices", cg->subsystem, 7) == 0) {
 			ret = bpf_device_cgroup_prepare(ops, conf, cg->subsystem,
 							cg->value);
+		} else if (strcmp(cg->subsystem, "files.limit") == 0) {
+			long long int setvalue = 0;
+			const char *cgvalue = cg->value;
+
+			if (lxc_safe_long_long(cgvalue, &setvalue) != 0)
+				return log_error(false, "Invalid integer value %s", cgvalue);
+
+			if (setvalue <= 0)
+				cgvalue = "max";
+
+			ret = lxc_write_openat(h->container_full_path,
+					       cg->subsystem, cgvalue,
+					       strlen(cgvalue));
+			if (ret < 0)
+				return log_error_errno(false, errno, "Failed to set \"%s\" to \"%s\"",
+						       cg->subsystem, cgvalue);
 		} else {
+			if (strcmp(cg->subsystem, "io.weight") == 0 || strcmp(cg->subsystem, "io.bfq.weight") == 0) {
+				path = must_make_path(h->container_full_path, cg->subsystem, NULL);
+				if (!file_exists(path)) {
+					continue;
+				}
+			}
 			ret = lxc_write_openat(h->container_full_path,
 					       cg->subsystem, cg->value,
 					       strlen(cg->value));
diff --git a/src/lxc/lxccontainer.c b/src/lxc/lxccontainer.c
index 06552ce5..5769b251 100644
--- a/src/lxc/lxccontainer.c
+++ b/src/lxc/lxccontainer.c
@@ -87,6 +87,9 @@
 
 lxc_log_define(lxccontainer, lxc);
 
+typedef bool (*func_is_io_stat_read)(const char *value);
+typedef bool (*func_is_io_stat_write)(const char *value);
+
 static bool do_lxcapi_destroy(struct lxc_container *c);
 static const char *lxcapi_get_config_path(struct lxc_container *c);
 #define do_lxcapi_get_config_path(c) lxcapi_get_config_path(c)
@@ -5768,6 +5771,26 @@ static uint64_t metrics_get_ull(struct lxc_container *c, struct cgroup_ops *cgro
     return val;
 }
 
+static uint64_t metrics_get_ull_with_max(struct lxc_container *c, struct cgroup_ops *cgroup_ops, const char *item)
+{
+    char buf[80] = {0};
+    int len = 0;
+    uint64_t val = 0;
+
+    len = cgroup_ops->get(cgroup_ops, item, buf, sizeof(buf), c->name, c->config_path);
+    if (len <= 0) {
+        DEBUG("unable to read cgroup item %s", item);
+        return 0;
+    }
+
+    if (strcmp(buf, "max") == 0) {
+	return ULONG_MAX;
+    }
+
+    val = strtoull(buf, NULL, 0);
+    return val;
+}
+
 static inline bool is_blk_metrics_read(const char *value)
 {
     return strcmp(value, "Read") == 0;
@@ -5826,6 +5849,60 @@ err_out:
     return;
 }
 
+static void metrics_get_io_stats_v2(struct lxc_container *c, struct cgroup_ops *cgroup_ops, const char *item, struct lxc_blkio_metrics *stats, func_is_io_stat_read is_io_stat_read, func_is_io_stat_write is_io_stat_write)
+{
+#define BUFSIZE 4096
+    char buf[BUFSIZE] = {0};
+    int i = 0;
+    int j = 0;
+    int len = 0;
+    char **lines = NULL;
+    char **cols = NULL;
+    char **kv = NULL;
+
+    len = cgroup_ops->get(cgroup_ops, item, buf, sizeof(buf), c->name, c->config_path);
+    if (len <= 0) {
+        DEBUG("unable to read cgroup item %s", item);
+        return;
+    }
+
+    lines = lxc_string_split_and_trim(buf, '\n');
+    if (lines == NULL) {
+        return;
+    }
+
+    (void)memset(stats, 0, sizeof(struct lxc_blkio_metrics));
+    // line example:
+    // 259:0 rbytes=0 wbytes=12288 rios=0 wios=4 dbytes=0 dios=0
+    for (i = 0; lines[i]; i++) {
+        cols = lxc_string_split_and_trim(lines[i], ' ');
+        if (cols == NULL || lxc_array_len((void **)cols) < 2) {
+            goto err_out;
+        }
+        len = lxc_array_len((void **)cols);
+        for (j = 1; j < len; j++) {
+            kv = lxc_string_split(cols[j], '=');
+            if (kv == NULL || lxc_array_len((void **)kv) != 2) {
+                lxc_free_array((void **)kv, free);
+                continue;
+            }
+            if (is_io_stat_read(kv[0])) {
+                stats->read += strtoull(kv[1], NULL, 0);
+            } else if (is_io_stat_write(kv[0])) {
+                stats->write += strtoull(kv[1], NULL, 0);
+            }
+            lxc_free_array((void **)kv, free);
+        }
+        lxc_free_array((void **)cols, free);
+    }
+
+    stats->total = stats->read + stats->write;
+
+err_out:
+    lxc_free_array((void **)lines, free);
+    return;
+}
+
 static uint64_t metrics_match_get_ull(struct lxc_container *c, struct cgroup_ops *cgroup_ops, const char *item, const char *match, int column)
 {
 #define BUFSIZE 4096
@@ -5874,6 +5951,54 @@ err_out:
     return val;
 }
 
+static bool is_io_stat_rbytes(const char *value)
+{
+    return strcmp(value, "rbytes") == 0;
+}
+
+static bool is_io_stat_wbytes(const char *value)
+{
+    return strcmp(value, "wbytes") == 0;
+}
+
+static bool is_io_stat_rios(const char *value)
+{
+    return strcmp(value, "rios") == 0;
+}
+
+static bool is_io_stat_wios(const char *value)
+{
+    return strcmp(value, "wios") == 0;
+}
+
+static bool unified_metrics_get(struct lxc_container *c, struct cgroup_ops *cgroup_ops, struct lxc_container_metrics *metrics)
+{
+	// cpu
+	metrics->cpu_use_nanos = metrics_match_get_ull(c, cgroup_ops, "cpu.stat", "usage_usec", 1) * 1000;
+	metrics->cpu_use_user = metrics_match_get_ull(c, cgroup_ops, "cpu.stat", "user_usec", 1) * 1000;
+	metrics->cpu_use_sys = metrics_match_get_ull(c, cgroup_ops, "cpu.stat", "system_usec", 1) * 1000;
+
+	// io
+	metrics_get_io_stats_v2(c, cgroup_ops, "io.stat", &metrics->io_service_bytes, is_io_stat_rbytes, is_io_stat_wbytes);
+	metrics_get_io_stats_v2(c, cgroup_ops, "io.stat", &metrics->io_serviced, is_io_stat_rios, is_io_stat_wios);
+
+	// memory
+	metrics->mem_used = metrics_get_ull(c, cgroup_ops, "memory.current");
+	metrics->mem_limit = metrics_get_ull_with_max(c, cgroup_ops, "memory.max");
+	metrics->inactive_file_total = metrics_match_get_ull(c, cgroup_ops, "memory.stat", "inactive_file", 1);
+	metrics->cache = metrics_match_get_ull(c, cgroup_ops, "memory.stat", "file", 1);
+	metrics->cache_total = metrics->cache;
+
+	// cgroup v2 does not support kernel memory
+	metrics->kmem_used = 0;
+	metrics->kmem_limit = 0;
+
+	// pids
+	metrics->pids_current = metrics_get_ull(c, cgroup_ops, "pids.current");
+
+	return true;
+}
+
 /* isulad add get container metrics */
 static bool do_lxcapi_get_container_metrics(struct lxc_container *c,  struct lxc_container_metrics *metrics)
 {
@@ -5897,6 +6022,10 @@ static bool do_lxcapi_get_container_metrics(struct lxc_container *c,  struct lxc
 		return false;
 	}
 
+	if (cgroup_ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
+		return unified_metrics_get(c, cgroup_ops, metrics);
+	}
+
 	metrics->cpu_use_nanos = metrics_get_ull(c, cgroup_ops, "cpuacct.usage");
 	metrics->pids_current = metrics_get_ull(c, cgroup_ops, "pids.current");
 
-- 
2.20.1