From 0eac597a6f853c4eb41d7ebe58398c117798542c Mon Sep 17 00:00:00 2001 From: WangFengTu Date: Fri, 29 Jan 2021 16:55:13 +0800 Subject: [PATCH] support cgroup v2 Signed-off-by: WangFengTu --- src/lxc/cgroups/cgroup2_devices.c | 126 +++++++++++++++++++---------- src/lxc/cgroups/isulad_cgfsng.c | 56 ++++++++++++- src/lxc/lxccontainer.c | 129 ++++++++++++++++++++++++++++++ 3 files changed, 269 insertions(+), 42 deletions(-) diff --git a/src/lxc/cgroups/cgroup2_devices.c b/src/lxc/cgroups/cgroup2_devices.c index 4efb28fb..05613c51 100644 --- a/src/lxc/cgroups/cgroup2_devices.c +++ b/src/lxc/cgroups/cgroup2_devices.c @@ -25,6 +25,19 @@ #include #include +#define BPF_LOG_BUF_SIZE (1 << 23) /* 8MB */ +#ifndef BPF_LOG_LEVEL1 +#define BPF_LOG_LEVEL1 1 +#endif + +#ifndef BPF_LOG_LEVEL2 +#define BPF_LOG_LEVEL2 2 +#endif + +#ifndef BPF_LOG_LEVEL +#define BPF_LOG_LEVEL (BPF_LOG_LEVEL1 | BPF_LOG_LEVEL2) +#endif + lxc_log_define(cgroup2_devices, cgroup); static int bpf_program_add_instructions(struct bpf_program *prog, @@ -42,6 +55,8 @@ static int bpf_program_add_instructions(struct bpf_program *prog, return log_error_errno(-1, ENOMEM, "Failed to reallocate bpf cgroup program"); prog->instructions = new_insn; + memset(prog->instructions + prog->n_instructions, 0, + sizeof(struct bpf_insn) * count); memcpy(prog->instructions + prog->n_instructions, instructions, sizeof(struct bpf_insn) * count); prog->n_instructions += count; @@ -118,29 +133,27 @@ void bpf_program_free(struct bpf_program *prog) .off = 0, \ .imm = 0}) -static int bpf_access_mask(const char *acc) +static int bpf_access_mask(const char *acc, __u32 *mask) { - int mask = 0; - if (!acc) - return mask; + return 0; for (; *acc; acc++) switch (*acc) { case 'r': - mask |= BPF_DEVCG_ACC_READ; + *mask |= BPF_DEVCG_ACC_READ; break; case 'w': - mask |= BPF_DEVCG_ACC_WRITE; + *mask |= BPF_DEVCG_ACC_WRITE; break; case 'm': - mask |= BPF_DEVCG_ACC_MKNOD; + *mask |= BPF_DEVCG_ACC_MKNOD; break; default: return -EINVAL; } - return mask; + return 0; } static int bpf_device_type(char type) @@ -157,19 +170,18 @@ static int bpf_device_type(char type) return -1; } -static inline bool bpf_device_all_access(int access_mask) +static inline bool bpf_device_all_access(__u32 access_mask) { - return (access_mask == (BPF_DEVCG_ACC_READ | BPF_DEVCG_ACC_WRITE | - BPF_DEVCG_ACC_MKNOD)); + return access_mask == (BPF_DEVCG_ACC_READ | BPF_DEVCG_ACC_WRITE | BPF_DEVCG_ACC_MKNOD); } struct bpf_program *bpf_program_new(uint32_t prog_type) { __do_free struct bpf_program *prog = NULL; - prog = calloc(1, sizeof(struct bpf_program)); + prog = zalloc(sizeof(struct bpf_program)); if (!prog) - return NULL; + return ret_set_errno(NULL, ENOMEM); prog->prog_type = prog_type; prog->kernel_fd = -EBADF; @@ -209,12 +221,10 @@ int bpf_program_append_device(struct bpf_program *prog, struct device_item *devi { int ret; int jump_nr = 1; - struct bpf_insn bpf_access_decision[] = { - BPF_MOV64_IMM(BPF_REG_0, device->allow), - BPF_EXIT_INSN(), - }; - int access_mask; + __u32 access_mask = 0; int device_type; + struct bpf_insn bpf_access_decision[2]; + bool add_exist = false; if (!prog || !device) return ret_set_errno(-1, EINVAL); @@ -225,6 +235,13 @@ int bpf_program_append_device(struct bpf_program *prog, struct device_item *devi return 0; } + ret = bpf_access_mask(device->access, &access_mask); + if (ret < 0) + return log_error_errno(ret, -ret, "Invalid access mask specified %s", device->access); + + if (!bpf_device_all_access(access_mask)) + jump_nr += 3; + device_type = bpf_device_type(device->type); if (device_type < 0) return log_error_errno(-1, EINVAL, "Invalid bpf cgroup device type %c", device->type); @@ -232,63 +249,67 @@ int bpf_program_append_device(struct bpf_program *prog, struct device_item *devi if (device_type > 0) jump_nr++; - access_mask = bpf_access_mask(device->access); - if (!bpf_device_all_access(access_mask)) - jump_nr += 3; - - if (device->major != -1) + if (device->major >= 0) jump_nr++; - if (device->minor != -1) + if (device->minor >= 0) jump_nr++; if (device_type > 0) { struct bpf_insn ins[] = { - BPF_JMP_IMM(BPF_JNE, BPF_REG_2, device_type, jump_nr--), + BPF_JMP_IMM(BPF_JNE, BPF_REG_2, device_type, jump_nr--), }; ret = bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins)); if (ret) return log_error_errno(-1, errno, "Failed to add instructions to bpf cgroup program"); + add_exist = true; } if (!bpf_device_all_access(access_mask)) { struct bpf_insn ins[] = { - BPF_MOV32_REG(BPF_REG_1, BPF_REG_3), - BPF_ALU32_IMM(BPF_AND, BPF_REG_1, access_mask), - BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, jump_nr), + BPF_MOV32_REG(BPF_REG_1, BPF_REG_3), + BPF_ALU32_IMM(BPF_AND, BPF_REG_1, access_mask), + BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, jump_nr-2), }; jump_nr -= 3; ret = bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins)); if (ret) return log_error_errno(-1, errno, "Failed to add instructions to bpf cgroup program"); + add_exist = true; } if (device->major >= 0) { struct bpf_insn ins[] = { - BPF_JMP_IMM(BPF_JNE, BPF_REG_4, device->major, jump_nr--), + BPF_JMP_IMM(BPF_JNE, BPF_REG_4, device->major, jump_nr--), }; ret = bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins)); if (ret) return log_error_errno(-1, errno, "Failed to add instructions to bpf cgroup program"); + add_exist = true; } if (device->minor >= 0) { struct bpf_insn ins[] = { - BPF_JMP_IMM(BPF_JNE, BPF_REG_5, device->minor, jump_nr--), + BPF_JMP_IMM(BPF_JNE, BPF_REG_5, device->minor, jump_nr--), }; ret = bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins)); if (ret) return log_error_errno(-1, errno, "Failed to add instructions to bpf cgroup program"); + add_exist = true; } - ret = bpf_program_add_instructions(prog, bpf_access_decision, - ARRAY_SIZE(bpf_access_decision)); - if (ret) - return log_error_errno(-1, errno, "Failed to add instructions to bpf cgroup program"); + if (add_exist) { + bpf_access_decision[0] = BPF_MOV64_IMM(BPF_REG_0, device->allow); + bpf_access_decision[1] = BPF_EXIT_INSN(); + ret = bpf_program_add_instructions(prog, bpf_access_decision, + ARRAY_SIZE(bpf_access_decision)); + if (ret) + return log_error_errno(-1, errno, "Failed to add instructions to bpf cgroup program"); + } return 0; } @@ -310,30 +331,49 @@ int bpf_program_finalize(struct bpf_program *prog) return bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins)); } -static int bpf_program_load_kernel(struct bpf_program *prog, char *log_buf, - size_t log_size) +static int bpf_program_load_kernel(struct bpf_program *prog) { + __do_free char *log_buf = NULL; + __u32 log_level = 0; + __u32 log_size = 0; union bpf_attr attr; + struct rlimit limit = { + .rlim_cur = RLIM_INFINITY, + .rlim_max = RLIM_INFINITY, + }; if (prog->kernel_fd >= 0) { - memset(log_buf, 0, log_size); return 0; } + if (lxc_log_get_level() <= LXC_LOG_LEVEL_DEBUG) { + log_buf = zalloc(BPF_LOG_BUF_SIZE); + if (!log_buf) { + WARN("Failed to allocate bpf log buffer"); + } else { + log_level = BPF_LOG_LEVEL; + log_size = BPF_LOG_BUF_SIZE; + } + } + + if (setrlimit(RLIMIT_MEMLOCK, &limit) < 0) + return log_error_errno(-1, errno, "Failed to set rlimit memlock to unlimited"); + attr = (union bpf_attr){ .prog_type = prog->prog_type, .insns = PTR_TO_UINT64(prog->instructions), .insn_cnt = prog->n_instructions, .license = PTR_TO_UINT64("GPL"), .log_buf = PTR_TO_UINT64(log_buf), - .log_level = !!log_buf, + .log_level = log_level, .log_size = log_size, }; prog->kernel_fd = bpf(BPF_PROG_LOAD, &attr, sizeof(attr)); if (prog->kernel_fd < 0) - return log_error_errno(-1, errno, "Failed to load bpf program"); + return log_error_errno(-1, errno, "Failed to load bpf program: %s", log_buf); + TRACE("Loaded bpf program: %s", log_buf ?: "(null)"); return 0; } @@ -362,7 +402,7 @@ int bpf_program_cgroup_attach(struct bpf_program *prog, int type, return true; } - ret = bpf_program_load_kernel(prog, NULL, 0); + ret = bpf_program_load_kernel(prog); if (ret < 0) return log_error_errno(-1, ret, "Failed to load bpf program"); @@ -518,11 +558,15 @@ bool bpf_devices_cgroup_supported(void) if (prog < 0) return log_trace(false, "Failed to allocate new bpf device cgroup program"); + ret = bpf_program_init(prog); + if (ret) + return log_error_errno(false, ENOMEM, "Failed to initialize bpf program"); + ret = bpf_program_add_instructions(prog, dummy, ARRAY_SIZE(dummy)); if (ret < 0) return log_trace(false, "Failed to add new instructions to bpf device cgroup program"); - ret = bpf_program_load_kernel(prog, NULL, 0); + ret = bpf_program_load_kernel(prog); if (ret < 0) return log_trace(false, "Failed to load new bpf device cgroup program"); diff --git a/src/lxc/cgroups/isulad_cgfsng.c b/src/lxc/cgroups/isulad_cgfsng.c index e16f8a19..c80527d5 100644 --- a/src/lxc/cgroups/isulad_cgfsng.c +++ b/src/lxc/cgroups/isulad_cgfsng.c @@ -823,6 +823,9 @@ static bool isulad_cg_legacy_handle_cpuset_hierarchy(struct hierarchy *h, char * char *cgpath, *slash; bool sub_mk_success = false; + if (is_unified_hierarchy(h)) + return true; + if (!string_in_list(h->controllers, "cpuset")) return true; @@ -1288,6 +1291,20 @@ __cgfsng_ops static bool isulad_cgfsng_mount(struct cgroup_ops *ops, ERROR("Failed to create directory: %s", tmpfspath); goto on_error; } + + if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) { + if (has_cgns && wants_force_mount) { + /* + * If cgroup namespaces are supported but the container + * will not have CAP_SYS_ADMIN after it has started we + * need to mount the cgroups manually. + */ + return cg_mount_in_cgroup_namespace(type, ops->unified, tmpfspath) == 0; + } + + return cg_mount_cgroup_full(type, ops->unified, tmpfspath) == 0; + } + ret = safe_mount(NULL, tmpfspath, "tmpfs", MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME, "size=10240k,mode=755", root, handler->conf->lsm_se_mount_context); @@ -2196,8 +2213,16 @@ __cgfsng_ops static int isulad_cgfsng_set(struct cgroup_ops *ops, h = get_hierarchy(ops, controller); if (h) { char *fullpath; - fullpath = build_full_cgpath_from_monitorpath(h, path, filename); + + if (strcmp(filename, "io.weight") == 0 || strcmp(filename, "io.bfq.weight") == 0) { + if (!file_exists(fullpath)) { + free(path); + free(fullpath); + return 0; + } + } + ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666); free(fullpath); } @@ -2428,6 +2453,9 @@ __cgfsng_ops static bool isulad_cgfsng_setup_limits_legacy(struct cgroup_ops *op if (!ops->hierarchies) return ret_set_errno(false, EINVAL); + if (pure_unified_layout(ops)) + return true; + sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings); if (!sorted_cgroup_settings) return false; @@ -2528,6 +2556,7 @@ static int bpf_device_cgroup_prepare(struct cgroup_ops *ops, __cgfsng_ops static bool isulad_cgfsng_setup_limits(struct cgroup_ops *ops, struct lxc_handler *handler) { + __do_free char *path = NULL; struct lxc_list *cgroup_settings, *iterator; struct hierarchy *h; struct lxc_conf *conf; @@ -2549,6 +2578,9 @@ __cgfsng_ops static bool isulad_cgfsng_setup_limits(struct cgroup_ops *ops, return true; cgroup_settings = &conf->cgroup2; + if (!pure_unified_layout(ops)) + return true; + if (!ops->unified) return false; h = ops->unified; @@ -2560,7 +2592,29 @@ __cgfsng_ops static bool isulad_cgfsng_setup_limits(struct cgroup_ops *ops, if (strncmp("devices", cg->subsystem, 7) == 0) { ret = bpf_device_cgroup_prepare(ops, conf, cg->subsystem, cg->value); + } else if (strcmp(cg->subsystem, "files.limit") == 0) { + long long int setvalue = 0; + const char *cgvalue = cg->value; + + if (lxc_safe_long_long(cgvalue, &setvalue) != 0) + return log_error(false, "Invalid integer value %s", cgvalue); + + if (setvalue <= 0) + cgvalue = "max"; + + ret = lxc_write_openat(h->container_full_path, + cg->subsystem, cgvalue, + strlen(cgvalue)); + if (ret < 0) + return log_error_errno(false, errno, "Failed to set \"%s\" to \"%s\"", + cg->subsystem, cgvalue); } else { + if (strcmp(cg->subsystem, "io.weight") == 0 || strcmp(cg->subsystem, "io.bfq.weight") == 0) { + path = must_make_path(h->container_full_path, cg->subsystem, NULL); + if (!file_exists(path)) { + continue; + } + } ret = lxc_write_openat(h->container_full_path, cg->subsystem, cg->value, strlen(cg->value)); diff --git a/src/lxc/lxccontainer.c b/src/lxc/lxccontainer.c index 06552ce5..5769b251 100644 --- a/src/lxc/lxccontainer.c +++ b/src/lxc/lxccontainer.c @@ -87,6 +87,9 @@ lxc_log_define(lxccontainer, lxc); +typedef bool (*func_is_io_stat_read)(const char *value); +typedef bool (*func_is_io_stat_write)(const char *value); + static bool do_lxcapi_destroy(struct lxc_container *c); static const char *lxcapi_get_config_path(struct lxc_container *c); #define do_lxcapi_get_config_path(c) lxcapi_get_config_path(c) @@ -5768,6 +5771,26 @@ static uint64_t metrics_get_ull(struct lxc_container *c, struct cgroup_ops *cgro return val; } +static uint64_t metrics_get_ull_with_max(struct lxc_container *c, struct cgroup_ops *cgroup_ops, const char *item) +{ + char buf[80] = {0}; + int len = 0; + uint64_t val = 0; + + len = cgroup_ops->get(cgroup_ops, item, buf, sizeof(buf), c->name, c->config_path); + if (len <= 0) { + DEBUG("unable to read cgroup item %s", item); + return 0; + } + + if (strcmp(buf, "max") == 0) { + return ULONG_MAX; + } + + val = strtoull(buf, NULL, 0); + return val; +} + static inline bool is_blk_metrics_read(const char *value) { return strcmp(value, "Read") == 0; @@ -5826,6 +5849,60 @@ err_out: return; } +static void metrics_get_io_stats_v2(struct lxc_container *c, struct cgroup_ops *cgroup_ops, const char *item, struct lxc_blkio_metrics *stats, func_is_io_stat_read is_io_stat_read, func_is_io_stat_write is_io_stat_write) +{ +#define BUFSIZE 4096 + char buf[BUFSIZE] = {0}; + int i = 0; + int j = 0; + int len = 0; + char **lines = NULL; + char **cols = NULL; + char **kv = NULL; + + len = cgroup_ops->get(cgroup_ops, item, buf, sizeof(buf), c->name, c->config_path); + if (len <= 0) { + DEBUG("unable to read cgroup item %s", item); + return; + } + + lines = lxc_string_split_and_trim(buf, '\n'); + if (lines == NULL) { + return; + } + + (void)memset(stats, 0, sizeof(struct lxc_blkio_metrics)); + // line example: + // 259:0 rbytes=0 wbytes=12288 rios=0 wios=4 dbytes=0 dios=0 + for (i = 0; lines[i]; i++) { + cols = lxc_string_split_and_trim(lines[i], ' '); + if (cols == NULL || lxc_array_len((void **)cols) < 2) { + goto err_out; + } + len = lxc_array_len((void **)cols); + for (j = 1; j < len; j++) { + kv = lxc_string_split(cols[j], '='); + if (kv == NULL || lxc_array_len((void **)kv) != 2) { + lxc_free_array((void **)kv, free); + continue; + } + if (is_io_stat_read(kv[0])) { + stats->read += strtoull(kv[1], NULL, 0); + } else if (is_io_stat_write(kv[0])) { + stats->write += strtoull(kv[1], NULL, 0); + } + lxc_free_array((void **)kv, free); + } + lxc_free_array((void **)cols, free); + } + + stats->total = stats->read + stats->write; + +err_out: + lxc_free_array((void **)lines, free); + return; +} + static uint64_t metrics_match_get_ull(struct lxc_container *c, struct cgroup_ops *cgroup_ops, const char *item, const char *match, int column) { #define BUFSIZE 4096 @@ -5874,6 +5951,54 @@ err_out: return val; } +static bool is_io_stat_rbytes(const char *value) +{ + return strcmp(value, "rbytes") == 0; +} + +static bool is_io_stat_wbytes(const char *value) +{ + return strcmp(value, "wbytes") == 0; +} + +static bool is_io_stat_rios(const char *value) +{ + return strcmp(value, "rios") == 0; +} + +static bool is_io_stat_wios(const char *value) +{ + return strcmp(value, "wios") == 0; +} + +static bool unified_metrics_get(struct lxc_container *c, struct cgroup_ops *cgroup_ops, struct lxc_container_metrics *metrics) +{ + // cpu + metrics->cpu_use_nanos = metrics_match_get_ull(c, cgroup_ops, "cpu.stat", "usage_usec", 1) * 1000; + metrics->cpu_use_user = metrics_match_get_ull(c, cgroup_ops, "cpu.stat", "user_usec", 1) * 1000; + metrics->cpu_use_sys = metrics_match_get_ull(c, cgroup_ops, "cpu.stat", "system_usec", 1) * 1000; + + // io + metrics_get_io_stats_v2(c, cgroup_ops, "io.stat", &metrics->io_service_bytes, is_io_stat_rbytes, is_io_stat_wbytes); + metrics_get_io_stats_v2(c, cgroup_ops, "io.stat", &metrics->io_serviced, is_io_stat_rios, is_io_stat_wios); + + // memory + metrics->mem_used = metrics_get_ull(c, cgroup_ops, "memory.current"); + metrics->mem_limit = metrics_get_ull_with_max(c, cgroup_ops, "memory.max"); + metrics->inactive_file_total = metrics_match_get_ull(c, cgroup_ops, "memory.stat", "inactive_file", 1); + metrics->cache = metrics_match_get_ull(c, cgroup_ops, "memory.stat", "file", 1); + metrics->cache_total = metrics->cache; + + // cgroup v2 does not support kernel memory + metrics->kmem_used = 0; + metrics->kmem_limit = 0; + + // pids + metrics->pids_current = metrics_get_ull(c, cgroup_ops, "pids.current"); + + return true; +} + /* isulad add get container metrics */ static bool do_lxcapi_get_container_metrics(struct lxc_container *c, struct lxc_container_metrics *metrics) { @@ -5897,6 +6022,10 @@ static bool do_lxcapi_get_container_metrics(struct lxc_container *c, struct lxc return false; } + if (cgroup_ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) { + return unified_metrics_get(c, cgroup_ops, metrics); + } + metrics->cpu_use_nanos = metrics_get_ull(c, cgroup_ops, "cpuacct.usage"); metrics->pids_current = metrics_get_ull(c, cgroup_ops, "pids.current"); -- 2.20.1