341 lines
11 KiB
Diff
341 lines
11 KiB
Diff
From 448eb95b45b0cf6ecc7cf1a3e24056a2fdae85bd Mon Sep 17 00:00:00 2001
|
|
From: Yicong Yang <yangyicong@hisilicon.com>
|
|
Date: Fri, 13 Oct 2023 15:21:11 +0800
|
|
Subject: [PATCH] Support initializing HBW nodes from memory_locality
|
|
|
|
In current implementation we mainly infer the HBW nodes from the
|
|
HMAT/SLIT, which may not describe all the cases. For example
|
|
the HMAT/SLIT cannot describe the topology below:
|
|
|
|
[ Node 0 ]
|
|
[ CPU 0-3 ][ CPU 4-7 ]
|
|
| |
|
|
[ HBM 0 ][ HBM 1 ]
|
|
[ Node 1 ][ Node 2 ]
|
|
|
|
CPU 0-7 are in one NUMA node, but CPU 0-3 is closest to HBM 0 while
|
|
CPU 4-7 is closest to HBM 1. Current HMAT/SLIT cannot support this
|
|
case.
|
|
|
|
In order to support this, openeuler has merged a HBM device driver
|
|
to export the topology by sysfs[1]. The description of above topology
|
|
will be like:
|
|
$ cat /sys/kernel/hbm_memory/memory_topo/memory_locality
|
|
1 0-3
|
|
1 4-7
|
|
|
|
This patch cooperate with the HBM device driver to support initializing
|
|
the HBW nodes from memory_locality for memkind. Will try to obtains
|
|
the HBW nodes by parsing the memory_locality first, on failure or there
|
|
is no memory_locality on the system will fallback to HMAT/SLIT. User
|
|
can disable this function by MEMKIND_DISABLE_MEMORY_LOCALITY=1 as well.
|
|
|
|
[1] https://gitee.com/openeuler/kernel/pulls/451
|
|
Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
|
|
---
|
|
include/memkind/internal/memkind_bitmask.h | 2 +
|
|
src/memkind_bitmask.c | 185 +++++++++++++++++++++
|
|
src/memkind_hbw.c | 42 +++++
|
|
3 files changed, 229 insertions(+)
|
|
|
|
diff --git a/include/memkind/internal/memkind_bitmask.h b/include/memkind/internal/memkind_bitmask.h
|
|
index 5c5b8434..6b0c3f64 100644
|
|
--- a/include/memkind/internal/memkind_bitmask.h
|
|
+++ b/include/memkind/internal/memkind_bitmask.h
|
|
@@ -12,6 +12,8 @@ extern "C" {
|
|
|
|
typedef int (*get_node_bitmask)(struct bitmask **);
|
|
|
|
+int set_numanode_from_memory_locality(void **numanode,
|
|
+ memkind_node_variant_t node_variant);
|
|
int set_closest_numanode(get_node_bitmask get_bitmask, void **numanode,
|
|
memkind_node_variant_t node_variant);
|
|
int set_bitmask_for_current_numanode(unsigned long *nodemask,
|
|
diff --git a/src/memkind_bitmask.c b/src/memkind_bitmask.c
|
|
index 4f6d9f00..84300395 100644
|
|
--- a/src/memkind_bitmask.c
|
|
+++ b/src/memkind_bitmask.c
|
|
@@ -1,9 +1,11 @@
|
|
// SPDX-License-Identifier: BSD-2-Clause
|
|
/* Copyright (C) 2019 - 2021 Intel Corporation. */
|
|
|
|
+#include <ctype.h>
|
|
#include <errno.h>
|
|
#include <limits.h>
|
|
#include <stdint.h>
|
|
+#include <stdio.h>
|
|
|
|
#include <memkind/internal/memkind_bitmask.h>
|
|
#include <memkind/internal/memkind_log.h>
|
|
@@ -12,6 +14,89 @@
|
|
// Vector of CPUs with memory NUMA Node id(s)
|
|
VEC(vec_cpu_node, int);
|
|
|
|
+void init_node_closet_cpu(cpu_set_t **cpunode_mask, int num_cpu, int num_nodes)
|
|
+{
|
|
+ char *line = NULL;
|
|
+ size_t len = 0;
|
|
+ ssize_t n;
|
|
+ FILE *f;
|
|
+
|
|
+ /*
|
|
+ * The content of /sys/kernel/hbm_memory/memory_topo/memory_locality should
|
|
+ * be like:
|
|
+ * 2 0-3
|
|
+ * 3 4-7
|
|
+ * 4 8-11
|
|
+ * 5 12-15
|
|
+ * 6 16-19
|
|
+ * 7 20-23
|
|
+ * 8 24-27
|
|
+ * 9 28-31
|
|
+ *
|
|
+ * The 1st column is the HBW node number and the 2nd column is the CPU list
|
|
+ * which is closet to the HBW node.
|
|
+ */
|
|
+ f = fopen("/sys/kernel/hbm_memory/memory_topo/memory_locality", "r");
|
|
+ if (!f)
|
|
+ return;
|
|
+
|
|
+ while ((n = getline(&line, &len, f)) != -1) {
|
|
+ long int node, begin_cpu, end_cpu;
|
|
+ char *begin, *end;
|
|
+
|
|
+ /* Get the node number first */
|
|
+ node = strtol(line, &end, 0);
|
|
+
|
|
+ /* Either the node number is invalid or the whole line is invalid */
|
|
+ if (line == end || node == LONG_MAX || node == LONG_MIN)
|
|
+ break;
|
|
+
|
|
+ if (node >= num_nodes) {
|
|
+ log_err("Invalid node number provided by memory_locality.");
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ /* Try to find the beginning of the CPU list string */
|
|
+ while (*end == ' ' && end != line + len)
|
|
+ end++;
|
|
+
|
|
+ if (end == line + len || !isdigit(*end))
|
|
+ break;
|
|
+
|
|
+ begin = end;
|
|
+ do {
|
|
+ begin_cpu = strtol(begin, &end, 0);
|
|
+ if (begin == end || begin_cpu == LONG_MAX || begin_cpu == LONG_MIN)
|
|
+ break;
|
|
+
|
|
+ /* End of the line */
|
|
+ if (*end == '\0' || *end == '\n') {
|
|
+ CPU_SET_S(begin_cpu, CPU_ALLOC_SIZE(num_cpu), cpunode_mask[node]);
|
|
+ break;
|
|
+ } else if (*end == ',') {
|
|
+ CPU_SET_S(begin_cpu, CPU_ALLOC_SIZE(num_cpu), cpunode_mask[node]);
|
|
+ } else if (*end == '-' && isdigit(*(++end))) {
|
|
+ begin = end;
|
|
+ end_cpu = strtol(begin, &end, 0);
|
|
+ if (begin == end || end_cpu == LONG_MAX || end_cpu == LONG_MIN)
|
|
+ break;
|
|
+
|
|
+ while (begin_cpu <= end_cpu) {
|
|
+ CPU_SET_S(begin_cpu, CPU_ALLOC_SIZE(num_cpu), cpunode_mask[node]);
|
|
+ ++begin_cpu;
|
|
+ }
|
|
+ } else {
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ begin = end + 1;
|
|
+ } while (begin < line + len);
|
|
+ }
|
|
+
|
|
+ free(line);
|
|
+ fclose(f);
|
|
+}
|
|
+
|
|
int memkind_env_get_nodemask(char *nodes_env, struct bitmask **bm)
|
|
{
|
|
*bm = numa_parse_nodestring(nodes_env);
|
|
@@ -22,6 +107,106 @@ int memkind_env_get_nodemask(char *nodes_env, struct bitmask **bm)
|
|
return MEMKIND_SUCCESS;
|
|
}
|
|
|
|
+int set_numanode_from_memory_locality(void **numanode,
|
|
+ memkind_node_variant_t node_variant)
|
|
+{
|
|
+ int num_cpu = numa_num_configured_cpus();
|
|
+ int cpuset_size = CPU_ALLOC_SIZE(num_cpu);
|
|
+ int max_node_id = numa_max_node();
|
|
+ cpu_set_t **cpunode_mask;
|
|
+ int init_node, cpu_id;
|
|
+ int status;
|
|
+
|
|
+ cpunode_mask = calloc(max_node_id + 1, sizeof(*cpunode_mask));
|
|
+ if (!cpunode_mask) {
|
|
+ status = MEMKIND_ERROR_MALLOC;
|
|
+ log_err("calloc() failed.");
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ for (init_node = 0; init_node <= max_node_id; init_node++) {
|
|
+ cpunode_mask[init_node] = CPU_ALLOC(num_cpu);
|
|
+ if (!cpunode_mask[init_node]) {
|
|
+ while (init_node >= 0) {
|
|
+ CPU_FREE(cpunode_mask[init_node]);
|
|
+ init_node--;
|
|
+ }
|
|
+
|
|
+ status = MEMKIND_ERROR_MALLOC;
|
|
+ log_err("CPU_ALLOC_SIZE() failed.");
|
|
+ goto free_cpunode_mask;
|
|
+ }
|
|
+
|
|
+ CPU_ZERO_S(cpuset_size, cpunode_mask[init_node]);
|
|
+ }
|
|
+
|
|
+ init_node_closet_cpu(cpunode_mask, num_cpu, max_node_id + 1);
|
|
+
|
|
+ struct vec_cpu_node *node_arr =
|
|
+ (struct vec_cpu_node *)calloc(num_cpu, sizeof(struct vec_cpu_node));
|
|
+ if (!node_arr) {
|
|
+ status = MEMKIND_ERROR_MALLOC;
|
|
+ log_err("calloc() failed.");
|
|
+ goto free_cpunode_mask_array;
|
|
+ }
|
|
+
|
|
+ /* Scan CPUs once. Assuming the CPU number are much more bigger than NUMA Nodes */
|
|
+ for (cpu_id = 0; cpu_id < num_cpu; cpu_id++) {
|
|
+ for (init_node = 0; init_node <= max_node_id; init_node++) {
|
|
+ if (CPU_ISSET_S(cpu_id, cpuset_size, cpunode_mask[init_node])) {
|
|
+ VEC_PUSH_BACK(&node_arr[cpu_id], init_node);
|
|
+
|
|
+ /*
|
|
+ * A cpu should always have one closet node, log error if
|
|
+ * violate this.
|
|
+ */
|
|
+ if (node_variant == NODE_VARIANT_SINGLE &&
|
|
+ VEC_SIZE(&node_arr[cpu_id]) > 1) {
|
|
+ log_err("CPU%d has more than one closet node.", cpu_id);
|
|
+ status = MEMKIND_ERROR_RUNTIME;
|
|
+ for (cpu_id = 0; cpu_id < num_cpu; cpu_id++) {
|
|
+ if (VEC_CAPACITY(&node_arr[cpu_id]))
|
|
+ VEC_DELETE(&node_arr[cpu_id]);
|
|
+ }
|
|
+
|
|
+ goto free_node_arr;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* Sanity Check each node_arr */
|
|
+ for (cpu_id = 0; cpu_id < num_cpu; cpu_id++) {
|
|
+ if (VEC_SIZE(&node_arr[cpu_id]) == 0) {
|
|
+ log_err("CPU%d's nodemask is not initialized.", cpu_id);
|
|
+ status = MEMKIND_ERROR_RUNTIME;
|
|
+ for (cpu_id = 0; cpu_id < num_cpu; cpu_id++) {
|
|
+ if (VEC_CAPACITY(&node_arr[cpu_id]))
|
|
+ VEC_DELETE(&node_arr[cpu_id]);
|
|
+ }
|
|
+
|
|
+ goto free_node_arr;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ *numanode = node_arr;
|
|
+ status = MEMKIND_SUCCESS;
|
|
+ goto free_cpunode_mask_array;
|
|
+
|
|
+free_node_arr:
|
|
+ free(node_arr);
|
|
+
|
|
+free_cpunode_mask_array:
|
|
+ for (init_node = 0; init_node <= max_node_id; init_node++)
|
|
+ CPU_FREE(cpunode_mask[init_node]);
|
|
+
|
|
+free_cpunode_mask:
|
|
+ free(cpunode_mask);
|
|
+
|
|
+out:
|
|
+ return status;
|
|
+}
|
|
+
|
|
int set_closest_numanode(get_node_bitmask get_bitmask, void **numanode,
|
|
memkind_node_variant_t node_variant)
|
|
{
|
|
diff --git a/src/memkind_hbw.c b/src/memkind_hbw.c
|
|
index 077660ab..e9948593 100644
|
|
--- a/src/memkind_hbw.c
|
|
+++ b/src/memkind_hbw.c
|
|
@@ -363,10 +363,36 @@ static bool is_hmat_supported(void)
|
|
return true;
|
|
}
|
|
|
|
+/*
|
|
+ * OS may provide further information of HBW topology in
|
|
+ * /sys/kernel/hbm_memory/memory_topo/memory_locality. Use it unless user
|
|
+ * specified HBW nodes or disabled using of memory_locality.
|
|
+ */
|
|
+static bool use_memory_locality(void)
|
|
+{
|
|
+ char *memory_locality_disable = memkind_get_env("MEMKIND_DISABLE_MEMORY_LOCALITY");
|
|
+
|
|
+ if (memory_locality_disable && !strncmp(memory_locality_disable, "1", 1))
|
|
+ return false;
|
|
+
|
|
+ if (memkind_get_env("MEMKIND_HBW_NODES"))
|
|
+ return false;
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
static void memkind_hbw_closest_numanode_init(void)
|
|
{
|
|
struct hbw_numanode_t *g = &memkind_hbw_numanode_g[NODE_VARIANT_MULTIPLE];
|
|
g->numanode = NULL;
|
|
+
|
|
+ if (use_memory_locality()) {
|
|
+ g->init_err = set_numanode_from_memory_locality(&g->numanode,
|
|
+ NODE_VARIANT_MULTIPLE);
|
|
+ if (!g->init_err)
|
|
+ return;
|
|
+ }
|
|
+
|
|
if (!is_hmat_supported()) {
|
|
g->init_err = set_closest_numanode(memkind_hbw_get_nodemask,
|
|
&g->numanode, NODE_VARIANT_MULTIPLE);
|
|
@@ -380,6 +406,14 @@ static void memkind_hbw_closest_preferred_numanode_init(void)
|
|
{
|
|
struct hbw_numanode_t *g = &memkind_hbw_numanode_g[NODE_VARIANT_SINGLE];
|
|
g->numanode = NULL;
|
|
+
|
|
+ if (use_memory_locality()) {
|
|
+ g->init_err = set_numanode_from_memory_locality(&g->numanode,
|
|
+ NODE_VARIANT_SINGLE);
|
|
+ if (!g->init_err)
|
|
+ return;
|
|
+ }
|
|
+
|
|
if (!is_hmat_supported()) {
|
|
g->init_err = set_closest_numanode(memkind_hbw_get_nodemask,
|
|
&g->numanode, NODE_VARIANT_SINGLE);
|
|
@@ -393,6 +427,14 @@ static void memkind_hbw_all_numanode_init(void)
|
|
{
|
|
struct hbw_numanode_t *g = &memkind_hbw_numanode_g[NODE_VARIANT_ALL];
|
|
g->numanode = NULL;
|
|
+
|
|
+ if (use_memory_locality()) {
|
|
+ g->init_err = set_numanode_from_memory_locality(&g->numanode,
|
|
+ NODE_VARIANT_ALL);
|
|
+ if (!g->init_err)
|
|
+ return;
|
|
+ }
|
|
+
|
|
if (!is_hmat_supported()) {
|
|
g->init_err = set_closest_numanode(memkind_hbw_get_nodemask,
|
|
&g->numanode, NODE_VARIANT_ALL);
|
|
--
|
|
2.24.0
|
|
|