iSulad/0025-fix-container-exit-health-check-residue-and-multiple.patch
haozi007 e72b756384 iSulad: sync with upstream iSulad
Signed-off-by: haozi007 <liuhao27@huawei.com>
2021-03-23 09:50:40 +08:00

678 lines
24 KiB
Diff

From 012b3f94279b0c6d193d510aa211b977a38e7c24 Mon Sep 17 00:00:00 2001
From: wujing <jing.woo@outlook.com>
Date: Fri, 22 Jan 2021 17:13:16 +0800
Subject: [PATCH 25/53] fix container exit health check residue and multiple
health checks
Signed-off-by: wujing <wujing50@huawei.com>
---
.../executor/container_cb/execution_extend.c | 7 +-
src/daemon/modules/api/container_api.h | 2 +
.../container/container_events_handler.c | 1 -
.../container/health_check/health_check.c | 274 +++++++++++++-----
.../modules/service/service_container.c | 3 +-
test/mocks/health_check_mock.cc | 8 +
test/mocks/health_check_mock.h | 1 +
.../execution_extend/execution_extend_ut.cc | 6 +
8 files changed, 220 insertions(+), 82 deletions(-)
diff --git a/src/daemon/executor/container_cb/execution_extend.c b/src/daemon/executor/container_cb/execution_extend.c
index 40f24d29..01a0466f 100644
--- a/src/daemon/executor/container_cb/execution_extend.c
+++ b/src/daemon/executor/container_cb/execution_extend.c
@@ -532,6 +532,7 @@ static int do_resume_container(container_t *cont)
params.rootpath = cont->root_path;
params.state = cont->state_path;
+
if (runtime_resume(id, cont->runtime, &params)) {
ERROR("Failed to resume container:%s", id);
ret = -1;
@@ -716,7 +717,11 @@ static int do_pause_container(container_t *cont)
params.rootpath = cont->root_path;
params.state = cont->state_path;
+
+ container_stop_health_checks(cont->common_config->id);
+
if (runtime_pause(id, cont->runtime, &params)) {
+ container_update_health_monitor(cont->common_config->id);
ERROR("Failed to pause container:%s", id);
ret = -1;
goto out;
@@ -724,8 +729,6 @@ static int do_pause_container(container_t *cont)
container_state_set_paused(cont->state);
- container_update_health_monitor(cont->common_config->id);
-
if (container_state_to_disk(cont)) {
ERROR("Failed to save container \"%s\" to disk", id);
ret = -1;
diff --git a/src/daemon/modules/api/container_api.h b/src/daemon/modules/api/container_api.h
index 83216cf3..3b7f2889 100644
--- a/src/daemon/modules/api/container_api.h
+++ b/src/daemon/modules/api/container_api.h
@@ -49,6 +49,8 @@ typedef struct health_check_manager {
pthread_mutex_t mutex;
bool init_mutex;
health_check_monitor_status_t monitor_status;
+ // Used to wait for the health check minotor thread to close
+ bool monitor_exist;
} health_check_manager_t;
typedef struct _container_state_t_ {
diff --git a/src/daemon/modules/container/container_events_handler.c b/src/daemon/modules/container/container_events_handler.c
index 1283f99c..994c11cc 100644
--- a/src/daemon/modules/container/container_events_handler.c
+++ b/src/daemon/modules/container/container_events_handler.c
@@ -156,7 +156,6 @@ static int container_state_changed(container_t *cont, const struct isulad_events
container_state_set_stopped(cont->state, (int)events->exit_status);
container_wait_stop_cond_broadcast(cont);
plugin_event_container_post_stop(cont);
- container_stop_health_checks(cont->common_config->id);
}
auto_remove = !should_restart && cont->hostconfig != NULL && cont->hostconfig->auto_remove;
diff --git a/src/daemon/modules/container/health_check/health_check.c b/src/daemon/modules/container/health_check/health_check.c
index 04467938..c6ccbbf2 100644
--- a/src/daemon/modules/container/health_check/health_check.c
+++ b/src/daemon/modules/container/health_check/health_check.c
@@ -73,24 +73,47 @@ static char *get_health_status(container_state_t *s)
return status;
}
-static void set_health_status(container_state_t *s, const char *new)
+static void set_health_status(container_t *cont, const char *new)
{
- if (s == NULL || new == NULL) {
+ if (cont->state == NULL || new == NULL) {
return;
}
- container_state_lock(s);
- free(s->state->health->status);
- s->state->health->status = util_strdup_s(new);
- container_state_unlock(s);
+
+ container_state_lock(cont->state);
+ free(cont->state->state->health->status);
+ cont->state->state->health->status = util_strdup_s(new);
+ container_state_unlock(cont->state);
+
+ if (container_state_to_disk(cont)) {
+ WARN("Failed to save container \"%s\" to disk", cont->common_config->id);
+ }
}
-static void set_monitor_idle_status(health_check_manager_t *health)
+static void init_monitor_idle_status(health_check_manager_t *health)
{
container_health_check_lock(health);
health->monitor_status = MONITOR_IDLE;
container_health_check_unlock(health);
}
+static int transfer_monitor_idle_status(health_check_manager_t *health)
+{
+ int ret = 0;
+
+ container_health_check_lock(health);
+ // When the minitor status is MONITOR_STOP, it cann't be set to minitor status
+ if (health->monitor_status == MONITOR_STOP) {
+ ret = -1;
+ goto out;
+ }
+
+ health->monitor_status = MONITOR_IDLE;
+
+out:
+ container_health_check_unlock(health);
+ return ret;
+}
+
static void set_monitor_stop_status(health_check_manager_t *health)
{
container_health_check_lock(health);
@@ -98,11 +121,21 @@ static void set_monitor_stop_status(health_check_manager_t *health)
container_health_check_unlock(health);
}
-static void set_monitor_interval_timeout_status(health_check_manager_t *health)
+static int transfer_monitor_interval_timeout_status(health_check_manager_t *health)
{
+ int ret = 0;
+
container_health_check_lock(health);
+ // When the minitor status is MONITOR_STOP, it cann't be set to minitor status
+ if (health->monitor_status == MONITOR_STOP) {
+ ret = -1;
+ goto out;
+ }
health->monitor_status = MONITOR_INTERVAL;
+
+out:
container_health_check_unlock(health);
+ return ret;
}
static health_check_monitor_status_t get_health_check_monitor_state(health_check_manager_t *health)
@@ -116,18 +149,35 @@ static health_check_monitor_status_t get_health_check_monitor_state(health_check
return ret;
}
-static void close_health_check_monitor(const container_t *cont)
+static void set_monitor_exist_flag(health_check_manager_t *health, bool closed)
+{
+ container_health_check_lock(health);
+ health->monitor_exist = closed;
+ container_health_check_unlock(health);
+}
+
+static bool get_monitor_exist_flag(health_check_manager_t *health)
+{
+ bool ret;
+
+ container_health_check_lock(health);
+ ret = health->monitor_exist;
+ container_health_check_unlock(health);
+
+ return ret;
+}
+
+static void close_health_check_monitor(container_t *cont)
{
if (cont == NULL || cont->health_check == NULL) {
return;
}
- set_monitor_stop_status(cont->health_check);
- set_health_status(cont->state, UNHEALTHY);
-}
-static void open_health_check_monitor(health_check_manager_t *health)
-{
- set_monitor_interval_timeout_status(health);
+ set_monitor_stop_status(cont->health_check);
+ // ensure that the monitor process exits
+ while (get_monitor_exist_flag(cont->health_check)) {
+ util_usleep_nointerupt(500);
+ }
}
// Called when the container is being stopped (whether because the health check is
@@ -160,6 +210,7 @@ void health_check_manager_free(health_check_manager_t *health_check)
if (health_check->init_mutex) {
pthread_mutex_destroy(&health_check->mutex);
}
+
free(health_check);
}
@@ -183,6 +234,8 @@ static health_check_manager_t *health_check_manager_new()
health_check->monitor_status = MONITOR_IDLE;
+ health_check->monitor_exist = false;
+
return health_check;
cleanup:
health_check_manager_free(health_check);
@@ -320,6 +373,43 @@ out:
return ret;
}
+static void *stop_container_on_unhealthy(void *arg)
+{
+ int ret = 0;
+ char *container_id = NULL;
+ container_t *cont = NULL;
+
+ if (arg == NULL) {
+ ERROR("Invalid input arguments");
+ return NULL;
+ }
+ container_id = (char *)arg;
+
+ ret = pthread_detach(pthread_self());
+ if (ret != 0) {
+ CRIT("Set thread detach fail");
+ }
+
+ prctl(PR_SET_NAME, "ExitOnUnhealthy");
+
+ cont = containers_store_get(container_id);
+ if (cont == NULL) {
+ ERROR("Failed to get container info");
+ goto out;
+ }
+
+ // kill container when exit on unhealthy flag is set
+ ret = stop_container(cont, 3, true, false);
+ if (ret != 0) {
+ ERROR("Could not stop running container %s, cannot remove", cont->common_config->id);
+ }
+
+out:
+ free(container_id);
+ container_unref(cont);
+ return NULL;
+}
+
static int handle_increment_streak(container_t *cont, int retries)
{
int ret = 0;
@@ -328,18 +418,19 @@ static int handle_increment_streak(container_t *cont, int retries)
health = cont->state->state->health;
health->failing_streak++;
if (health->failing_streak >= retries) {
- set_health_status(cont->state, UNHEALTHY);
+ set_health_status(cont, UNHEALTHY);
if (cont->common_config->config->healthcheck->exit_on_unhealthy) {
- // kill container when exit on unhealthy flag is set
- ret = stop_container(cont, 3, true, false);
- if (ret != 0) {
- isulad_try_set_error_message("Could not stop running container %s, cannot remove",
- cont->common_config->id);
- ERROR("Could not stop running container %s, cannot remove", cont->common_config->id);
+ pthread_t stop_container_tid = { 0 };
+ char *container_id = util_strdup_s(cont->common_config->id);
+ if (pthread_create(&stop_container_tid, NULL, stop_container_on_unhealthy,
+ (void *)container_id)) {
+ free(container_id);
+ ERROR("Failed to create thread to exec health check");
ret = -1;
}
}
}
+
return ret;
}
@@ -442,7 +533,7 @@ static int handle_probe_result(const char *container_id, const defs_health_log_e
if (result->exit_code == EXIT_STATUS_HEALTHY) {
health->failing_streak = 0;
- set_health_status(cont->state, HEALTHY);
+ set_health_status(cont, HEALTHY);
} else {
if (handle_unhealthy_case(cont, result, retries)) {
ERROR("failed to handle unhealthy case");
@@ -457,10 +548,7 @@ static int handle_probe_result(const char *container_id, const defs_health_log_e
// note: event
EVENT("EVENT: {Object: %s, health_status: %s}", cont->common_config->id, current);
}
- if (container_state_to_disk(cont)) {
- ERROR("Failed to save container \"%s\" to disk", cont->common_config->id);
- ret = -1;
- }
+
out:
free(old_state);
free(current);
@@ -499,10 +587,9 @@ static void health_check_exec_success_handle(const container_exec_response *cont
// exec the healthcheck command in the container.
// Returns the exit code and probe output (if any)
-void *health_check_run(void *arg)
+static void health_check_run(const char *container_id)
{
int ret = 0;
- char *container_id = NULL;
char **cmd_slice = NULL;
char output[REV_BUF_SIZE] = { 0 };
char timebuffer[TIME_STR_SIZE] = { 0 };
@@ -514,13 +601,6 @@ void *health_check_run(void *arg)
defs_health_log_element *result = NULL;
container_config *config = NULL;
- if (arg == NULL) {
- ERROR("Invalid input arguments");
- return NULL;
- }
-
- container_id = util_strdup_s((char *)arg);
-
cont = containers_store_get(container_id);
if (cont == NULL) {
ERROR("Failed to get container info");
@@ -590,14 +670,10 @@ void *health_check_run(void *arg)
out:
util_free_array(cmd_slice);
- free(container_id);
- container_id = NULL;
free_defs_health_log_element(result);
free_container_exec_request(container_req);
free_container_exec_response(container_res);
container_unref(cont);
- DAEMON_CLEAR_ERRMSG();
- return NULL;
}
// Get a suitable probe implementation for the container's healthcheck configuration.
@@ -623,41 +699,81 @@ static health_probe_t get_probe(const container_t *cont)
}
}
+static bool valid_container_status_for_health_check(const char *container_id)
+{
+ bool bret = true;
+ const char *id = NULL;
+ container_t *cont = NULL;
+
+ cont = containers_store_get(container_id);
+ if (cont == NULL) {
+ ERROR("No such container:%s", container_id);
+ bret = false;
+ goto out;
+ }
+
+ id = cont->common_config->id;
+
+ if (!container_is_running(cont->state)) {
+ ERROR("Container %s is not running.", id);
+ bret = false;
+ goto out;
+ }
+
+ if (container_is_paused(cont->state)) {
+ ERROR("Container %s is paused.", id);
+ bret = false;
+ goto out;
+ }
+
+ if (container_is_restarting(cont->state)) {
+ ERROR("Container %s is restarting.", id);
+ bret = false;
+ goto out;
+ }
+
+out:
+ container_unref(cont);
+ return bret;
+}
+
static int do_monitor_interval(const char *container_id, health_check_manager_t *health_check,
types_timestamp_t *start_timestamp)
{
int ret = 0;
- pthread_t exec_tid = { 0 };
- if (pthread_create(&exec_tid, NULL, health_check_run, (void *)container_id)) {
- ERROR("Failed to create thread to exec health check");
+ if (!valid_container_status_for_health_check(container_id)) {
+ ERROR("Invalid container status for health check");
ret = -1;
goto out;
}
- if (pthread_join(exec_tid, NULL) != 0) {
- ERROR("Failed to run health check thread");
+
+ health_check_run(container_id);
+
+ if (transfer_monitor_idle_status(health_check) != 0) {
ret = -1;
goto out;
}
- if (get_health_check_monitor_state(health_check) == MONITOR_STOP) {
- ret = 0;
- goto out;
- }
- set_monitor_idle_status(health_check);
if (util_get_now_time_stamp(start_timestamp) == false) {
ERROR("Failed to get time stamp");
ret = -1;
goto out;
}
+
out:
return ret;
}
-static int do_monitor_default(int64_t probe_interval, health_check_manager_t *health_check,
+static int do_monitor_default(const char *container_id, int64_t probe_interval, health_check_manager_t *health_check,
const types_timestamp_t *start_timestamp, types_timestamp_t *last_timestamp)
{
int64_t time_interval = 0;
+ if (!valid_container_status_for_health_check(container_id)) {
+ ERROR("Invalid container status for health check");
+ return -1;
+ }
+
if (util_get_now_time_stamp(last_timestamp) == false) {
ERROR("Failed to get time stamp");
return -1;
@@ -668,13 +784,14 @@ static int do_monitor_default(int64_t probe_interval, health_check_manager_t *he
return -1;
}
- if (time_interval >= probe_interval) {
- set_monitor_interval_timeout_status(health_check);
+ if (time_interval >= probe_interval && transfer_monitor_interval_timeout_status(health_check) != 0) {
+ return -1;
}
util_usleep_nointerupt(500);
return 0;
}
+
// Run the container's monitoring thread until notified via "stop".
// There is never more than one monitor thread running per container at a time.
static void *health_check_monitor(void *arg)
@@ -689,14 +806,17 @@ static void *health_check_monitor(void *arg)
ERROR("Container id is empty");
return NULL;
}
- container_id = util_strdup_s((char *)arg);
+
+ container_id = (char *)arg;
+
+ prctl(PR_SET_NAME, "HealthCheck");
cont = containers_store_get(container_id);
if (cont == NULL) {
ERROR("Failed to get container info");
goto out;
}
-
+ set_monitor_exist_flag(cont->health_check, true);
if (util_get_now_time_stamp(&start_timestamp) == false) {
ERROR("Failed to monitor start time stamp");
goto out;
@@ -704,7 +824,7 @@ static void *health_check_monitor(void *arg)
probe_interval = (cont->common_config->config->healthcheck->interval == 0) ?
DEFAULT_PROBE_INTERVAL :
cont->common_config->config->healthcheck->interval;
- set_monitor_idle_status(cont->health_check);
+
while (true) {
switch (get_health_check_monitor_state(cont->health_check)) {
case MONITOR_STOP:
@@ -712,30 +832,35 @@ static void *health_check_monitor(void *arg)
goto out;
/* fall-through */
case MONITOR_INTERVAL:
- if (do_monitor_interval(container_id, cont->health_check, &start_timestamp)) {
+ if (do_monitor_interval(container_id, cont->health_check, &start_timestamp) != 0) {
goto out;
}
break;
case MONITOR_IDLE:
/* fall-through */
default:
- if (do_monitor_default(probe_interval, cont->health_check, &start_timestamp, &last_timestamp)) {
+ if (do_monitor_default(container_id, probe_interval, cont->health_check,
+ &start_timestamp, &last_timestamp) != 0) {
goto out;
}
break;
}
}
+
out:
free(container_id);
container_id = NULL;
+ // unhealthy when the monitor has stopped for compatibility reasons
+ set_health_status(cont, UNHEALTHY);
+ // post semaphore, indicating that the minitor process has exited
+ set_monitor_exist_flag(cont->health_check, false);
container_unref(cont);
DAEMON_CLEAR_ERRMSG();
return NULL;
}
// Ensure the health-check monitor is running or not, depending on the current
-// state of the container.
-// Called from monitor.go, with c locked.
+// state of the container. Called from monitor, with c locked.
void container_update_health_monitor(const char *container_id)
{
bool want_running = false;
@@ -746,6 +871,7 @@ void container_update_health_monitor(const char *container_id)
if (container_id == NULL) {
return;
}
+
cont = containers_store_get(container_id);
if (cont == NULL) {
ERROR("Failed to get container info");
@@ -756,13 +882,18 @@ void container_update_health_monitor(const char *container_id)
if (health == NULL) {
goto out;
}
+
probe = get_probe(cont);
- want_running = cont->state->state->running && !cont->state->state->paused && probe != HEALTH_NONE;
+ want_running = container_is_running(cont->state) && !container_is_paused(cont->state) && probe != HEALTH_NONE;
if (want_running) {
- open_health_check_monitor(cont->health_check);
pthread_t monitor_tid = { 0 };
- if (pthread_create(&monitor_tid, NULL, health_check_monitor, (void *)container_id)) {
+ char *cid = util_strdup_s(container_id);
+ // ensured that the health check monitor process is stopped
+ close_health_check_monitor(cont);
+ init_monitor_idle_status(cont->health_check);
+ if (pthread_create(&monitor_tid, NULL, health_check_monitor, (void *)cid)) {
+ free(cid);
ERROR("Failed to create thread to monitor health check...");
goto out;
}
@@ -779,8 +910,7 @@ out:
}
// Reset the health state for a newly-started, restarted or restored container.
-// initHealthMonitor is called from monitor.go and we should never be running
-// two instances at once.
+// initHealthMonitor is called from monitor and we should never be running two instances at once.
// Note: Called with container locked.
void container_init_health_monitor(const char *id)
{
@@ -809,14 +939,9 @@ void container_init_health_monitor(const char *id)
if (get_probe(cont) == HEALTH_NONE) {
goto out;
}
- // This is needed in case we're auto-restarting
- container_stop_health_checks(cont->common_config->id);
- if (cont->state == NULL || cont->state->state == NULL) {
- goto out;
- }
if (cont->state->state->health != NULL) {
- set_health_status(cont->state, HEALTH_STARTING);
+ set_health_status(cont, HEALTH_STARTING);
cont->state->state->health->failing_streak = 0;
} else {
cont->state->state->health = util_common_calloc_s(sizeof(defs_health));
@@ -824,12 +949,7 @@ void container_init_health_monitor(const char *id)
ERROR("out of memory");
goto out;
}
- set_health_status(cont->state, HEALTH_STARTING);
- }
-
- if (container_state_to_disk(cont)) {
- ERROR("Failed to save container \"%s\" to disk", id);
- goto out;
+ set_health_status(cont, HEALTH_STARTING);
}
container_update_health_monitor(id);
diff --git a/src/daemon/modules/service/service_container.c b/src/daemon/modules/service/service_container.c
index a4a2414c..e96a94d0 100644
--- a/src/daemon/modules/service/service_container.c
+++ b/src/daemon/modules/service/service_container.c
@@ -1247,8 +1247,6 @@ static int kill_with_signal(container_t *cont, uint32_t signal)
goto out;
}
- container_stop_health_checks(id);
-
ret = send_signal_to_process(cont->state->state->pid, cont->state->state->start_time, stop_signal, signal);
if (ret != 0) {
ERROR("Failed to send signal to container %s with signal %u", id, signal);
@@ -1353,6 +1351,7 @@ int stop_container(container_t *cont, int timeout, bool force, bool restart)
goto out;
}
}
+
out:
if (restart) {
cont->hostconfig->auto_remove = cont->hostconfig->auto_remove_bak;
diff --git a/test/mocks/health_check_mock.cc b/test/mocks/health_check_mock.cc
index 879e4d9c..4347a04e 100644
--- a/test/mocks/health_check_mock.cc
+++ b/test/mocks/health_check_mock.cc
@@ -31,3 +31,11 @@ void container_update_health_monitor(const char *container_id)
}
return;
}
+
+void container_stop_health_checks(const char *container_id)
+{
+ if (g_health_check_mock != nullptr) {
+ return g_health_check_mock->ContainerStopHealthCheck(container_id);
+ }
+ return;
+}
\ No newline at end of file
diff --git a/test/mocks/health_check_mock.h b/test/mocks/health_check_mock.h
index 7891f53c..ab8e20b0 100644
--- a/test/mocks/health_check_mock.h
+++ b/test/mocks/health_check_mock.h
@@ -22,6 +22,7 @@
class MockHealthCheck {
public:
MOCK_METHOD1(UpdateHealthMonitor, void(const char *container_id));
+ MOCK_METHOD1(ContainerStopHealthCheck, void(const char *container_id));
};
void MockHealthCheck_SetMock(MockHealthCheck* mock);
diff --git a/test/services/execution/execute/execution_extend/execution_extend_ut.cc b/test/services/execution/execute/execution_extend/execution_extend_ut.cc
index 2dc67814..03872340 100644
--- a/test/services/execution/execute/execution_extend/execution_extend_ut.cc
+++ b/test/services/execution/execute/execution_extend/execution_extend_ut.cc
@@ -204,6 +204,11 @@ void invokeStateSetPaused(container_state_t *s)
return;
}
+void invokeContainerStopHealthCheck(const char *container_id)
+{
+ return;
+}
+
TEST_F(ExecutionExtendUnitTest, test_container_extend_callback_init_pause)
{
service_container_callback_t cb;
@@ -220,6 +225,7 @@ TEST_F(ExecutionExtendUnitTest, test_container_extend_callback_init_pause)
EXPECT_CALL(m_containerState, IsRestarting(_)).WillRepeatedly(Invoke(invokeIsRestarting));
EXPECT_CALL(m_containerUnix, ContainerToDisk(_)).WillRepeatedly(Invoke(invokeContainerToDisk));
EXPECT_CALL(m_containerUnix, ContainerStateToDisk(_)).WillRepeatedly(Invoke(invokeContainerStateToDisk));
+ EXPECT_CALL(m_healthCheck, ContainerStopHealthCheck(_)).WillRepeatedly(Invoke(invokeContainerStopHealthCheck));
container_extend_callback_init(&cb);
ASSERT_EQ(cb.pause(request, &response), 0);
testing::Mock::VerifyAndClearExpectations(&m_runtime);
--
2.25.1