iSulad/0025-fix-container-exit-health-check-residue-and-multiple.patch

From 012b3f94279b0c6d193d510aa211b977a38e7c24 Mon Sep 17 00:00:00 2001
From: wujing <jing.woo@outlook.com>
Date: Fri, 22 Jan 2021 17:13:16 +0800
Subject: [PATCH 25/53] fix container exit health check residue and multiple
 health checks

Signed-off-by: wujing <wujing50@huawei.com>
---
 .../executor/container_cb/execution_extend.c  |   7 +-
 src/daemon/modules/api/container_api.h        |   2 +
 .../container/container_events_handler.c      |   1 -
 .../container/health_check/health_check.c     | 274 +++++++++++++-----
 .../modules/service/service_container.c       |   3 +-
 test/mocks/health_check_mock.cc               |   8 +
 test/mocks/health_check_mock.h                |   1 +
 .../execution_extend/execution_extend_ut.cc   |   6 +
 8 files changed, 220 insertions(+), 82 deletions(-)

diff --git a/src/daemon/executor/container_cb/execution_extend.c b/src/daemon/executor/container_cb/execution_extend.c
index 40f24d29..01a0466f 100644
--- a/src/daemon/executor/container_cb/execution_extend.c
+++ b/src/daemon/executor/container_cb/execution_extend.c
@@ -532,6 +532,7 @@ static int do_resume_container(container_t *cont)

     params.rootpath = cont->root_path;
     params.state = cont->state_path;
+
     if (runtime_resume(id, cont->runtime, &params)) {
         ERROR("Failed to resume container:%s", id);
         ret = -1;
@@ -716,7 +717,11 @@ static int do_pause_container(container_t *cont)

     params.rootpath = cont->root_path;
     params.state = cont->state_path;
+
+    container_stop_health_checks(cont->common_config->id);
+
     if (runtime_pause(id, cont->runtime, &params)) {
+        container_update_health_monitor(cont->common_config->id);
         ERROR("Failed to pause container:%s", id);
         ret = -1;
         goto out;
@@ -724,8 +729,6 @@ static int do_pause_container(container_t *cont)

     container_state_set_paused(cont->state);

-    container_update_health_monitor(cont->common_config->id);
-
     if (container_state_to_disk(cont)) {
         ERROR("Failed to save container \"%s\" to disk", id);
         ret = -1;
diff --git a/src/daemon/modules/api/container_api.h b/src/daemon/modules/api/container_api.h
index 83216cf3..3b7f2889 100644
--- a/src/daemon/modules/api/container_api.h
+++ b/src/daemon/modules/api/container_api.h
@@ -49,6 +49,8 @@ typedef struct health_check_manager {
     pthread_mutex_t mutex;
     bool init_mutex;
     health_check_monitor_status_t monitor_status;
+    // Used to wait for the health check minotor thread to close
+    bool monitor_exist;
 } health_check_manager_t;

 typedef struct _container_state_t_ {
diff --git a/src/daemon/modules/container/container_events_handler.c b/src/daemon/modules/container/container_events_handler.c
index 1283f99c..994c11cc 100644
--- a/src/daemon/modules/container/container_events_handler.c
+++ b/src/daemon/modules/container/container_events_handler.c
@@ -156,7 +156,6 @@ static int container_state_changed(container_t *cont, const struct isulad_events
                 container_state_set_stopped(cont->state, (int)events->exit_status);
                 container_wait_stop_cond_broadcast(cont);
                 plugin_event_container_post_stop(cont);
-                container_stop_health_checks(cont->common_config->id);
             }

             auto_remove = !should_restart && cont->hostconfig != NULL && cont->hostconfig->auto_remove;
diff --git a/src/daemon/modules/container/health_check/health_check.c b/src/daemon/modules/container/health_check/health_check.c
index 04467938..c6ccbbf2 100644
--- a/src/daemon/modules/container/health_check/health_check.c
+++ b/src/daemon/modules/container/health_check/health_check.c
@@ -73,24 +73,47 @@ static char *get_health_status(container_state_t *s)
     return status;
 }

-static void set_health_status(container_state_t *s, const char *new)
+static void set_health_status(container_t *cont, const char *new)
 {
-    if (s == NULL || new == NULL) {
+    if (cont->state == NULL || new == NULL) {
         return;
     }
-    container_state_lock(s);
-    free(s->state->health->status);
-    s->state->health->status = util_strdup_s(new);
-    container_state_unlock(s);
+
+    container_state_lock(cont->state);
+    free(cont->state->state->health->status);
+    cont->state->state->health->status = util_strdup_s(new);
+    container_state_unlock(cont->state);
+
+    if (container_state_to_disk(cont)) {
+        WARN("Failed to save container \"%s\" to disk", cont->common_config->id);
+    }
 }

-static void set_monitor_idle_status(health_check_manager_t *health)
+static void init_monitor_idle_status(health_check_manager_t *health)
 {
     container_health_check_lock(health);
     health->monitor_status = MONITOR_IDLE;
     container_health_check_unlock(health);
 }

+static int transfer_monitor_idle_status(health_check_manager_t *health)
+{
+    int ret = 0;
+
+    container_health_check_lock(health);
+    // When the minitor status is MONITOR_STOP, it cann't be set to minitor status
+    if (health->monitor_status == MONITOR_STOP) {
+        ret = -1;
+        goto out;
+    }
+
+    health->monitor_status = MONITOR_IDLE;
+
+out:
+    container_health_check_unlock(health);
+    return ret;
+}
+
 static void set_monitor_stop_status(health_check_manager_t *health)
 {
     container_health_check_lock(health);
@@ -98,11 +121,21 @@ static void set_monitor_stop_status(health_check_manager_t *health)
     container_health_check_unlock(health);
 }

-static void set_monitor_interval_timeout_status(health_check_manager_t *health)
+static int transfer_monitor_interval_timeout_status(health_check_manager_t *health)
 {
+    int ret = 0;
+
     container_health_check_lock(health);
+    // When the minitor status is MONITOR_STOP, it cann't be set to minitor status
+    if (health->monitor_status == MONITOR_STOP) {
+        ret = -1;
+        goto out;
+    }
     health->monitor_status = MONITOR_INTERVAL;
+
+out:
     container_health_check_unlock(health);
+    return ret;
 }

 static health_check_monitor_status_t get_health_check_monitor_state(health_check_manager_t *health)
@@ -116,18 +149,35 @@ static health_check_monitor_status_t get_health_check_monitor_state(health_check
     return ret;
 }

-static void close_health_check_monitor(const container_t *cont)
+static void set_monitor_exist_flag(health_check_manager_t *health, bool closed)
+{
+    container_health_check_lock(health);
+    health->monitor_exist = closed;
+    container_health_check_unlock(health);
+}
+
+static bool get_monitor_exist_flag(health_check_manager_t *health)
+{
+    bool ret;
+
+    container_health_check_lock(health);
+    ret = health->monitor_exist;
+    container_health_check_unlock(health);
+
+    return ret;
+}
+
+static void close_health_check_monitor(container_t *cont)
 {
     if (cont == NULL || cont->health_check == NULL) {
         return;
     }
-    set_monitor_stop_status(cont->health_check);
-    set_health_status(cont->state, UNHEALTHY);
-}

-static void open_health_check_monitor(health_check_manager_t *health)
-{
-    set_monitor_interval_timeout_status(health);
+    set_monitor_stop_status(cont->health_check);
+    // ensure that the monitor process exits
+    while (get_monitor_exist_flag(cont->health_check)) {
+        util_usleep_nointerupt(500);
+    }
 }

 // Called when the container is being stopped (whether because the health check is
@@ -160,6 +210,7 @@ void health_check_manager_free(health_check_manager_t *health_check)
     if (health_check->init_mutex) {
         pthread_mutex_destroy(&health_check->mutex);
     }
+
     free(health_check);
 }

@@ -183,6 +234,8 @@ static health_check_manager_t *health_check_manager_new()

     health_check->monitor_status = MONITOR_IDLE;

+    health_check->monitor_exist = false;
+
     return health_check;
 cleanup:
     health_check_manager_free(health_check);
@@ -320,6 +373,43 @@ out:
     return ret;
 }

+static void *stop_container_on_unhealthy(void *arg)
+{
+    int ret = 0;
+    char *container_id = NULL;
+    container_t *cont = NULL;
+
+    if (arg == NULL) {
+        ERROR("Invalid input arguments");
+        return NULL;
+    }
+    container_id = (char *)arg;
+
+    ret = pthread_detach(pthread_self());
+    if (ret != 0) {
+        CRIT("Set thread detach fail");
+    }
+
+    prctl(PR_SET_NAME, "ExitOnUnhealthy");
+
+    cont = containers_store_get(container_id);
+    if (cont == NULL) {
+        ERROR("Failed to get container info");
+        goto out;
+    }
+
+    // kill container when exit on unhealthy flag is set
+    ret = stop_container(cont, 3, true, false);
+    if (ret != 0) {
+        ERROR("Could not stop running container %s, cannot remove", cont->common_config->id);
+    }
+
+out:
+    free(container_id);
+    container_unref(cont);
+    return NULL;
+}
+
 static int handle_increment_streak(container_t *cont, int retries)
 {
     int ret = 0;
@@ -328,18 +418,19 @@ static int handle_increment_streak(container_t *cont, int retries)
     health = cont->state->state->health;
     health->failing_streak++;
     if (health->failing_streak >= retries) {
-        set_health_status(cont->state, UNHEALTHY);
+        set_health_status(cont, UNHEALTHY);
         if (cont->common_config->config->healthcheck->exit_on_unhealthy) {
-            // kill container when exit on unhealthy flag is set
-            ret = stop_container(cont, 3, true, false);
-            if (ret != 0) {
-                isulad_try_set_error_message("Could not stop running container %s, cannot remove",
-                                             cont->common_config->id);
-                ERROR("Could not stop running container %s, cannot remove", cont->common_config->id);
+            pthread_t stop_container_tid = { 0 };
+            char *container_id = util_strdup_s(cont->common_config->id);
+            if (pthread_create(&stop_container_tid, NULL, stop_container_on_unhealthy,
+                               (void *)container_id)) {
+                free(container_id);
+                ERROR("Failed to create thread to exec health check");
                 ret = -1;
             }
         }
     }
+
     return ret;
 }

@@ -442,7 +533,7 @@ static int handle_probe_result(const char *container_id, const defs_health_log_e

     if (result->exit_code == EXIT_STATUS_HEALTHY) {
         health->failing_streak = 0;
-        set_health_status(cont->state, HEALTHY);
+        set_health_status(cont, HEALTHY);
     } else {
         if (handle_unhealthy_case(cont, result, retries)) {
             ERROR("failed to handle unhealthy case");
@@ -457,10 +548,7 @@ static int handle_probe_result(const char *container_id, const defs_health_log_e
         // note: event
         EVENT("EVENT: {Object: %s, health_status: %s}", cont->common_config->id, current);
     }
-    if (container_state_to_disk(cont)) {
-        ERROR("Failed to save container \"%s\" to disk", cont->common_config->id);
-        ret = -1;
-    }
+
 out:
     free(old_state);
     free(current);
@@ -499,10 +587,9 @@ static void health_check_exec_success_handle(const container_exec_response *cont

 // exec the healthcheck command in the container.
 // Returns the exit code and probe output (if any)
-void *health_check_run(void *arg)
+static void health_check_run(const char *container_id)
 {
     int ret = 0;
-    char *container_id = NULL;
     char **cmd_slice = NULL;
     char output[REV_BUF_SIZE] = { 0 };
     char timebuffer[TIME_STR_SIZE] = { 0 };
@@ -514,13 +601,6 @@ void *health_check_run(void *arg)
     defs_health_log_element *result = NULL;
     container_config *config = NULL;

-    if (arg == NULL) {
-        ERROR("Invalid input arguments");
-        return NULL;
-    }
-
-    container_id = util_strdup_s((char *)arg);
-
     cont = containers_store_get(container_id);
     if (cont == NULL) {
         ERROR("Failed to get container info");
@@ -590,14 +670,10 @@ void *health_check_run(void *arg)

 out:
     util_free_array(cmd_slice);
-    free(container_id);
-    container_id = NULL;
     free_defs_health_log_element(result);
     free_container_exec_request(container_req);
     free_container_exec_response(container_res);
     container_unref(cont);
-    DAEMON_CLEAR_ERRMSG();
-    return NULL;
 }

 // Get a suitable probe implementation for the container's healthcheck configuration.
@@ -623,41 +699,81 @@ static health_probe_t get_probe(const container_t *cont)
     }
 }

+static bool valid_container_status_for_health_check(const char *container_id)
+{
+    bool bret = true;
+    const char *id = NULL;
+    container_t *cont = NULL;
+
+    cont = containers_store_get(container_id);
+    if (cont == NULL) {
+        ERROR("No such container:%s", container_id);
+        bret = false;
+        goto out;
+    }
+
+    id = cont->common_config->id;
+
+    if (!container_is_running(cont->state)) {
+        ERROR("Container %s is not running.", id);
+        bret = false;
+        goto out;
+    }
+
+    if (container_is_paused(cont->state)) {
+        ERROR("Container %s is paused.", id);
+        bret = false;
+        goto out;
+    }
+
+    if (container_is_restarting(cont->state)) {
+        ERROR("Container %s is restarting.", id);
+        bret = false;
+        goto out;
+    }
+
+out:
+    container_unref(cont);
+    return bret;
+}
+
 static int do_monitor_interval(const char *container_id, health_check_manager_t *health_check,
                                types_timestamp_t *start_timestamp)
 {
     int ret = 0;
-    pthread_t exec_tid = { 0 };

-    if (pthread_create(&exec_tid, NULL, health_check_run, (void *)container_id)) {
-        ERROR("Failed to create thread to exec health check");
+    if (!valid_container_status_for_health_check(container_id)) {
+        ERROR("Invalid container status for health check");
         ret = -1;
         goto out;
     }
-    if (pthread_join(exec_tid, NULL) != 0) {
-        ERROR("Failed to run health check thread");
+
+    health_check_run(container_id);
+
+    if (transfer_monitor_idle_status(health_check) != 0) {
         ret = -1;
         goto out;
     }
-    if (get_health_check_monitor_state(health_check) == MONITOR_STOP) {
-        ret = 0;
-        goto out;
-    }
-    set_monitor_idle_status(health_check);
     if (util_get_now_time_stamp(start_timestamp) == false) {
         ERROR("Failed to get time stamp");
         ret = -1;
         goto out;
     }
+
 out:
     return ret;
 }

-static int do_monitor_default(int64_t probe_interval, health_check_manager_t *health_check,
+static int do_monitor_default(const char *container_id, int64_t probe_interval, health_check_manager_t *health_check,
                               const types_timestamp_t *start_timestamp, types_timestamp_t *last_timestamp)
 {
     int64_t time_interval = 0;

+    if (!valid_container_status_for_health_check(container_id)) {
+        ERROR("Invalid container status for health check");
+        return -1;
+    }
+
     if (util_get_now_time_stamp(last_timestamp) == false) {
         ERROR("Failed to get time stamp");
         return -1;
@@ -668,13 +784,14 @@ static int do_monitor_default(int64_t probe_interval, health_check_manager_t *he
         return -1;
     }

-    if (time_interval >= probe_interval) {
-        set_monitor_interval_timeout_status(health_check);
+    if (time_interval >= probe_interval && transfer_monitor_interval_timeout_status(health_check) != 0) {
+        return -1;
     }
     util_usleep_nointerupt(500);

     return 0;
 }
+
 // Run the container's monitoring thread until notified via "stop".
 // There is never more than one monitor thread running per container at a time.
 static void *health_check_monitor(void *arg)
@@ -689,14 +806,17 @@ static void *health_check_monitor(void *arg)
         ERROR("Container id is empty");
         return NULL;
     }
-    container_id = util_strdup_s((char *)arg);
+
+    container_id = (char *)arg;
+
+    prctl(PR_SET_NAME, "HealthCheck");

     cont = containers_store_get(container_id);
     if (cont == NULL) {
         ERROR("Failed to get container info");
         goto out;
     }
-
+    set_monitor_exist_flag(cont->health_check, true);
     if (util_get_now_time_stamp(&start_timestamp) == false) {
         ERROR("Failed to monitor start time stamp");
         goto out;
@@ -704,7 +824,7 @@ static void *health_check_monitor(void *arg)
     probe_interval = (cont->common_config->config->healthcheck->interval == 0) ?
                      DEFAULT_PROBE_INTERVAL :
                      cont->common_config->config->healthcheck->interval;
-    set_monitor_idle_status(cont->health_check);
+
     while (true) {
         switch (get_health_check_monitor_state(cont->health_check)) {
             case MONITOR_STOP:
@@ -712,30 +832,35 @@ static void *health_check_monitor(void *arg)
                 goto out;
             /* fall-through */
             case MONITOR_INTERVAL:
-                if (do_monitor_interval(container_id, cont->health_check, &start_timestamp)) {
+                if (do_monitor_interval(container_id, cont->health_check, &start_timestamp) != 0) {
                     goto out;
                 }
                 break;
             case MONITOR_IDLE:
             /* fall-through */
             default:
-                if (do_monitor_default(probe_interval, cont->health_check, &start_timestamp, &last_timestamp)) {
+                if (do_monitor_default(container_id, probe_interval, cont->health_check,
+                                       &start_timestamp, &last_timestamp) != 0) {
                     goto out;
                 }
                 break;
         }
     }
+
 out:
     free(container_id);
     container_id = NULL;
+    //  unhealthy when the monitor has stopped for compatibility reasons
+    set_health_status(cont, UNHEALTHY);
+    // post semaphore, indicating that the minitor process has exited
+    set_monitor_exist_flag(cont->health_check, false);
     container_unref(cont);
     DAEMON_CLEAR_ERRMSG();
     return NULL;
 }

 // Ensure the health-check monitor is running or not, depending on the current
-// state of the container.
-// Called from monitor.go, with c locked.
+// state of the container. Called from monitor, with c locked.
 void container_update_health_monitor(const char *container_id)
 {
     bool want_running = false;
@@ -746,6 +871,7 @@ void container_update_health_monitor(const char *container_id)
     if (container_id == NULL) {
         return;
     }
+
     cont = containers_store_get(container_id);
     if (cont == NULL) {
         ERROR("Failed to get container info");
@@ -756,13 +882,18 @@ void container_update_health_monitor(const char *container_id)
     if (health == NULL) {
         goto out;
     }
+
     probe = get_probe(cont);
-    want_running = cont->state->state->running && !cont->state->state->paused && probe != HEALTH_NONE;

+    want_running = container_is_running(cont->state) && !container_is_paused(cont->state) && probe != HEALTH_NONE;
     if (want_running) {
-        open_health_check_monitor(cont->health_check);
         pthread_t monitor_tid = { 0 };
-        if (pthread_create(&monitor_tid, NULL, health_check_monitor, (void *)container_id)) {
+        char *cid = util_strdup_s(container_id);
+        // ensured that the health check monitor process is stopped
+        close_health_check_monitor(cont);
+        init_monitor_idle_status(cont->health_check);
+        if (pthread_create(&monitor_tid, NULL, health_check_monitor, (void *)cid)) {
+            free(cid);
             ERROR("Failed to create thread to monitor health check...");
             goto out;
         }
@@ -779,8 +910,7 @@ out:
 }

 // Reset the health state for a newly-started, restarted or restored container.
-// initHealthMonitor is called from monitor.go and we should never be running
-// two instances at once.
+// initHealthMonitor is called from monitor and we should never be running two instances at once.
 // Note: Called with container locked.
 void container_init_health_monitor(const char *id)
 {
@@ -809,14 +939,9 @@ void container_init_health_monitor(const char *id)
     if (get_probe(cont) == HEALTH_NONE) {
         goto out;
     }
-    // This is needed in case we're auto-restarting
-    container_stop_health_checks(cont->common_config->id);
-    if (cont->state == NULL || cont->state->state == NULL) {
-        goto out;
-    }

     if (cont->state->state->health != NULL) {
-        set_health_status(cont->state, HEALTH_STARTING);
+        set_health_status(cont, HEALTH_STARTING);
         cont->state->state->health->failing_streak = 0;
     } else {
         cont->state->state->health = util_common_calloc_s(sizeof(defs_health));
@@ -824,12 +949,7 @@ void container_init_health_monitor(const char *id)
             ERROR("out of memory");
             goto out;
         }
-        set_health_status(cont->state, HEALTH_STARTING);
-    }
-
-    if (container_state_to_disk(cont)) {
-        ERROR("Failed to save container \"%s\" to disk", id);
-        goto out;
+        set_health_status(cont, HEALTH_STARTING);
     }

     container_update_health_monitor(id);
diff --git a/src/daemon/modules/service/service_container.c b/src/daemon/modules/service/service_container.c
index a4a2414c..e96a94d0 100644
--- a/src/daemon/modules/service/service_container.c
+++ b/src/daemon/modules/service/service_container.c
@@ -1247,8 +1247,6 @@ static int kill_with_signal(container_t *cont, uint32_t signal)
         goto out;
     }

-    container_stop_health_checks(id);
-
     ret = send_signal_to_process(cont->state->state->pid, cont->state->state->start_time, stop_signal, signal);
     if (ret != 0) {
         ERROR("Failed to send signal to container %s with signal %u", id, signal);
@@ -1353,6 +1351,7 @@ int stop_container(container_t *cont, int timeout, bool force, bool restart)
             goto out;
         }
     }
+
 out:
     if (restart) {
         cont->hostconfig->auto_remove = cont->hostconfig->auto_remove_bak;
diff --git a/test/mocks/health_check_mock.cc b/test/mocks/health_check_mock.cc
index 879e4d9c..4347a04e 100644
--- a/test/mocks/health_check_mock.cc
+++ b/test/mocks/health_check_mock.cc
@@ -31,3 +31,11 @@ void container_update_health_monitor(const char *container_id)
     }
     return;
 }
+
+void container_stop_health_checks(const char *container_id)
+{
+    if (g_health_check_mock != nullptr) {
+        return g_health_check_mock->ContainerStopHealthCheck(container_id);
+    }
+    return;
+}
\ No newline at end of file
diff --git a/test/mocks/health_check_mock.h b/test/mocks/health_check_mock.h
index 7891f53c..ab8e20b0 100644
--- a/test/mocks/health_check_mock.h
+++ b/test/mocks/health_check_mock.h
@@ -22,6 +22,7 @@
 class MockHealthCheck {
 public:
     MOCK_METHOD1(UpdateHealthMonitor, void(const char *container_id));
+    MOCK_METHOD1(ContainerStopHealthCheck, void(const char *container_id));
 };

 void MockHealthCheck_SetMock(MockHealthCheck* mock);
diff --git a/test/services/execution/execute/execution_extend/execution_extend_ut.cc b/test/services/execution/execute/execution_extend/execution_extend_ut.cc
index 2dc67814..03872340 100644
--- a/test/services/execution/execute/execution_extend/execution_extend_ut.cc
+++ b/test/services/execution/execute/execution_extend/execution_extend_ut.cc
@@ -204,6 +204,11 @@ void invokeStateSetPaused(container_state_t *s)
     return;
 }

+void invokeContainerStopHealthCheck(const char *container_id)
+{
+    return;
+}
+
 TEST_F(ExecutionExtendUnitTest, test_container_extend_callback_init_pause)
 {
     service_container_callback_t cb;
@@ -220,6 +225,7 @@ TEST_F(ExecutionExtendUnitTest, test_container_extend_callback_init_pause)
     EXPECT_CALL(m_containerState, IsRestarting(_)).WillRepeatedly(Invoke(invokeIsRestarting));
     EXPECT_CALL(m_containerUnix, ContainerToDisk(_)).WillRepeatedly(Invoke(invokeContainerToDisk));
     EXPECT_CALL(m_containerUnix, ContainerStateToDisk(_)).WillRepeatedly(Invoke(invokeContainerStateToDisk));
+    EXPECT_CALL(m_healthCheck, ContainerStopHealthCheck(_)).WillRepeatedly(Invoke(invokeContainerStopHealthCheck));
     container_extend_callback_init(&cb);
     ASSERT_EQ(cb.pause(request, &response), 0);
     testing::Mock::VerifyAndClearExpectations(&m_runtime);
--
2.25.1