From 1d918d78398d92f67e4b36f31d64b3531476fcae Mon Sep 17 00:00:00 2001 From: wu-changsheng Date: Sat, 8 Oct 2022 22:27:12 +0800 Subject: [PATCH] adapt ceph client --- 0086-fix-fd-leak.patch | 80 ++ 0087-fix-del-conn-use-after-free.patch | 29 + 0088-init-g_gazelle_errno-before-use.patch | 24 + 0089-code-format-specification.patch | 39 + ...el-event-thread-affinity-same-with-s.patch | 48 + 0091-have_corelist_arg.patch | 77 + 0092-ltran-update-list.patch | 48 + 0093-remove-get_reg_ring_free_count.patch | 37 + 0094-add-errorno-EISCONN.patch | 25 + 0095-fix-sendmsg-data-write-wrong.patch | 29 + 0096-lstack-restore-pci-bus-after-init.patch | 69 + 0097-fix-malloc-rpc-msg-fail.patch | 133 ++ 0098-support-dpdk-dynamic-memory.patch | 64 + ...x-lwip_send-fail-free-pbuf-miss-data.patch | 87 ++ 0100-merger-wakeup.patch | 984 +++++++++++++ 0101-conenct-support-multi-queues.patch | 113 ++ 0102-merge-sendmsg-write.patch | 275 ++++ 0103-add-thread-select-path.patch | 160 +++ 0104-support-conf-control-app-bind-numa.patch | 99 ++ 0105-fix-epoll_wait-cover-kernel-event.patch | 130 ++ ...ead-stack-data-return-0-when-no-data.patch | 45 + 0107-fix-stack-wakeup-node-del.patch | 79 ++ ...oid-useless-stack-check-wakeup-event.patch | 60 + 0109-fix-mesg-loss.patch | 32 + 0110-add-accept4-and-epoll_create1.patch | 232 +++ 0111-refactor-event-notice.patch | 1243 +++++++++++++++++ gazelle.spec | 32 +- 27 files changed, 4272 insertions(+), 1 deletion(-) create mode 100644 0086-fix-fd-leak.patch create mode 100644 0087-fix-del-conn-use-after-free.patch create mode 100644 0088-init-g_gazelle_errno-before-use.patch create mode 100644 0089-code-format-specification.patch create mode 100644 0090-fix-gazelle-kernel-event-thread-affinity-same-with-s.patch create mode 100644 0091-have_corelist_arg.patch create mode 100644 0092-ltran-update-list.patch create mode 100644 0093-remove-get_reg_ring_free_count.patch create mode 100644 0094-add-errorno-EISCONN.patch create mode 100644 0095-fix-sendmsg-data-write-wrong.patch create mode 100644 0096-lstack-restore-pci-bus-after-init.patch create mode 100644 0097-fix-malloc-rpc-msg-fail.patch create mode 100644 0098-support-dpdk-dynamic-memory.patch create mode 100644 0099-fix-lwip_send-fail-free-pbuf-miss-data.patch create mode 100644 0100-merger-wakeup.patch create mode 100644 0101-conenct-support-multi-queues.patch create mode 100644 0102-merge-sendmsg-write.patch create mode 100644 0103-add-thread-select-path.patch create mode 100644 0104-support-conf-control-app-bind-numa.patch create mode 100644 0105-fix-epoll_wait-cover-kernel-event.patch create mode 100644 0106-fix-read-stack-data-return-0-when-no-data.patch create mode 100644 0107-fix-stack-wakeup-node-del.patch create mode 100644 0108-avoid-useless-stack-check-wakeup-event.patch create mode 100644 0109-fix-mesg-loss.patch create mode 100644 0110-add-accept4-and-epoll_create1.patch create mode 100644 0111-refactor-event-notice.patch diff --git a/0086-fix-fd-leak.patch b/0086-fix-fd-leak.patch new file mode 100644 index 0000000..4801357 --- /dev/null +++ b/0086-fix-fd-leak.patch @@ -0,0 +1,80 @@ +From 0e4e3a3357d5a2ffff06bf9a32bba95e207e8ff0 Mon Sep 17 00:00:00 2001 +From: wu-changsheng +Date: Tue, 6 Sep 2022 16:23:58 +0800 +Subject: [PATCH 01/21] fix fd leak + +--- + src/lstack/core/lstack_control_plane.c | 6 +++++- + src/ltran/ltran_monitor.c | 4 +++- + 2 files changed, 8 insertions(+), 2 deletions(-) + +diff --git a/src/lstack/core/lstack_control_plane.c b/src/lstack/core/lstack_control_plane.c +index 8012fa6..7f62254 100644 +--- a/src/lstack/core/lstack_control_plane.c ++++ b/src/lstack/core/lstack_control_plane.c +@@ -713,6 +713,7 @@ void control_server_thread(void *arg) + + int32_t epfd = init_epoll(listenfd); + if (epfd < 0) { ++ posix_api->close_fn(listenfd); + LSTACK_LOG(ERR, LSTACK, "init_epoll failed\n"); + return; + } +@@ -744,7 +745,9 @@ void control_server_thread(void *arg) + + evt_array.data.fd = connfd; + evt_array.events = EPOLLIN; +- posix_api->epoll_ctl_fn(epfd, EPOLL_CTL_ADD, connfd, &evt_array); ++ if (posix_api->epoll_ctl_fn(epfd, EPOLL_CTL_ADD, connfd, &evt_array) < 0) { ++ posix_api->close_fn(connfd); ++ } + } else { + if (handle_stat_request(evt_array.data.fd) < 0) { + posix_api->close_fn(evt_array.data.fd); +@@ -761,6 +764,7 @@ void control_client_thread(void *arg) + + epfd = init_epoll(sockfd); + if (epfd < 0) { ++ posix_api->close_fn(sockfd); + LSTACK_LOG(ERR, LSTACK, "control_thread fail\n"); + return; + } +diff --git a/src/ltran/ltran_monitor.c b/src/ltran/ltran_monitor.c +index dfda93f..7da65ea 100644 +--- a/src/ltran/ltran_monitor.c ++++ b/src/ltran/ltran_monitor.c +@@ -188,6 +188,7 @@ static int32_t gazelle_ctl_init(void) + + ret = gazelle_ep_event_init(&event_dfx, GAZELLE_DFX_SERVER_FD, listenfd); + if (ret != GAZELLE_OK) { ++ close(listenfd); + return GAZELLE_ERR; + } + +@@ -207,6 +208,7 @@ static int32_t gazelle_ctl_init(void) + + ret = gazelle_ep_event_init(&event_reg, GAZELLE_REG_SERVER_FD, listenfd); + if (ret != GAZELLE_OK) { ++ close(listenfd); + sockfd_data_free(event_dfx.data.ptr); + return GAZELLE_ERR; + } +@@ -283,7 +285,6 @@ static void dfx_server_msg_proc(uint32_t events, struct sockfd_data *data) + if (ret < 0) { + LTRAN_ERR("epoll_ctl ERROR, errno: %d. ret=%d.\n", errno, ret); + sockfd_data_free(event.data.ptr); +- close(conn_fd); + return; + } + } +@@ -421,6 +422,7 @@ static void reg_server_msg_proc(uint32_t events, struct sockfd_data *data) + event.events = EPOLLIN; + if (event.data.ptr == NULL) { + LTRAN_ERR("alloc sockfd_data ERROR\n"); ++ close(conn_fd); + return; + } + +-- +2.23.0 + diff --git a/0087-fix-del-conn-use-after-free.patch b/0087-fix-del-conn-use-after-free.patch new file mode 100644 index 0000000..8db8d5e --- /dev/null +++ b/0087-fix-del-conn-use-after-free.patch @@ -0,0 +1,29 @@ +From 898e627aad3f0a997fa87a54ee397e999953d695 Mon Sep 17 00:00:00 2001 +From: wu-changsheng +Date: Tue, 6 Sep 2022 19:08:25 +0800 +Subject: [PATCH 02/21] fix del conn use after free + +--- + src/ltran/ltran_timer.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/src/ltran/ltran_timer.c b/src/ltran/ltran_timer.c +index 3ee266c..85ea324 100644 +--- a/src/ltran/ltran_timer.c ++++ b/src/ltran/ltran_timer.c +@@ -101,11 +101,11 @@ void gazelle_detect_conn_logout(struct gazelle_tcp_conn_htable *conn_htable) + hlist_del_init(&conn->conn_node); + conn_htable->cur_conn_num--; + conn_htable->array[i].chain_size--; +- rte_free(conn); + LTRAN_DEBUG("delete the tcp conn htable: tid %u quintuple[%u %u %u %u %u]\n", + conn->tid, conn->quintuple.protocol, + conn->quintuple.src_ip, (uint32_t)ntohs(conn->quintuple.src_port), + conn->quintuple.dst_ip, (uint32_t)ntohs(conn->quintuple.dst_port)); ++ rte_free(conn); + } + } + } +-- +2.23.0 + diff --git a/0088-init-g_gazelle_errno-before-use.patch b/0088-init-g_gazelle_errno-before-use.patch new file mode 100644 index 0000000..ac9ec64 --- /dev/null +++ b/0088-init-g_gazelle_errno-before-use.patch @@ -0,0 +1,24 @@ +From b7fe5b572c7d09a0e5ce58d0102ff76777edc8c4 Mon Sep 17 00:00:00 2001 +From: wu-changsheng +Date: Tue, 6 Sep 2022 19:16:00 +0800 +Subject: [PATCH 03/21] init g_gazelle_errno before use + +--- + src/ltran/ltran_param.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/src/ltran/ltran_param.c b/src/ltran/ltran_param.c +index 75cc2f9..fc45895 100644 +--- a/src/ltran/ltran_param.c ++++ b/src/ltran/ltran_param.c +@@ -574,6 +574,7 @@ int32_t parse_config_file_args(const char *conf_file_path, struct ltran_config * + + int32_t param_num = sizeof(g_param_parse_tbl) / sizeof(g_param_parse_tbl[0]); + for (int32_t i = 0; i < param_num; i++) { ++ gazelle_set_errno(GAZELLE_SUCCESS); + ret = g_param_parse_tbl[i].func(&config, g_param_parse_tbl[i].param_name, ltran_config); + if (ret != GAZELLE_OK) { + config_destroy(&config); +-- +2.23.0 + diff --git a/0089-code-format-specification.patch b/0089-code-format-specification.patch new file mode 100644 index 0000000..37720d3 --- /dev/null +++ b/0089-code-format-specification.patch @@ -0,0 +1,39 @@ +From c414e36e78b616b9d8a34317b2fd473c3c3ab000 Mon Sep 17 00:00:00 2001 +From: wu-changsheng +Date: Tue, 6 Sep 2022 20:18:34 +0800 +Subject: [PATCH 04/21] code format specification + +--- + src/lstack/core/lstack_dpdk.c | 2 +- + src/ltran/ltran_monitor.c | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/src/lstack/core/lstack_dpdk.c b/src/lstack/core/lstack_dpdk.c +index 366655c..340ae94 100644 +--- a/src/lstack/core/lstack_dpdk.c ++++ b/src/lstack/core/lstack_dpdk.c +@@ -446,7 +446,7 @@ int32_t dpdk_ethdev_init(void) + rss_setup(port_id, nb_queues); + } + +- return ERR_OK; ++ return 0; + } + + static int32_t dpdk_ethdev_setup(const struct eth_params *eth_params, const struct protocol_stack *stack) +diff --git a/src/ltran/ltran_monitor.c b/src/ltran/ltran_monitor.c +index 7da65ea..d535726 100644 +--- a/src/ltran/ltran_monitor.c ++++ b/src/ltran/ltran_monitor.c +@@ -414,7 +414,7 @@ static void reg_server_msg_proc(uint32_t events, struct sockfd_data *data) + timeout.tv_sec = 60; /* 60: timeout 60S */ + ret = setsockopt(conn_fd, SOL_SOCKET, SO_RCVTIMEO, &timeout, sizeof(timeout)); + if (ret != 0) { +- LTRAN_ERR("setsockopt ERROR, errno: %d. ret=%d.\n", errno, ret); ++ LTRAN_WARN("setsockopt ERROR, errno: %d. ret=%d.\n", errno, ret); + } + + struct epoll_event event = {0}; +-- +2.23.0 + diff --git a/0090-fix-gazelle-kernel-event-thread-affinity-same-with-s.patch b/0090-fix-gazelle-kernel-event-thread-affinity-same-with-s.patch new file mode 100644 index 0000000..2d275ed --- /dev/null +++ b/0090-fix-gazelle-kernel-event-thread-affinity-same-with-s.patch @@ -0,0 +1,48 @@ +From 7a5a641ad5d56df224fd352b0a8366814bb819d2 Mon Sep 17 00:00:00 2001 +From: wu-changsheng +Date: Tue, 13 Sep 2022 19:32:14 +0800 +Subject: [PATCH 06/21] fix gazelle kernel event thread affinity same with + stack thread + +--- + src/lstack/core/lstack_protocol_stack.c | 10 +++------- + 1 file changed, 3 insertions(+), 7 deletions(-) + +diff --git a/src/lstack/core/lstack_protocol_stack.c b/src/lstack/core/lstack_protocol_stack.c +index 3009286..1dc6c3f 100644 +--- a/src/lstack/core/lstack_protocol_stack.c ++++ b/src/lstack/core/lstack_protocol_stack.c +@@ -270,11 +270,7 @@ static int32_t init_stack_value(struct protocol_stack *stack, uint16_t queue_id) + + stack_group->stacks[queue_id] = stack; + +- cpu_set_t cpuset; +- CPU_ZERO(&cpuset); +- CPU_SET(stack->cpu_id, &cpuset); +- if (rte_thread_set_affinity(&cpuset) != 0) { +- LSTACK_LOG(ERR, LSTACK, "rte_thread_set_affinity failed\n"); ++ if (thread_affinity_init(stack->cpu_id) != 0) { + return -1; + } + RTE_PER_LCORE(_lcore_id) = stack->cpu_id; +@@ -301,6 +297,8 @@ static void* gazelle_kernel_event(void *arg) + uint16_t queue_id = *(uint16_t *)arg; + struct protocol_stack *stack = get_protocol_stack_group()->stacks[queue_id]; + ++ bind_to_stack_numa(stack); ++ + int32_t epoll_fd = posix_api->epoll_create_fn(GAZELLE_LSTACK_MAX_CONN); + if (epoll_fd < 0) { + LSTACK_LOG(ERR, LSTACK, "queue_id=%hu epoll_fd=%d errno=%d\n", queue_id, epoll_fd, errno); +@@ -383,8 +381,6 @@ static struct protocol_stack *stack_thread_init(uint16_t queue_id) + return NULL; + } + +- thread_affinity_init(stack->cpu_id); +- + hugepage_init(); + + tcpip_init(NULL, NULL); +-- +2.23.0 + diff --git a/0091-have_corelist_arg.patch b/0091-have_corelist_arg.patch new file mode 100644 index 0000000..c26c85f --- /dev/null +++ b/0091-have_corelist_arg.patch @@ -0,0 +1,77 @@ +From 37f9a2c0f60cb2757a0f1f64ad140ccbdc120501 Mon Sep 17 00:00:00 2001 +From: compile_success <980965867@qq.com> +Date: Wed, 21 Sep 2022 07:57:35 +0000 +Subject: [PATCH 07/21] =?UTF-8?q?=E6=96=B0=E5=A2=9E=20have=5Fcorelist=5Far?= + =?UTF-8?q?g?= +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +--- + src/lstack/core/lstack_cfg.c | 42 ++++++++++++++++++++++++++++++------ + 1 file changed, 35 insertions(+), 7 deletions(-) + +diff --git a/src/lstack/core/lstack_cfg.c b/src/lstack/core/lstack_cfg.c +index 5cd326b..b6a517b 100644 +--- a/src/lstack/core/lstack_cfg.c ++++ b/src/lstack/core/lstack_cfg.c +@@ -208,6 +208,33 @@ static int32_t get_param_idx(int32_t argc, char **argv, const char *param) + return -1; + } + ++static bool have_corelist_arg(int32_t argc, char **argv) ++{ ++ for (uint32_t i = 0; i < argc; i++) { ++ if (strncmp(argv[i], OPT_BIND_CORELIST, strlen(OPT_BIND_CORELIST)) == 0) { ++ return true; ++ } ++ ++ if (strncmp(argv[i], "--lcores", strlen("--lcores")) == 0) { ++ return true; ++ } ++ ++ if (strncmp(argv[i], "-c", strlen("-c")) == 0) { ++ return true; ++ } ++ ++ if (strncmp(argv[i], "-s", strlen("-s")) == 0) { ++ return true; ++ } ++ ++ if (strncmp(argv[i], "-S", strlen("-S")) == 0) { ++ return true; ++ } ++ } ++ ++ return false; ++} ++ + static int32_t parse_stack_cpu_number(void) + { + const config_setting_t *num_cpus = NULL; +@@ -222,14 +249,15 @@ static int32_t parse_stack_cpu_number(void) + if (args == NULL) { + return -EINVAL; + } ++ if (!have_corelist_arg(g_config_params.dpdk_argc, g_config_params.dpdk_argv)) { ++ int32_t idx = get_param_idx(g_config_params.dpdk_argc, g_config_params.dpdk_argv, OPT_BIND_CORELIST); ++ if (idx < 0) { ++ g_config_params.dpdk_argv[g_config_params.dpdk_argc] = strdup(OPT_BIND_CORELIST); ++ g_config_params.dpdk_argc++; + +- int32_t idx = get_param_idx(g_config_params.dpdk_argc, g_config_params.dpdk_argv, OPT_BIND_CORELIST); +- if (idx < 0) { +- g_config_params.dpdk_argv[g_config_params.dpdk_argc] = strdup(OPT_BIND_CORELIST); +- g_config_params.dpdk_argc++; +- +- g_config_params.dpdk_argv[g_config_params.dpdk_argc] = strdup(args); +- g_config_params.dpdk_argc++; ++ g_config_params.dpdk_argv[g_config_params.dpdk_argc] = strdup(args); ++ g_config_params.dpdk_argc++; ++ } + } + + char *tmp_arg = strdup(args); +-- +2.23.0 + diff --git a/0092-ltran-update-list.patch b/0092-ltran-update-list.patch new file mode 100644 index 0000000..8e8058e --- /dev/null +++ b/0092-ltran-update-list.patch @@ -0,0 +1,48 @@ +From d609d7a16c0ab0e0215e7c84909bc19b8216727f Mon Sep 17 00:00:00 2001 +From: compile_success <980965867@qq.com> +Date: Wed, 21 Sep 2022 12:06:21 +0000 +Subject: [PATCH 08/21] ltran update list + +--- + src/ltran/ltran_forward.c | 19 +++++++++++++------ + 1 file changed, 13 insertions(+), 6 deletions(-) + +diff --git a/src/ltran/ltran_forward.c b/src/ltran/ltran_forward.c +index 1a92348..9ca04aa 100644 +--- a/src/ltran/ltran_forward.c ++++ b/src/ltran/ltran_forward.c +@@ -432,18 +432,25 @@ static __rte_always_inline void tcp_hash_table_add_conn(struct gazelle_stack *st + struct gazelle_quintuple *transfer_qtuple, uint32_t tid) + { + struct gazelle_tcp_conn *tcp_conn = NULL; ++ struct gazelle_tcp_conn_htable *conn_htable = gazelle_get_tcp_conn_htable(); + +- /* When lstack is the server, conn is created in tcp_handle func. lwip send the connect command after +- receiving syn, and delete conn timeout. */ +- tcp_conn = gazelle_conn_get_by_quintuple(gazelle_get_tcp_conn_htable(), transfer_qtuple); ++ tcp_conn = gazelle_conn_get_by_quintuple(conn_htable, transfer_qtuple); + if (tcp_conn) { +- tcp_conn->conn_timeout = -1; +- return; ++ /* When lstack is the server, conn is created in tcp_handle func. lwip send the connect command after ++ * receiving syn, and delete conn timeout. */ ++ if (tcp_conn->conn_timeout >= 0) { ++ tcp_conn->conn_timeout = -1; ++ return; ++ } else { ++ /* del old invaild conn */ ++ gazelle_conn_del_by_quintuple(conn_htable, transfer_qtuple); ++ printf("del old conn port=%d\n", ntohs(transfer_qtuple->dst_port)); ++ } + } + + /* When lstack is the client, lwip send the connect command while calling connect func. conn is created + without a timeout */ +- tcp_conn = gazelle_conn_add_by_quintuple(gazelle_get_tcp_conn_htable(), transfer_qtuple); ++ tcp_conn = gazelle_conn_add_by_quintuple(conn_htable, transfer_qtuple); + if (tcp_conn == NULL) { + LTRAN_ERR("add tcp conn htable failed\n"); + return; +-- +2.23.0 + diff --git a/0093-remove-get_reg_ring_free_count.patch b/0093-remove-get_reg_ring_free_count.patch new file mode 100644 index 0000000..7ab8c38 --- /dev/null +++ b/0093-remove-get_reg_ring_free_count.patch @@ -0,0 +1,37 @@ +From b2cb29c59dcfbeb2652d57e5f6484c5e1239dd94 Mon Sep 17 00:00:00 2001 +From: compile_success <980965867@qq.com> +Date: Wed, 21 Sep 2022 11:56:18 +0000 +Subject: [PATCH 09/21] remove get_reg_ring_free_count + +--- + src/lstack/netif/lstack_vdev.c | 7 +------ + 1 file changed, 1 insertion(+), 6 deletions(-) + +diff --git a/src/lstack/netif/lstack_vdev.c b/src/lstack/netif/lstack_vdev.c +index 31c1843..f9fa5a3 100644 +--- a/src/lstack/netif/lstack_vdev.c ++++ b/src/lstack/netif/lstack_vdev.c +@@ -99,11 +99,6 @@ static uint32_t vdev_tx_xmit(struct protocol_stack *stack, struct rte_mbuf **pkt + return sent_pkts; + } + +-static inline uint32_t get_reg_ring_free_count(const struct rte_ring *reg_ring) +-{ +- return (reg_ring->capacity + reg_ring->cons.tail - reg_ring->cons.head); +-} +- + int32_t vdev_reg_xmit(enum reg_ring_type type, struct gazelle_quintuple *qtuple) + { + if (!use_ltran()) { +@@ -132,7 +127,7 @@ int32_t vdev_reg_xmit(enum reg_ring_type type, struct gazelle_quintuple *qtuple) + do { + (void)gazelle_ring_sc_dequeue(stack->reg_ring, free_buf, VDEV_REG_QUEUE_SZ); + +- if (get_reg_ring_free_count(stack->reg_ring) == 0) { ++ if (gazelle_ring_free_count(stack->reg_ring) == 0) { + continue; + } + +-- +2.23.0 + diff --git a/0094-add-errorno-EISCONN.patch b/0094-add-errorno-EISCONN.patch new file mode 100644 index 0000000..c8a3c88 --- /dev/null +++ b/0094-add-errorno-EISCONN.patch @@ -0,0 +1,25 @@ +From 39cbe5776adb427d0e7d9a9bec58302a08202145 Mon Sep 17 00:00:00 2001 +From: compile_success <980965867@qq.com> +Date: Wed, 21 Sep 2022 14:31:15 +0000 +Subject: [PATCH 10/21] add errorno EISCONN + +--- + src/lstack/api/lstack_wrap.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/src/lstack/api/lstack_wrap.c b/src/lstack/api/lstack_wrap.c +index e402ce1..296906e 100644 +--- a/src/lstack/api/lstack_wrap.c ++++ b/src/lstack/api/lstack_wrap.c +@@ -185,7 +185,7 @@ static int32_t do_connect(int32_t s, const struct sockaddr *name, socklen_t name + } + + int32_t ret = rpc_call_connect(s, name, namelen); +- if (ret == 0) { ++ if (ret == 0 || errno == EISCONN) { + return ret; + } + +-- +2.23.0 + diff --git a/0095-fix-sendmsg-data-write-wrong.patch b/0095-fix-sendmsg-data-write-wrong.patch new file mode 100644 index 0000000..2f6ad44 --- /dev/null +++ b/0095-fix-sendmsg-data-write-wrong.patch @@ -0,0 +1,29 @@ +From 241c72599d7da72ada7cc15da90b7309e74241ea Mon Sep 17 00:00:00 2001 +From: wu-changsheng +Date: Sat, 24 Sep 2022 21:48:12 +0800 +Subject: [PATCH 11/21] fix sendmsg data write wrong + +--- + src/lstack/core/lstack_lwip.c | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/src/lstack/core/lstack_lwip.c b/src/lstack/core/lstack_lwip.c +index 5174e4c..d1e09ce 100644 +--- a/src/lstack/core/lstack_lwip.c ++++ b/src/lstack/core/lstack_lwip.c +@@ -526,7 +526,12 @@ ssize_t sendmsg_to_stack(int32_t s, const struct msghdr *message, int32_t flags) + if (ret < 0) { + return buflen == 0 ? ret : buflen; + } ++ + buflen += ret; ++ ++ if (ret < message->msg_iov[i].iov_len) { ++ return buflen; ++ } + } + + return buflen; +-- +2.23.0 + diff --git a/0096-lstack-restore-pci-bus-after-init.patch b/0096-lstack-restore-pci-bus-after-init.patch new file mode 100644 index 0000000..cb3365b --- /dev/null +++ b/0096-lstack-restore-pci-bus-after-init.patch @@ -0,0 +1,69 @@ +From 4d12b47a714217615a04f2a084b9c4857167e258 Mon Sep 17 00:00:00 2001 +From: wuchangsheng +Date: Thu, 6 Oct 2022 15:14:21 +0800 +Subject: [PATCH 12/21] lstack restore pci bus after init + +--- + src/lstack/core/lstack_dpdk.c | 14 +++++++++++--- + src/lstack/core/lstack_init.c | 1 + + src/lstack/include/lstack_dpdk.h | 1 + + 3 files changed, 13 insertions(+), 3 deletions(-) + +diff --git a/src/lstack/core/lstack_dpdk.c b/src/lstack/core/lstack_dpdk.c +index 340ae94..10207d1 100644 +--- a/src/lstack/core/lstack_dpdk.c ++++ b/src/lstack/core/lstack_dpdk.c +@@ -53,6 +53,7 @@ struct eth_params { + struct rte_eth_txconf tx_conf; + }; + struct rte_kni; ++static rte_bus *g_pci_bus = NULL; + + int32_t thread_affinity_default(void) + { +@@ -536,9 +537,16 @@ int32_t dpdk_init_lstack_kni(void) + void dpdk_skip_nic_init(void) + { + /* when lstack init nic again, ltran can't read pkts from nic. unregister pci_bus to avoid init nic in lstack */ +- struct rte_bus *pci_bus = rte_bus_find_by_name("pci"); +- if (pci_bus != NULL) { +- rte_bus_unregister(pci_bus); ++ g_pci_bus = rte_bus_find_by_name("pci"); ++ if (g_pci_bus != NULL) { ++ rte_bus_unregister(g_pci_bus); ++ } ++} ++ ++void dpdk_restore_pci(void) ++{ ++ if (g_pci_bus != NULL) { ++ rte_bus_register(g_pci_bus); + } + } + +diff --git a/src/lstack/core/lstack_init.c b/src/lstack/core/lstack_init.c +index 9fafda9..b1c69e6 100644 +--- a/src/lstack/core/lstack_init.c ++++ b/src/lstack/core/lstack_init.c +@@ -251,6 +251,7 @@ __attribute__((constructor)) void gazelle_network_init(void) + /* + * Init control plane and dpdk init */ + create_control_thread(); ++ dpdk_restore_pci(); + + /* + * cancel the core binding from DPDK initialization */ +diff --git a/src/lstack/include/lstack_dpdk.h b/src/lstack/include/lstack_dpdk.h +index f0bf4a1..e224f23 100644 +--- a/src/lstack/include/lstack_dpdk.h ++++ b/src/lstack/include/lstack_dpdk.h +@@ -50,5 +50,6 @@ int dpdk_ethdev_init(void); + int dpdk_ethdev_start(void); + void dpdk_skip_nic_init(void); + int32_t dpdk_init_lstack_kni(void); ++void dpdk_restore_pci(void); + + #endif /* GAZELLE_DPDK_H */ +-- +2.23.0 + diff --git a/0097-fix-malloc-rpc-msg-fail.patch b/0097-fix-malloc-rpc-msg-fail.patch new file mode 100644 index 0000000..5c4493e --- /dev/null +++ b/0097-fix-malloc-rpc-msg-fail.patch @@ -0,0 +1,133 @@ +From d168ee1528444bbdf3e1fd8f6a566295531177a8 Mon Sep 17 00:00:00 2001 +From: wuchangsheng +Date: Thu, 6 Oct 2022 15:35:02 +0800 +Subject: [PATCH 13/21] fix malloc rpc msg fail + +--- + src/lstack/core/lstack_dpdk.c | 2 +- + src/lstack/core/lstack_lwip.c | 17 ++++++++++++++++- + src/lstack/core/lstack_thread_rpc.c | 6 ++++-- + src/lstack/include/lstack_thread_rpc.h | 2 +- + 4 files changed, 22 insertions(+), 5 deletions(-) + +diff --git a/src/lstack/core/lstack_dpdk.c b/src/lstack/core/lstack_dpdk.c +index 10207d1..8d45838 100644 +--- a/src/lstack/core/lstack_dpdk.c ++++ b/src/lstack/core/lstack_dpdk.c +@@ -53,7 +53,7 @@ struct eth_params { + struct rte_eth_txconf tx_conf; + }; + struct rte_kni; +-static rte_bus *g_pci_bus = NULL; ++static struct rte_bus *g_pci_bus = NULL; + + int32_t thread_affinity_default(void) + { +diff --git a/src/lstack/core/lstack_lwip.c b/src/lstack/core/lstack_lwip.c +index d1e09ce..6f08a1c 100644 +--- a/src/lstack/core/lstack_lwip.c ++++ b/src/lstack/core/lstack_lwip.c +@@ -77,6 +77,7 @@ static void reset_sock_data(struct lwip_sock *sock) + sock->listen_next = NULL; + sock->epoll_events = 0; + sock->events = 0; ++ sock->in_send = 0; + + if (sock->recv_lastdata) { + pbuf_free(sock->recv_lastdata); +@@ -328,6 +329,9 @@ void stack_send(struct rpc_msg *msg) + return; + } + ++ __atomic_store_n(&sock->in_send, 0, __ATOMIC_RELEASE); ++ rte_mb(); ++ + if (!NETCONN_IS_DATAOUT(sock)) { + return; + } +@@ -338,6 +342,7 @@ void stack_send(struct rpc_msg *msg) + if (NETCONN_IS_DATAOUT(sock)) { + if (list_is_null(&sock->send_list)) { + list_add_node(&stack->send_list, &sock->send_list); ++ __atomic_store_n(&sock->in_send, 1, __ATOMIC_RELEASE); + } + stack->stats.send_self_rpc++; + } +@@ -352,6 +357,9 @@ void send_stack_list(struct protocol_stack *stack, uint32_t send_max) + list_for_each_safe(node, temp, &stack->send_list) { + sock = container_of(node, struct lwip_sock, send_list); + ++ __atomic_store_n(&sock->in_send, 0, __ATOMIC_RELEASE); ++ rte_mb(); ++ + if (sock->conn == NULL || !NETCONN_IS_DATAOUT(sock)) { + list_del_node_null(&sock->send_list); + continue; +@@ -361,6 +369,8 @@ void send_stack_list(struct protocol_stack *stack, uint32_t send_max) + + if (!NETCONN_IS_DATAOUT(sock)) { + list_del_node_null(&sock->send_list); ++ } else { ++ __atomic_store_n(&sock->in_send, 1, __ATOMIC_RELEASE); + } + + if (++read_num >= send_max) { +@@ -507,7 +517,12 @@ ssize_t gazelle_send(int32_t fd, const void *buf, size_t len, int32_t flags) + return 0; + } + +- rpc_call_send(fd, NULL, send, flags); ++ if (__atomic_load_n(&sock->in_send, __ATOMIC_ACQUIRE) == 0) { ++ __atomic_store_n(&sock->in_send, 1, __ATOMIC_RELEASE); ++ if (rpc_call_send(fd, NULL, send, flags) != 0) { ++ __atomic_store_n(&sock->in_send, 0, __ATOMIC_RELEASE); ++ } ++ } + return send; + } + +diff --git a/src/lstack/core/lstack_thread_rpc.c b/src/lstack/core/lstack_thread_rpc.c +index c9fc4e9..d1f7580 100644 +--- a/src/lstack/core/lstack_thread_rpc.c ++++ b/src/lstack/core/lstack_thread_rpc.c +@@ -427,13 +427,13 @@ int32_t rpc_call_ioctl(int fd, long cmd, void *argp) + return rpc_sync_call(&stack->rpc_queue, msg); + } + +-void rpc_call_send(int fd, const void *buf, size_t len, int flags) ++int32_t rpc_call_send(int fd, const void *buf, size_t len, int flags) + { + struct protocol_stack *stack = get_protocol_stack_by_fd(fd); + + struct rpc_msg *msg = rpc_msg_alloc(stack, stack_send); + if (msg == NULL) { +- return; ++ return -1; + } + + msg->args[MSG_ARG_0].i = fd; +@@ -442,6 +442,8 @@ void rpc_call_send(int fd, const void *buf, size_t len, int flags) + msg->self_release = 0; + + rpc_call(&stack->rpc_queue, msg); ++ ++ return 0; + } + + int32_t rpc_call_sendmsg(int fd, const struct msghdr *msghdr, int flags) +diff --git a/src/lstack/include/lstack_thread_rpc.h b/src/lstack/include/lstack_thread_rpc.h +index 3732167..e1223de 100644 +--- a/src/lstack/include/lstack_thread_rpc.h ++++ b/src/lstack/include/lstack_thread_rpc.h +@@ -66,7 +66,7 @@ int32_t rpc_call_bind(int32_t fd, const struct sockaddr *addr, socklen_t addrlen + int32_t rpc_call_listen(int s, int backlog); + int32_t rpc_call_accept(int fd, struct sockaddr *addr, socklen_t *addrlen); + int32_t rpc_call_connect(int fd, const struct sockaddr *addr, socklen_t addrlen); +-void rpc_call_send(int fd, const void *buf, size_t len, int flags); ++int32_t rpc_call_send(int fd, const void *buf, size_t len, int flags); + int32_t rpc_call_sendmsg(int fd, const struct msghdr *msg, int flags); + int32_t rpc_call_recvmsg(int fd, struct msghdr *msg, int flags); + int32_t rpc_call_getpeername(int fd, struct sockaddr *addr, socklen_t *addrlen); +-- +2.23.0 + diff --git a/0098-support-dpdk-dynamic-memory.patch b/0098-support-dpdk-dynamic-memory.patch new file mode 100644 index 0000000..3d5c8ff --- /dev/null +++ b/0098-support-dpdk-dynamic-memory.patch @@ -0,0 +1,64 @@ +From cc9db5e298dab1c1bac927464538ba51da07d9c6 Mon Sep 17 00:00:00 2001 +From: wu-changsheng +Date: Sat, 8 Oct 2022 10:55:05 +0800 +Subject: [PATCH 14/21] support dpdk dynamic memory + +--- + src/lstack/core/lstack_cfg.c | 20 +++++++++++--------- + 1 file changed, 11 insertions(+), 9 deletions(-) + +diff --git a/src/lstack/core/lstack_cfg.c b/src/lstack/core/lstack_cfg.c +index b6a517b..19a5b30 100644 +--- a/src/lstack/core/lstack_cfg.c ++++ b/src/lstack/core/lstack_cfg.c +@@ -192,7 +192,6 @@ static int32_t parse_devices(void) + + static int32_t get_param_idx(int32_t argc, char **argv, const char *param) + { +- int32_t ret; + int32_t idx; + + if ((argc <= 0) || (argv == NULL) || (param == NULL)) { +@@ -200,8 +199,7 @@ static int32_t get_param_idx(int32_t argc, char **argv, const char *param) + } + + for (idx = 0; idx < argc; ++idx) { +- ret = strncmp(argv[idx], param, strlen(param)); +- if (ret == 0) { ++ if (strncmp(argv[idx], param, strlen(param)) == 0) { + return idx; + } + } +@@ -249,6 +247,7 @@ static int32_t parse_stack_cpu_number(void) + if (args == NULL) { + return -EINVAL; + } ++ + if (!have_corelist_arg(g_config_params.dpdk_argc, g_config_params.dpdk_argv)) { + int32_t idx = get_param_idx(g_config_params.dpdk_argc, g_config_params.dpdk_argv, OPT_BIND_CORELIST); + if (idx < 0) { +@@ -465,12 +464,15 @@ static int32_t turn_args_to_config(int32_t argc, char **argv) + // OPT_SOCKET_MEM + idx = get_param_idx(argc, argv, OPT_SOCKET_MEM); + if ((idx < 0) || (idx + 1 >= argc)) { +- LSTACK_LOG(ERR, LSTACK, "Cannot find param %s\n", OPT_SOCKET_MEM); +- return idx; +- } +- ret = gazelle_parse_socket_mem(argv[idx + 1], &g_config_params.sec_attach_arg); +- if (ret < 0) { +- return ret; ++ if (use_ltran()) { ++ LSTACK_LOG(ERR, LSTACK, "Cannot find param %s\n", OPT_SOCKET_MEM); ++ return idx; ++ } ++ } else { ++ ret = gazelle_parse_socket_mem(argv[idx + 1], &g_config_params.sec_attach_arg); ++ if (ret < 0) { ++ return ret; ++ } + } + + // OPT_BASE_VIRTADDR +-- +2.23.0 + diff --git a/0099-fix-lwip_send-fail-free-pbuf-miss-data.patch b/0099-fix-lwip_send-fail-free-pbuf-miss-data.patch new file mode 100644 index 0000000..0bfccd2 --- /dev/null +++ b/0099-fix-lwip_send-fail-free-pbuf-miss-data.patch @@ -0,0 +1,87 @@ +From b3ccf93dec7581adfcfaf99c6ea8b8478da263b1 Mon Sep 17 00:00:00 2001 +From: wu-changsheng +Date: Sat, 8 Oct 2022 11:10:24 +0800 +Subject: [PATCH 15/21] fix lwip_send fail free pbuf miss data + +--- + src/common/dpdk_common.h | 25 +++++++++++++++++++++++++ + src/lstack/core/lstack_lwip.c | 7 ++++++- + src/lstack/include/lstack_lwip.h | 1 + + 3 files changed, 32 insertions(+), 1 deletion(-) + +diff --git a/src/common/dpdk_common.h b/src/common/dpdk_common.h +index 1c3e7e8..c2cbda7 100644 +--- a/src/common/dpdk_common.h ++++ b/src/common/dpdk_common.h +@@ -177,6 +177,31 @@ static __rte_always_inline uint32_t gazelle_ring_sc_dequeue(struct rte_ring *r, + return n; + } + ++/* get ring obj dont dequeue */ ++static __rte_always_inline uint32_t gazelle_ring_sc_peek(struct rte_ring *r, void **obj_table, uint32_t n) ++{ ++ uint32_t prod = __atomic_load_n(&r->prod.tail, __ATOMIC_ACQUIRE); ++ uint32_t cons = r->cons.tail; ++ ++ uint32_t entries = prod - cons; ++ if (n > entries) { ++ n = entries; ++ } ++ if (unlikely(n == 0)) { ++ return 0; ++ } ++ ++ ++ __rte_ring_dequeue_elems(r, cons, obj_table, sizeof(void *), n); ++ ++ return n; ++} ++ ++static __rte_always_inline void gazelle_ring_dequeue_over(struct rte_ring *r, uint32_t n) ++{ ++ r->cons.tail += n; ++} ++ + static __rte_always_inline uint32_t gazelle_ring_read(struct rte_ring *r, void **obj_table, uint32_t n) + { + uint32_t cons = __atomic_load_n(&r->cons.head, __ATOMIC_ACQUIRE); +diff --git a/src/lstack/core/lstack_lwip.c b/src/lstack/core/lstack_lwip.c +index 6f08a1c..3f21a3a 100644 +--- a/src/lstack/core/lstack_lwip.c ++++ b/src/lstack/core/lstack_lwip.c +@@ -229,7 +229,7 @@ struct pbuf *write_lwip_data(struct lwip_sock *sock, uint16_t remain_size, uint8 + { + struct pbuf *pbuf = NULL; + +- if (gazelle_ring_sc_dequeue(sock->send_ring, (void **)&pbuf, 1) != 1) { ++ if (gazelle_ring_sc_peek(sock->send_ring, (void **)&pbuf, 1) != 1) { + *apiflags &= ~TCP_WRITE_FLAG_MORE; + return NULL; + } +@@ -238,6 +238,11 @@ struct pbuf *write_lwip_data(struct lwip_sock *sock, uint16_t remain_size, uint8 + return pbuf; + } + ++void write_lwip_over(struct lwip_sock *sock, uint32_t n) ++{ ++ gazelle_ring_dequeue_over(sock->send_ring, n); ++} ++ + static inline void del_data_out_event(struct lwip_sock *sock) + { + pthread_spin_lock(&sock->wakeup->event_list_lock); +diff --git a/src/lstack/include/lstack_lwip.h b/src/lstack/include/lstack_lwip.h +index c62e99d..968eff2 100644 +--- a/src/lstack/include/lstack_lwip.h ++++ b/src/lstack/include/lstack_lwip.h +@@ -27,6 +27,7 @@ void gazelle_init_sock(int32_t fd); + int32_t gazelle_socket(int domain, int type, int protocol); + void gazelle_clean_sock(int32_t fd); + struct pbuf *write_lwip_data(struct lwip_sock *sock, uint16_t remain_size, uint8_t *apiflags); ++void write_lwip_over(struct lwip_sock *sock, uint32_t n); + ssize_t write_stack_data(struct lwip_sock *sock, const void *buf, size_t len); + ssize_t read_stack_data(int32_t fd, void *buf, size_t len, int32_t flags); + ssize_t read_lwip_data(struct lwip_sock *sock, int32_t flags, uint8_t apiflags); +-- +2.23.0 + diff --git a/0100-merger-wakeup.patch b/0100-merger-wakeup.patch new file mode 100644 index 0000000..6aa2848 --- /dev/null +++ b/0100-merger-wakeup.patch @@ -0,0 +1,984 @@ +From 045c0ea6fa5a2251a4a205bc9a732e694ddbb5a7 Mon Sep 17 00:00:00 2001 +From: wu-changsheng +Date: Sat, 8 Oct 2022 16:51:11 +0800 +Subject: [PATCH 16/21] merger wakeup + +--- + src/lstack/api/lstack_epoll.c | 366 +++++++++------------ + src/lstack/core/lstack_lwip.c | 29 +- + src/lstack/core/lstack_protocol_stack.c | 92 ++---- + src/lstack/core/lstack_stack_stat.c | 34 +- + src/lstack/include/lstack_protocol_stack.h | 4 +- + src/lstack/include/lstack_stack_stat.h | 5 +- + src/lstack/include/posix/lstack_epoll.h | 19 +- + 7 files changed, 232 insertions(+), 317 deletions(-) + +diff --git a/src/lstack/api/lstack_epoll.c b/src/lstack/api/lstack_epoll.c +index 9c44f87..cc1cbf0 100644 +--- a/src/lstack/api/lstack_epoll.c ++++ b/src/lstack/api/lstack_epoll.c +@@ -40,33 +40,35 @@ + #define SEC_TO_NSEC 1000000000 + #define SEC_TO_MSEC 1000 + #define MSEC_TO_NSEC 1000000 +-#define POLL_KERNEL_EVENTS 32 ++#define POLL_KERNEL_EVENTS 128 + +-void add_epoll_event(struct netconn *conn, uint32_t event) ++void add_sock_event(struct lwip_sock *sock, uint32_t event) + { +- /* conn sock nerver null, because lwip call this func */ +- struct lwip_sock *sock = get_socket_by_fd(conn->socket); +- if (sock->wakeup == NULL || (event & sock->epoll_events) == 0) { ++ struct wakeup_poll *wakeup = sock->wakeup; ++ if (wakeup == NULL || (event & sock->epoll_events) == 0) { + return; + } +- struct wakeup_poll *wakeup = sock->wakeup; +- struct protocol_stack *stack = sock->stack; + +- if (wakeup->type == WAKEUP_EPOLL) { +- pthread_spin_lock(&wakeup->event_list_lock); +- sock->events |= (event == EPOLLERR) ? (EPOLLIN | EPOLLERR) : (event & sock->epoll_events); +- if (list_is_null(&sock->event_list)) { +- list_add_node(&wakeup->event_list, &sock->event_list); +- } +- pthread_spin_unlock(&wakeup->event_list_lock); ++ wakeup->have_event = true; ++ ++ if (wakeup->type == WAKEUP_POLL) { ++ return; + } + +- stack->stats.wakeup_events++; +- sem_t *sem = &wakeup->event_sem; +- if (get_protocol_stack_group()->wakeup_enable) { +- gazelle_light_ring_enqueue_busrt(stack->wakeup_ring, (void **)&sem, 1); +- } else { +- sem_post(sem); ++ pthread_spin_lock(&wakeup->event_list_lock); ++ sock->events |= (event == EPOLLERR) ? (EPOLLIN | EPOLLERR) : (event & sock->epoll_events); ++ if (list_is_null(&sock->event_list)) { ++ list_add_node(&wakeup->event_list, &sock->event_list); ++ } ++ pthread_spin_unlock(&wakeup->event_list_lock); ++} ++ ++void wakeup_epoll(struct protocol_stack *stack, struct wakeup_poll *wakeup) ++{ ++ if (__atomic_load_n(&wakeup->in_wait, __ATOMIC_ACQUIRE)) { ++ uint64_t tmp = 1; ++ posix_api->write_fn(wakeup->eventfd, &tmp, sizeof(tmp)); ++ stack->stats.wakeup_events++; + } + } + +@@ -98,11 +100,7 @@ static void raise_pending_events(struct wakeup_poll *wakeup, struct lwip_sock *s + { + sock->events = update_events(sock); + if (sock->events) { +- pthread_spin_lock(&wakeup->event_list_lock); +- if (list_is_null(&sock->event_list)) { +- list_add_node(&wakeup->event_list, &sock->event_list); +- } +- pthread_spin_unlock(&wakeup->event_list_lock); ++ add_sock_event(sock, sock->events); + } + } + +@@ -120,28 +118,38 @@ int32_t lstack_epoll_create(int32_t size) + GAZELLE_RETURN(EINVAL); + } + +- struct wakeup_poll *wakeup = malloc(sizeof(struct wakeup_poll)); ++ struct wakeup_poll *wakeup = calloc(1, sizeof(struct wakeup_poll)); + if (wakeup == NULL) { ++ LSTACK_LOG(ERR, LSTACK, "calloc null\n"); + posix_api->close_fn(fd); + GAZELLE_RETURN(EINVAL); + } +- if (memset_s(wakeup, sizeof(struct wakeup_poll), 0, sizeof(struct wakeup_poll)) != 0) { +- LSTACK_LOG(ERR, LSTACK, "memset_s failed\n"); ++ ++ wakeup->eventfd = eventfd(0, EFD_NONBLOCK); ++ if (wakeup->eventfd < 0) { ++ LSTACK_LOG(ERR, LSTACK, "eventfd fail=%d errno=%d\n", wakeup->eventfd, errno); ++ posix_api->close_fn(fd); + free(wakeup); ++ GAZELLE_RETURN(EINVAL); ++ } ++ ++ struct epoll_event event; ++ event.data.fd = wakeup->eventfd; ++ event.events = EPOLLIN | EPOLLET; ++ if (posix_api->epoll_ctl_fn(fd, EPOLL_CTL_ADD, wakeup->eventfd, &event) < 0) { ++ LSTACK_LOG(ERR, LSTACK, "eventfd errno=%d\n", errno); + posix_api->close_fn(fd); ++ free(wakeup); + GAZELLE_RETURN(EINVAL); + } + + init_list_node(&wakeup->event_list); +- sem_init(&wakeup->event_sem, 0, 0); + pthread_spin_init(&wakeup->event_list_lock, PTHREAD_PROCESS_PRIVATE); + + wakeup->type = WAKEUP_EPOLL; + wakeup->epollfd = fd; + sock->wakeup = wakeup; + +- register_wakeup(wakeup); +- + return fd; + } + +@@ -156,8 +164,10 @@ int32_t lstack_epoll_close(int32_t fd) + } + + if (sock->wakeup) { +- unregister_wakeup(sock->wakeup); +- sem_destroy(&sock->wakeup->event_sem); ++ if (sock->stack) { ++ unregister_wakeup(sock->stack, sock->wakeup); ++ } ++ posix_api->close_fn(sock->wakeup->eventfd); + pthread_spin_destroy(&sock->wakeup->event_list_lock); + free(sock->wakeup); + } +@@ -217,19 +227,22 @@ int32_t lstack_epoll_ctl(int32_t epfd, int32_t op, int32_t fd, struct epoll_even + return posix_api->epoll_ctl_fn(epfd, op, fd, event); + } + ++ ++ struct wakeup_poll *wakeup = epoll_sock->wakeup; + struct lwip_sock *sock = get_socket(fd); + if (sock == NULL) { ++ wakeup->have_kernel_fd = true; + return posix_api->epoll_ctl_fn(epfd, op, fd, event); + } + + if (CONN_TYPE_HAS_HOST(sock->conn)) { ++ wakeup->have_kernel_fd = true; + int32_t ret = posix_api->epoll_ctl_fn(epfd, op, fd, event); + if (ret < 0) { + LSTACK_LOG(ERR, LSTACK, "fd=%d epfd=%d op=%d\n", fd, epfd, op); + } + } + +- struct wakeup_poll *wakeup = epoll_sock->wakeup; + do { + switch (op) { + case EPOLL_CTL_ADD: +@@ -258,34 +271,10 @@ int32_t lstack_epoll_ctl(int32_t epfd, int32_t op, int32_t fd, struct epoll_even + return 0; + } + +-static void del_node_array(struct epoll_event *events, int32_t event_num, int32_t del_index) +-{ +- for (int32_t i = del_index; i + 1 < event_num; i++) { +- events[i] = events[i + 1]; +- } +-} +- +-static int32_t del_duplicate_event(struct epoll_event *events, int32_t event_num) +-{ +- int32_t num = event_num; +- +- for (int32_t i = 0; i < num; i++) { +- for (int32_t j = i + 1; j < num; j++) { +- if (events[i].data.u64 == events[j].data.u64) { +- del_node_array(events, num, j); +- num--; +- } +- } +- } +- +- return num; +-} +- + static int32_t epoll_lwip_event(struct wakeup_poll *wakeup, struct epoll_event *events, uint32_t maxevents) + { + int32_t event_num = 0; + struct list_node *node, *temp; +- int32_t accept_num = 0; + + pthread_spin_lock(&wakeup->event_list_lock); + +@@ -297,10 +286,6 @@ static int32_t epoll_lwip_event(struct wakeup_poll *wakeup, struct epoll_event * + continue; + } + +- if (sock->conn && sock->conn->acceptmbox) { +- accept_num++; +- } +- + if (sock->epoll_events & EPOLLET) { + list_del_node_null(&sock->event_list); + } +@@ -323,10 +308,6 @@ static int32_t epoll_lwip_event(struct wakeup_poll *wakeup, struct epoll_event * + + pthread_spin_unlock(&wakeup->event_list_lock); + +- if (accept_num > 1) { +- event_num = del_duplicate_event(events, event_num); +- } +- + wakeup->stat.app_events += event_num; + return event_num; + } +@@ -354,33 +335,6 @@ static int32_t poll_lwip_event(struct pollfd *fds, nfds_t nfds) + return event_num; + } + +-static void ms_to_timespec(struct timespec *timespec, int32_t timeout) +-{ +- clock_gettime(CLOCK_REALTIME, timespec); +- timespec->tv_sec += timeout / SEC_TO_MSEC; +- timespec->tv_nsec += (timeout % SEC_TO_MSEC) * MSEC_TO_NSEC; +- timespec->tv_sec += timespec->tv_nsec / SEC_TO_NSEC; +- timespec->tv_nsec = timespec->tv_nsec % SEC_TO_NSEC; +-} +- +-static void change_epollfd_kernel_thread(struct wakeup_poll *wakeup, struct protocol_stack *old_stack, +- struct protocol_stack *new_stack) +-{ +- if (old_stack) { +- if (posix_api->epoll_ctl_fn(old_stack->epollfd, EPOLL_CTL_DEL, wakeup->epollfd, NULL) != 0) { +- LSTACK_LOG(ERR, LSTACK, "epoll_ctl_fn errno=%d\n", errno); +- } +- } +- +- /* avoid kernel thread post too much, use EPOLLET */ +- struct epoll_event event; +- event.data.ptr = wakeup; +- event.events = EPOLLIN | EPOLLOUT | EPOLLERR | EPOLLHUP | EPOLLET; +- if (posix_api->epoll_ctl_fn(new_stack->epollfd, EPOLL_CTL_ADD, wakeup->epollfd, &event) != 0) { +- LSTACK_LOG(ERR, LSTACK, "epoll_ctl_fn errno=%d\n", errno); +- } +-} +- + static void epoll_bind_statck(struct wakeup_poll *wakeup) + { + /* all fd is kernel, set rand stack */ +@@ -390,11 +344,27 @@ static void epoll_bind_statck(struct wakeup_poll *wakeup) + + if (wakeup->bind_stack != wakeup->max_stack && wakeup->max_stack) { + bind_to_stack_numa(wakeup->max_stack); +- change_epollfd_kernel_thread(wakeup, wakeup->bind_stack, wakeup->max_stack); ++ if (wakeup->bind_stack) { ++ unregister_wakeup(wakeup->bind_stack, wakeup); ++ } + wakeup->bind_stack = wakeup->max_stack; ++ register_wakeup(wakeup->bind_stack, wakeup); + } + } + ++static bool del_event_fd(struct epoll_event* events, int32_t eventnum, int32_t eventfd) ++{ ++ for (int32_t i = 0; i < eventnum; i++) { ++ if (events[i].data.fd == eventfd) { ++ events[i].data.u64 = events[eventnum - 1].data.u64; ++ events[i].events = events[eventnum - 1].events; ++ return true; ++ } ++ } ++ ++ return false; ++} ++ + int32_t lstack_epoll_wait(int32_t epfd, struct epoll_event* events, int32_t maxevents, int32_t timeout) + { + struct lwip_sock *sock = get_socket_by_fd(epfd); +@@ -402,61 +372,66 @@ int32_t lstack_epoll_wait(int32_t epfd, struct epoll_event* events, int32_t maxe + return posix_api->epoll_wait_fn(epfd, events, maxevents, timeout); + } + +- int32_t event_num = 0; +- int32_t ret; +- +- struct timespec epoll_time; +- if (timeout >= 0) { +- ms_to_timespec(&epoll_time, timeout); +- } ++ struct wakeup_poll *wakeup = sock->wakeup; ++ int32_t kernel_num = 0; + + epoll_bind_statck(sock->wakeup); + +- do { +- event_num += epoll_lwip_event(sock->wakeup, &events[event_num], maxevents - event_num); +- sock->wakeup->stat.app_events += event_num; ++ __atomic_store_n(&wakeup->in_wait, true, __ATOMIC_RELEASE); ++ rte_mb(); + +- if (__atomic_load_n(&sock->wakeup->have_kernel_event, __ATOMIC_RELAXED)) { +- event_num += posix_api->epoll_wait_fn(epfd, &events[event_num], maxevents - event_num, 0); +- } ++ int32_t lwip_num = epoll_lwip_event(wakeup, events, maxevents); ++ wakeup->stat.app_events += lwip_num; ++ if (!wakeup->have_kernel_fd && lwip_num > 0) { ++ return lwip_num; ++ } + +- if (event_num > 0) { +- while (sem_trywait(&sock->wakeup->event_sem) == 0); +- break; +- } ++ if (lwip_num > 0) { ++ __atomic_store_n(&wakeup->in_wait, false, __ATOMIC_RELEASE); ++ rte_mb(); ++ kernel_num = posix_api->epoll_wait_fn(epfd, &events[lwip_num], maxevents - lwip_num, 0); ++ } else { ++ kernel_num = posix_api->epoll_wait_fn(epfd, &events[lwip_num], maxevents - lwip_num, timeout); ++ rte_mb(); ++ __atomic_store_n(&wakeup->in_wait, false, __ATOMIC_RELEASE); ++ } ++ ++ if (kernel_num <= 0) { ++ return (lwip_num > 0) ? lwip_num : kernel_num; ++ } + +- sock->wakeup->have_kernel_event = false; +- if (timeout < 0) { +- ret = sem_wait(&sock->wakeup->event_sem); +- } else { +- ret = sem_timedwait(&sock->wakeup->event_sem, &epoll_time); ++ if (del_event_fd(&events[lwip_num], kernel_num, wakeup->eventfd)) { ++ if (lwip_num == 0) { ++ lwip_num = epoll_lwip_event(wakeup, events, maxevents); + } +- } while (ret == 0); ++ kernel_num--; ++ } + +- return event_num; ++ return lwip_num + kernel_num; + } + +-static void init_poll_wakeup_data(struct wakeup_poll *wakeup) ++static int32_t init_poll_wakeup_data(struct wakeup_poll *wakeup) + { +- sem_init(&wakeup->event_sem, 0, 0); + wakeup->type = WAKEUP_POLL; + +- wakeup->last_fds = calloc(POLL_KERNEL_EVENTS, sizeof(struct pollfd)); +- if (wakeup->last_fds == NULL) { +- LSTACK_LOG(ERR, LSTACK, "calloc failed errno=%d\n", errno); ++ wakeup->eventfd = eventfd(0, EFD_NONBLOCK); ++ if (wakeup->eventfd < 0) { ++ LSTACK_LOG(ERR, LSTACK, "eventfd failed errno=%d\n", errno); ++ GAZELLE_RETURN(EINVAL); + } + +- wakeup->events = calloc(POLL_KERNEL_EVENTS, sizeof(struct epoll_event)); +- if (wakeup->events == NULL) { ++ wakeup->last_fds = calloc(POLL_KERNEL_EVENTS, sizeof(struct pollfd)); ++ if (wakeup->last_fds == NULL) { + LSTACK_LOG(ERR, LSTACK, "calloc failed errno=%d\n", errno); ++ posix_api->close_fn(wakeup->eventfd); ++ GAZELLE_RETURN(EINVAL); + } + ++ wakeup->last_fds[0].fd = wakeup->eventfd; ++ wakeup->last_fds[0].events = POLLIN; + wakeup->last_max_nfds = POLL_KERNEL_EVENTS; + +- wakeup->epollfd = posix_api->epoll_create_fn(POLL_KERNEL_EVENTS); +- if (wakeup->epollfd < 0) { +- LSTACK_LOG(ERR, LSTACK, "epoll_create_fn errno=%d\n", errno); +- } ++ return 0; + } + + static void resize_kernel_poll(struct wakeup_poll *wakeup, nfds_t nfds) +@@ -464,23 +439,14 @@ static void resize_kernel_poll(struct wakeup_poll *wakeup, nfds_t nfds) + if (wakeup->last_fds) { + free(wakeup->last_fds); + } +- wakeup->last_fds = calloc(nfds, sizeof(struct pollfd)); ++ wakeup->last_fds = calloc(nfds + 1, sizeof(struct pollfd)); + if (wakeup->last_fds == NULL) { + LSTACK_LOG(ERR, LSTACK, "calloc failed errno=%d\n", errno); + } + +- if (wakeup->events) { +- free(wakeup->events); +- } +- wakeup->events = calloc(nfds, sizeof(struct epoll_event)); +- if (wakeup->events == NULL) { +- LSTACK_LOG(ERR, LSTACK, "calloc failed errno=%d\n", errno); +- } +- ++ wakeup->last_fds[0].fd = wakeup->eventfd; ++ wakeup->last_fds[0].events = POLLIN; + wakeup->last_max_nfds = nfds; +- if (memset_s(wakeup->last_fds, nfds * sizeof(struct pollfd), 0, nfds * sizeof(struct pollfd)) != 0) { +- LSTACK_LOG(ERR, LSTACK, "memset_s faile\n"); +- } + } + + static void poll_bind_statck(struct wakeup_poll *wakeup, int32_t *stack_count) +@@ -492,35 +458,16 @@ static void poll_bind_statck(struct wakeup_poll *wakeup, int32_t *stack_count) + return; + } + +- change_epollfd_kernel_thread(wakeup, wakeup->bind_stack, stack_group->stacks[bind_id]); ++ if (wakeup->bind_stack) { ++ unregister_wakeup(wakeup->bind_stack, wakeup); ++ } + bind_to_stack_numa(stack_group->stacks[bind_id]); + wakeup->bind_stack = stack_group->stacks[bind_id]; +-} +- +-static void update_kernel_poll(struct wakeup_poll *wakeup, uint32_t index, struct pollfd *new_fd) +-{ +- posix_api->epoll_ctl_fn(wakeup->epollfd, EPOLL_CTL_DEL, wakeup->last_fds[index].fd, NULL); +- +- if (new_fd == NULL) { +- return; +- } +- +- struct epoll_event event; +- event.data.u32 = index; +- event.events = new_fd->events; +- if (posix_api->epoll_ctl_fn(wakeup->epollfd, EPOLL_CTL_ADD, new_fd->fd, &event) != 0) { +- LSTACK_LOG(ERR, LSTACK, "epoll_ctl_fn errno=%d\n", errno); +- } ++ register_wakeup(wakeup->bind_stack, wakeup); + } + + static void poll_init(struct wakeup_poll *wakeup, struct pollfd *fds, nfds_t nfds) + { +- if (!wakeup->init) { +- wakeup->init = true; +- init_poll_wakeup_data(wakeup); +- register_wakeup(wakeup); +- } +- + int32_t stack_count[PROTOCOL_STACK_MAX] = {0}; + int32_t poll_change = 0; + +@@ -529,31 +476,22 @@ static void poll_init(struct wakeup_poll *wakeup, struct pollfd *fds, nfds_t nfd + resize_kernel_poll(wakeup, nfds); + poll_change = 1; + } +- /* poll fds num less, del old fd */ +- for (uint32_t i = nfds; i < wakeup->last_nfds; i++) { +- update_kernel_poll(wakeup, i, NULL); +- poll_change = 1; +- } + + for (uint32_t i = 0; i < nfds; i++) { + int32_t fd = fds[i].fd; + fds[i].revents = 0; + struct lwip_sock *sock = get_socket_by_fd(fd); + +- if (fd == wakeup->last_fds[i].fd && fds[i].events == wakeup->last_fds[i].events) { ++ if (fd == wakeup->last_fds[i + 1].fd && fds[i].events == wakeup->last_fds[i + 1].events) { + /* fd close then socket may get same fd. */ + if (sock == NULL || sock->wakeup != NULL) { + continue; + } + } +- wakeup->last_fds[i].fd = fd; +- wakeup->last_fds[i].events = fds[i].events; ++ wakeup->last_fds[i + 1].fd = fd; ++ wakeup->last_fds[i + 1].events = fds[i].events; + poll_change = 1; + +- if (sock == NULL || sock->conn == NULL || CONN_TYPE_HAS_HOST(sock->conn)) { +- update_kernel_poll(wakeup, i, fds + i); +- } +- + while (sock && sock->conn) { + if (sock->epoll_events != (fds[i].events | POLLERR)) { + sock->epoll_events = fds[i].events | POLLERR; +@@ -571,51 +509,65 @@ static void poll_init(struct wakeup_poll *wakeup, struct pollfd *fds, nfds_t nfd + if (poll_change == 0) { + return; + } +- wakeup->last_nfds = nfds; ++ wakeup->last_nfds = nfds + 1; + + poll_bind_statck(wakeup, stack_count); + } + + int32_t lstack_poll(struct pollfd *fds, nfds_t nfds, int32_t timeout) + { +- static PER_THREAD struct wakeup_poll wakeup_poll = {0}; ++ static PER_THREAD struct wakeup_poll *wakeup = NULL; ++ if (wakeup == NULL) { ++ wakeup = calloc(1, sizeof(struct wakeup_poll)); ++ if (wakeup == NULL) { ++ GAZELLE_RETURN(EINVAL); ++ } + +- poll_init(&wakeup_poll, fds, nfds); ++ if (init_poll_wakeup_data(wakeup) < 0) { ++ free(wakeup); ++ GAZELLE_RETURN(EINVAL); ++ } ++ } + +- int32_t event_num = 0; +- int32_t ret; ++ poll_init(wakeup, fds, nfds); ++ ++ __atomic_store_n(&wakeup->in_wait, true, __ATOMIC_RELEASE); ++ rte_mb(); + +- struct timespec poll_time; +- if (timeout >= 0) { +- ms_to_timespec(&poll_time, timeout); ++ int32_t lwip_num = poll_lwip_event(fds, nfds); ++ wakeup->stat.app_events += lwip_num; ++ if (lwip_num >= nfds) { ++ __atomic_store_n(&wakeup->in_wait, false, __ATOMIC_RELEASE); ++ return lwip_num; + } + +- /* when epfd > 0 is epoll type */ +- do { +- event_num += poll_lwip_event(fds, nfds); +- +- /* reduce syscall epoll_wait */ +- if (__atomic_load_n(&wakeup_poll.have_kernel_event, __ATOMIC_RELAXED)) { +- int32_t kernel_num = posix_api->epoll_wait_fn(wakeup_poll.epollfd, wakeup_poll.events, nfds, 0); +- for (int32_t i = 0; i < kernel_num; i++) { +- uint32_t index = wakeup_poll.events[i].data.u32; +- fds[index].revents = wakeup_poll.events[i].events; +- } +- event_num += kernel_num >= 0 ? kernel_num : 0; +- } ++ int32_t kernel_num = 0; ++ if (lwip_num > 0) { ++ __atomic_store_n(&wakeup->in_wait, false, __ATOMIC_RELEASE); ++ rte_mb(); ++ kernel_num = posix_api->poll_fn(wakeup->last_fds, wakeup->last_nfds, 0); ++ } else { ++ kernel_num = posix_api->poll_fn(wakeup->last_fds, wakeup->last_nfds, timeout); ++ rte_mb(); ++ __atomic_store_n(&wakeup->in_wait, false, __ATOMIC_RELEASE); ++ } + +- if (event_num > 0) { +- while (sem_trywait(&wakeup_poll.event_sem) == 0); +- break; ++ if (kernel_num <= 0) { ++ return (lwip_num > 0) ? lwip_num : kernel_num; ++ } ++ ++ for (nfds_t i = 0; i < nfds; i++) { ++ if (fds[i].revents == 0 && wakeup->last_fds[i + 1].revents != 0) { ++ fds[i].revents = wakeup->last_fds[i + 1].revents; + } ++ } + +- wakeup_poll.have_kernel_event = false; +- if (timeout < 0) { +- ret = sem_wait(&wakeup_poll.event_sem); +- } else { +- ret = sem_timedwait(&wakeup_poll.event_sem, &poll_time); ++ if (wakeup->last_fds[0].revents) { ++ if (lwip_num == 0) { ++ lwip_num = poll_lwip_event(fds, nfds); + } +- } while (ret == 0); ++ kernel_num--; ++ } + +- return event_num; ++ return kernel_num + lwip_num; + } +diff --git a/src/lstack/core/lstack_lwip.c b/src/lstack/core/lstack_lwip.c +index 3f21a3a..bb5a7e5 100644 +--- a/src/lstack/core/lstack_lwip.c ++++ b/src/lstack/core/lstack_lwip.c +@@ -309,7 +309,7 @@ static void do_lwip_send(int32_t fd, struct lwip_sock *sock, int32_t flags) + if (len == 0) { + /* FIXME: should use POLLRDHUP, when connection be closed. lwip event-callback no POLLRDHUP */ + sock->errevent = 1; +- add_epoll_event(sock->conn, EPOLLERR); ++ add_sock_event(sock, EPOLLERR); + } + + if (gazelle_ring_readable_count(sock->send_ring) < SOCK_SEND_REPLENISH_THRES) { +@@ -317,7 +317,7 @@ static void do_lwip_send(int32_t fd, struct lwip_sock *sock, int32_t flags) + } + + if ((sock->epoll_events & EPOLLOUT) && NETCONN_IS_OUTIDLE(sock)) { +- add_epoll_event(sock->conn, EPOLLOUT); ++ add_sock_event(sock, EPOLLOUT); + } + } + +@@ -678,9 +678,9 @@ void read_recv_list(struct protocol_stack *stack, uint32_t max_num) + if (len == 0) { + /* FIXME: should use POLLRDHUP, when connection be closed. lwip event-callback no POLLRDHUP */ + sock->errevent = 1; +- add_epoll_event(sock->conn, EPOLLERR); ++ add_sock_event(sock, EPOLLERR); + } else if (len > 0) { +- add_epoll_event(sock->conn, EPOLLIN); ++ add_sock_event(sock, EPOLLIN); + } + + /* last_node:recv only once per sock. max_num avoid cost too much time this loop */ +@@ -690,6 +690,23 @@ void read_recv_list(struct protocol_stack *stack, uint32_t max_num) + } + } + ++void gazelle_connected_callback(struct netconn *conn) ++{ ++ if (conn == NULL) { ++ return; ++ } ++ ++ int32_t fd = conn->socket; ++ struct lwip_sock *sock = get_socket_by_fd(fd); ++ if (sock == NULL || sock->conn == NULL) { ++ return; ++ } ++ ++ SET_CONN_TYPE_LIBOS(conn); ++ ++ add_sock_event(sock, EPOLLOUT); ++} ++ + static void copy_pcb_to_conn(struct gazelle_stat_lstack_conn_info *conn, const struct tcp_pcb *pcb) + { + struct netconn *netconn = (struct netconn *)pcb->callback_arg; +@@ -711,10 +728,6 @@ static void copy_pcb_to_conn(struct gazelle_stat_lstack_conn_info *conn, const s + conn->recv_ring_cnt += (sock->recv_lastdata) ? 1 : 0; + + conn->send_ring_cnt = gazelle_ring_readover_count(sock->send_ring); +- +- if (sock->wakeup) { +- sem_getvalue(&sock->wakeup->event_sem, &conn->sem_cnt); +- } + } + } + } +diff --git a/src/lstack/core/lstack_protocol_stack.c b/src/lstack/core/lstack_protocol_stack.c +index 1dc6c3f..6119975 100644 +--- a/src/lstack/core/lstack_protocol_stack.c ++++ b/src/lstack/core/lstack_protocol_stack.c +@@ -262,6 +262,8 @@ static int32_t init_stack_value(struct protocol_stack *stack, uint16_t queue_id) + stack->cpu_id = get_global_cfg_params()->cpus[queue_id]; + stack->lwip_stats = &lwip_stats; + ++ pthread_spin_init(&stack->wakeup_list_lock, PTHREAD_PROCESS_PRIVATE); ++ + init_list_node(&stack->recv_list); + init_list_node(&stack->send_list); + +@@ -292,70 +294,6 @@ static int32_t init_stack_value(struct protocol_stack *stack, uint16_t queue_id) + return 0; + } + +-static void* gazelle_kernel_event(void *arg) +-{ +- uint16_t queue_id = *(uint16_t *)arg; +- struct protocol_stack *stack = get_protocol_stack_group()->stacks[queue_id]; +- +- bind_to_stack_numa(stack); +- +- int32_t epoll_fd = posix_api->epoll_create_fn(GAZELLE_LSTACK_MAX_CONN); +- if (epoll_fd < 0) { +- LSTACK_LOG(ERR, LSTACK, "queue_id=%hu epoll_fd=%d errno=%d\n", queue_id, epoll_fd, errno); +- /* exit in main thread, avoid create mempool and exit at the same time */ +- set_init_fail(); +- stack->epollfd = -1; +- return NULL; +- } +- +- stack->epollfd = epoll_fd; +- +- LSTACK_LOG(INFO, LSTACK, "kernel_event_%02hu start\n", queue_id); +- +- struct epoll_event events[KERNEL_EPOLL_MAX]; +- for (;;) { +- int32_t event_num = posix_api->epoll_wait_fn(epoll_fd, events, KERNEL_EPOLL_MAX, -1); +- if (event_num <= 0) { +- continue; +- } +- +- for (int32_t i = 0; i < event_num; i++) { +- struct wakeup_poll *wakeup = events[i].data.ptr; +- if (wakeup) { +- __atomic_store_n(&wakeup->have_kernel_event, true, __ATOMIC_RELEASE); +- sem_post(&wakeup->event_sem); +- } +- } +- } +- +- return NULL; +-} +- +-static int32_t create_companion_thread(struct protocol_stack_group *stack_group, struct protocol_stack *stack) +-{ +- int32_t ret; +- +- ret = create_thread(stack->queue_id, "gazellekernel", gazelle_kernel_event); +- if (ret != 0) { +- LSTACK_LOG(ERR, LSTACK, "gazellekernelEvent ret=%d errno=%d\n", ret, errno); +- return ret; +- } +- +- /* wait gazelle_kernel_event finish use stack.avoid use stack after free when create gazelle_weakup_thread fail */ +- while (stack->epollfd == 0) { +- usleep(1); +- } +- +- if (stack_group->wakeup_enable) { +- ret = create_thread(stack->queue_id, "gazelleweakup", gazelle_wakeup_thread); +- if (ret != 0) { +- LSTACK_LOG(ERR, LSTACK, "gazelleweakup ret=%d errno=%d\n", ret, errno); +- } +- } +- +- return ret; +-} +- + void wait_sem_value(sem_t *sem, int32_t wait_value) + { + int32_t sem_val; +@@ -404,14 +342,29 @@ static struct protocol_stack *stack_thread_init(uint16_t queue_id) + return NULL; + } + +- if (create_companion_thread(stack_group, stack) != 0) { +- free(stack); +- return NULL; ++ if (stack_group->wakeup_enable) { ++ if (create_thread(stack->queue_id, "gazelleweakup", gazelle_wakeup_thread) != 0) { ++ LSTACK_LOG(ERR, LSTACK, "gazelleweakup errno=%d\n", errno); ++ free(stack); ++ return NULL; ++ } + } + + return stack; + } + ++static void wakeup_stack_wait(struct protocol_stack *stack) ++{ ++ struct wakeup_poll *node = stack->wakeup_list; ++ while (node) { ++ if (node->have_event) { ++ wakeup_epoll(stack, node); ++ node->have_event = false; ++ } ++ node = node->next; ++ } ++} ++ + static void* gazelle_stack_thread(void *arg) + { + uint16_t queue_id = *(uint16_t *)arg; +@@ -437,6 +390,8 @@ static void* gazelle_stack_thread(void *arg) + + send_stack_list(stack, SEND_LIST_MAX); + ++ wakeup_stack_wait(stack); ++ + sys_timer_run(); + + if (get_global_cfg_params()->low_power_mod != 0) { +@@ -452,8 +407,6 @@ static int32_t init_protocol_sem(void) + int32_t ret; + struct protocol_stack_group *stack_group = get_protocol_stack_group(); + +- pthread_spin_init(&stack_group->wakeup_list_lock, PTHREAD_PROCESS_PRIVATE); +- + if (!use_ltran()) { + ret = sem_init(&stack_group->ethdev_init, 0, 0); + if (ret < 0) { +@@ -484,7 +437,6 @@ int32_t init_protocol_stack(void) + + stack_group->stack_num = get_global_cfg_params()->num_cpu; + stack_group->wakeup_enable = (get_global_cfg_params()->num_wakeup > 0) ? true : false; +- stack_group->wakeup_list = NULL; + + if (init_protocol_sem() != 0) { + return -1; +diff --git a/src/lstack/core/lstack_stack_stat.c b/src/lstack/core/lstack_stack_stat.c +index e8c5bc3..245bcd7 100644 +--- a/src/lstack/core/lstack_stack_stat.c ++++ b/src/lstack/core/lstack_stack_stat.c +@@ -92,25 +92,21 @@ static void set_latency_start_flag(bool start) + } + } + +-void register_wakeup(struct wakeup_poll *wakeup) ++void register_wakeup(struct protocol_stack *stack, struct wakeup_poll *wakeup) + { +- struct protocol_stack_group *stack_group = get_protocol_stack_group(); +- +- pthread_spin_lock(&stack_group->wakeup_list_lock); ++ pthread_spin_lock(&stack->wakeup_list_lock); + +- wakeup->next = stack_group->wakeup_list; +- stack_group->wakeup_list = wakeup; ++ wakeup->next = stack->wakeup_list; ++ stack->wakeup_list = wakeup; + +- pthread_spin_unlock(&stack_group->wakeup_list_lock); ++ pthread_spin_unlock(&stack->wakeup_list_lock); + } + +-void unregister_wakeup(struct wakeup_poll *wakeup) ++void unregister_wakeup(struct protocol_stack *stack, struct wakeup_poll *wakeup) + { +- struct protocol_stack_group *stack_group = get_protocol_stack_group(); ++ pthread_spin_lock(&stack->wakeup_list_lock); + +- pthread_spin_lock(&stack_group->wakeup_list_lock); +- +- struct wakeup_poll *node = stack_group->wakeup_list; ++ struct wakeup_poll *node = stack->wakeup_list; + struct wakeup_poll *pre = NULL; + + while (node && node != wakeup) { +@@ -119,26 +115,24 @@ void unregister_wakeup(struct wakeup_poll *wakeup) + } + + if (node == NULL) { +- pthread_spin_unlock(&stack_group->wakeup_list_lock); ++ pthread_spin_unlock(&stack->wakeup_list_lock); + return; + } + + if (pre) { + pre->next = node->next; + } else { +- stack_group->wakeup_list = node->next; ++ stack->wakeup_list = node->next; + } + +- pthread_spin_unlock(&stack_group->wakeup_list_lock); ++ pthread_spin_unlock(&stack->wakeup_list_lock); + } + + static void get_wakeup_stat(struct protocol_stack *stack, struct gazelle_wakeup_stat *stat) + { +- struct protocol_stack_group *stack_group = get_protocol_stack_group(); +- +- pthread_spin_lock(&stack_group->wakeup_list_lock); ++ pthread_spin_lock(&stack->wakeup_list_lock); + +- struct wakeup_poll *node = stack_group->wakeup_list; ++ struct wakeup_poll *node = stack->wakeup_list; + while (node) { + if (node->bind_stack == stack) { + stat->app_events += node->stat.app_events; +@@ -151,7 +145,7 @@ static void get_wakeup_stat(struct protocol_stack *stack, struct gazelle_wakeup_ + node = node->next; + } + +- pthread_spin_unlock(&stack_group->wakeup_list_lock); ++ pthread_spin_unlock(&stack->wakeup_list_lock); + } + + void lstack_get_low_power_info(struct gazelle_stat_low_power_info *low_power_info) +diff --git a/src/lstack/include/lstack_protocol_stack.h b/src/lstack/include/lstack_protocol_stack.h +index 36340ab..0a060b4 100644 +--- a/src/lstack/include/lstack_protocol_stack.h ++++ b/src/lstack/include/lstack_protocol_stack.h +@@ -50,6 +50,8 @@ struct protocol_stack { + struct reg_ring_msg *reg_buf; + + volatile bool low_power; ++ struct wakeup_poll *wakeup_list; ++ pthread_spinlock_t wakeup_list_lock; + lockless_queue rpc_queue __rte_cache_aligned; + char pad __rte_cache_aligned; + +@@ -84,8 +86,6 @@ struct protocol_stack_group { + /* dfx stats */ + bool latency_start; + uint64_t call_alloc_fail; +- pthread_spinlock_t wakeup_list_lock; +- struct wakeup_poll *wakeup_list __rte_cache_aligned; + }; + + long get_stack_tid(void); +diff --git a/src/lstack/include/lstack_stack_stat.h b/src/lstack/include/lstack_stack_stat.h +index aacade1..98ffe8f 100644 +--- a/src/lstack/include/lstack_stack_stat.h ++++ b/src/lstack/include/lstack_stack_stat.h +@@ -17,6 +17,7 @@ struct gazelle_stack_latency; + struct pbuf; + struct gazelle_stat_low_power_info; + struct wakeup_poll; ++struct protocol_stack; + enum GAZELLE_LATENCY_TYPE; + enum GAZELLE_STAT_MODE; + +@@ -26,7 +27,7 @@ void stack_stat_init(void); + int32_t handle_stack_cmd(int fd, enum GAZELLE_STAT_MODE stat_mode); + uint64_t get_current_time(void); + void lstack_get_low_power_info(struct gazelle_stat_low_power_info *low_power_info); +-void register_wakeup(struct wakeup_poll *wakeup); +-void unregister_wakeup(struct wakeup_poll *wakeup); ++void register_wakeup(struct protocol_stack *stack, struct wakeup_poll *wakeup); ++void unregister_wakeup(struct protocol_stack *stack, struct wakeup_poll *wakeup); + + #endif /* GAZELLE_STACK_STAT_H */ +diff --git a/src/lstack/include/posix/lstack_epoll.h b/src/lstack/include/posix/lstack_epoll.h +index a94b49f..5799028 100644 +--- a/src/lstack/include/posix/lstack_epoll.h ++++ b/src/lstack/include/posix/lstack_epoll.h +@@ -17,6 +17,7 @@ + #include + #include + #include ++#include + + #include + +@@ -35,24 +36,24 @@ enum wakeup_type { + struct protocol_stack; + struct wakeup_poll { + /* stack thread read frequently */ +- sem_t event_sem __rte_cache_aligned; +- enum wakeup_type type __rte_cache_aligned; +- volatile bool have_kernel_event __rte_cache_aligned; +- struct gazelle_wakeup_stat stat __rte_cache_aligned; ++ int32_t eventfd; ++ enum wakeup_type type; ++ bool have_event; ++ volatile bool in_wait __rte_cache_aligned; + char pad __rte_cache_aligned; + +- bool init; ++ struct gazelle_wakeup_stat stat; + struct protocol_stack *bind_stack; +- int32_t epollfd; /* epoll kernel fd, ctl add into gazelle_kernel_event thread */ + struct wakeup_poll *next; + + /* poll */ + struct pollfd *last_fds; + nfds_t last_nfds; + nfds_t last_max_nfds; +- struct epoll_event *events; + + /* epoll */ ++ int32_t epollfd; /* epoll kernel fd */ ++ bool have_kernel_fd; + int32_t stack_fd_cnt[PROTOCOL_STACK_MAX]; + struct protocol_stack *max_stack; + struct list_node event_list; +@@ -60,7 +61,9 @@ struct wakeup_poll { + }; + + struct netconn; +-void add_epoll_event(struct netconn *conn, uint32_t event); ++struct lwip_sock; ++void add_sock_event(struct lwip_sock *sock, uint32_t event); ++void wakeup_epoll(struct protocol_stack *stack, struct wakeup_poll *wakeup); + int32_t lstack_epoll_create(int32_t size); + int32_t lstack_epoll_ctl(int32_t epfd, int32_t op, int32_t fd, struct epoll_event *event); + int32_t lstack_epoll_wait(int32_t epfd, struct epoll_event *events, int32_t maxevents, int32_t timeout); +-- +2.23.0 + diff --git a/0101-conenct-support-multi-queues.patch b/0101-conenct-support-multi-queues.patch new file mode 100644 index 0000000..dd1469a --- /dev/null +++ b/0101-conenct-support-multi-queues.patch @@ -0,0 +1,113 @@ +From e55108b4f3ef13556075da292d1dc17a3335fb04 Mon Sep 17 00:00:00 2001 +From: wu-changsheng +Date: Sat, 8 Oct 2022 17:57:40 +0800 +Subject: [PATCH 17/21] conenct support multi queues + +--- + src/lstack/core/lstack_dpdk.c | 38 ++++++++++++++++++++-- + src/lstack/include/lstack_dpdk.h | 1 + + src/lstack/include/lstack_protocol_stack.h | 2 ++ + 3 files changed, 39 insertions(+), 2 deletions(-) + +diff --git a/src/lstack/core/lstack_dpdk.c b/src/lstack/core/lstack_dpdk.c +index 8d45838..39f8ecf 100644 +--- a/src/lstack/core/lstack_dpdk.c ++++ b/src/lstack/core/lstack_dpdk.c +@@ -27,6 +27,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -55,6 +56,15 @@ struct eth_params { + struct rte_kni; + static struct rte_bus *g_pci_bus = NULL; + ++#define RSS_HASH_KEY_LEN 40 ++static uint8_t g_default_rss_key[] = { ++ 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, ++ 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, ++ 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, ++ 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, ++ 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa, ++}; ++ + int32_t thread_affinity_default(void) + { + static cpu_set_t cpuset; +@@ -327,8 +337,8 @@ static int eth_params_rss(struct rte_eth_conf *conf, struct rte_eth_dev_info *de + int rss_enable = 0; + uint64_t def_rss_hf = ETH_RSS_TCP | ETH_RSS_IP; + struct rte_eth_rss_conf rss_conf = { +- NULL, +- 40, ++ g_default_rss_key, ++ RSS_HASH_KEY_LEN, + def_rss_hf, + }; + +@@ -445,7 +455,9 @@ int32_t dpdk_ethdev_init(void) + + if (rss_enable) { + rss_setup(port_id, nb_queues); ++ stack_group->reta_mask = dev_info.reta_size - 1; + } ++ stack_group->nb_queues = nb_queues; + + return 0; + } +@@ -571,3 +583,25 @@ int32_t init_dpdk_ethdev(void) + sem_post(&stack_group->ethdev_init); + return 0; + } ++ ++bool port_in_stack_queue(uint32_t src_ip, uint32_t dst_ip, uint16_t src_port, uint16_t dst_port) ++{ ++ struct protocol_stack_group *stack_group = get_protocol_stack_group(); ++ if (stack_group->reta_mask == 0 || stack_group->nb_queues <= 1) { ++ return true; ++ } ++ ++ struct rte_ipv4_tuple tuple = {0}; ++ tuple.src_addr = rte_be_to_cpu_32(src_ip); ++ tuple.dst_addr = rte_be_to_cpu_32(dst_ip); ++ tuple.sport = src_port; ++ tuple.dport = dst_port; ++ ++ uint32_t hash = rte_softrss((uint32_t *)&tuple, RTE_THASH_V4_L4_LEN, g_default_rss_key); ++ ++ uint32_t reta_index = hash & stack_group->reta_mask; ++ ++ struct protocol_stack *stack = get_protocol_stack(); ++ return (reta_index % stack_group->nb_queues) == stack->queue_id; ++} ++ +diff --git a/src/lstack/include/lstack_dpdk.h b/src/lstack/include/lstack_dpdk.h +index e224f23..684d025 100644 +--- a/src/lstack/include/lstack_dpdk.h ++++ b/src/lstack/include/lstack_dpdk.h +@@ -51,5 +51,6 @@ int dpdk_ethdev_start(void); + void dpdk_skip_nic_init(void); + int32_t dpdk_init_lstack_kni(void); + void dpdk_restore_pci(void); ++bool port_in_stack_queue(uint32_t src_ip, uint32_t dst_ip, uint16_t src_port, uint16_t dst_port); + + #endif /* GAZELLE_DPDK_H */ +diff --git a/src/lstack/include/lstack_protocol_stack.h b/src/lstack/include/lstack_protocol_stack.h +index 0a060b4..a791357 100644 +--- a/src/lstack/include/lstack_protocol_stack.h ++++ b/src/lstack/include/lstack_protocol_stack.h +@@ -78,6 +78,8 @@ struct protocol_stack_group { + sem_t all_init; + uint64_t rx_offload; + uint64_t tx_offload; ++ uint32_t reta_mask; ++ uint16_t nb_queues; + struct rte_mempool *kni_pktmbuf_pool; + struct eth_params *eth_params; + struct protocol_stack *stacks[PROTOCOL_STACK_MAX]; +-- +2.23.0 + diff --git a/0102-merge-sendmsg-write.patch b/0102-merge-sendmsg-write.patch new file mode 100644 index 0000000..1abe335 --- /dev/null +++ b/0102-merge-sendmsg-write.patch @@ -0,0 +1,275 @@ +From b87de3f10de1839e8dccc64ba01b3d4b7bd114c3 Mon Sep 17 00:00:00 2001 +From: wu-changsheng +Date: Sat, 8 Oct 2022 19:50:42 +0800 +Subject: [PATCH 18/21] merge sendmsg write + +--- + src/lstack/core/lstack_lwip.c | 92 ++++++++++++++++------ + src/lstack/core/lstack_protocol_stack.c | 16 ---- + src/lstack/core/lstack_thread_rpc.c | 32 +------- + src/lstack/include/lstack_protocol_stack.h | 2 - + src/lstack/include/lstack_thread_rpc.h | 2 - + 5 files changed, 68 insertions(+), 76 deletions(-) + +diff --git a/src/lstack/core/lstack_lwip.c b/src/lstack/core/lstack_lwip.c +index bb5a7e5..f4128d7 100644 +--- a/src/lstack/core/lstack_lwip.c ++++ b/src/lstack/core/lstack_lwip.c +@@ -259,6 +259,22 @@ static inline void del_data_out_event(struct lwip_sock *sock) + pthread_spin_unlock(&sock->wakeup->event_list_lock); + } + ++void write_stack_over(struct lwip_sock *sock) ++{ ++ if (sock->send_lastdata) { ++ sock->send_lastdata->tot_len = sock->send_lastdata->len = sock->send_datalen; ++ sock->send_lastdata = NULL; ++ } ++ ++ gazelle_ring_read_over(sock->send_ring); ++ ++ if (sock->wakeup) { ++ if (sock->wakeup->type == WAKEUP_EPOLL && (sock->events & EPOLLOUT)) { ++ del_data_out_event(sock); ++ } ++ } ++} ++ + ssize_t write_stack_data(struct lwip_sock *sock, const void *buf, size_t len) + { + if (sock->errevent > 0) { +@@ -272,31 +288,37 @@ ssize_t write_stack_data(struct lwip_sock *sock, const void *buf, size_t len) + + struct pbuf *pbuf = NULL; + ssize_t send_len = 0; +- size_t copy_len; + uint32_t send_pkt = 0; + + while (send_len < len && send_pkt < free_count) { +- if (gazelle_ring_read(sock->send_ring, (void **)&pbuf, 1) != 1) { +- if (sock->wakeup) { +- sock->wakeup->stat.app_write_idlefail++; ++ if (sock->send_lastdata) { ++ pbuf = sock->send_lastdata; ++ } else { ++ if (gazelle_ring_read(sock->send_ring, (void **)&pbuf, 1) != 1) { ++ if (sock->wakeup) { ++ sock->wakeup->stat.app_write_idlefail++; ++ } ++ break; + } +- break; ++ sock->send_lastdata = pbuf; ++ sock->send_datalen = 0; + } + +- copy_len = (len - send_len > pbuf->len) ? pbuf->len : (len - send_len); +- pbuf_take(pbuf, (char *)buf + send_len, copy_len); +- pbuf->tot_len = pbuf->len = copy_len; ++ uint16_t remian_len = pbuf->len - sock->send_datalen; ++ uint16_t copy_len = (len - send_len > remian_len) ? remian_len : (len - send_len); ++ pbuf_take_at(pbuf, (char *)buf + send_len, copy_len, sock->send_datalen); ++ sock->send_datalen += copy_len; ++ if (sock->send_datalen >= pbuf->len) { ++ sock->send_lastdata = NULL; ++ pbuf->tot_len = pbuf->len = sock->send_datalen; ++ send_pkt++; ++ } + + send_len += copy_len; +- send_pkt++; + } +- gazelle_ring_read_over(sock->send_ring); + + if (sock->wakeup) { + sock->wakeup->stat.app_write_cnt += send_pkt; +- if (sock->wakeup->type == WAKEUP_EPOLL && (sock->events & EPOLLOUT)) { +- del_data_out_event(sock); +- } + } + + return send_len; +@@ -500,6 +522,16 @@ ssize_t recvmsg_from_stack(int32_t s, struct msghdr *message, int32_t flags) + return buflen; + } + ++static inline void notice_stack_send(struct lwip_sock *sock, int32_t fd, int32_t len, int32_t flags) ++{ ++ if (__atomic_load_n(&sock->in_send, __ATOMIC_ACQUIRE) == 0) { ++ __atomic_store_n(&sock->in_send, 1, __ATOMIC_RELEASE); ++ if (rpc_call_send(fd, NULL, len, flags) != 0) { ++ __atomic_store_n(&sock->in_send, 0, __ATOMIC_RELEASE); ++ } ++ } ++} ++ + ssize_t gazelle_send(int32_t fd, const void *buf, size_t len, int32_t flags) + { + if (buf == NULL) { +@@ -516,18 +548,12 @@ ssize_t gazelle_send(int32_t fd, const void *buf, size_t len, int32_t flags) + } + + ssize_t send = write_stack_data(sock, buf, len); +- if (send < 0) { +- GAZELLE_RETURN(EAGAIN); +- } else if (send == 0) { +- return 0; ++ if (send <= 0) { ++ return send; + } ++ write_stack_over(sock); + +- if (__atomic_load_n(&sock->in_send, __ATOMIC_ACQUIRE) == 0) { +- __atomic_store_n(&sock->in_send, 1, __ATOMIC_RELEASE); +- if (rpc_call_send(fd, NULL, send, flags) != 0) { +- __atomic_store_n(&sock->in_send, 0, __ATOMIC_RELEASE); +- } +- } ++ notice_stack_send(sock, fd, send, flags); + return send; + } + +@@ -537,23 +563,37 @@ ssize_t sendmsg_to_stack(int32_t s, const struct msghdr *message, int32_t flags) + int32_t i; + ssize_t buflen = 0; + ++ struct lwip_sock *sock = get_socket(s); ++ if (sock == NULL) { ++ GAZELLE_RETURN(EINVAL); ++ } ++ + if (check_msg_vaild(message)) { + GAZELLE_RETURN(EINVAL); + } + + for (i = 0; i < message->msg_iovlen; i++) { +- ret = gazelle_send(s, message->msg_iov[i].iov_base, message->msg_iov[i].iov_len, flags); ++ if (message->msg_iov[i].iov_len == 0) { ++ continue; ++ } ++ ++ ret = write_stack_data(sock, message->msg_iov[i].iov_base, message->msg_iov[i].iov_len); + if (ret < 0) { +- return buflen == 0 ? ret : buflen; ++ buflen = (buflen == 0) ? ret : buflen; ++ break; + } + + buflen += ret; + + if (ret < message->msg_iov[i].iov_len) { +- return buflen; ++ break; + } + } + ++ if (buflen > 0) { ++ write_stack_over(sock); ++ notice_stack_send(sock, s, buflen, flags); ++ } + return buflen; + } + +diff --git a/src/lstack/core/lstack_protocol_stack.c b/src/lstack/core/lstack_protocol_stack.c +index 6119975..fbeca62 100644 +--- a/src/lstack/core/lstack_protocol_stack.c ++++ b/src/lstack/core/lstack_protocol_stack.c +@@ -608,22 +608,6 @@ void stack_recv(struct rpc_msg *msg) + msg->args[MSG_ARG_3].i); + } + +-void stack_sendmsg(struct rpc_msg *msg) +-{ +- msg->result = lwip_sendmsg(msg->args[MSG_ARG_0].i, msg->args[MSG_ARG_1].cp, msg->args[MSG_ARG_2].i); +- if (msg->result != 0) { +- LSTACK_LOG(ERR, LSTACK, "tid %ld, fd %d fail %ld\n", get_stack_tid(), msg->args[MSG_ARG_0].i, msg->result); +- } +-} +- +-void stack_recvmsg(struct rpc_msg *msg) +-{ +- msg->result = lwip_recvmsg(msg->args[MSG_ARG_0].i, msg->args[MSG_ARG_1].p, msg->args[MSG_ARG_2].i); +- if (msg->result != 0) { +- LSTACK_LOG(ERR, LSTACK, "tid %ld, fd %d fail %ld\n", get_stack_tid(), msg->args[MSG_ARG_0].i, msg->result); +- } +-} +- + /* any protocol stack thread receives arp packet and sync it to other threads so that it can have the arp table */ + void stack_broadcast_arp(struct rte_mbuf *mbuf, struct protocol_stack *cur_stack) + { +diff --git a/src/lstack/core/lstack_thread_rpc.c b/src/lstack/core/lstack_thread_rpc.c +index d1f7580..ad967e9 100644 +--- a/src/lstack/core/lstack_thread_rpc.c ++++ b/src/lstack/core/lstack_thread_rpc.c +@@ -236,7 +236,8 @@ int32_t rpc_call_arp(struct protocol_stack *stack, struct rte_mbuf *mbuf) + + msg->self_release = 0; + msg->args[MSG_ARG_0].p = mbuf; +- lockless_queue_mpsc_push(&stack->rpc_queue, &msg->queue_node); ++ ++ rpc_call(&stack->rpc_queue, msg); + + return 0; + } +@@ -446,32 +447,3 @@ int32_t rpc_call_send(int fd, const void *buf, size_t len, int flags) + return 0; + } + +-int32_t rpc_call_sendmsg(int fd, const struct msghdr *msghdr, int flags) +-{ +- struct protocol_stack *stack = get_protocol_stack_by_fd(fd); +- struct rpc_msg *msg = rpc_msg_alloc(stack, stack_sendmsg); +- if (msg == NULL) { +- return -1; +- } +- +- msg->args[MSG_ARG_0].i = fd; +- msg->args[MSG_ARG_1].cp = msghdr; +- msg->args[MSG_ARG_2].i = flags; +- +- return rpc_sync_call(&stack->rpc_queue, msg); +-} +- +-int32_t rpc_call_recvmsg(int fd, struct msghdr *msghdr, int flags) +-{ +- struct protocol_stack *stack = get_protocol_stack_by_fd(fd); +- struct rpc_msg *msg = rpc_msg_alloc(stack, stack_recvmsg); +- if (msg == NULL) { +- return -1; +- } +- +- msg->args[MSG_ARG_0].i = fd; +- msg->args[MSG_ARG_1].p = msghdr; +- msg->args[MSG_ARG_2].i = flags; +- +- return rpc_sync_call(&stack->rpc_queue, msg); +-} +diff --git a/src/lstack/include/lstack_protocol_stack.h b/src/lstack/include/lstack_protocol_stack.h +index a791357..f58ae56 100644 +--- a/src/lstack/include/lstack_protocol_stack.h ++++ b/src/lstack/include/lstack_protocol_stack.h +@@ -124,8 +124,6 @@ void stack_listen(struct rpc_msg *msg); + void stack_accept(struct rpc_msg *msg); + void stack_connect(struct rpc_msg *msg); + void stack_recv(struct rpc_msg *msg); +-void stack_sendmsg(struct rpc_msg *msg); +-void stack_recvmsg(struct rpc_msg *msg); + void stack_getpeername(struct rpc_msg *msg); + void stack_getsockname(struct rpc_msg *msg); + void stack_getsockopt(struct rpc_msg *msg); +diff --git a/src/lstack/include/lstack_thread_rpc.h b/src/lstack/include/lstack_thread_rpc.h +index e1223de..175c8c9 100644 +--- a/src/lstack/include/lstack_thread_rpc.h ++++ b/src/lstack/include/lstack_thread_rpc.h +@@ -67,8 +67,6 @@ int32_t rpc_call_listen(int s, int backlog); + int32_t rpc_call_accept(int fd, struct sockaddr *addr, socklen_t *addrlen); + int32_t rpc_call_connect(int fd, const struct sockaddr *addr, socklen_t addrlen); + int32_t rpc_call_send(int fd, const void *buf, size_t len, int flags); +-int32_t rpc_call_sendmsg(int fd, const struct msghdr *msg, int flags); +-int32_t rpc_call_recvmsg(int fd, struct msghdr *msg, int flags); + int32_t rpc_call_getpeername(int fd, struct sockaddr *addr, socklen_t *addrlen); + int32_t rpc_call_getsockname(int fd, struct sockaddr *addr, socklen_t *addrlen); + int32_t rpc_call_getsockopt(int fd, int level, int optname, void *optval, socklen_t *optlen); +-- +2.23.0 + diff --git a/0103-add-thread-select-path.patch b/0103-add-thread-select-path.patch new file mode 100644 index 0000000..5a2660b --- /dev/null +++ b/0103-add-thread-select-path.patch @@ -0,0 +1,160 @@ +From b61caf98b117313d1c2d59c1954d494901adcb73 Mon Sep 17 00:00:00 2001 +From: wu-changsheng +Date: Sat, 8 Oct 2022 20:39:48 +0800 +Subject: [PATCH 19/21] add thread select path + +--- + src/lstack/api/lstack_wrap.c | 14 ++++++--- + src/lstack/core/lstack_init.c | 54 +++++++++++++++++++++++++++++++++++ + 2 files changed, 64 insertions(+), 4 deletions(-) + +diff --git a/src/lstack/api/lstack_wrap.c b/src/lstack/api/lstack_wrap.c +index 296906e..d88513b 100644 +--- a/src/lstack/api/lstack_wrap.c ++++ b/src/lstack/api/lstack_wrap.c +@@ -42,9 +42,11 @@ enum KERNEL_LWIP_PATH { + PATH_UNKNOW, + }; + ++bool select_thread_path(void); ++ + static enum KERNEL_LWIP_PATH select_path(int fd) + { +- if (posix_api == NULL) { ++ if (unlikely(posix_api == NULL)) { + /* posix api maybe call before gazelle init */ + if (posix_api_init() != 0) { + LSTACK_PRE_LOG(LSTACK_ERR, "posix_api_init failed\n"); +@@ -52,6 +54,10 @@ static enum KERNEL_LWIP_PATH select_path(int fd) + return PATH_KERNEL; + } + ++ if (!select_thread_path()) { ++ return PATH_KERNEL; ++ } ++ + if (unlikely(posix_api->ues_posix)) { + return PATH_KERNEL; + } +@@ -93,7 +99,7 @@ static inline int32_t do_epoll_create(int32_t size) + + static inline int32_t do_epoll_ctl(int32_t epfd, int32_t op, int32_t fd, struct epoll_event* event) + { +- if (unlikely(posix_api->ues_posix)) { ++ if (unlikely(posix_api->ues_posix) || !select_thread_path()) { + return posix_api->epoll_ctl_fn(epfd, op, fd, event); + } + +@@ -102,7 +108,7 @@ static inline int32_t do_epoll_ctl(int32_t epfd, int32_t op, int32_t fd, struct + + static inline int32_t do_epoll_wait(int32_t epfd, struct epoll_event* events, int32_t maxevents, int32_t timeout) + { +- if (unlikely(posix_api->ues_posix)) { ++ if (unlikely(posix_api->ues_posix) || !select_thread_path()) { + return posix_api->epoll_wait_fn(epfd, events, maxevents, timeout); + } + +@@ -369,7 +375,7 @@ static int32_t do_poll(struct pollfd *fds, nfds_t nfds, int32_t timeout) + GAZELLE_RETURN(EINVAL); + } + +- if (unlikely(posix_api->ues_posix) || nfds == 0) { ++ if (unlikely(posix_api->ues_posix) || nfds == 0 || !select_thread_path()) { + return posix_api->poll_fn(fds, nfds, timeout); + } + +diff --git a/src/lstack/core/lstack_init.c b/src/lstack/core/lstack_init.c +index b1c69e6..f647b8e 100644 +--- a/src/lstack/core/lstack_init.c ++++ b/src/lstack/core/lstack_init.c +@@ -48,8 +48,10 @@ + #define LSTACK_SO_NAME "liblstack.so" + #define LSTACK_PRELOAD_NAME_LEN PATH_MAX + #define LSTACK_PRELOAD_ENV_PROC "GAZELLE_BIND_PROCNAME" ++#define LSTACK_ENV_THREAD "GAZELLE_THREAD_NAME" + + static volatile bool g_init_fail = false; ++static PER_THREAD int32_t g_thread_path = -1; + + void set_init_fail(void) + { +@@ -64,14 +66,34 @@ bool get_init_fail(void) + struct lstack_preload { + int32_t preload_switch; + char env_procname[LSTACK_PRELOAD_NAME_LEN]; ++ bool get_thread_name; ++ char env_threadname[LSTACK_PRELOAD_NAME_LEN]; + }; + static struct lstack_preload g_preload_info = {0}; + ++static void get_select_thread_name(void) ++{ ++ g_preload_info.get_thread_name = true; ++ ++ char *enval = NULL; ++ enval = getenv(LSTACK_ENV_THREAD); ++ if (enval == NULL) { ++ return; ++ } ++ if (strcpy_s(g_preload_info.env_threadname, LSTACK_PRELOAD_NAME_LEN, enval) != EOK) { ++ return; ++ } ++ ++ LSTACK_PRE_LOG(LSTACK_INFO, "thread name=%s ok\n", g_preload_info.env_threadname); ++} ++ + static int32_t preload_info_init(void) + { + char *enval = NULL; + + g_preload_info.preload_switch = 0; ++ ++ get_select_thread_name(); + + enval = getenv(LSTACK_PRELOAD_ENV_SYS); + if (enval == NULL) { +@@ -91,9 +113,41 @@ static int32_t preload_info_init(void) + } + + g_preload_info.preload_switch = 1; ++ LSTACK_PRE_LOG(LSTACK_INFO, "LD_PRELOAD ok\n"); + return 0; + } + ++bool select_thread_path(void) ++{ ++ if (g_thread_path >= 0) { ++ return g_thread_path; ++ } ++ ++ if (!g_preload_info.get_thread_name) { ++ get_select_thread_name(); ++ } ++ ++ /* not set GAZELLE_THREAD_NAME, select all thread */ ++ if (g_preload_info.env_threadname[0] == '\0') { ++ g_thread_path = 1; ++ return true; ++ } ++ ++ char thread_name[PATH_MAX] = {0}; ++ if (pthread_getname_np(pthread_self(), thread_name, PATH_MAX) != 0) { ++ g_thread_path = 0; ++ return false; ++ } ++ ++ if (strstr(thread_name, g_preload_info.env_threadname) == NULL) { ++ g_thread_path = 0; ++ return false; ++ } ++ ++ g_thread_path = 1; ++ return true; ++} ++ + static int32_t check_process_conflict(void) + { + int32_t ret; +-- +2.23.0 + diff --git a/0104-support-conf-control-app-bind-numa.patch b/0104-support-conf-control-app-bind-numa.patch new file mode 100644 index 0000000..8aab09b --- /dev/null +++ b/0104-support-conf-control-app-bind-numa.patch @@ -0,0 +1,99 @@ +From 8a148ade17e4c73cd768ad81b30096139898bbe7 Mon Sep 17 00:00:00 2001 +From: wu-changsheng +Date: Sat, 8 Oct 2022 21:03:58 +0800 +Subject: [PATCH 20/21] support conf control app bind numa + +--- + src/lstack/api/lstack_epoll.c | 9 +++++++-- + src/lstack/core/lstack_cfg.c | 20 +++++++++++++++++++- + src/lstack/include/lstack_cfg.h | 1 + + 3 files changed, 27 insertions(+), 3 deletions(-) + +diff --git a/src/lstack/api/lstack_epoll.c b/src/lstack/api/lstack_epoll.c +index cc1cbf0..bd3396a 100644 +--- a/src/lstack/api/lstack_epoll.c ++++ b/src/lstack/api/lstack_epoll.c +@@ -343,7 +343,9 @@ static void epoll_bind_statck(struct wakeup_poll *wakeup) + } + + if (wakeup->bind_stack != wakeup->max_stack && wakeup->max_stack) { +- bind_to_stack_numa(wakeup->max_stack); ++ if (get_global_cfg_params()->app_bind_numa) { ++ bind_to_stack_numa(wakeup->max_stack); ++ } + if (wakeup->bind_stack) { + unregister_wakeup(wakeup->bind_stack, wakeup); + } +@@ -461,7 +463,10 @@ static void poll_bind_statck(struct wakeup_poll *wakeup, int32_t *stack_count) + if (wakeup->bind_stack) { + unregister_wakeup(wakeup->bind_stack, wakeup); + } +- bind_to_stack_numa(stack_group->stacks[bind_id]); ++ ++ if (get_global_cfg_params()->app_bind_numa) { ++ bind_to_stack_numa(stack_group->stacks[bind_id]); ++ } + wakeup->bind_stack = stack_group->stacks[bind_id]; + register_wakeup(wakeup->bind_stack, wakeup); + } +diff --git a/src/lstack/core/lstack_cfg.c b/src/lstack/core/lstack_cfg.c +index 19a5b30..2cd9989 100644 +--- a/src/lstack/core/lstack_cfg.c ++++ b/src/lstack/core/lstack_cfg.c +@@ -55,6 +55,7 @@ static int32_t parse_dpdk_args(void); + static int32_t parse_gateway_addr(void); + static int32_t parse_kni_switch(void); + static int32_t parse_listen_shadow(void); ++static int32_t parse_app_bind_numa(void); + + struct config_vector_t { + const char *name; +@@ -71,8 +72,9 @@ static struct config_vector_t g_config_tbl[] = { + { "num_cpus", parse_stack_cpu_number }, + { "num_wakeup", parse_wakeup_cpu_number }, + { "low_power_mode", parse_low_power_mode }, +- { "kni_switch", parse_kni_switch }, ++ { "kni_switch", parse_kni_switch }, + { "listen_shadow", parse_listen_shadow }, ++ { "app_bind_numa", parse_app_bind_numa }, + { NULL, NULL } + }; + +@@ -710,6 +712,22 @@ static int32_t parse_listen_shadow(void) + return 0; + } + ++static int32_t parse_app_bind_numa(void) ++{ ++ const config_setting_t *arg = NULL; ++ ++ arg = config_lookup(&g_config, "app_bind_numa"); ++ if (arg == NULL) { ++ g_config_params.app_bind_numa = true; ++ return 0; ++ } ++ ++ int32_t val = config_setting_get_int(arg); ++ g_config_params.app_bind_numa = (val == 0) ? false : true; ++ ++ return 0; ++} ++ + static int32_t parse_kni_switch(void) + { + const config_setting_t *arg = NULL; +diff --git a/src/lstack/include/lstack_cfg.h b/src/lstack/include/lstack_cfg.h +index 1a6ef39..80dfd24 100644 +--- a/src/lstack/include/lstack_cfg.h ++++ b/src/lstack/include/lstack_cfg.h +@@ -76,6 +76,7 @@ struct cfg_params { + bool use_ltran; // ture:lstack read from nic false:read form ltran + bool kni_switch; + bool listen_shadow; // true:listen in all stack thread. false:listen in one stack thread. ++ bool app_bind_numa; + int dpdk_argc; + char **dpdk_argv; + struct secondary_attach_arg sec_attach_arg; +-- +2.23.0 + diff --git a/0105-fix-epoll_wait-cover-kernel-event.patch b/0105-fix-epoll_wait-cover-kernel-event.patch new file mode 100644 index 0000000..9be5f4b --- /dev/null +++ b/0105-fix-epoll_wait-cover-kernel-event.patch @@ -0,0 +1,130 @@ +From 05fe4efd5ff233edcf39a3abc1c4ef048f8173b6 Mon Sep 17 00:00:00 2001 +From: wu-changsheng +Date: Sat, 8 Oct 2022 21:50:19 +0800 +Subject: [PATCH 21/21] fix epoll_wait cover kernel event + +--- + src/lstack/api/lstack_epoll.c | 6 +++--- + src/lstack/core/lstack_lwip.c | 8 +++++--- + src/lstack/core/lstack_stack_stat.c | 12 +++++------- + src/ltran/ltran_forward.c | 13 ++++++------- + 4 files changed, 19 insertions(+), 20 deletions(-) + +diff --git a/src/lstack/api/lstack_epoll.c b/src/lstack/api/lstack_epoll.c +index bd3396a..6ac245c 100644 +--- a/src/lstack/api/lstack_epoll.c ++++ b/src/lstack/api/lstack_epoll.c +@@ -137,7 +137,7 @@ int32_t lstack_epoll_create(int32_t size) + event.data.fd = wakeup->eventfd; + event.events = EPOLLIN | EPOLLET; + if (posix_api->epoll_ctl_fn(fd, EPOLL_CTL_ADD, wakeup->eventfd, &event) < 0) { +- LSTACK_LOG(ERR, LSTACK, "eventfd errno=%d\n", errno); ++ LSTACK_LOG(ERR, LSTACK, "ctl eventfd errno=%d\n", errno); + posix_api->close_fn(fd); + free(wakeup); + GAZELLE_RETURN(EINVAL); +@@ -403,10 +403,10 @@ int32_t lstack_epoll_wait(int32_t epfd, struct epoll_event* events, int32_t maxe + } + + if (del_event_fd(&events[lwip_num], kernel_num, wakeup->eventfd)) { ++ kernel_num--; + if (lwip_num == 0) { +- lwip_num = epoll_lwip_event(wakeup, events, maxevents); ++ lwip_num = epoll_lwip_event(wakeup, &events[kernel_num], maxevents - kernel_num); + } +- kernel_num--; + } + + return lwip_num + kernel_num; +diff --git a/src/lstack/core/lstack_lwip.c b/src/lstack/core/lstack_lwip.c +index f4128d7..60e30d9 100644 +--- a/src/lstack/core/lstack_lwip.c ++++ b/src/lstack/core/lstack_lwip.c +@@ -359,7 +359,7 @@ void stack_send(struct rpc_msg *msg) + __atomic_store_n(&sock->in_send, 0, __ATOMIC_RELEASE); + rte_mb(); + +- if (!NETCONN_IS_DATAOUT(sock)) { ++ if (!NETCONN_IS_DATAOUT(sock) || sock->errevent > 0) { + return; + } + +@@ -387,7 +387,7 @@ void send_stack_list(struct protocol_stack *stack, uint32_t send_max) + __atomic_store_n(&sock->in_send, 0, __ATOMIC_RELEASE); + rte_mb(); + +- if (sock->conn == NULL || !NETCONN_IS_DATAOUT(sock)) { ++ if (sock->conn == NULL || sock->errevent > 0 || !NETCONN_IS_DATAOUT(sock)) { + list_del_node_null(&sock->send_list); + continue; + } +@@ -578,7 +578,7 @@ ssize_t sendmsg_to_stack(int32_t s, const struct msghdr *message, int32_t flags) + } + + ret = write_stack_data(sock, message->msg_iov[i].iov_base, message->msg_iov[i].iov_len); +- if (ret < 0) { ++ if (ret <= 0) { + buflen = (buflen == 0) ? ret : buflen; + break; + } +@@ -742,6 +742,8 @@ void gazelle_connected_callback(struct netconn *conn) + return; + } + ++ posix_api->shutdown_fn(fd, SHUT_RDWR); ++ + SET_CONN_TYPE_LIBOS(conn); + + add_sock_event(sock, EPOLLOUT); +diff --git a/src/lstack/core/lstack_stack_stat.c b/src/lstack/core/lstack_stack_stat.c +index 245bcd7..b32f082 100644 +--- a/src/lstack/core/lstack_stack_stat.c ++++ b/src/lstack/core/lstack_stack_stat.c +@@ -134,13 +134,11 @@ static void get_wakeup_stat(struct protocol_stack *stack, struct gazelle_wakeup_ + + struct wakeup_poll *node = stack->wakeup_list; + while (node) { +- if (node->bind_stack == stack) { +- stat->app_events += node->stat.app_events; +- stat->read_null += node->stat.read_null; +- stat->app_write_cnt += node->stat.app_write_cnt; +- stat->app_write_idlefail += node->stat.app_write_idlefail; +- stat->app_read_cnt += node->stat.app_read_cnt; +- } ++ stat->app_events += node->stat.app_events; ++ stat->read_null += node->stat.read_null; ++ stat->app_write_cnt += node->stat.app_write_cnt; ++ stat->app_write_idlefail += node->stat.app_write_idlefail; ++ stat->app_read_cnt += node->stat.app_read_cnt; + + node = node->next; + } +diff --git a/src/ltran/ltran_forward.c b/src/ltran/ltran_forward.c +index 9ca04aa..b73c983 100644 +--- a/src/ltran/ltran_forward.c ++++ b/src/ltran/ltran_forward.c +@@ -437,15 +437,14 @@ static __rte_always_inline void tcp_hash_table_add_conn(struct gazelle_stack *st + tcp_conn = gazelle_conn_get_by_quintuple(conn_htable, transfer_qtuple); + if (tcp_conn) { + /* When lstack is the server, conn is created in tcp_handle func. lwip send the connect command after +- * receiving syn, and delete conn timeout. */ +- if (tcp_conn->conn_timeout >= 0) { ++ * receiving syn, and delete conn timeout. */ ++ if (tcp_conn->conn_timeout >= 0) { + tcp_conn->conn_timeout = -1; +- return; +- } else { +- /* del old invaild conn */ ++ return; ++ } else { ++ /* del old invaild conn */ + gazelle_conn_del_by_quintuple(conn_htable, transfer_qtuple); +- printf("del old conn port=%d\n", ntohs(transfer_qtuple->dst_port)); +- } ++ } + } + + /* When lstack is the client, lwip send the connect command while calling connect func. conn is created +-- +2.23.0 + diff --git a/0106-fix-read-stack-data-return-0-when-no-data.patch b/0106-fix-read-stack-data-return-0-when-no-data.patch new file mode 100644 index 0000000..7fe953e --- /dev/null +++ b/0106-fix-read-stack-data-return-0-when-no-data.patch @@ -0,0 +1,45 @@ +From d039a4e91eb805ea688467e8e4643cbbcf81f35e Mon Sep 17 00:00:00 2001 +From: wu-changsheng +Date: Sun, 9 Oct 2022 10:31:16 +0800 +Subject: [PATCH] fix read stack data return 0 when no data + +--- + src/lstack/core/lstack_lwip.c | 6 +++++- + src/ltran/ltran_tcp_conn.h | 1 + + 2 files changed, 6 insertions(+), 1 deletion(-) + +diff --git a/src/lstack/core/lstack_lwip.c b/src/lstack/core/lstack_lwip.c +index 60e30d9..8c1df26 100644 +--- a/src/lstack/core/lstack_lwip.c ++++ b/src/lstack/core/lstack_lwip.c +@@ -642,10 +642,14 @@ ssize_t read_stack_data(int32_t fd, void *buf, size_t len, int32_t flags) + GAZELLE_RETURN(EINVAL); + } + +- if (sock->errevent > 0 && !NETCONN_IS_DATAIN(sock)) { ++ if (sock->errevent > 0) { + return 0; + } + ++ if (!NETCONN_IS_DATAIN(sock)) { ++ GAZELLE_RETURN(EAGAIN); ++ } ++ + while (recv_left > 0) { + if (sock->recv_lastdata) { + pbuf = sock->recv_lastdata; +diff --git a/src/ltran/ltran_tcp_conn.h b/src/ltran/ltran_tcp_conn.h +index 2b6e6ea..fa508bc 100644 +--- a/src/ltran/ltran_tcp_conn.h ++++ b/src/ltran/ltran_tcp_conn.h +@@ -15,6 +15,7 @@ + + #include + #include ++#include + #include + + #include "gazelle_opt.h" +-- +2.23.0 + diff --git a/0107-fix-stack-wakeup-node-del.patch b/0107-fix-stack-wakeup-node-del.patch new file mode 100644 index 0000000..d80d2af --- /dev/null +++ b/0107-fix-stack-wakeup-node-del.patch @@ -0,0 +1,79 @@ +From ef98d32ed9dbde08412c8d4587bfeb7a6e75ec7c Mon Sep 17 00:00:00 2001 +From: wu-changsheng +Date: Sun, 9 Oct 2022 20:21:40 +0800 +Subject: [PATCH] fix stack wakeup node del + +--- + src/lstack/api/lstack_epoll.c | 4 ++-- + src/lstack/core/lstack_lwip.c | 2 +- + src/lstack/core/lstack_protocol_stack.c | 6 ++++++ + src/lstack/core/lstack_stack_stat.c | 1 + + 4 files changed, 10 insertions(+), 3 deletions(-) + +diff --git a/src/lstack/api/lstack_epoll.c b/src/lstack/api/lstack_epoll.c +index 6ac245c..dcd58e7 100644 +--- a/src/lstack/api/lstack_epoll.c ++++ b/src/lstack/api/lstack_epoll.c +@@ -164,8 +164,8 @@ int32_t lstack_epoll_close(int32_t fd) + } + + if (sock->wakeup) { +- if (sock->stack) { +- unregister_wakeup(sock->stack, sock->wakeup); ++ if (sock->wakeup->bind_stack) { ++ unregister_wakeup(sock->wakeup->bind_stack, sock->wakeup); + } + posix_api->close_fn(sock->wakeup->eventfd); + pthread_spin_destroy(&sock->wakeup->event_list_lock); +diff --git a/src/lstack/core/lstack_lwip.c b/src/lstack/core/lstack_lwip.c +index 8c1df26..4fbaed1 100644 +--- a/src/lstack/core/lstack_lwip.c ++++ b/src/lstack/core/lstack_lwip.c +@@ -486,7 +486,7 @@ static int32_t check_msg_vaild(const struct msghdr *message) + } + + for (int32_t i = 0; i < message->msg_iovlen; i++) { +- if ((message->msg_iov[i].iov_base == NULL) || ((ssize_t)message->msg_iov[i].iov_len <= 0) || ++ if ((message->msg_iov[i].iov_base == NULL) || ((ssize_t)message->msg_iov[i].iov_len < 0) || + ((size_t)(ssize_t)message->msg_iov[i].iov_len != message->msg_iov[i].iov_len) || + ((ssize_t)(buflen + (ssize_t)message->msg_iov[i].iov_len) <= 0)) { + GAZELLE_RETURN(EINVAL); +diff --git a/src/lstack/core/lstack_protocol_stack.c b/src/lstack/core/lstack_protocol_stack.c +index fbeca62..8283a41 100644 +--- a/src/lstack/core/lstack_protocol_stack.c ++++ b/src/lstack/core/lstack_protocol_stack.c +@@ -355,6 +355,10 @@ static struct protocol_stack *stack_thread_init(uint16_t queue_id) + + static void wakeup_stack_wait(struct protocol_stack *stack) + { ++ if (pthread_spin_trylock(&stack->wakeup_list_lock)) { ++ return; ++ } ++ + struct wakeup_poll *node = stack->wakeup_list; + while (node) { + if (node->have_event) { +@@ -363,6 +367,8 @@ static void wakeup_stack_wait(struct protocol_stack *stack) + } + node = node->next; + } ++ ++ pthread_spin_unlock(&stack->wakeup_list_lock); + } + + static void* gazelle_stack_thread(void *arg) +diff --git a/src/lstack/core/lstack_stack_stat.c b/src/lstack/core/lstack_stack_stat.c +index b32f082..c011aed 100644 +--- a/src/lstack/core/lstack_stack_stat.c ++++ b/src/lstack/core/lstack_stack_stat.c +@@ -124,6 +124,7 @@ void unregister_wakeup(struct protocol_stack *stack, struct wakeup_poll *wakeup) + } else { + stack->wakeup_list = node->next; + } ++ node->next = NULL; + + pthread_spin_unlock(&stack->wakeup_list_lock); + } +-- +2.23.0 + diff --git a/0108-avoid-useless-stack-check-wakeup-event.patch b/0108-avoid-useless-stack-check-wakeup-event.patch new file mode 100644 index 0000000..8623c12 --- /dev/null +++ b/0108-avoid-useless-stack-check-wakeup-event.patch @@ -0,0 +1,60 @@ +From 315f57f4bed8aeaa375f7f326ab8723f21082e9a Mon Sep 17 00:00:00 2001 +From: wu-changsheng +Date: Sun, 9 Oct 2022 22:15:19 +0800 +Subject: [PATCH] avoid useless stack check wakeup event + +--- + src/lstack/api/lstack_epoll.c | 1 + + src/lstack/core/lstack_protocol_stack.c | 4 +++- + src/lstack/include/lstack_protocol_stack.h | 1 + + 3 files changed, 5 insertions(+), 1 deletion(-) + +diff --git a/src/lstack/api/lstack_epoll.c b/src/lstack/api/lstack_epoll.c +index dcd58e7..4da88b0 100644 +--- a/src/lstack/api/lstack_epoll.c ++++ b/src/lstack/api/lstack_epoll.c +@@ -50,6 +50,7 @@ void add_sock_event(struct lwip_sock *sock, uint32_t event) + } + + wakeup->have_event = true; ++ sock->stack->have_event = true; + + if (wakeup->type == WAKEUP_POLL) { + return; +diff --git a/src/lstack/core/lstack_protocol_stack.c b/src/lstack/core/lstack_protocol_stack.c +index 8283a41..71412b4 100644 +--- a/src/lstack/core/lstack_protocol_stack.c ++++ b/src/lstack/core/lstack_protocol_stack.c +@@ -355,7 +355,7 @@ static struct protocol_stack *stack_thread_init(uint16_t queue_id) + + static void wakeup_stack_wait(struct protocol_stack *stack) + { +- if (pthread_spin_trylock(&stack->wakeup_list_lock)) { ++ if (!stack->have_event || pthread_spin_trylock(&stack->wakeup_list_lock)) { + return; + } + +@@ -369,6 +369,8 @@ static void wakeup_stack_wait(struct protocol_stack *stack) + } + + pthread_spin_unlock(&stack->wakeup_list_lock); ++ ++ stack->have_event = false; + } + + static void* gazelle_stack_thread(void *arg) +diff --git a/src/lstack/include/lstack_protocol_stack.h b/src/lstack/include/lstack_protocol_stack.h +index f58ae56..4249217 100644 +--- a/src/lstack/include/lstack_protocol_stack.h ++++ b/src/lstack/include/lstack_protocol_stack.h +@@ -62,6 +62,7 @@ struct protocol_stack { + + struct list_node recv_list; + struct list_node send_list; ++ bool have_event; + + volatile uint16_t conn_num; + struct stats_ *lwip_stats; +-- +2.23.0 + diff --git a/0109-fix-mesg-loss.patch b/0109-fix-mesg-loss.patch new file mode 100644 index 0000000..4ab1195 --- /dev/null +++ b/0109-fix-mesg-loss.patch @@ -0,0 +1,32 @@ +From f9183f5d025f8eac168c4f66d4b96992ff4dff28 Mon Sep 17 00:00:00 2001 +From: compile_success <980965867@qq.com> +Date: Thu, 13 Oct 2022 08:54:40 +0000 +Subject: [PATCH 1/3] fix mesg loss + +--- + src/lstack/core/lstack_lwip.c | 6 +----- + 1 file changed, 1 insertion(+), 5 deletions(-) + +diff --git a/src/lstack/core/lstack_lwip.c b/src/lstack/core/lstack_lwip.c +index 4fbaed1..5ca0cc3 100644 +--- a/src/lstack/core/lstack_lwip.c ++++ b/src/lstack/core/lstack_lwip.c +@@ -642,14 +642,10 @@ ssize_t read_stack_data(int32_t fd, void *buf, size_t len, int32_t flags) + GAZELLE_RETURN(EINVAL); + } + +- if (sock->errevent > 0) { ++ if (sock->errevent > 0 && !NETCONN_IS_DATAIN(sock)) { + return 0; + } + +- if (!NETCONN_IS_DATAIN(sock)) { +- GAZELLE_RETURN(EAGAIN); +- } +- + while (recv_left > 0) { + if (sock->recv_lastdata) { + pbuf = sock->recv_lastdata; +-- +2.23.0 + diff --git a/0110-add-accept4-and-epoll_create1.patch b/0110-add-accept4-and-epoll_create1.patch new file mode 100644 index 0000000..7e2cefd --- /dev/null +++ b/0110-add-accept4-and-epoll_create1.patch @@ -0,0 +1,232 @@ +From 840d501625839cc9c837f65c98baa0def5af3616 Mon Sep 17 00:00:00 2001 +From: compile_success <980965867@qq.com> +Date: Sat, 15 Oct 2022 12:06:56 +0000 +Subject: [PATCH 1/2] add accept4 and epoll_create1 + +--- + src/lstack/api/lstack_epoll.c | 15 +++++++++-- + src/lstack/api/lstack_wrap.c | 31 ++++++++++++++++++---- + src/lstack/core/lstack_lwip.c | 7 ++++- + src/lstack/core/lstack_protocol_stack.c | 11 +++++--- + src/lstack/core/lstack_thread_rpc.c | 3 ++- + src/lstack/include/lstack_protocol_stack.h | 1 + + src/lstack/include/lstack_thread_rpc.h | 2 +- + src/lstack/include/posix/lstack_epoll.h | 1 + + 8 files changed, 58 insertions(+), 13 deletions(-) + +diff --git a/src/lstack/api/lstack_epoll.c b/src/lstack/api/lstack_epoll.c +index 4da88b0..1206e75 100644 +--- a/src/lstack/api/lstack_epoll.c ++++ b/src/lstack/api/lstack_epoll.c +@@ -105,9 +105,8 @@ static void raise_pending_events(struct wakeup_poll *wakeup, struct lwip_sock *s + } + } + +-int32_t lstack_epoll_create(int32_t size) ++int32_t lstack_do_epoll_create(int32_t fd) + { +- int32_t fd = posix_api->epoll_create_fn(size); + if (fd < 0) { + return fd; + } +@@ -154,6 +153,18 @@ int32_t lstack_epoll_create(int32_t size) + return fd; + } + ++int32_t lstack_epoll_create1(int32_t flags) ++{ ++ int32_t fd = posix_api->epoll_create1_fn(flags); ++ return lstack_do_epoll_create(fd); ++} ++ ++int32_t lstack_epoll_create(int32_t flags) ++{ ++ int32_t fd = posix_api->epoll_create_fn(flags); ++ return lstack_do_epoll_create(fd); ++} ++ + int32_t lstack_epoll_close(int32_t fd) + { + posix_api->close_fn(fd); +diff --git a/src/lstack/api/lstack_wrap.c b/src/lstack/api/lstack_wrap.c +index d88513b..4669a30 100644 +--- a/src/lstack/api/lstack_wrap.c ++++ b/src/lstack/api/lstack_wrap.c +@@ -80,6 +80,23 @@ static enum KERNEL_LWIP_PATH select_path(int fd) + return PATH_UNKNOW; + } + ++static inline int32_t do_epoll_create1(int32_t flags) ++{ ++ if (posix_api == NULL) { ++ /* posix api maybe call before gazelle init */ ++ if (posix_api_init() != 0) { ++ LSTACK_PRE_LOG(LSTACK_ERR, "posix_api_init failed\n"); ++ } ++ return posix_api->epoll_create1_fn(flags); ++ } ++ ++ if (unlikely(posix_api->ues_posix)) { ++ return posix_api->epoll_create1_fn(flags); ++ } ++ ++ return lstack_epoll_create1(flags); ++} ++ + static inline int32_t do_epoll_create(int32_t size) + { + if (posix_api == NULL) { +@@ -147,11 +164,7 @@ static int32_t do_accept4(int32_t s, struct sockaddr *addr, socklen_t *addrlen, + return posix_api->accept4_fn(s, addr, addrlen, flags); + } + +- if ((flags & SOCK_CLOEXEC) == 0) { +- return 0; +- } +- +- int32_t fd = stack_broadcast_accept(s, addr, addrlen); ++ int32_t fd = stack_broadcast_accept4(s, addr, addrlen, flags); + if (fd >= 0) { + return fd; + } +@@ -432,6 +445,10 @@ static int32_t do_sigaction(int32_t signum, const struct sigaction *act, struct + * ------- LD_PRELOAD mode replacement interface -------- + * -------------------------------------------------------- + */ ++int32_t epoll_create1(int32_t flags) ++{ ++ return do_epoll_create1(flags); ++} + int32_t epoll_create(int32_t size) + { + return do_epoll_create(size); +@@ -550,6 +567,10 @@ pid_t fork(void) + * -------------------------------------------------------- + */ + ++int32_t __wrap_epoll_create1(int32_t size) ++{ ++ return do_epoll_create1(size); ++} + int32_t __wrap_epoll_create(int32_t size) + { + return do_epoll_create(size); +diff --git a/src/lstack/core/lstack_lwip.c b/src/lstack/core/lstack_lwip.c +index 5ca0cc3..94f95fa 100644 +--- a/src/lstack/core/lstack_lwip.c ++++ b/src/lstack/core/lstack_lwip.c +@@ -790,7 +790,12 @@ static inline void clone_lwip_socket_opt(struct lwip_sock *dst_sock, struct lwip + + int32_t gazelle_socket(int domain, int type, int protocol) + { +- int32_t fd = lwip_socket(AF_INET, SOCK_STREAM, 0); ++ if (((type & SOCK_TYPE_MASK) & ~SOCK_STREAM) != 0){ ++ LSTACK_LOG(ERR, LSTACK, "sock type error:%d, only support SOCK_STREAM \n", type); ++ return -1; ++ } ++ ++ int32_t fd = lwip_socket(AF_INET, type, 0); + if (fd < 0) { + return fd; + } +diff --git a/src/lstack/core/lstack_protocol_stack.c b/src/lstack/core/lstack_protocol_stack.c +index 71412b4..c381187 100644 +--- a/src/lstack/core/lstack_protocol_stack.c ++++ b/src/lstack/core/lstack_protocol_stack.c +@@ -531,7 +531,7 @@ void stack_accept(struct rpc_msg *msg) + int32_t fd = msg->args[MSG_ARG_0].i; + msg->result = -1; + +- int32_t accept_fd = lwip_accept(fd, msg->args[MSG_ARG_1].p, msg->args[MSG_ARG_2].p); ++ int32_t accept_fd = lwip_accept4(fd, msg->args[MSG_ARG_1].p, msg->args[MSG_ARG_2].p, msg->args[MSG_ARG_3].i); + if (accept_fd < 0) { + LSTACK_LOG(ERR, LSTACK, "fd %d ret %d\n", fd, accept_fd); + return; +@@ -749,7 +749,7 @@ static void inline del_accept_in_event(struct lwip_sock *sock) + } + + /* ergodic the protocol stack thread to find the connection, because all threads are listening */ +-int32_t stack_broadcast_accept(int32_t fd, struct sockaddr *addr, socklen_t *addrlen) ++int32_t stack_broadcast_accept4(int32_t fd, struct sockaddr *addr, socklen_t *addrlen, int flags) + { + int32_t ret = -1; + +@@ -761,7 +761,7 @@ int32_t stack_broadcast_accept(int32_t fd, struct sockaddr *addr, socklen_t *add + + struct lwip_sock *min_sock = get_min_accept_sock(fd); + if (min_sock && min_sock->conn) { +- ret = rpc_call_accept(min_sock->conn->socket, addr, addrlen); ++ ret = rpc_call_accept(min_sock->conn->socket, addr, addrlen, flags); + } + + if (min_sock && min_sock->wakeup && min_sock->wakeup->type == WAKEUP_EPOLL) { +@@ -773,3 +773,8 @@ int32_t stack_broadcast_accept(int32_t fd, struct sockaddr *addr, socklen_t *add + } + return ret; + } ++ ++int32_t stack_broadcast_accept(int32_t fd, struct sockaddr *addr, socklen_t *addrlen) ++{ ++ return stack_broadcast_accept4(fd, addr, addrlen, 0); ++} +diff --git a/src/lstack/core/lstack_thread_rpc.c b/src/lstack/core/lstack_thread_rpc.c +index ad967e9..bc77909 100644 +--- a/src/lstack/core/lstack_thread_rpc.c ++++ b/src/lstack/core/lstack_thread_rpc.c +@@ -299,7 +299,7 @@ int32_t rpc_call_listen(int s, int backlog) + return rpc_sync_call(&stack->rpc_queue, msg); + } + +-int32_t rpc_call_accept(int fd, struct sockaddr *addr, socklen_t *addrlen) ++int32_t rpc_call_accept(int fd, struct sockaddr *addr, socklen_t *addrlen, int flags) + { + struct protocol_stack *stack = get_protocol_stack_by_fd(fd); + struct rpc_msg *msg = rpc_msg_alloc(stack, stack_accept); +@@ -310,6 +310,7 @@ int32_t rpc_call_accept(int fd, struct sockaddr *addr, socklen_t *addrlen) + msg->args[MSG_ARG_0].i = fd; + msg->args[MSG_ARG_1].p = addr; + msg->args[MSG_ARG_2].p = addrlen; ++ msg->args[MSG_ARG_3].i = flags; + + return rpc_sync_call(&stack->rpc_queue, msg); + } +diff --git a/src/lstack/include/lstack_protocol_stack.h b/src/lstack/include/lstack_protocol_stack.h +index 4249217..0eda45d 100644 +--- a/src/lstack/include/lstack_protocol_stack.h ++++ b/src/lstack/include/lstack_protocol_stack.h +@@ -115,6 +115,7 @@ int32_t stack_single_listen(int32_t fd, int32_t backlog); + + /* ergodic the protocol stack thread to find the connection, because all threads are listening */ + int32_t stack_broadcast_accept(int32_t fd, struct sockaddr *addr, socklen_t *addrlen); ++int32_t stack_broadcast_accept4(int32_t fd, struct sockaddr *addr, socklen_t *addrlen, int32_t flags); + + struct rpc_msg; + void stack_arp(struct rpc_msg *msg); +diff --git a/src/lstack/include/lstack_thread_rpc.h b/src/lstack/include/lstack_thread_rpc.h +index 175c8c9..f95bc72 100644 +--- a/src/lstack/include/lstack_thread_rpc.h ++++ b/src/lstack/include/lstack_thread_rpc.h +@@ -64,7 +64,7 @@ int32_t rpc_call_socket(int32_t domain, int32_t type, int32_t protocol); + int32_t rpc_call_close(int32_t fd); + int32_t rpc_call_bind(int32_t fd, const struct sockaddr *addr, socklen_t addrlen); + int32_t rpc_call_listen(int s, int backlog); +-int32_t rpc_call_accept(int fd, struct sockaddr *addr, socklen_t *addrlen); ++int32_t rpc_call_accept(int fd, struct sockaddr *addr, socklen_t *addrlen, int flags); + int32_t rpc_call_connect(int fd, const struct sockaddr *addr, socklen_t addrlen); + int32_t rpc_call_send(int fd, const void *buf, size_t len, int flags); + int32_t rpc_call_getpeername(int fd, struct sockaddr *addr, socklen_t *addrlen); +diff --git a/src/lstack/include/posix/lstack_epoll.h b/src/lstack/include/posix/lstack_epoll.h +index 5799028..3c8fd1b 100644 +--- a/src/lstack/include/posix/lstack_epoll.h ++++ b/src/lstack/include/posix/lstack_epoll.h +@@ -65,6 +65,7 @@ struct lwip_sock; + void add_sock_event(struct lwip_sock *sock, uint32_t event); + void wakeup_epoll(struct protocol_stack *stack, struct wakeup_poll *wakeup); + int32_t lstack_epoll_create(int32_t size); ++int32_t lstack_epoll_create1(int32_t flags); + int32_t lstack_epoll_ctl(int32_t epfd, int32_t op, int32_t fd, struct epoll_event *event); + int32_t lstack_epoll_wait(int32_t epfd, struct epoll_event *events, int32_t maxevents, int32_t timeout); + int32_t lstack_poll(struct pollfd *fds, nfds_t nfds, int32_t timeout); +-- +2.23.0 + diff --git a/0111-refactor-event-notice.patch b/0111-refactor-event-notice.patch new file mode 100644 index 0000000..4506acd --- /dev/null +++ b/0111-refactor-event-notice.patch @@ -0,0 +1,1243 @@ +From b41e22d7d8af5c990edc04130d885c9bd675461e Mon Sep 17 00:00:00 2001 +From: chengyechun +Date: Thu, 20 Oct 2022 11:29:16 +0800 +Subject: [PATCH 2/2] refactor event notice + +--- + src/common/gazelle_opt.h | 1 + + src/lstack/api/lstack_epoll.c | 382 +++++++++++++-------- + src/lstack/api/lstack_wrap.c | 9 + + src/lstack/core/lstack_lwip.c | 4 +- + src/lstack/core/lstack_protocol_stack.c | 148 +++++--- + src/lstack/core/lstack_stack_stat.c | 67 +--- + src/lstack/core/lstack_thread_rpc.c | 15 +- + src/lstack/include/lstack_protocol_stack.h | 16 +- + src/lstack/include/lstack_stack_stat.h | 1 - + src/lstack/include/lstack_thread_rpc.h | 2 + + src/lstack/include/posix/lstack_epoll.h | 15 +- + src/lstack/netif/lstack_ethdev.c | 2 +- + src/ltran/ltran_dfx.c | 4 +- + 13 files changed, 410 insertions(+), 256 deletions(-) + +diff --git a/src/common/gazelle_opt.h b/src/common/gazelle_opt.h +index f2ec163..011553c 100644 +--- a/src/common/gazelle_opt.h ++++ b/src/common/gazelle_opt.h +@@ -24,6 +24,7 @@ + #define GAZELLE_FALSE 0 + + #define PROTOCOL_STACK_MAX 32 ++#define KERNEL_EPOLL_MAX 512 + + #define ETHER_ADDR_LEN 6 + +diff --git a/src/lstack/api/lstack_epoll.c b/src/lstack/api/lstack_epoll.c +index 1206e75..417d69c 100644 +--- a/src/lstack/api/lstack_epoll.c ++++ b/src/lstack/api/lstack_epoll.c +@@ -40,36 +40,54 @@ + #define SEC_TO_NSEC 1000000000 + #define SEC_TO_MSEC 1000 + #define MSEC_TO_NSEC 1000000 +-#define POLL_KERNEL_EVENTS 128 ++#define POLL_KERNEL_EVENTS 32 ++ ++static void update_epoll_max_stack(struct wakeup_poll *wakeup); ++static void change_epollfd_kernel_thread(struct wakeup_poll *wakeup, struct protocol_stack *old_stack, ++ struct protocol_stack *new_stack); + + void add_sock_event(struct lwip_sock *sock, uint32_t event) + { + struct wakeup_poll *wakeup = sock->wakeup; +- if (wakeup == NULL || (event & sock->epoll_events) == 0) { ++ if (wakeup == NULL || wakeup->type == WAKEUP_CLOSE || (event & sock->epoll_events) == 0) { + return; + } + +- wakeup->have_event = true; +- sock->stack->have_event = true; +- +- if (wakeup->type == WAKEUP_POLL) { +- return; ++ if (wakeup->type == WAKEUP_EPOLL) { ++ pthread_spin_lock(&wakeup->event_list_lock); ++ sock->events |= (event == EPOLLERR) ? (EPOLLIN | EPOLLERR) : (event & sock->epoll_events); ++ if (list_is_null(&sock->event_list)) { ++ list_add_node(&wakeup->event_list, &sock->event_list); ++ } ++ pthread_spin_unlock(&wakeup->event_list_lock); + } + +- pthread_spin_lock(&wakeup->event_list_lock); +- sock->events |= (event == EPOLLERR) ? (EPOLLIN | EPOLLERR) : (event & sock->epoll_events); +- if (list_is_null(&sock->event_list)) { +- list_add_node(&wakeup->event_list, &sock->event_list); ++ struct protocol_stack *stack = sock->stack; ++ if (list_is_null(&wakeup->wakeup_list[stack->queue_id])) { ++ list_add_node(&stack->wakeup_list, &wakeup->wakeup_list[stack->queue_id]); + } +- pthread_spin_unlock(&wakeup->event_list_lock); + } + +-void wakeup_epoll(struct protocol_stack *stack, struct wakeup_poll *wakeup) ++void wakeup_stack_epoll(struct protocol_stack *stack) + { +- if (__atomic_load_n(&wakeup->in_wait, __ATOMIC_ACQUIRE)) { +- uint64_t tmp = 1; +- posix_api->write_fn(wakeup->eventfd, &tmp, sizeof(tmp)); +- stack->stats.wakeup_events++; ++ struct list_node *node, *temp; ++ ++ list_for_each_safe(node, temp, &stack->wakeup_list) { ++ struct wakeup_poll *wakeup = container_of((node - stack->queue_id), struct wakeup_poll, wakeup_list); ++ ++ if (!get_protocol_stack_group()->wakeup_enable) { ++ if (__atomic_load_n(&wakeup->in_wait, __ATOMIC_ACQUIRE)) { ++ __atomic_store_n(&wakeup->in_wait, false, __ATOMIC_RELEASE); ++ rte_mb(); ++ pthread_mutex_unlock(&wakeup->wait); ++ stack->stats.wakeup_events++; ++ } ++ } else { ++ gazelle_light_ring_enqueue_busrt(stack->wakeup_ring, (void **)&wakeup, 1); ++ stack->stats.wakeup_events++; ++ } ++ ++ list_del_node_null(&wakeup->wakeup_list[stack->queue_id]); + } + } + +@@ -101,7 +119,11 @@ static void raise_pending_events(struct wakeup_poll *wakeup, struct lwip_sock *s + { + sock->events = update_events(sock); + if (sock->events) { +- add_sock_event(sock, sock->events); ++ pthread_spin_lock(&wakeup->event_list_lock); ++ if (wakeup->type == WAKEUP_EPOLL && list_is_null(&sock->event_list)) { ++ list_add_node(&wakeup->event_list, &sock->event_list); ++ } ++ pthread_spin_unlock(&wakeup->event_list_lock); + } + } + +@@ -125,23 +147,23 @@ int32_t lstack_do_epoll_create(int32_t fd) + GAZELLE_RETURN(EINVAL); + } + +- wakeup->eventfd = eventfd(0, EFD_NONBLOCK); +- if (wakeup->eventfd < 0) { +- LSTACK_LOG(ERR, LSTACK, "eventfd fail=%d errno=%d\n", wakeup->eventfd, errno); +- posix_api->close_fn(fd); +- free(wakeup); +- GAZELLE_RETURN(EINVAL); ++ for (uint32_t i = 0; i < PROTOCOL_STACK_MAX; i++) { ++ init_list_node_null(&wakeup->wakeup_list[i]); + } + +- struct epoll_event event; +- event.data.fd = wakeup->eventfd; +- event.events = EPOLLIN | EPOLLET; +- if (posix_api->epoll_ctl_fn(fd, EPOLL_CTL_ADD, wakeup->eventfd, &event) < 0) { +- LSTACK_LOG(ERR, LSTACK, "ctl eventfd errno=%d\n", errno); ++ if (pthread_mutex_init(&wakeup->wait, NULL) != 0) { + posix_api->close_fn(fd); + free(wakeup); + GAZELLE_RETURN(EINVAL); + } ++ pthread_mutex_trylock(&wakeup->wait); ++ __atomic_store_n(&wakeup->in_wait, true, __ATOMIC_RELEASE); ++ ++ struct protocol_stack_group *stack_group = get_protocol_stack_group(); ++ init_list_node_null(&wakeup->poll_list); ++ pthread_spin_lock(&stack_group->poll_list_lock); ++ list_add_node(&stack_group->poll_list, &wakeup->poll_list); ++ pthread_spin_unlock(&stack_group->poll_list_lock); + + init_list_node(&wakeup->event_list); + pthread_spin_init(&wakeup->event_list_lock, PTHREAD_PROCESS_PRIVATE); +@@ -150,6 +172,9 @@ int32_t lstack_do_epoll_create(int32_t fd) + wakeup->epollfd = fd; + sock->wakeup = wakeup; + ++ update_epoll_max_stack(wakeup); ++ change_epollfd_kernel_thread(wakeup, wakeup->bind_stack, wakeup->max_stack); ++ + return fd; + } + +@@ -167,24 +192,41 @@ int32_t lstack_epoll_create(int32_t flags) + + int32_t lstack_epoll_close(int32_t fd) + { +- posix_api->close_fn(fd); +- + struct lwip_sock *sock = get_socket_by_fd(fd); + if (sock == NULL) { + LSTACK_LOG(ERR, LSTACK, "fd=%d sock is NULL errno=%d\n", fd, errno); + GAZELLE_RETURN(EINVAL); + } + +- if (sock->wakeup) { +- if (sock->wakeup->bind_stack) { +- unregister_wakeup(sock->wakeup->bind_stack, sock->wakeup); +- } +- posix_api->close_fn(sock->wakeup->eventfd); +- pthread_spin_destroy(&sock->wakeup->event_list_lock); +- free(sock->wakeup); ++ struct protocol_stack_group *stack_group = get_protocol_stack_group(); ++ struct wakeup_poll *wakeup = sock->wakeup; ++ if (wakeup == NULL) { ++ return 0; + } ++ ++ wakeup->type = WAKEUP_CLOSE; ++ ++ stack_broadcast_clean_epoll(wakeup); ++ ++ struct list_node *node, *temp; ++ pthread_spin_lock(&wakeup->event_list_lock); ++ list_for_each_safe(node, temp, &wakeup->event_list) { ++ struct lwip_sock *sock = container_of(node, struct lwip_sock, event_list); ++ list_del_node_null(&sock->event_list); ++ } ++ pthread_spin_unlock(&wakeup->event_list_lock); ++ pthread_spin_destroy(&wakeup->event_list_lock); ++ ++ pthread_spin_lock(&stack_group->poll_list_lock); ++ list_del_node_null(&wakeup->poll_list); ++ pthread_spin_unlock(&stack_group->poll_list_lock); ++ ++ pthread_mutex_destroy(&wakeup->wait); ++ ++ free(wakeup); + sock->wakeup = NULL; + ++ posix_api->close_fn(fd); + return 0; + } + +@@ -243,12 +285,10 @@ int32_t lstack_epoll_ctl(int32_t epfd, int32_t op, int32_t fd, struct epoll_even + struct wakeup_poll *wakeup = epoll_sock->wakeup; + struct lwip_sock *sock = get_socket(fd); + if (sock == NULL) { +- wakeup->have_kernel_fd = true; + return posix_api->epoll_ctl_fn(epfd, op, fd, event); + } + + if (CONN_TYPE_HAS_HOST(sock->conn)) { +- wakeup->have_kernel_fd = true; + int32_t ret = posix_api->epoll_ctl_fn(epfd, op, fd, event); + if (ret < 0) { + LSTACK_LOG(ERR, LSTACK, "fd=%d epfd=%d op=%d\n", fd, epfd, op); +@@ -347,36 +387,40 @@ static int32_t poll_lwip_event(struct pollfd *fds, nfds_t nfds) + return event_num; + } + +-static void epoll_bind_statck(struct wakeup_poll *wakeup) ++static void change_epollfd_kernel_thread(struct wakeup_poll *wakeup, struct protocol_stack *old_stack, ++ struct protocol_stack *new_stack) + { +- /* all fd is kernel, set rand stack */ +- if (wakeup->bind_stack == NULL && wakeup->max_stack == NULL) { +- update_epoll_max_stack(wakeup); ++ if (old_stack) { ++ if (posix_api->epoll_ctl_fn(old_stack->epollfd, EPOLL_CTL_DEL, wakeup->epollfd, NULL) != 0) { ++ LSTACK_LOG(ERR, LSTACK, "epoll_ctl_fn errno=%d\n", errno); ++ } + } + +- if (wakeup->bind_stack != wakeup->max_stack && wakeup->max_stack) { +- if (get_global_cfg_params()->app_bind_numa) { +- bind_to_stack_numa(wakeup->max_stack); +- } +- if (wakeup->bind_stack) { +- unregister_wakeup(wakeup->bind_stack, wakeup); +- } +- wakeup->bind_stack = wakeup->max_stack; +- register_wakeup(wakeup->bind_stack, wakeup); ++ /* avoid kernel thread post too much, use EPOLLET */ ++ struct epoll_event event; ++ event.data.ptr = wakeup; ++ event.events = EPOLLIN | EPOLLOUT | EPOLLERR | EPOLLHUP | EPOLLET; ++ if (posix_api->epoll_ctl_fn(new_stack->epollfd, EPOLL_CTL_ADD, wakeup->epollfd, &event) != 0) { ++ LSTACK_LOG(ERR, LSTACK, "epoll_ctl_fn errno=%d\n", errno); + } + } + +-static bool del_event_fd(struct epoll_event* events, int32_t eventnum, int32_t eventfd) ++static void epoll_bind_statck(struct wakeup_poll *wakeup) + { +- for (int32_t i = 0; i < eventnum; i++) { +- if (events[i].data.fd == eventfd) { +- events[i].data.u64 = events[eventnum - 1].data.u64; +- events[i].events = events[eventnum - 1].events; +- return true; +- } ++ if (wakeup->bind_stack != wakeup->max_stack && wakeup->max_stack) { ++ bind_to_stack_numa(wakeup->max_stack); ++ change_epollfd_kernel_thread(wakeup, wakeup->bind_stack, wakeup->max_stack); ++ wakeup->bind_stack = wakeup->max_stack; + } ++} + +- return false; ++static void ms_to_timespec(struct timespec *timespec, int32_t timeout) ++{ ++ clock_gettime(CLOCK_REALTIME, timespec); ++ timespec->tv_sec += timeout / SEC_TO_MSEC; ++ timespec->tv_nsec += (timeout % SEC_TO_MSEC) * MSEC_TO_NSEC; ++ timespec->tv_sec += timespec->tv_nsec / SEC_TO_NSEC; ++ timespec->tv_nsec = timespec->tv_nsec % SEC_TO_NSEC; + } + + int32_t lstack_epoll_wait(int32_t epfd, struct epoll_event* events, int32_t maxevents, int32_t timeout) +@@ -388,63 +432,84 @@ int32_t lstack_epoll_wait(int32_t epfd, struct epoll_event* events, int32_t maxe + + struct wakeup_poll *wakeup = sock->wakeup; + int32_t kernel_num = 0; ++ int32_t lwip_num = 0; ++ int32_t ret = 0; + +- epoll_bind_statck(sock->wakeup); ++ if (get_global_cfg_params()->app_bind_numa) { ++ epoll_bind_statck(sock->wakeup); ++ } + +- __atomic_store_n(&wakeup->in_wait, true, __ATOMIC_RELEASE); +- rte_mb(); ++ do { ++ lwip_num = epoll_lwip_event(wakeup, events, maxevents); ++ wakeup->stat.app_events += lwip_num; + +- int32_t lwip_num = epoll_lwip_event(wakeup, events, maxevents); +- wakeup->stat.app_events += lwip_num; +- if (!wakeup->have_kernel_fd && lwip_num > 0) { +- return lwip_num; +- } ++ if (__atomic_load_n(&wakeup->have_kernel_event, __ATOMIC_ACQUIRE)) { ++ __atomic_store_n(&wakeup->have_kernel_event, false, __ATOMIC_RELEASE); ++ kernel_num = posix_api->epoll_wait_fn(epfd, &events[lwip_num], maxevents - lwip_num, 0); ++ } + +- if (lwip_num > 0) { +- __atomic_store_n(&wakeup->in_wait, false, __ATOMIC_RELEASE); +- rte_mb(); +- kernel_num = posix_api->epoll_wait_fn(epfd, &events[lwip_num], maxevents - lwip_num, 0); +- } else { +- kernel_num = posix_api->epoll_wait_fn(epfd, &events[lwip_num], maxevents - lwip_num, timeout); +- rte_mb(); +- __atomic_store_n(&wakeup->in_wait, false, __ATOMIC_RELEASE); +- } ++ if (lwip_num + kernel_num > 0) { ++ return lwip_num + kernel_num; ++ } + +- if (kernel_num <= 0) { +- return (lwip_num > 0) ? lwip_num : kernel_num; +- } ++ if (timeout < 0) { ++ ret = pthread_mutex_lock(&wakeup->wait); ++ } else { ++ struct timespec epoll_time; ++ ms_to_timespec(&epoll_time, timeout); ++ ret = pthread_mutex_timedlock(&wakeup->wait, &epoll_time); ++ } + +- if (del_event_fd(&events[lwip_num], kernel_num, wakeup->eventfd)) { +- kernel_num--; +- if (lwip_num == 0) { +- lwip_num = epoll_lwip_event(wakeup, &events[kernel_num], maxevents - kernel_num); ++ if (ret == 0) { ++ __atomic_store_n(&wakeup->in_wait, true, __ATOMIC_RELEASE); + } +- } ++ } while (ret == 0); + +- return lwip_num + kernel_num; ++ return 0; + } + + static int32_t init_poll_wakeup_data(struct wakeup_poll *wakeup) + { +- wakeup->type = WAKEUP_POLL; ++ if (pthread_mutex_init(&wakeup->wait, NULL) != 0) { ++ GAZELLE_RETURN(EINVAL); ++ } ++ pthread_mutex_trylock(&wakeup->wait); ++ __atomic_store_n(&wakeup->in_wait, true, __ATOMIC_RELEASE); ++ ++ for (uint32_t i = 0; i < PROTOCOL_STACK_MAX; i++) { ++ init_list_node_null(&wakeup->wakeup_list[i]); ++ } + +- wakeup->eventfd = eventfd(0, EFD_NONBLOCK); +- if (wakeup->eventfd < 0) { +- LSTACK_LOG(ERR, LSTACK, "eventfd failed errno=%d\n", errno); ++ wakeup->epollfd = posix_api->epoll_create_fn(POLL_KERNEL_EVENTS); ++ if (wakeup->epollfd < 0) { + GAZELLE_RETURN(EINVAL); + } + ++ wakeup->type = WAKEUP_POLL; ++ + wakeup->last_fds = calloc(POLL_KERNEL_EVENTS, sizeof(struct pollfd)); + if (wakeup->last_fds == NULL) { +- LSTACK_LOG(ERR, LSTACK, "calloc failed errno=%d\n", errno); +- posix_api->close_fn(wakeup->eventfd); + GAZELLE_RETURN(EINVAL); + } +- +- wakeup->last_fds[0].fd = wakeup->eventfd; +- wakeup->last_fds[0].events = POLLIN; + wakeup->last_max_nfds = POLL_KERNEL_EVENTS; + ++ wakeup->events = calloc(POLL_KERNEL_EVENTS, sizeof(struct epoll_event)); ++ if (wakeup->events == NULL) { ++ free(wakeup->last_fds); ++ GAZELLE_RETURN(EINVAL); ++ } ++ ++ struct protocol_stack_group *stack_group = get_protocol_stack_group(); ++ init_list_node_null(&wakeup->poll_list); ++ pthread_spin_lock(&stack_group->poll_list_lock); ++ list_add_node(&stack_group->poll_list, &wakeup->poll_list); ++ pthread_spin_unlock(&stack_group->poll_list_lock); ++ ++ int32_t stack_count[PROTOCOL_STACK_MAX] = {0}; ++ uint16_t bind_id = find_max_cnt_stack(stack_count, stack_group->stack_num, wakeup->bind_stack); ++ change_epollfd_kernel_thread(wakeup, wakeup->bind_stack, stack_group->stacks[bind_id]); ++ wakeup->bind_stack = stack_group->stacks[bind_id]; ++ + return 0; + } + +@@ -453,13 +518,19 @@ static void resize_kernel_poll(struct wakeup_poll *wakeup, nfds_t nfds) + if (wakeup->last_fds) { + free(wakeup->last_fds); + } +- wakeup->last_fds = calloc(nfds + 1, sizeof(struct pollfd)); ++ wakeup->last_fds = calloc(nfds, sizeof(struct pollfd)); + if (wakeup->last_fds == NULL) { + LSTACK_LOG(ERR, LSTACK, "calloc failed errno=%d\n", errno); + } + +- wakeup->last_fds[0].fd = wakeup->eventfd; +- wakeup->last_fds[0].events = POLLIN; ++ if (wakeup->events) { ++ free(wakeup->events); ++ } ++ wakeup->events = calloc(nfds, sizeof(struct epoll_event)); ++ if (wakeup->events == NULL) { ++ LSTACK_LOG(ERR, LSTACK, "calloc failed errno=%d\n", errno); ++ } ++ + wakeup->last_max_nfds = nfds; + } + +@@ -472,15 +543,25 @@ static void poll_bind_statck(struct wakeup_poll *wakeup, int32_t *stack_count) + return; + } + +- if (wakeup->bind_stack) { +- unregister_wakeup(wakeup->bind_stack, wakeup); ++ change_epollfd_kernel_thread(wakeup, wakeup->bind_stack, stack_group->stacks[bind_id]); ++ bind_to_stack_numa(stack_group->stacks[bind_id]); ++ wakeup->bind_stack = stack_group->stacks[bind_id]; ++} ++ ++static void update_kernel_poll(struct wakeup_poll *wakeup, uint32_t index, struct pollfd *new_fd) ++{ ++ posix_api->epoll_ctl_fn(wakeup->epollfd, EPOLL_CTL_DEL, wakeup->last_fds[index].fd, NULL); ++ ++ if (new_fd == NULL) { ++ return; + } +- +- if (get_global_cfg_params()->app_bind_numa) { +- bind_to_stack_numa(stack_group->stacks[bind_id]); ++ ++ struct epoll_event event; ++ event.data.u32 = index; ++ event.events = new_fd->events; ++ if (posix_api->epoll_ctl_fn(wakeup->epollfd, EPOLL_CTL_ADD, new_fd->fd, &event) != 0) { ++ LSTACK_LOG(ERR, LSTACK, "epoll_ctl_fn errno=%d\n", errno); + } +- wakeup->bind_stack = stack_group->stacks[bind_id]; +- register_wakeup(wakeup->bind_stack, wakeup); + } + + static void poll_init(struct wakeup_poll *wakeup, struct pollfd *fds, nfds_t nfds) +@@ -494,31 +575,33 @@ static void poll_init(struct wakeup_poll *wakeup, struct pollfd *fds, nfds_t nfd + poll_change = 1; + } + ++ if (nfds < wakeup->last_nfds) { ++ poll_change = 1; ++ } ++ + for (uint32_t i = 0; i < nfds; i++) { + int32_t fd = fds[i].fd; + fds[i].revents = 0; + struct lwip_sock *sock = get_socket_by_fd(fd); + +- if (fd == wakeup->last_fds[i + 1].fd && fds[i].events == wakeup->last_fds[i + 1].events) { ++ if (fd == wakeup->last_fds[i].fd && fds[i].events == wakeup->last_fds[i].events) { + /* fd close then socket may get same fd. */ + if (sock == NULL || sock->wakeup != NULL) { + continue; + } + } +- wakeup->last_fds[i + 1].fd = fd; +- wakeup->last_fds[i + 1].events = fds[i].events; ++ wakeup->last_fds[i].fd = fd; ++ wakeup->last_fds[i].events = fds[i].events; + poll_change = 1; + +- while (sock && sock->conn) { +- if (sock->epoll_events != (fds[i].events | POLLERR)) { +- sock->epoll_events = fds[i].events | POLLERR; +- } +- if (sock->wakeup != wakeup) { +- sock->wakeup = wakeup; +- } ++ if (sock == NULL || sock->conn == NULL || CONN_TYPE_HAS_HOST(sock->conn)) { ++ update_kernel_poll(wakeup, i, fds + i); ++ } + ++ while (sock && sock->conn) { ++ sock->epoll_events = fds[i].events | POLLERR; ++ sock->wakeup = wakeup; + stack_count[sock->stack->queue_id]++; +- /* listenfd list */ + sock = sock->listen_next; + } + } +@@ -526,9 +609,11 @@ static void poll_init(struct wakeup_poll *wakeup, struct pollfd *fds, nfds_t nfd + if (poll_change == 0) { + return; + } +- wakeup->last_nfds = nfds + 1; ++ wakeup->last_nfds = nfds; + +- poll_bind_statck(wakeup, stack_count); ++ if (get_global_cfg_params()->app_bind_numa) { ++ poll_bind_statck(wakeup, stack_count); ++ } + } + + int32_t lstack_poll(struct pollfd *fds, nfds_t nfds, int32_t timeout) +@@ -548,43 +633,38 @@ int32_t lstack_poll(struct pollfd *fds, nfds_t nfds, int32_t timeout) + + poll_init(wakeup, fds, nfds); + +- __atomic_store_n(&wakeup->in_wait, true, __ATOMIC_RELEASE); +- rte_mb(); +- +- int32_t lwip_num = poll_lwip_event(fds, nfds); +- wakeup->stat.app_events += lwip_num; +- if (lwip_num >= nfds) { +- __atomic_store_n(&wakeup->in_wait, false, __ATOMIC_RELEASE); +- return lwip_num; +- } +- + int32_t kernel_num = 0; +- if (lwip_num > 0) { +- __atomic_store_n(&wakeup->in_wait, false, __ATOMIC_RELEASE); +- rte_mb(); +- kernel_num = posix_api->poll_fn(wakeup->last_fds, wakeup->last_nfds, 0); +- } else { +- kernel_num = posix_api->poll_fn(wakeup->last_fds, wakeup->last_nfds, timeout); +- rte_mb(); +- __atomic_store_n(&wakeup->in_wait, false, __ATOMIC_RELEASE); +- } ++ int32_t lwip_num = 0; ++ int32_t ret; + +- if (kernel_num <= 0) { +- return (lwip_num > 0) ? lwip_num : kernel_num; +- } ++ do { ++ lwip_num = poll_lwip_event(fds, nfds); ++ ++ if (__atomic_load_n(&wakeup->have_kernel_event, __ATOMIC_ACQUIRE)) { ++ __atomic_store_n(&wakeup->have_kernel_event, false, __ATOMIC_RELEASE); ++ kernel_num = posix_api->epoll_wait_fn(wakeup->epollfd, wakeup->events, nfds, 0); ++ for (int32_t i = 0; i < kernel_num; i++) { ++ uint32_t index = wakeup->events[i].data.u32; ++ fds[index].revents = wakeup->events[i].events; ++ } ++ } + +- for (nfds_t i = 0; i < nfds; i++) { +- if (fds[i].revents == 0 && wakeup->last_fds[i + 1].revents != 0) { +- fds[i].revents = wakeup->last_fds[i + 1].revents; ++ if (lwip_num + kernel_num > 0) { ++ return lwip_num + kernel_num; + } +- } + +- if (wakeup->last_fds[0].revents) { +- if (lwip_num == 0) { +- lwip_num = poll_lwip_event(fds, nfds); ++ if (timeout < 0) { ++ ret = pthread_mutex_lock(&wakeup->wait); ++ } else { ++ struct timespec epoll_time; ++ ms_to_timespec(&epoll_time, timeout); ++ ret = pthread_mutex_timedlock(&wakeup->wait, &epoll_time); + } +- kernel_num--; +- } + +- return kernel_num + lwip_num; ++ if (ret == 0) { ++ __atomic_store_n(&wakeup->in_wait, true, __ATOMIC_RELEASE); ++ } ++ } while (ret == 0); ++ ++ return 0; + } +diff --git a/src/lstack/api/lstack_wrap.c b/src/lstack/api/lstack_wrap.c +index 4669a30..8dc4524 100644 +--- a/src/lstack/api/lstack_wrap.c ++++ b/src/lstack/api/lstack_wrap.c +@@ -203,6 +203,15 @@ static int32_t do_connect(int32_t s, const struct sockaddr *name, socklen_t name + return posix_api->connect_fn(s, name, namelen); + } + ++ struct lwip_sock *sock = get_socket(s); ++ if (sock == NULL) { ++ return posix_api->connect_fn(s, name, namelen); ++ } ++ ++ if (!netconn_is_nonblocking(sock->conn)) { ++ GAZELLE_RETURN(EINVAL); ++ } ++ + int32_t ret = rpc_call_connect(s, name, namelen); + if (ret == 0 || errno == EISCONN) { + return ret; +diff --git a/src/lstack/core/lstack_lwip.c b/src/lstack/core/lstack_lwip.c +index 94f95fa..00afc75 100644 +--- a/src/lstack/core/lstack_lwip.c ++++ b/src/lstack/core/lstack_lwip.c +@@ -234,13 +234,13 @@ struct pbuf *write_lwip_data(struct lwip_sock *sock, uint16_t remain_size, uint8 + return NULL; + } + +- sock->stack->stats.write_lwip_cnt++; + return pbuf; + } + + void write_lwip_over(struct lwip_sock *sock, uint32_t n) + { + gazelle_ring_dequeue_over(sock->send_ring, n); ++ sock->stack->stats.write_lwip_cnt += n; + } + + static inline void del_data_out_event(struct lwip_sock *sock) +@@ -269,6 +269,7 @@ void write_stack_over(struct lwip_sock *sock) + gazelle_ring_read_over(sock->send_ring); + + if (sock->wakeup) { ++ sock->wakeup->stat.app_write_cnt++; + if (sock->wakeup->type == WAKEUP_EPOLL && (sock->events & EPOLLOUT)) { + del_data_out_event(sock); + } +@@ -882,6 +883,7 @@ void get_lwip_conntable(struct rpc_msg *msg) + conn[conn_num].l_port = pcbl->local_port; + conn[conn_num].tcp_sub_state = pcbl->state; + struct netconn *netconn = (struct netconn *)pcbl->callback_arg; ++ conn[conn_num].fd = netconn->socket; + if (netconn != NULL && netconn->acceptmbox != NULL) { + conn[conn_num].recv_cnt = rte_ring_count(netconn->acceptmbox->ring); + } +diff --git a/src/lstack/core/lstack_protocol_stack.c b/src/lstack/core/lstack_protocol_stack.c +index c381187..79769f3 100644 +--- a/src/lstack/core/lstack_protocol_stack.c ++++ b/src/lstack/core/lstack_protocol_stack.c +@@ -36,12 +36,12 @@ + #include "posix/lstack_epoll.h" + #include "lstack_stack_stat.h" + +-#define READ_LIST_MAX 32 +-#define SEND_LIST_MAX 32 +-#define HANDLE_RPC_MSG_MAX 32 +-#define KERNEL_EPOLL_MAX 256 ++#define READ_LIST_MAX 128 ++#define SEND_LIST_MAX 128 ++#define HANDLE_RPC_MSG_MAX 128 ++#define KERNEL_EVENT_100us 100 + +-static PER_THREAD uint16_t g_stack_idx = PROTOCOL_STACK_MAX; ++static PER_THREAD struct protocol_stack *g_stack_p = NULL; + static struct protocol_stack_group g_stack_group = {0}; + + void set_init_fail(void); +@@ -57,12 +57,13 @@ void bind_to_stack_numa(struct protocol_stack *stack) + ret = pthread_setaffinity_np(tid, sizeof(stack->idle_cpuset), &stack->idle_cpuset); + if (ret != 0) { + LSTACK_LOG(ERR, LSTACK, "thread %d setaffinity to stack %hu failed\n", rte_gettid(), stack->queue_id); ++ return; + } + } + + static inline void set_stack_idx(uint16_t idx) + { +- g_stack_idx = idx; ++ g_stack_p = g_stack_group.stacks[idx]; + } + + long get_stack_tid(void) +@@ -83,10 +84,7 @@ struct protocol_stack_group *get_protocol_stack_group(void) + + struct protocol_stack *get_protocol_stack(void) + { +- if (g_stack_idx >= PROTOCOL_STACK_MAX) { +- return NULL; +- } +- return g_stack_group.stacks[g_stack_idx]; ++ return g_stack_p; + } + + struct protocol_stack *get_protocol_stack_by_fd(int32_t fd) +@@ -241,10 +239,33 @@ static void* gazelle_wakeup_thread(void *arg) + nanosleep(&st, NULL); + } + +- sem_t *event_sem[WAKEUP_MAX_NUM]; +- uint32_t num = gazelle_light_ring_dequeue_burst(stack->wakeup_ring, (void **)event_sem, WAKEUP_MAX_NUM); ++ struct wakeup_poll *wakeup[WAKEUP_MAX_NUM]; ++ uint32_t num = gazelle_light_ring_dequeue_burst(stack->wakeup_ring, (void **)wakeup, WAKEUP_MAX_NUM); + for (uint32_t i = 0; i < num; i++) { +- sem_post(event_sem[i]); ++ if (__atomic_load_n(&wakeup[i]->in_wait, __ATOMIC_ACQUIRE)) { ++ __atomic_store_n(&wakeup[i]->in_wait, false, __ATOMIC_RELEASE); ++ rte_mb(); ++ pthread_mutex_unlock(&wakeup[i]->wait); ++ } ++ } ++ } ++ ++ return NULL; ++} ++ ++static void* gazelle_kernelevent_thread(void *arg) ++{ ++ uint16_t queue_id = *(uint16_t *)arg; ++ struct protocol_stack *stack = get_protocol_stack_group()->stacks[queue_id]; ++ ++ bind_to_stack_numa(stack); ++ ++ LSTACK_LOG(INFO, LSTACK, "kernelevent_%02hu start\n", queue_id); ++ ++ for (;;) { ++ stack->kernel_event_num = posix_api->epoll_wait_fn(stack->epollfd, stack->kernel_events, KERNEL_EPOLL_MAX, -1); ++ while (stack->kernel_event_num > 0) { ++ usleep(KERNEL_EVENT_100us); + } + } + +@@ -255,27 +276,26 @@ static int32_t init_stack_value(struct protocol_stack *stack, uint16_t queue_id) + { + struct protocol_stack_group *stack_group = get_protocol_stack_group(); + +- set_stack_idx(queue_id); + stack->tid = rte_gettid(); + stack->queue_id = queue_id; + stack->port_id = stack_group->port_id; + stack->cpu_id = get_global_cfg_params()->cpus[queue_id]; + stack->lwip_stats = &lwip_stats; + +- pthread_spin_init(&stack->wakeup_list_lock, PTHREAD_PROCESS_PRIVATE); +- + init_list_node(&stack->recv_list); + init_list_node(&stack->send_list); ++ init_list_node(&stack->wakeup_list); + + sys_calibrate_tsc(); + stack_stat_init(); + + stack_group->stacks[queue_id] = stack; ++ set_stack_idx(queue_id); + +- if (thread_affinity_init(stack->cpu_id) != 0) { ++ stack->epollfd = posix_api->epoll_create_fn(GAZELLE_LSTACK_MAX_CONN); ++ if (stack->epollfd < 0) { + return -1; + } +- RTE_PER_LCORE(_lcore_id) = stack->cpu_id; + + stack->socket_id = numa_node_of_cpu(stack->cpu_id); + if (stack->socket_id < 0) { +@@ -302,6 +322,23 @@ void wait_sem_value(sem_t *sem, int32_t wait_value) + } while (sem_val < wait_value); + } + ++static int32_t create_affiliate_thread(uint16_t queue_id, bool wakeup_enable) ++{ ++ if (wakeup_enable) { ++ if (create_thread(queue_id, "gazelleweakup", gazelle_wakeup_thread) != 0) { ++ LSTACK_LOG(ERR, LSTACK, "gazelleweakup errno=%d\n", errno); ++ return -1; ++ } ++ } ++ ++ if (create_thread(queue_id, "gazellekernel", gazelle_kernelevent_thread) != 0) { ++ LSTACK_LOG(ERR, LSTACK, "gazellekernel errno=%d\n", errno); ++ return -1; ++ } ++ ++ return 0; ++} ++ + static struct protocol_stack *stack_thread_init(uint16_t queue_id) + { + struct protocol_stack_group *stack_group = get_protocol_stack_group(); +@@ -319,6 +356,19 @@ static struct protocol_stack *stack_thread_init(uint16_t queue_id) + return NULL; + } + ++ if (create_affiliate_thread(queue_id, stack_group->wakeup_enable) < 0) { ++ sem_post(&stack_group->thread_phase1); ++ free(stack); ++ return NULL; ++ } ++ ++ if (thread_affinity_init(stack->cpu_id) != 0) { ++ sem_post(&stack_group->thread_phase1); ++ free(stack); ++ return NULL; ++ } ++ RTE_PER_LCORE(_lcore_id) = stack->cpu_id; ++ + hugepage_init(); + + tcpip_init(NULL, NULL); +@@ -342,35 +392,28 @@ static struct protocol_stack *stack_thread_init(uint16_t queue_id) + return NULL; + } + +- if (stack_group->wakeup_enable) { +- if (create_thread(stack->queue_id, "gazelleweakup", gazelle_wakeup_thread) != 0) { +- LSTACK_LOG(ERR, LSTACK, "gazelleweakup errno=%d\n", errno); +- free(stack); +- return NULL; +- } +- } +- + return stack; + } + +-static void wakeup_stack_wait(struct protocol_stack *stack) ++static void wakeup_kernel_event(struct protocol_stack *stack) + { +- if (!stack->have_event || pthread_spin_trylock(&stack->wakeup_list_lock)) { ++ if (stack->kernel_event_num == 0) { + return; + } + +- struct wakeup_poll *node = stack->wakeup_list; +- while (node) { +- if (node->have_event) { +- wakeup_epoll(stack, node); +- node->have_event = false; ++ for (int32_t i = 0; i < stack->kernel_event_num; i++) { ++ struct wakeup_poll *wakeup = stack->kernel_events[i].data.ptr; ++ if (wakeup->type == WAKEUP_CLOSE) { ++ continue; + } +- node = node->next; +- } + +- pthread_spin_unlock(&stack->wakeup_list_lock); ++ __atomic_store_n(&wakeup->have_kernel_event, true, __ATOMIC_RELEASE); ++ if (list_is_null(&wakeup->wakeup_list[stack->queue_id])) { ++ list_add_node(&stack->wakeup_list, &wakeup->wakeup_list[stack->queue_id]); ++ } ++ } + +- stack->have_event = false; ++ stack->kernel_event_num = 0; + } + + static void* gazelle_stack_thread(void *arg) +@@ -398,7 +441,9 @@ static void* gazelle_stack_thread(void *arg) + + send_stack_list(stack, SEND_LIST_MAX); + +- wakeup_stack_wait(stack); ++ wakeup_kernel_event(stack); ++ ++ wakeup_stack_epoll(stack); + + sys_timer_run(); + +@@ -445,6 +490,9 @@ int32_t init_protocol_stack(void) + + stack_group->stack_num = get_global_cfg_params()->num_cpu; + stack_group->wakeup_enable = (get_global_cfg_params()->num_wakeup > 0) ? true : false; ++ init_list_node(&stack_group->poll_list); ++ pthread_spin_init(&stack_group->poll_list_lock, PTHREAD_PROCESS_PRIVATE); ++ + + if (init_protocol_sem() != 0) { + return -1; +@@ -482,7 +530,10 @@ void stack_socket(struct rpc_msg *msg) + { + msg->result = gazelle_socket(msg->args[MSG_ARG_0].i, msg->args[MSG_ARG_1].i, msg->args[MSG_ARG_2].i); + if (msg->result < 0) { +- LSTACK_LOG(ERR, LSTACK, "tid %ld, %ld socket failed\n", get_stack_tid(), msg->result); ++ msg->result = gazelle_socket(msg->args[MSG_ARG_0].i, msg->args[MSG_ARG_1].i, msg->args[MSG_ARG_2].i); ++ if (msg->result < 0) { ++ LSTACK_LOG(ERR, LSTACK, "tid %ld, %ld socket failed\n", get_stack_tid(), msg->result); ++ } + } + } + +@@ -644,6 +695,25 @@ void stack_broadcast_arp(struct rte_mbuf *mbuf, struct protocol_stack *cur_stack + } + } + ++void stack_broadcast_clean_epoll(struct wakeup_poll *wakeup) ++{ ++ struct protocol_stack_group *stack_group = get_protocol_stack_group(); ++ struct protocol_stack *stack = NULL; ++ ++ for (int32_t i = 0; i < stack_group->stack_num; i++) { ++ stack = stack_group->stacks[i]; ++ rpc_call_clean_epoll(stack, wakeup); ++ } ++} ++ ++void stack_clean_epoll(struct rpc_msg *msg) ++{ ++ struct protocol_stack *stack = get_protocol_stack(); ++ struct wakeup_poll *wakeup = (struct wakeup_poll *)msg->args[MSG_ARG_0].p; ++ ++ list_del_node_null(&wakeup->wakeup_list[stack->queue_id]); ++} ++ + /* when fd is listenfd, listenfd of all protocol stack thread will be closed */ + int32_t stack_broadcast_close(int32_t fd) + { +diff --git a/src/lstack/core/lstack_stack_stat.c b/src/lstack/core/lstack_stack_stat.c +index c011aed..6261fa9 100644 +--- a/src/lstack/core/lstack_stack_stat.c ++++ b/src/lstack/core/lstack_stack_stat.c +@@ -92,59 +92,26 @@ static void set_latency_start_flag(bool start) + } + } + +-void register_wakeup(struct protocol_stack *stack, struct wakeup_poll *wakeup) ++static void get_wakeup_stat(struct protocol_stack_group *stack_group, struct protocol_stack *stack, ++ struct gazelle_wakeup_stat *stat) + { +- pthread_spin_lock(&stack->wakeup_list_lock); ++ struct list_node *node, *temp; + +- wakeup->next = stack->wakeup_list; +- stack->wakeup_list = wakeup; ++ pthread_spin_lock(&stack_group->poll_list_lock); + +- pthread_spin_unlock(&stack->wakeup_list_lock); +-} +- +-void unregister_wakeup(struct protocol_stack *stack, struct wakeup_poll *wakeup) +-{ +- pthread_spin_lock(&stack->wakeup_list_lock); +- +- struct wakeup_poll *node = stack->wakeup_list; +- struct wakeup_poll *pre = NULL; +- +- while (node && node != wakeup) { +- pre = node; +- node = node->next; +- } +- +- if (node == NULL) { +- pthread_spin_unlock(&stack->wakeup_list_lock); +- return; +- } ++ list_for_each_safe(node, temp, &stack_group->poll_list) { ++ struct wakeup_poll *wakeup = container_of(node, struct wakeup_poll, poll_list); + +- if (pre) { +- pre->next = node->next; +- } else { +- stack->wakeup_list = node->next; +- } +- node->next = NULL; +- +- pthread_spin_unlock(&stack->wakeup_list_lock); +-} +- +-static void get_wakeup_stat(struct protocol_stack *stack, struct gazelle_wakeup_stat *stat) +-{ +- pthread_spin_lock(&stack->wakeup_list_lock); +- +- struct wakeup_poll *node = stack->wakeup_list; +- while (node) { +- stat->app_events += node->stat.app_events; +- stat->read_null += node->stat.read_null; +- stat->app_write_cnt += node->stat.app_write_cnt; +- stat->app_write_idlefail += node->stat.app_write_idlefail; +- stat->app_read_cnt += node->stat.app_read_cnt; +- +- node = node->next; ++ if (wakeup->bind_stack == stack) { ++ stat->app_events += wakeup->stat.app_events; ++ stat->read_null += wakeup->stat.read_null; ++ stat->app_write_cnt += wakeup->stat.app_write_cnt; ++ stat->app_write_idlefail += wakeup->stat.app_write_idlefail; ++ stat->app_read_cnt += wakeup->stat.app_read_cnt; ++ } + } + +- pthread_spin_unlock(&stack->wakeup_list_lock); ++ pthread_spin_unlock(&stack_group->poll_list_lock); + } + + void lstack_get_low_power_info(struct gazelle_stat_low_power_info *low_power_info) +@@ -172,7 +139,7 @@ static void get_stack_stats(struct gazelle_stack_dfx_data *dfx, struct protocol_ + return; + } + +- get_wakeup_stat(stack, &dfx->data.pkts.wakeup_stat); ++ get_wakeup_stat(stack_group, stack, &dfx->data.pkts.wakeup_stat); + + dfx->data.pkts.call_alloc_fail = stack_group->call_alloc_fail; + +@@ -251,11 +218,13 @@ static int32_t send_control_cmd_data(int32_t fd, struct gazelle_stack_dfx_data * + + int32_t handle_stack_cmd(int32_t fd, enum GAZELLE_STAT_MODE stat_mode) + { +- struct gazelle_stack_dfx_data dfx = {0}; ++ struct gazelle_stack_dfx_data dfx; + struct protocol_stack_group *stack_group = get_protocol_stack_group(); + + for (uint32_t i = 0; i < stack_group->stack_num; i++) { + struct protocol_stack *stack = stack_group->stacks[i]; ++ ++ memset_s(&dfx, sizeof(dfx), 0, sizeof(dfx)); + get_stack_dfx_data(&dfx, stack, stat_mode); + + if (!use_ltran() && +diff --git a/src/lstack/core/lstack_thread_rpc.c b/src/lstack/core/lstack_thread_rpc.c +index bc77909..46cbbe7 100644 +--- a/src/lstack/core/lstack_thread_rpc.c ++++ b/src/lstack/core/lstack_thread_rpc.c +@@ -88,7 +88,6 @@ static inline __attribute__((always_inline)) void rpc_msg_free(struct rpc_msg *m + + static inline __attribute__((always_inline)) void rpc_call(lockless_queue *queue, struct rpc_msg *msg) + { +- pthread_spin_trylock(&msg->lock); + lockless_queue_mpsc_push(queue, &msg->queue_node); + } + +@@ -96,6 +95,7 @@ static inline __attribute__((always_inline)) int32_t rpc_sync_call(lockless_queu + { + int32_t ret; + ++ pthread_spin_trylock(&msg->lock); + rpc_call(queue, msg); + + // waiting stack unlock +@@ -270,6 +270,18 @@ int32_t rpc_call_close(int fd) + return rpc_sync_call(&stack->rpc_queue, msg); + } + ++void rpc_call_clean_epoll(struct protocol_stack *stack, struct wakeup_poll *wakeup) ++{ ++ struct rpc_msg *msg = rpc_msg_alloc(stack, stack_clean_epoll); ++ if (msg == NULL) { ++ return; ++ } ++ ++ msg->args[MSG_ARG_0].p = wakeup; ++ ++ rpc_sync_call(&stack->rpc_queue, msg); ++} ++ + int32_t rpc_call_bind(int32_t fd, const struct sockaddr *addr, socklen_t addrlen) + { + struct protocol_stack *stack = get_protocol_stack_by_fd(fd); +@@ -447,4 +459,3 @@ int32_t rpc_call_send(int fd, const void *buf, size_t len, int flags) + + return 0; + } +- +diff --git a/src/lstack/include/lstack_protocol_stack.h b/src/lstack/include/lstack_protocol_stack.h +index 0eda45d..cc2cfb9 100644 +--- a/src/lstack/include/lstack_protocol_stack.h ++++ b/src/lstack/include/lstack_protocol_stack.h +@@ -14,6 +14,7 @@ + #define __GAZELLE_PROTOCOL_STACK_H__ + + #include ++#include + + #include + #include +@@ -50,11 +51,14 @@ struct protocol_stack { + struct reg_ring_msg *reg_buf; + + volatile bool low_power; +- struct wakeup_poll *wakeup_list; +- pthread_spinlock_t wakeup_list_lock; + lockless_queue rpc_queue __rte_cache_aligned; + char pad __rte_cache_aligned; + ++ /* kernel event thread read/write frequently */ ++ struct epoll_event kernel_events[KERNEL_EPOLL_MAX]; ++ int32_t kernel_event_num; ++ char pad1 __rte_cache_aligned; ++ + struct netif netif; + struct eth_dev_ops *dev_ops; + uint32_t rx_ring_used; +@@ -62,7 +66,7 @@ struct protocol_stack { + + struct list_node recv_list; + struct list_node send_list; +- bool have_event; ++ struct list_node wakeup_list; + + volatile uint16_t conn_num; + struct stats_ *lwip_stats; +@@ -85,6 +89,8 @@ struct protocol_stack_group { + struct eth_params *eth_params; + struct protocol_stack *stacks[PROTOCOL_STACK_MAX]; + bool wakeup_enable; ++ struct list_node poll_list; ++ pthread_spinlock_t poll_list_lock; + + /* dfx stats */ + bool latency_start; +@@ -117,7 +123,11 @@ int32_t stack_single_listen(int32_t fd, int32_t backlog); + int32_t stack_broadcast_accept(int32_t fd, struct sockaddr *addr, socklen_t *addrlen); + int32_t stack_broadcast_accept4(int32_t fd, struct sockaddr *addr, socklen_t *addrlen, int32_t flags); + ++struct wakeup_poll; ++void stack_broadcast_clean_epoll(struct wakeup_poll *wakeup); ++ + struct rpc_msg; ++void stack_clean_epoll(struct rpc_msg *msg); + void stack_arp(struct rpc_msg *msg); + void stack_socket(struct rpc_msg *msg); + void stack_close(struct rpc_msg *msg); +diff --git a/src/lstack/include/lstack_stack_stat.h b/src/lstack/include/lstack_stack_stat.h +index 98ffe8f..6057fe1 100644 +--- a/src/lstack/include/lstack_stack_stat.h ++++ b/src/lstack/include/lstack_stack_stat.h +@@ -27,7 +27,6 @@ void stack_stat_init(void); + int32_t handle_stack_cmd(int fd, enum GAZELLE_STAT_MODE stat_mode); + uint64_t get_current_time(void); + void lstack_get_low_power_info(struct gazelle_stat_low_power_info *low_power_info); +-void register_wakeup(struct protocol_stack *stack, struct wakeup_poll *wakeup); + void unregister_wakeup(struct protocol_stack *stack, struct wakeup_poll *wakeup); + + #endif /* GAZELLE_STACK_STAT_H */ +diff --git a/src/lstack/include/lstack_thread_rpc.h b/src/lstack/include/lstack_thread_rpc.h +index f95bc72..6928f98 100644 +--- a/src/lstack/include/lstack_thread_rpc.h ++++ b/src/lstack/include/lstack_thread_rpc.h +@@ -50,7 +50,9 @@ struct rpc_msg { + + struct protocol_stack; + struct rte_mbuf; ++struct wakeup_poll; + void poll_rpc_msg(struct protocol_stack *stack, uint32_t max_num); ++void rpc_call_clean_epoll(struct protocol_stack *stack, struct wakeup_poll *wakeup); + int32_t rpc_call_msgcnt(struct protocol_stack *stack); + int32_t rpc_call_shadow_fd(struct protocol_stack *stack, int32_t fd, const struct sockaddr *addr, socklen_t addrlen); + int32_t rpc_call_recvlistcnt(struct protocol_stack *stack); +diff --git a/src/lstack/include/posix/lstack_epoll.h b/src/lstack/include/posix/lstack_epoll.h +index 3c8fd1b..d6c81a7 100644 +--- a/src/lstack/include/posix/lstack_epoll.h ++++ b/src/lstack/include/posix/lstack_epoll.h +@@ -17,7 +17,6 @@ + #include + #include + #include +-#include + + #include + +@@ -31,29 +30,31 @@ extern "C" { + enum wakeup_type { + WAKEUP_EPOLL = 0, + WAKEUP_POLL, ++ WAKEUP_CLOSE, + }; + + struct protocol_stack; + struct wakeup_poll { + /* stack thread read frequently */ +- int32_t eventfd; + enum wakeup_type type; +- bool have_event; +- volatile bool in_wait __rte_cache_aligned; ++ pthread_mutex_t wait __rte_cache_aligned; ++ bool in_wait; ++ struct list_node wakeup_list[PROTOCOL_STACK_MAX]; ++ bool have_kernel_event; + char pad __rte_cache_aligned; + + struct gazelle_wakeup_stat stat; + struct protocol_stack *bind_stack; +- struct wakeup_poll *next; ++ struct list_node poll_list; + + /* poll */ + struct pollfd *last_fds; + nfds_t last_nfds; + nfds_t last_max_nfds; ++ struct epoll_event *events; + + /* epoll */ + int32_t epollfd; /* epoll kernel fd */ +- bool have_kernel_fd; + int32_t stack_fd_cnt[PROTOCOL_STACK_MAX]; + struct protocol_stack *max_stack; + struct list_node event_list; +@@ -63,7 +64,7 @@ struct wakeup_poll { + struct netconn; + struct lwip_sock; + void add_sock_event(struct lwip_sock *sock, uint32_t event); +-void wakeup_epoll(struct protocol_stack *stack, struct wakeup_poll *wakeup); ++void wakeup_stack_epoll(struct protocol_stack *stack); + int32_t lstack_epoll_create(int32_t size); + int32_t lstack_epoll_create1(int32_t flags); + int32_t lstack_epoll_ctl(int32_t epfd, int32_t op, int32_t fd, struct epoll_event *event); +diff --git a/src/lstack/netif/lstack_ethdev.c b/src/lstack/netif/lstack_ethdev.c +index 28bf32d..4757d72 100644 +--- a/src/lstack/netif/lstack_ethdev.c ++++ b/src/lstack/netif/lstack_ethdev.c +@@ -79,7 +79,7 @@ void eth_dev_recv(struct rte_mbuf *mbuf) + } + } + +-#define READ_PKTS_MAX 32 ++#define READ_PKTS_MAX 128 + int32_t eth_dev_poll(void) + { + uint32_t nr_pkts; +diff --git a/src/ltran/ltran_dfx.c b/src/ltran/ltran_dfx.c +index 3d977b5..de97a48 100644 +--- a/src/ltran/ltran_dfx.c ++++ b/src/ltran/ltran_dfx.c +@@ -875,8 +875,8 @@ static void gazelle_print_lstack_stat_conn(void *buf, const struct gazelle_stat_ + inet_ntop(AF_INET, &rip, str_rip, sizeof(str_rip)), conn_info->r_port, + tcp_state_to_str(conn_info->tcp_sub_state)); + } else if (conn_info->state == GAZELLE_LISTEN_LIST) { +- printf("%-6utcp %-57u%s:%hu 0.0.0.0:* LISTEN\n", i, conn_info->recv_cnt, +- inet_ntop(AF_INET, &lip, str_ip, sizeof(str_ip)), conn_info->l_port); ++ printf("%-6utcp %-50u%-7d%s:%hu 0.0.0.0:* LISTEN\n", i, conn_info->recv_cnt, ++ conn_info->fd, inet_ntop(AF_INET, &lip, str_ip, sizeof(str_ip)), conn_info->l_port); + } else { + printf("Got unknow tcp conn::%s:%5hu, state:%u\n", + inet_ntop(AF_INET, &lip, str_ip, sizeof(str_ip)), conn_info->l_port, conn_info->state); +-- +2.23.0 + diff --git a/gazelle.spec b/gazelle.spec index 8f5ecbe..40ea108 100644 --- a/gazelle.spec +++ b/gazelle.spec @@ -2,7 +2,7 @@ Name: gazelle Version: 1.0.1 -Release: 15 +Release: 16 Summary: gazelle is a high performance user-mode stack License: MulanPSL-2.0 URL: https://gitee.com/openeuler/gazelle @@ -100,6 +100,32 @@ Patch9082: 0082-del-gazelle-ring-cons.tail-atomic-protect.patch Patch9083: 0083-fix-send-return-vale.patch Patch9084: 0084-add-examples.patch Patch9085: 0085-expand-thread-rpc-msg-pool-size.patch +Patch9086: 0086-fix-fd-leak.patch +Patch9087: 0087-fix-del-conn-use-after-free.patch +Patch9088: 0088-init-g_gazelle_errno-before-use.patch +Patch9089: 0089-code-format-specification.patch +Patch9090: 0090-fix-gazelle-kernel-event-thread-affinity-same-with-s.patch +Patch9091: 0091-have_corelist_arg.patch +Patch9092: 0092-ltran-update-list.patch +Patch9093: 0093-remove-get_reg_ring_free_count.patch +Patch9094: 0094-add-errorno-EISCONN.patch +Patch9095: 0095-fix-sendmsg-data-write-wrong.patch +Patch9096: 0096-lstack-restore-pci-bus-after-init.patch +Patch9097: 0097-fix-malloc-rpc-msg-fail.patch +Patch9098: 0098-support-dpdk-dynamic-memory.patch +Patch9099: 0099-fix-lwip_send-fail-free-pbuf-miss-data.patch +Patch9100: 0100-merger-wakeup.patch +Patch9101: 0101-conenct-support-multi-queues.patch +Patch9102: 0102-merge-sendmsg-write.patch +Patch9103: 0103-add-thread-select-path.patch +Patch9104: 0104-support-conf-control-app-bind-numa.patch +Patch9105: 0105-fix-epoll_wait-cover-kernel-event.patch +Patch9106: 0106-fix-read-stack-data-return-0-when-no-data.patch +Patch9107: 0107-fix-stack-wakeup-node-del.patch +Patch9108: 0108-avoid-useless-stack-check-wakeup-event.patch +Patch9109: 0109-fix-mesg-loss.patch +Patch9110: 0110-add-accept4-and-epoll_create1.patch +Patch9111: 0111-refactor-event-notice.patch %description %{name} is a high performance user-mode stack. @@ -140,6 +166,10 @@ install -Dpm 0640 %{_builddir}/%{name}-%{version}/src/ltran/ltran.conf %{b %config(noreplace) %{conf_path}/ltran.conf %changelog +* Sat Oct 08 2022 wuchangsheng - 1.0.1-16 +- refactor event + addapt for ceph client + * Mon Sep 05 2022 wuchangsheng - 1.0.1-15 - expand rpc msg pool size