982 lines
31 KiB
Diff
982 lines
31 KiB
Diff
|
|
From c33b3d26f695d342af3fa81ab404a366bb8ce873 Mon Sep 17 00:00:00 2001
|
||
|
|
From: Artem Boldariev <artem@boldariev.com>
|
||
|
|
Date: Wed, 3 Jul 2024 13:58:32 +0300
|
||
|
|
Subject: [PATCH] TCP/TLS DNS: unthrottle only when all input data processing
|
||
|
|
|
||
|
|
This commit ensures that we restart reading only when all DNS data in
|
||
|
|
the input buffer is processed so the we will not get into the
|
||
|
|
situation when the buffer is overrun.
|
||
|
|
|
||
|
|
Conflict:NA
|
||
|
|
Reference:https://downloads.isc.org/isc/bind9/9.18.28/patches/0001-CVE-2024-0760.patch
|
||
|
|
|
||
|
|
---
|
||
|
|
lib/isc/netmgr/netmgr-int.h | 27 +++++--
|
||
|
|
lib/isc/netmgr/netmgr.c | 79 ++++++++++++++----
|
||
|
|
lib/isc/netmgr/tcp.c | 71 +++++++++++++++-
|
||
|
|
lib/isc/netmgr/tcpdns.c | 59 +++++++++++++-
|
||
|
|
lib/isc/netmgr/tlsdns.c | 120 ++++++++++++++++++++-------
|
||
|
|
lib/ns/client.c | 156 +++++++++++++++++-------------------
|
||
|
|
lib/ns/include/ns/client.h | 6 +-
|
||
|
|
7 files changed, 379 insertions(+), 139 deletions(-)
|
||
|
|
|
||
|
|
diff --git a/lib/isc/netmgr/netmgr-int.h b/lib/isc/netmgr/netmgr-int.h
|
||
|
|
index 6aca9ab..bc1ba73 100644
|
||
|
|
--- a/lib/isc/netmgr/netmgr-int.h
|
||
|
|
+++ b/lib/isc/netmgr/netmgr-int.h
|
||
|
|
@@ -62,9 +62,10 @@
|
||
|
|
#endif
|
||
|
|
|
||
|
|
/*
|
||
|
|
- * The TCP receive buffer can fit one maximum sized DNS message plus its size,
|
||
|
|
- * the receive buffer here affects TCP, DoT and DoH.
|
||
|
|
+ * The TCP send and receive buffers can fit one maximum sized DNS message plus
|
||
|
|
+ * its size, the receive buffer here affects TCP, DoT and DoH.
|
||
|
|
*/
|
||
|
|
+#define ISC_NETMGR_TCP_SENDBUF_SIZE (sizeof(uint16_t) + UINT16_MAX)
|
||
|
|
#define ISC_NETMGR_TCP_RECVBUF_SIZE (sizeof(uint16_t) + UINT16_MAX)
|
||
|
|
|
||
|
|
/* Pick the larger buffer */
|
||
|
|
@@ -377,9 +378,10 @@ struct isc__nm_uvreq {
|
||
|
|
int magic;
|
||
|
|
isc_nmsocket_t *sock;
|
||
|
|
isc_nmhandle_t *handle;
|
||
|
|
- char tcplen[2]; /* The TCP DNS message length */
|
||
|
|
- uv_buf_t uvbuf; /* translated isc_region_t, to be
|
||
|
|
- * sent or received */
|
||
|
|
+ char tcplen[2]; /* The TCP DNS message length */
|
||
|
|
+ uv_buf_t uvbuf; /* translated isc_region_t, to be
|
||
|
|
+ * sent or received */
|
||
|
|
+ isc_region_t userbuf;
|
||
|
|
isc_sockaddr_t local; /* local address */
|
||
|
|
isc_sockaddr_t peer; /* peer address */
|
||
|
|
isc__nm_cb_t cb; /* callback */
|
||
|
|
@@ -998,7 +1000,6 @@ struct isc_nmsocket {
|
||
|
|
TLS_STATE_ERROR,
|
||
|
|
TLS_STATE_CLOSING
|
||
|
|
} state;
|
||
|
|
- isc_region_t senddata;
|
||
|
|
ISC_LIST(isc__nm_uvreq_t) sendreqs;
|
||
|
|
bool cycle;
|
||
|
|
isc_result_t pending_error;
|
||
|
|
@@ -1063,6 +1064,12 @@ struct isc_nmsocket {
|
||
|
|
*/
|
||
|
|
uint64_t write_timeout;
|
||
|
|
|
||
|
|
+ /*
|
||
|
|
+ * Reading was throttled over TCP as the peer does not read the
|
||
|
|
+ * data we are sending back.
|
||
|
|
+ */
|
||
|
|
+ bool reading_throttled;
|
||
|
|
+
|
||
|
|
/*% outer socket is for 'wrapped' sockets - e.g. tcpdns in tcp */
|
||
|
|
isc_nmsocket_t *outer;
|
||
|
|
|
||
|
|
@@ -2265,6 +2272,14 @@ isc__nmsocket_readtimeout_cb(uv_timer_t *timer);
|
||
|
|
void
|
||
|
|
isc__nmsocket_writetimeout_cb(void *data, isc_result_t eresult);
|
||
|
|
|
||
|
|
+/*%<
|
||
|
|
+ *
|
||
|
|
+ * Maximum number of simultaneous handles in flight supported for a single
|
||
|
|
+ * connected TCPDNS socket. This value was chosen arbitrarily, and may be
|
||
|
|
+ * changed in the future.
|
||
|
|
+ */
|
||
|
|
+#define STREAM_CLIENTS_PER_CONN 23
|
||
|
|
+
|
||
|
|
#define UV_RUNTIME_CHECK(func, ret) \
|
||
|
|
if (ret != 0) { \
|
||
|
|
FATAL_ERROR("%s failed: %s\n", #func, uv_strerror(ret)); \
|
||
|
|
diff --git a/lib/isc/netmgr/netmgr.c b/lib/isc/netmgr/netmgr.c
|
||
|
|
index 2310b4b..f9e3b70 100644
|
||
|
|
--- a/lib/isc/netmgr/netmgr.c
|
||
|
|
+++ b/lib/isc/netmgr/netmgr.c
|
||
|
|
@@ -49,8 +49,15 @@
|
||
|
|
* How many isc_nmhandles and isc_nm_uvreqs will we be
|
||
|
|
* caching for reuse in a socket.
|
||
|
|
*/
|
||
|
|
-#define ISC_NM_HANDLES_STACK_SIZE 600
|
||
|
|
-#define ISC_NM_REQS_STACK_SIZE 600
|
||
|
|
+#define ISC_NM_HANDLES_STACK_SIZE 16
|
||
|
|
+#define ISC_NM_REQS_STACK_SIZE 16
|
||
|
|
+
|
||
|
|
+/*%
|
||
|
|
+ * Same, but for UDP sockets which tend to need larger values as they
|
||
|
|
+ * process many requests per socket.
|
||
|
|
+ */
|
||
|
|
+#define ISC_NM_HANDLES_STACK_SIZE_UDP 64
|
||
|
|
+#define ISC_NM_REQS_STACK_SIZE_UDP 64
|
||
|
|
|
||
|
|
/*%
|
||
|
|
* Shortcut index arrays to get access to statistics counters.
|
||
|
|
@@ -1506,16 +1513,25 @@ void
|
||
|
|
isc___nmsocket_init(isc_nmsocket_t *sock, isc_nm_t *mgr, isc_nmsocket_type type,
|
||
|
|
isc_sockaddr_t *iface FLARG) {
|
||
|
|
uint16_t family;
|
||
|
|
+ size_t inactive_handles_stack_size = ISC_NM_HANDLES_STACK_SIZE;
|
||
|
|
+ size_t inactive_reqs_stack_size = ISC_NM_REQS_STACK_SIZE;
|
||
|
|
|
||
|
|
REQUIRE(sock != NULL);
|
||
|
|
REQUIRE(mgr != NULL);
|
||
|
|
|
||
|
|
- *sock = (isc_nmsocket_t){ .type = type,
|
||
|
|
- .fd = -1,
|
||
|
|
- .inactivehandles = isc_astack_new(
|
||
|
|
- mgr->mctx, ISC_NM_HANDLES_STACK_SIZE),
|
||
|
|
- .inactivereqs = isc_astack_new(
|
||
|
|
- mgr->mctx, ISC_NM_REQS_STACK_SIZE) };
|
||
|
|
+ if (type == isc_nm_udpsocket) {
|
||
|
|
+ inactive_handles_stack_size = ISC_NM_HANDLES_STACK_SIZE_UDP;
|
||
|
|
+ inactive_reqs_stack_size = ISC_NM_REQS_STACK_SIZE_UDP;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ *sock = (isc_nmsocket_t){
|
||
|
|
+ .type = type,
|
||
|
|
+ .fd = -1,
|
||
|
|
+ .inactivehandles = isc_astack_new(mgr->mctx,
|
||
|
|
+ inactive_handles_stack_size),
|
||
|
|
+ .inactivereqs = isc_astack_new(mgr->mctx,
|
||
|
|
+ inactive_reqs_stack_size)
|
||
|
|
+ };
|
||
|
|
|
||
|
|
ISC_LIST_INIT(sock->tls.sendreqs);
|
||
|
|
|
||
|
|
@@ -2084,6 +2100,7 @@ isc__nmsocket_writetimeout_cb(void *data, isc_result_t eresult) {
|
||
|
|
|
||
|
|
sock = req->sock;
|
||
|
|
|
||
|
|
+ isc__nm_start_reading(sock);
|
||
|
|
isc__nmsocket_reset(sock);
|
||
|
|
}
|
||
|
|
|
||
|
|
@@ -2093,7 +2110,6 @@ isc__nmsocket_readtimeout_cb(uv_timer_t *timer) {
|
||
|
|
|
||
|
|
REQUIRE(VALID_NMSOCK(sock));
|
||
|
|
REQUIRE(sock->tid == isc_nm_tid());
|
||
|
|
- REQUIRE(atomic_load(&sock->reading));
|
||
|
|
|
||
|
|
if (atomic_load(&sock->client)) {
|
||
|
|
uv_timer_stop(timer);
|
||
|
|
@@ -2340,8 +2356,10 @@ processbuffer(isc_nmsocket_t *sock) {
|
||
|
|
* timers. If we do have a full message, reset the timer.
|
||
|
|
*
|
||
|
|
* Stop reading if this is a client socket, or if the server socket
|
||
|
|
- * has been set to sequential mode. In this case we'll be called again
|
||
|
|
- * later by isc__nm_resume_processing().
|
||
|
|
+ * has been set to sequential mode, or the number of queries we are
|
||
|
|
+ * processing simultaneously has reached the clients-per-connection
|
||
|
|
+ * limit. In this case we'll be called again later by
|
||
|
|
+ * isc__nm_resume_processing().
|
||
|
|
*/
|
||
|
|
isc_result_t
|
||
|
|
isc__nm_process_sock_buffer(isc_nmsocket_t *sock) {
|
||
|
|
@@ -2349,14 +2367,41 @@ isc__nm_process_sock_buffer(isc_nmsocket_t *sock) {
|
||
|
|
int_fast32_t ah = atomic_load(&sock->ah);
|
||
|
|
isc_result_t result = processbuffer(sock);
|
||
|
|
switch (result) {
|
||
|
|
- case ISC_R_NOMORE:
|
||
|
|
+ case ISC_R_NOMORE: {
|
||
|
|
/*
|
||
|
|
* Don't reset the timer until we have a
|
||
|
|
* full DNS message.
|
||
|
|
*/
|
||
|
|
- result = isc__nm_start_reading(sock);
|
||
|
|
- if (result != ISC_R_SUCCESS) {
|
||
|
|
- return (result);
|
||
|
|
+
|
||
|
|
+ /*
|
||
|
|
+ * Restart reading if we have less data in the send
|
||
|
|
+ * queue than the send buffer size, this means that the
|
||
|
|
+ * TCP client has started reading some data again.
|
||
|
|
+ * Starting reading when we go under the limit instead
|
||
|
|
+ * of waiting for all data has been flushed allows
|
||
|
|
+ * faster recovery (in case there was a congestion and
|
||
|
|
+ * now there isn't).
|
||
|
|
+ */
|
||
|
|
+ size_t write_queue_size =
|
||
|
|
+ uv_stream_get_write_queue_size(
|
||
|
|
+ &sock->uv_handle.stream);
|
||
|
|
+ if (write_queue_size < ISC_NETMGR_TCP_SENDBUF_SIZE) {
|
||
|
|
+ if (sock->reading_throttled) {
|
||
|
|
+ isc_log_write(isc_lctx,
|
||
|
|
+ ISC_LOGCATEGORY_GENERAL,
|
||
|
|
+ ISC_LOGMODULE_NETMGR,
|
||
|
|
+ ISC_LOG_DEBUG(3),
|
||
|
|
+ "resuming TCP "
|
||
|
|
+ "connection, the other "
|
||
|
|
+ "side is reading the "
|
||
|
|
+ "data again (%zu)",
|
||
|
|
+ write_queue_size);
|
||
|
|
+ sock->reading_throttled = false;
|
||
|
|
+ }
|
||
|
|
+ result = isc__nm_start_reading(sock);
|
||
|
|
+ if (result != ISC_R_SUCCESS) {
|
||
|
|
+ return (result);
|
||
|
|
+ }
|
||
|
|
}
|
||
|
|
/*
|
||
|
|
* Start the timer only if there are no externally used
|
||
|
|
@@ -2368,6 +2413,7 @@ isc__nm_process_sock_buffer(isc_nmsocket_t *sock) {
|
||
|
|
isc__nmsocket_timer_start(sock);
|
||
|
|
}
|
||
|
|
goto done;
|
||
|
|
+ }
|
||
|
|
case ISC_R_CANCELED:
|
||
|
|
isc__nmsocket_timer_stop(sock);
|
||
|
|
isc__nm_stop_reading(sock);
|
||
|
|
@@ -2381,7 +2427,8 @@ isc__nm_process_sock_buffer(isc_nmsocket_t *sock) {
|
||
|
|
isc__nmsocket_timer_stop(sock);
|
||
|
|
|
||
|
|
if (atomic_load(&sock->client) ||
|
||
|
|
- atomic_load(&sock->sequential))
|
||
|
|
+ atomic_load(&sock->sequential) ||
|
||
|
|
+ atomic_load(&sock->ah) >= STREAM_CLIENTS_PER_CONN)
|
||
|
|
{
|
||
|
|
isc__nm_stop_reading(sock);
|
||
|
|
goto done;
|
||
|
|
diff --git a/lib/isc/netmgr/tcp.c b/lib/isc/netmgr/tcp.c
|
||
|
|
index 16b53cc..37d44bd 100644
|
||
|
|
--- a/lib/isc/netmgr/tcp.c
|
||
|
|
+++ b/lib/isc/netmgr/tcp.c
|
||
|
|
@@ -766,7 +766,7 @@ isc__nm_async_tcpstartread(isc__networker_t *worker, isc__netievent_t *ev0) {
|
||
|
|
isc__netievent_tcpstartread_t *ievent =
|
||
|
|
(isc__netievent_tcpstartread_t *)ev0;
|
||
|
|
isc_nmsocket_t *sock = ievent->sock;
|
||
|
|
- isc_result_t result;
|
||
|
|
+ isc_result_t result = ISC_R_SUCCESS;
|
||
|
|
|
||
|
|
REQUIRE(VALID_NMSOCK(sock));
|
||
|
|
REQUIRE(sock->tid == isc_nm_tid());
|
||
|
|
@@ -774,7 +774,7 @@ isc__nm_async_tcpstartread(isc__networker_t *worker, isc__netievent_t *ev0) {
|
||
|
|
|
||
|
|
if (isc__nmsocket_closing(sock)) {
|
||
|
|
result = ISC_R_CANCELED;
|
||
|
|
- } else {
|
||
|
|
+ } else if (!sock->reading_throttled) {
|
||
|
|
result = isc__nm_start_reading(sock);
|
||
|
|
}
|
||
|
|
|
||
|
|
@@ -905,6 +905,32 @@ isc__nm_tcp_read_cb(uv_stream_t *stream, ssize_t nread, const uv_buf_t *buf) {
|
||
|
|
|
||
|
|
/* The readcb could have paused the reading */
|
||
|
|
if (atomic_load(&sock->reading)) {
|
||
|
|
+ if (!sock->client) {
|
||
|
|
+ /*
|
||
|
|
+ * Stop reading if we have accumulated enough bytes in
|
||
|
|
+ * the send queue; this means that the TCP client is not
|
||
|
|
+ * reading back the data we sending to it, and there's
|
||
|
|
+ * no reason to continue processing more incoming DNS
|
||
|
|
+ * messages, if the client is not reading back the
|
||
|
|
+ * responses.
|
||
|
|
+ */
|
||
|
|
+ size_t write_queue_size =
|
||
|
|
+ uv_stream_get_write_queue_size(
|
||
|
|
+ &sock->uv_handle.stream);
|
||
|
|
+
|
||
|
|
+ if (write_queue_size >= ISC_NETMGR_TCP_SENDBUF_SIZE) {
|
||
|
|
+ isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
|
||
|
|
+ ISC_LOGMODULE_NETMGR,
|
||
|
|
+ ISC_LOG_DEBUG(3),
|
||
|
|
+ "throttling TCP connection, "
|
||
|
|
+ "the other side is "
|
||
|
|
+ "not reading the data (%zu)",
|
||
|
|
+ write_queue_size);
|
||
|
|
+ sock->reading_throttled = true;
|
||
|
|
+ isc__nm_stop_reading(sock);
|
||
|
|
+ }
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
/* The timer will be updated */
|
||
|
|
isc__nmsocket_timer_restart(sock);
|
||
|
|
}
|
||
|
|
@@ -1095,6 +1121,34 @@ isc__nm_tcp_send(isc_nmhandle_t *handle, const isc_region_t *region,
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
+static void
|
||
|
|
+tcp_maybe_restart_reading(isc_nmsocket_t *sock) {
|
||
|
|
+ if (!sock->client && sock->reading_throttled &&
|
||
|
|
+ !uv_is_active(&sock->uv_handle.handle))
|
||
|
|
+ {
|
||
|
|
+ /*
|
||
|
|
+ * Restart reading if we have less data in the send queue than
|
||
|
|
+ * the send buffer size, this means that the TCP client has
|
||
|
|
+ * started reading some data again. Starting reading when we go
|
||
|
|
+ * under the limit instead of waiting for all data has been
|
||
|
|
+ * flushed allows faster recovery (in case there was a
|
||
|
|
+ * congestion and now there isn't).
|
||
|
|
+ */
|
||
|
|
+ size_t write_queue_size =
|
||
|
|
+ uv_stream_get_write_queue_size(&sock->uv_handle.stream);
|
||
|
|
+ if (write_queue_size < ISC_NETMGR_TCP_SENDBUF_SIZE) {
|
||
|
|
+ isc_log_write(
|
||
|
|
+ isc_lctx, ISC_LOGCATEGORY_GENERAL,
|
||
|
|
+ ISC_LOGMODULE_NETMGR, ISC_LOG_DEBUG(3),
|
||
|
|
+ "resuming TCP connection, the other side "
|
||
|
|
+ "is reading the data again (%zu)",
|
||
|
|
+ write_queue_size);
|
||
|
|
+ sock->reading_throttled = false;
|
||
|
|
+ isc__nm_start_reading(sock);
|
||
|
|
+ }
|
||
|
|
+ }
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
static void
|
||
|
|
tcp_send_cb(uv_write_t *req, int status) {
|
||
|
|
isc__nm_uvreq_t *uvreq = (isc__nm_uvreq_t *)req->data;
|
||
|
|
@@ -1112,10 +1166,23 @@ tcp_send_cb(uv_write_t *req, int status) {
|
||
|
|
isc__nm_incstats(sock, STATID_SENDFAIL);
|
||
|
|
isc__nm_failed_send_cb(sock, uvreq,
|
||
|
|
isc__nm_uverr2result(status));
|
||
|
|
+
|
||
|
|
+ if (!sock->client &&
|
||
|
|
+ (atomic_load(&sock->reading) || sock->reading_throttled))
|
||
|
|
+ {
|
||
|
|
+ /*
|
||
|
|
+ * As we are resuming reading, it is not throttled
|
||
|
|
+ * anymore (technically).
|
||
|
|
+ */
|
||
|
|
+ sock->reading_throttled = false;
|
||
|
|
+ isc__nm_start_reading(sock);
|
||
|
|
+ isc__nmsocket_reset(sock);
|
||
|
|
+ }
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
isc__nm_sendcb(sock, uvreq, ISC_R_SUCCESS, false);
|
||
|
|
+ tcp_maybe_restart_reading(sock);
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
diff --git a/lib/isc/netmgr/tcpdns.c b/lib/isc/netmgr/tcpdns.c
|
||
|
|
index 46958d0..6d417f7 100644
|
||
|
|
--- a/lib/isc/netmgr/tcpdns.c
|
||
|
|
+++ b/lib/isc/netmgr/tcpdns.c
|
||
|
|
@@ -733,7 +733,7 @@ isc__nm_async_tcpdnsread(isc__networker_t *worker, isc__netievent_t *ev0) {
|
||
|
|
isc__netievent_tcpdnsread_t *ievent =
|
||
|
|
(isc__netievent_tcpdnsread_t *)ev0;
|
||
|
|
isc_nmsocket_t *sock = ievent->sock;
|
||
|
|
- isc_result_t result;
|
||
|
|
+ isc_result_t result = ISC_R_SUCCESS;
|
||
|
|
|
||
|
|
UNUSED(worker);
|
||
|
|
|
||
|
|
@@ -742,7 +742,7 @@ isc__nm_async_tcpdnsread(isc__networker_t *worker, isc__netievent_t *ev0) {
|
||
|
|
|
||
|
|
if (isc__nmsocket_closing(sock)) {
|
||
|
|
result = ISC_R_CANCELED;
|
||
|
|
- } else {
|
||
|
|
+ } else if (!sock->reading_throttled) {
|
||
|
|
result = isc__nm_process_sock_buffer(sock);
|
||
|
|
}
|
||
|
|
|
||
|
|
@@ -905,6 +905,28 @@ isc__nm_tcpdns_read_cb(uv_stream_t *stream, ssize_t nread,
|
||
|
|
result = isc__nm_process_sock_buffer(sock);
|
||
|
|
if (result != ISC_R_SUCCESS) {
|
||
|
|
isc__nm_failed_read_cb(sock, result, true);
|
||
|
|
+ } else if (!sock->client) {
|
||
|
|
+ /*
|
||
|
|
+ * Stop reading if we have accumulated enough bytes in
|
||
|
|
+ * the send queue; this means that the TCP client is not
|
||
|
|
+ * reading back the data we sending to it, and there's
|
||
|
|
+ * no reason to continue processing more incoming DNS
|
||
|
|
+ * messages, if the client is not reading back the
|
||
|
|
+ * responses.
|
||
|
|
+ */
|
||
|
|
+ size_t write_queue_size =
|
||
|
|
+ uv_stream_get_write_queue_size(&sock->uv_handle.stream);
|
||
|
|
+
|
||
|
|
+ if (write_queue_size >= ISC_NETMGR_TCP_SENDBUF_SIZE) {
|
||
|
|
+ isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
|
||
|
|
+ ISC_LOGMODULE_NETMGR, ISC_LOG_DEBUG(3),
|
||
|
|
+ "throttling TCP connection, "
|
||
|
|
+ "the other side is "
|
||
|
|
+ "not reading the data (%zu)",
|
||
|
|
+ write_queue_size);
|
||
|
|
+ sock->reading_throttled = true;
|
||
|
|
+ isc__nm_stop_reading(sock);
|
||
|
|
+ }
|
||
|
|
}
|
||
|
|
free:
|
||
|
|
if (nread < 0) {
|
||
|
|
@@ -1125,6 +1147,19 @@ isc__nm_tcpdns_send(isc_nmhandle_t *handle, isc_region_t *region,
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
+static void
|
||
|
|
+tcpdns_maybe_restart_reading(isc_nmsocket_t *sock) {
|
||
|
|
+ if (!sock->client && sock->reading_throttled &&
|
||
|
|
+ !uv_is_active(&sock->uv_handle.handle))
|
||
|
|
+ {
|
||
|
|
+ isc_result_t result = isc__nm_process_sock_buffer(sock);
|
||
|
|
+ if (result != ISC_R_SUCCESS) {
|
||
|
|
+ atomic_store(&sock->reading, true);
|
||
|
|
+ isc__nm_failed_read_cb(sock, result, false);
|
||
|
|
+ }
|
||
|
|
+ }
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
static void
|
||
|
|
tcpdns_send_cb(uv_write_t *req, int status) {
|
||
|
|
isc__nm_uvreq_t *uvreq = (isc__nm_uvreq_t *)req->data;
|
||
|
|
@@ -1142,10 +1177,23 @@ tcpdns_send_cb(uv_write_t *req, int status) {
|
||
|
|
isc__nm_incstats(sock, STATID_SENDFAIL);
|
||
|
|
isc__nm_failed_send_cb(sock, uvreq,
|
||
|
|
isc__nm_uverr2result(status));
|
||
|
|
+
|
||
|
|
+ if (!sock->client &&
|
||
|
|
+ (atomic_load(&sock->reading) || sock->reading_throttled))
|
||
|
|
+ {
|
||
|
|
+ /*
|
||
|
|
+ * As we are resuming reading, it is not throttled
|
||
|
|
+ * anymore (technically).
|
||
|
|
+ */
|
||
|
|
+ sock->reading_throttled = false;
|
||
|
|
+ isc__nm_start_reading(sock);
|
||
|
|
+ isc__nmsocket_reset(sock);
|
||
|
|
+ }
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
isc__nm_sendcb(sock, uvreq, ISC_R_SUCCESS, false);
|
||
|
|
+ tcpdns_maybe_restart_reading(sock);
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
@@ -1211,6 +1259,13 @@ isc__nm_async_tcpdnssend(isc__networker_t *worker, isc__netievent_t *ev0) {
|
||
|
|
goto fail;
|
||
|
|
}
|
||
|
|
|
||
|
|
+ isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_NETMGR,
|
||
|
|
+ ISC_LOG_DEBUG(3),
|
||
|
|
+ "throttling TCP connection, the other side is not "
|
||
|
|
+ "reading the data, switching to uv_write()");
|
||
|
|
+ sock->reading_throttled = true;
|
||
|
|
+ isc__nm_stop_reading(sock);
|
||
|
|
+
|
||
|
|
r = uv_write(&uvreq->uv_req.write, &sock->uv_handle.stream, bufs, nbufs,
|
||
|
|
tcpdns_send_cb);
|
||
|
|
if (r < 0) {
|
||
|
|
diff --git a/lib/isc/netmgr/tlsdns.c b/lib/isc/netmgr/tlsdns.c
|
||
|
|
index 40e6fc8..f62dfd4 100644
|
||
|
|
--- a/lib/isc/netmgr/tlsdns.c
|
||
|
|
+++ b/lib/isc/netmgr/tlsdns.c
|
||
|
|
@@ -88,6 +88,9 @@ tlsdns_set_tls_shutdown(isc_tls_t *tls) {
|
||
|
|
(void)SSL_set_shutdown(tls, SSL_SENT_SHUTDOWN);
|
||
|
|
}
|
||
|
|
|
||
|
|
+static void
|
||
|
|
+tlsdns_maybe_restart_reading(isc_nmsocket_t *sock);
|
||
|
|
+
|
||
|
|
static bool
|
||
|
|
peer_verification_has_failed(isc_nmsocket_t *sock) {
|
||
|
|
if (sock->tls.tls != NULL && sock->tls.state == TLS_STATE_HANDSHAKE &&
|
||
|
|
@@ -1076,6 +1079,19 @@ tls_cycle_input(isc_nmsocket_t *sock) {
|
||
|
|
size_t len;
|
||
|
|
|
||
|
|
for (;;) {
|
||
|
|
+ /*
|
||
|
|
+ * There is a similar branch in
|
||
|
|
+ * isc__nm_process_sock_buffer() which is sufficient to
|
||
|
|
+ * stop excessive processing in TCP. However, as we wrap
|
||
|
|
+ * this call in a loop, we need to have it here in order
|
||
|
|
+ * to limit the number of loop iterations (and,
|
||
|
|
+ * consequently, the number of messages processed).
|
||
|
|
+ */
|
||
|
|
+ if (atomic_load(&sock->ah) >= STREAM_CLIENTS_PER_CONN) {
|
||
|
|
+ isc__nm_stop_reading(sock);
|
||
|
|
+ break;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
(void)SSL_peek(sock->tls.tls, &(char){ '\0' }, 0);
|
||
|
|
|
||
|
|
int pending = SSL_pending(sock->tls.tls);
|
||
|
|
@@ -1253,17 +1269,17 @@ call_pending_send_callbacks(isc_nmsocket_t *sock, const isc_result_t result) {
|
||
|
|
}
|
||
|
|
|
||
|
|
static void
|
||
|
|
-free_senddata(isc_nmsocket_t *sock, const isc_result_t result) {
|
||
|
|
+free_senddata(isc_nmsocket_t *sock, isc__nm_uvreq_t *req,
|
||
|
|
+ const isc_result_t result) {
|
||
|
|
REQUIRE(VALID_NMSOCK(sock));
|
||
|
|
- REQUIRE(sock->tls.senddata.base != NULL);
|
||
|
|
- REQUIRE(sock->tls.senddata.length > 0);
|
||
|
|
+ REQUIRE(req != NULL && req->userbuf.base != NULL &&
|
||
|
|
+ req->userbuf.length > 0);
|
||
|
|
|
||
|
|
- isc_mem_put(sock->mgr->mctx, sock->tls.senddata.base,
|
||
|
|
- sock->tls.senddata.length);
|
||
|
|
- sock->tls.senddata.base = NULL;
|
||
|
|
- sock->tls.senddata.length = 0;
|
||
|
|
+ isc_mem_put(sock->mgr->mctx, req->userbuf.base, req->userbuf.length);
|
||
|
|
|
||
|
|
call_pending_send_callbacks(sock, result);
|
||
|
|
+
|
||
|
|
+ isc__nm_uvreq_put(&req, sock);
|
||
|
|
}
|
||
|
|
|
||
|
|
static void
|
||
|
|
@@ -1276,11 +1292,19 @@ tls_write_cb(uv_write_t *req, int status) {
|
||
|
|
isc_nm_timer_stop(uvreq->timer);
|
||
|
|
isc_nm_timer_detach(&uvreq->timer);
|
||
|
|
|
||
|
|
- free_senddata(sock, result);
|
||
|
|
-
|
||
|
|
- isc__nm_uvreq_put(&uvreq, sock);
|
||
|
|
+ free_senddata(sock, uvreq, result);
|
||
|
|
|
||
|
|
if (status != 0) {
|
||
|
|
+ if (!sock->client &&
|
||
|
|
+ (atomic_load(&sock->reading) || sock->reading_throttled))
|
||
|
|
+ {
|
||
|
|
+ /*
|
||
|
|
+ * As we are resuming reading, it is not throttled
|
||
|
|
+ * anymore (technically).
|
||
|
|
+ */
|
||
|
|
+ sock->reading_throttled = false;
|
||
|
|
+ isc__nm_start_reading(sock);
|
||
|
|
+ }
|
||
|
|
tls_error(sock, result);
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
@@ -1290,6 +1314,8 @@ tls_write_cb(uv_write_t *req, int status) {
|
||
|
|
tls_error(sock, result);
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
+
|
||
|
|
+ tlsdns_maybe_restart_reading(sock);
|
||
|
|
}
|
||
|
|
|
||
|
|
static isc_result_t
|
||
|
|
@@ -1303,23 +1329,18 @@ tls_cycle_output(isc_nmsocket_t *sock) {
|
||
|
|
int rv;
|
||
|
|
int r;
|
||
|
|
|
||
|
|
- if (sock->tls.senddata.base != NULL ||
|
||
|
|
- sock->tls.senddata.length > 0)
|
||
|
|
- {
|
||
|
|
- break;
|
||
|
|
- }
|
||
|
|
-
|
||
|
|
if (pending > (int)ISC_NETMGR_TCP_RECVBUF_SIZE) {
|
||
|
|
pending = (int)ISC_NETMGR_TCP_RECVBUF_SIZE;
|
||
|
|
}
|
||
|
|
|
||
|
|
- sock->tls.senddata.base = isc_mem_get(sock->mgr->mctx, pending);
|
||
|
|
- sock->tls.senddata.length = pending;
|
||
|
|
-
|
||
|
|
/* It's a bit misnomer here, but it does the right thing */
|
||
|
|
req = isc__nm_get_read_req(sock, NULL);
|
||
|
|
- req->uvbuf.base = (char *)sock->tls.senddata.base;
|
||
|
|
- req->uvbuf.len = sock->tls.senddata.length;
|
||
|
|
+
|
||
|
|
+ req->userbuf.base = isc_mem_get(sock->mgr->mctx, pending);
|
||
|
|
+ req->userbuf.length = (size_t)pending;
|
||
|
|
+
|
||
|
|
+ req->uvbuf.base = (char *)req->userbuf.base;
|
||
|
|
+ req->uvbuf.len = (size_t)req->userbuf.length;
|
||
|
|
|
||
|
|
rv = BIO_read_ex(sock->tls.app_rbio, req->uvbuf.base,
|
||
|
|
req->uvbuf.len, &bytes);
|
||
|
|
@@ -1331,32 +1352,36 @@ tls_cycle_output(isc_nmsocket_t *sock) {
|
||
|
|
|
||
|
|
if (r == pending) {
|
||
|
|
/* Wrote everything, restart */
|
||
|
|
- isc__nm_uvreq_put(&req, sock);
|
||
|
|
- free_senddata(sock, ISC_R_SUCCESS);
|
||
|
|
+ free_senddata(sock, req, ISC_R_SUCCESS);
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (r > 0) {
|
||
|
|
/* Partial write, send rest asynchronously */
|
||
|
|
- memmove(req->uvbuf.base, req->uvbuf.base + r,
|
||
|
|
- req->uvbuf.len - r);
|
||
|
|
- req->uvbuf.len = req->uvbuf.len - r;
|
||
|
|
+ req->uvbuf.base += r;
|
||
|
|
+ req->uvbuf.len -= r;
|
||
|
|
} else if (r == UV_ENOSYS || r == UV_EAGAIN) {
|
||
|
|
/* uv_try_write is not supported, send
|
||
|
|
* asynchronously */
|
||
|
|
} else {
|
||
|
|
result = isc__nm_uverr2result(r);
|
||
|
|
- isc__nm_uvreq_put(&req, sock);
|
||
|
|
- free_senddata(sock, result);
|
||
|
|
+ free_senddata(sock, req, result);
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
+ isc_log_write(
|
||
|
|
+ isc_lctx, ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_NETMGR,
|
||
|
|
+ ISC_LOG_DEBUG(3),
|
||
|
|
+ "throttling TCP connection, the other side is not "
|
||
|
|
+ "reading the data, switching to uv_write()");
|
||
|
|
+ sock->reading_throttled = true;
|
||
|
|
+ isc__nm_stop_reading(sock);
|
||
|
|
+
|
||
|
|
r = uv_write(&req->uv_req.write, &sock->uv_handle.stream,
|
||
|
|
&req->uvbuf, 1, tls_write_cb);
|
||
|
|
if (r < 0) {
|
||
|
|
result = isc__nm_uverr2result(r);
|
||
|
|
- isc__nm_uvreq_put(&req, sock);
|
||
|
|
- free_senddata(sock, result);
|
||
|
|
+ free_senddata(sock, req, result);
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
@@ -1525,6 +1550,28 @@ isc__nm_tlsdns_read_cb(uv_stream_t *stream, ssize_t nread,
|
||
|
|
result = tls_cycle(sock);
|
||
|
|
if (result != ISC_R_SUCCESS) {
|
||
|
|
isc__nm_failed_read_cb(sock, result, true);
|
||
|
|
+ } else if (!sock->client) {
|
||
|
|
+ /*
|
||
|
|
+ * Stop reading if we have accumulated enough bytes in
|
||
|
|
+ * the send queue; this means that the TCP client is not
|
||
|
|
+ * reading back the data we sending to it, and there's
|
||
|
|
+ * no reason to continue processing more incoming DNS
|
||
|
|
+ * messages, if the client is not reading back the
|
||
|
|
+ * responses.
|
||
|
|
+ */
|
||
|
|
+ size_t write_queue_size =
|
||
|
|
+ uv_stream_get_write_queue_size(&sock->uv_handle.stream);
|
||
|
|
+
|
||
|
|
+ if (write_queue_size >= ISC_NETMGR_TCP_SENDBUF_SIZE) {
|
||
|
|
+ isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
|
||
|
|
+ ISC_LOGMODULE_NETMGR, ISC_LOG_DEBUG(3),
|
||
|
|
+ "throttling TCP connection, "
|
||
|
|
+ "the other side is "
|
||
|
|
+ "not reading the data (%zu)",
|
||
|
|
+ write_queue_size);
|
||
|
|
+ sock->reading_throttled = true;
|
||
|
|
+ isc__nm_stop_reading(sock);
|
||
|
|
+ }
|
||
|
|
}
|
||
|
|
free:
|
||
|
|
async_tlsdns_cycle(sock);
|
||
|
|
@@ -1766,6 +1813,19 @@ isc__nm_tlsdns_send(isc_nmhandle_t *handle, isc_region_t *region,
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
+static void
|
||
|
|
+tlsdns_maybe_restart_reading(isc_nmsocket_t *sock) {
|
||
|
|
+ if (!sock->client && sock->reading_throttled &&
|
||
|
|
+ !uv_is_active(&sock->uv_handle.handle))
|
||
|
|
+ {
|
||
|
|
+ isc_result_t result = isc__nm_process_sock_buffer(sock);
|
||
|
|
+ if (result != ISC_R_SUCCESS) {
|
||
|
|
+ atomic_store(&sock->reading, true);
|
||
|
|
+ isc__nm_failed_read_cb(sock, result, false);
|
||
|
|
+ }
|
||
|
|
+ }
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
/*
|
||
|
|
* Handle 'tcpsend' async event - send a packet on the socket
|
||
|
|
*/
|
||
|
|
diff --git a/lib/ns/client.c b/lib/ns/client.c
|
||
|
|
index a62343b..8981222 100644
|
||
|
|
--- a/lib/ns/client.c
|
||
|
|
+++ b/lib/ns/client.c
|
||
|
|
@@ -101,6 +101,9 @@
|
||
|
|
#define COOKIE_SIZE 24U /* 8 + 4 + 4 + 8 */
|
||
|
|
#define ECS_SIZE 20U /* 2 + 1 + 1 + [0..16] */
|
||
|
|
|
||
|
|
+#define TCPBUFFERS_FILLCOUNT 1U
|
||
|
|
+#define TCPBUFFERS_FREEMAX 8U
|
||
|
|
+
|
||
|
|
#define WANTNSID(x) (((x)->attributes & NS_CLIENTATTR_WANTNSID) != 0)
|
||
|
|
#define WANTEXPIRE(x) (((x)->attributes & NS_CLIENTATTR_WANTEXPIRE) != 0)
|
||
|
|
#define WANTPAD(x) (((x)->attributes & NS_CLIENTATTR_WANTPAD) != 0)
|
||
|
|
@@ -330,12 +333,36 @@ client_senddone(isc_nmhandle_t *handle, isc_result_t result, void *cbarg) {
|
||
|
|
NS_LOGMODULE_CLIENT, ISC_LOG_DEBUG(3),
|
||
|
|
"send failed: %s",
|
||
|
|
isc_result_totext(result));
|
||
|
|
+ isc_nm_bad_request(handle);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
isc_nmhandle_detach(&handle);
|
||
|
|
}
|
||
|
|
|
||
|
|
+static void
|
||
|
|
+client_setup_tcp_buffer(ns_client_t *client) {
|
||
|
|
+ REQUIRE(client->tcpbuf == NULL);
|
||
|
|
+
|
||
|
|
+ client->tcpbuf = client->manager->tcp_buffer;
|
||
|
|
+ client->tcpbuf_size = NS_CLIENT_TCP_BUFFER_SIZE;
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+static void
|
||
|
|
+client_put_tcp_buffer(ns_client_t *client) {
|
||
|
|
+ if (client->tcpbuf == NULL) {
|
||
|
|
+ return;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ if (client->tcpbuf != client->manager->tcp_buffer) {
|
||
|
|
+ isc_mem_put(client->manager->mctx, client->tcpbuf,
|
||
|
|
+ client->tcpbuf_size);
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ client->tcpbuf = NULL;
|
||
|
|
+ client->tcpbuf_size = 0;
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
static void
|
||
|
|
client_allocsendbuf(ns_client_t *client, isc_buffer_t *buffer,
|
||
|
|
unsigned char **datap) {
|
||
|
|
@@ -345,12 +372,9 @@ client_allocsendbuf(ns_client_t *client, isc_buffer_t *buffer,
|
||
|
|
REQUIRE(datap != NULL);
|
||
|
|
|
||
|
|
if (TCP_CLIENT(client)) {
|
||
|
|
- INSIST(client->tcpbuf == NULL);
|
||
|
|
- client->tcpbuf = isc_mem_get(client->manager->send_mctx,
|
||
|
|
- NS_CLIENT_TCP_BUFFER_SIZE);
|
||
|
|
- client->tcpbuf_size = NS_CLIENT_TCP_BUFFER_SIZE;
|
||
|
|
+ client_setup_tcp_buffer(client);
|
||
|
|
data = client->tcpbuf;
|
||
|
|
- isc_buffer_init(buffer, data, NS_CLIENT_TCP_BUFFER_SIZE);
|
||
|
|
+ isc_buffer_init(buffer, data, client->tcpbuf_size);
|
||
|
|
} else {
|
||
|
|
data = client->sendbuf;
|
||
|
|
if ((client->attributes & NS_CLIENTATTR_HAVECOOKIE) == 0) {
|
||
|
|
@@ -383,11 +407,49 @@ client_sendpkg(ns_client_t *client, isc_buffer_t *buffer) {
|
||
|
|
|
||
|
|
if (isc_buffer_base(buffer) == client->tcpbuf) {
|
||
|
|
size_t used = isc_buffer_usedlength(buffer);
|
||
|
|
- client->tcpbuf = isc_mem_reget(client->manager->send_mctx,
|
||
|
|
- client->tcpbuf,
|
||
|
|
- client->tcpbuf_size, used);
|
||
|
|
- client->tcpbuf_size = used;
|
||
|
|
- r.base = client->tcpbuf;
|
||
|
|
+ INSIST(client->tcpbuf_size == NS_CLIENT_TCP_BUFFER_SIZE);
|
||
|
|
+
|
||
|
|
+ /*
|
||
|
|
+ * Copy the data into a smaller buffer before sending,
|
||
|
|
+ * and keep the original big TCP send buffer for reuse
|
||
|
|
+ * by other clients.
|
||
|
|
+ */
|
||
|
|
+ if (used > NS_CLIENT_SEND_BUFFER_SIZE) {
|
||
|
|
+ /*
|
||
|
|
+ * We can save space by allocating a new buffer with a
|
||
|
|
+ * correct size and freeing the big buffer.
|
||
|
|
+ */
|
||
|
|
+ unsigned char *new_tcpbuf =
|
||
|
|
+ isc_mem_get(client->manager->mctx, used);
|
||
|
|
+ memmove(new_tcpbuf, buffer->base, used);
|
||
|
|
+
|
||
|
|
+ /*
|
||
|
|
+ * Put the big buffer so we can replace the pointer
|
||
|
|
+ * and the size with the new ones.
|
||
|
|
+ */
|
||
|
|
+ client_put_tcp_buffer(client);
|
||
|
|
+
|
||
|
|
+ /*
|
||
|
|
+ * Keep the new buffer's information so it can be freed.
|
||
|
|
+ */
|
||
|
|
+ client->tcpbuf = new_tcpbuf;
|
||
|
|
+ client->tcpbuf_size = used;
|
||
|
|
+
|
||
|
|
+ r.base = new_tcpbuf;
|
||
|
|
+ } else {
|
||
|
|
+ /*
|
||
|
|
+ * The data fits in the available space in
|
||
|
|
+ * 'sendbuf', there is no need for a new buffer.
|
||
|
|
+ */
|
||
|
|
+ memmove(client->sendbuf, buffer->base, used);
|
||
|
|
+
|
||
|
|
+ /*
|
||
|
|
+ * Put the big buffer, we don't need a dynamic buffer.
|
||
|
|
+ */
|
||
|
|
+ client_put_tcp_buffer(client);
|
||
|
|
+
|
||
|
|
+ r.base = client->sendbuf;
|
||
|
|
+ }
|
||
|
|
r.length = used;
|
||
|
|
} else {
|
||
|
|
isc_buffer_usedregion(buffer, &r);
|
||
|
|
@@ -461,8 +523,7 @@ ns_client_sendraw(ns_client_t *client, dns_message_t *message) {
|
||
|
|
return;
|
||
|
|
done:
|
||
|
|
if (client->tcpbuf != NULL) {
|
||
|
|
- isc_mem_put(client->manager->send_mctx, client->tcpbuf,
|
||
|
|
- client->tcpbuf_size);
|
||
|
|
+ client_put_tcp_buffer(client);
|
||
|
|
}
|
||
|
|
|
||
|
|
ns_client_drop(client, result);
|
||
|
|
@@ -746,8 +807,7 @@ renderend:
|
||
|
|
|
||
|
|
cleanup:
|
||
|
|
if (client->tcpbuf != NULL) {
|
||
|
|
- isc_mem_put(client->manager->send_mctx, client->tcpbuf,
|
||
|
|
- client->tcpbuf_size);
|
||
|
|
+ client_put_tcp_buffer(client);
|
||
|
|
}
|
||
|
|
|
||
|
|
if (cleanup_cctx) {
|
||
|
|
@@ -1629,8 +1689,7 @@ ns__client_reset_cb(void *client0) {
|
||
|
|
|
||
|
|
ns_client_endrequest(client);
|
||
|
|
if (client->tcpbuf != NULL) {
|
||
|
|
- isc_mem_put(client->manager->send_mctx, client->tcpbuf,
|
||
|
|
- client->tcpbuf_size);
|
||
|
|
+ client_put_tcp_buffer(client);
|
||
|
|
}
|
||
|
|
|
||
|
|
if (client->keytag != NULL) {
|
||
|
|
@@ -1661,8 +1720,6 @@ ns__client_put_cb(void *client0) {
|
||
|
|
client->magic = 0;
|
||
|
|
client->shuttingdown = true;
|
||
|
|
|
||
|
|
- isc_mem_put(client->manager->send_mctx, client->sendbuf,
|
||
|
|
- NS_CLIENT_SEND_BUFFER_SIZE);
|
||
|
|
if (client->opt != NULL) {
|
||
|
|
INSIST(dns_rdataset_isassociated(client->opt));
|
||
|
|
dns_rdataset_disassociate(client->opt);
|
||
|
|
@@ -2339,8 +2396,6 @@ ns__client_setup(ns_client_t *client, ns_clientmgr_t *mgr, bool new) {
|
||
|
|
dns_message_create(client->mctx, DNS_MESSAGE_INTENTPARSE,
|
||
|
|
&client->message);
|
||
|
|
|
||
|
|
- client->sendbuf = isc_mem_get(client->manager->send_mctx,
|
||
|
|
- NS_CLIENT_SEND_BUFFER_SIZE);
|
||
|
|
/*
|
||
|
|
* Set magic earlier than usual because ns_query_init()
|
||
|
|
* and the functions it calls will require it.
|
||
|
|
@@ -2357,7 +2412,6 @@ ns__client_setup(ns_client_t *client, ns_clientmgr_t *mgr, bool new) {
|
||
|
|
ns_clientmgr_t *oldmgr = client->manager;
|
||
|
|
ns_server_t *sctx = client->sctx;
|
||
|
|
isc_task_t *task = client->task;
|
||
|
|
- unsigned char *sendbuf = client->sendbuf;
|
||
|
|
dns_message_t *message = client->message;
|
||
|
|
isc_mem_t *oldmctx = client->mctx;
|
||
|
|
ns_query_t query = client->query;
|
||
|
|
@@ -2372,7 +2426,6 @@ ns__client_setup(ns_client_t *client, ns_clientmgr_t *mgr, bool new) {
|
||
|
|
.manager = oldmgr,
|
||
|
|
.sctx = sctx,
|
||
|
|
.task = task,
|
||
|
|
- .sendbuf = sendbuf,
|
||
|
|
.message = message,
|
||
|
|
.query = query,
|
||
|
|
.tid = tid };
|
||
|
|
@@ -2397,8 +2450,6 @@ ns__client_setup(ns_client_t *client, ns_clientmgr_t *mgr, bool new) {
|
||
|
|
return (ISC_R_SUCCESS);
|
||
|
|
|
||
|
|
cleanup:
|
||
|
|
- isc_mem_put(client->manager->send_mctx, client->sendbuf,
|
||
|
|
- NS_CLIENT_SEND_BUFFER_SIZE);
|
||
|
|
dns_message_detach(&client->message);
|
||
|
|
isc_task_detach(&client->task);
|
||
|
|
ns_clientmgr_detach(&client->manager);
|
||
|
|
@@ -2461,8 +2512,6 @@ clientmgr_destroy(ns_clientmgr_t *manager) {
|
||
|
|
isc_task_detach(&manager->task);
|
||
|
|
ns_server_detach(&manager->sctx);
|
||
|
|
|
||
|
|
- isc_mem_detach(&manager->send_mctx);
|
||
|
|
-
|
||
|
|
isc_mem_putanddetach(&manager->mctx, manager, sizeof(*manager));
|
||
|
|
}
|
||
|
|
|
||
|
|
@@ -2499,61 +2548,6 @@ ns_clientmgr_create(ns_server_t *sctx, isc_taskmgr_t *taskmgr,
|
||
|
|
|
||
|
|
ISC_LIST_INIT(manager->recursing);
|
||
|
|
|
||
|
|
- /*
|
||
|
|
- * We create specialised per-worker memory context specifically
|
||
|
|
- * dedicated and tuned for allocating send buffers as it is a very
|
||
|
|
- * common operation. Not doing so may result in excessive memory
|
||
|
|
- * use in certain workloads.
|
||
|
|
- *
|
||
|
|
- * Please see this thread for more details:
|
||
|
|
- *
|
||
|
|
- * https://github.com/jemalloc/jemalloc/issues/2483
|
||
|
|
- *
|
||
|
|
- * In particular, this information from the jemalloc developers is
|
||
|
|
- * of the most interest:
|
||
|
|
- *
|
||
|
|
- * https://github.com/jemalloc/jemalloc/issues/2483#issuecomment-1639019699
|
||
|
|
- * https://github.com/jemalloc/jemalloc/issues/2483#issuecomment-1698173849
|
||
|
|
- *
|
||
|
|
- * In essence, we use the following memory management strategy:
|
||
|
|
- *
|
||
|
|
- * 1. We use a per-worker memory arena for send buffers memory
|
||
|
|
- * allocation to reduce lock contention (In reality, we create a
|
||
|
|
- * per-client manager arena, but we have one client manager per
|
||
|
|
- * worker).
|
||
|
|
- *
|
||
|
|
- * 2. The automatically created arenas settings remain unchanged
|
||
|
|
- * and may be controlled by users (e.g. by setting the
|
||
|
|
- * "MALLOC_CONF" variable).
|
||
|
|
- *
|
||
|
|
- * 3. We attune the arenas to not use dirty pages cache as the
|
||
|
|
- * cache would have a poor reuse rate, and that is known to
|
||
|
|
- * significantly contribute to excessive memory use.
|
||
|
|
- *
|
||
|
|
- * 4. There is no strict need for the dirty cache, as there is a
|
||
|
|
- * per arena bin for each allocation size, so because we initially
|
||
|
|
- * allocate strictly 64K per send buffer (enough for a DNS
|
||
|
|
- * message), allocations would get directed to one bin (an "object
|
||
|
|
- * pool" or a "slab") maintained within an arena. That is, there
|
||
|
|
- * is an object pool already, specifically to optimise for the
|
||
|
|
- * case of frequent allocations of objects of the given size. The
|
||
|
|
- * object pool should suffice our needs, as we will end up
|
||
|
|
- * recycling the objects from there without the need to back it by
|
||
|
|
- * an additional layer of dirty pages cache. The dirty pages cache
|
||
|
|
- * would have worked better in the case when there are more
|
||
|
|
- * allocation bins involved due to a higher reuse rate (the case
|
||
|
|
- * of a more "generic" memory management).
|
||
|
|
- */
|
||
|
|
- isc_mem_create_arena(&manager->send_mctx);
|
||
|
|
- isc_mem_setname(manager->send_mctx, "sendbufs");
|
||
|
|
- (void)isc_mem_arena_set_dirty_decay_ms(manager->send_mctx, 0);
|
||
|
|
- /*
|
||
|
|
- * Disable muzzy pages cache too, as versions < 5.2.0 have it
|
||
|
|
- * enabled by default. The muzzy pages cache goes right below the
|
||
|
|
- * dirty pages cache and backs it.
|
||
|
|
- */
|
||
|
|
- (void)isc_mem_arena_set_muzzy_decay_ms(manager->send_mctx, 0);
|
||
|
|
-
|
||
|
|
manager->magic = MANAGER_MAGIC;
|
||
|
|
|
||
|
|
MTRACE("create");
|
||
|
|
diff --git a/lib/ns/include/ns/client.h b/lib/ns/include/ns/client.h
|
||
|
|
index 7a7196f..ea2d83e 100644
|
||
|
|
--- a/lib/ns/include/ns/client.h
|
||
|
|
+++ b/lib/ns/include/ns/client.h
|
||
|
|
@@ -144,7 +144,6 @@ struct ns_clientmgr {
|
||
|
|
unsigned int magic;
|
||
|
|
|
||
|
|
isc_mem_t *mctx;
|
||
|
|
- isc_mem_t *send_mctx;
|
||
|
|
ns_server_t *sctx;
|
||
|
|
isc_taskmgr_t *taskmgr;
|
||
|
|
isc_timermgr_t *timermgr;
|
||
|
|
@@ -159,6 +158,8 @@ struct ns_clientmgr {
|
||
|
|
/* Lock covers the recursing list */
|
||
|
|
isc_mutex_t reclock;
|
||
|
|
client_list_t recursing; /*%< Recursing clients */
|
||
|
|
+
|
||
|
|
+ uint8_t tcp_buffer[NS_CLIENT_TCP_BUFFER_SIZE];
|
||
|
|
};
|
||
|
|
|
||
|
|
/*% nameserver client structure */
|
||
|
|
@@ -187,7 +188,6 @@ struct ns_client {
|
||
|
|
unsigned char *tcpbuf;
|
||
|
|
size_t tcpbuf_size;
|
||
|
|
dns_message_t *message;
|
||
|
|
- unsigned char *sendbuf;
|
||
|
|
dns_rdataset_t *opt;
|
||
|
|
dns_ednsopt_t *ede;
|
||
|
|
uint16_t udpsize;
|
||
|
|
@@ -240,6 +240,8 @@ struct ns_client {
|
||
|
|
* bits will be used as the rcode in the response message.
|
||
|
|
*/
|
||
|
|
int32_t rcode_override;
|
||
|
|
+
|
||
|
|
+ uint8_t sendbuf[NS_CLIENT_SEND_BUFFER_SIZE];
|
||
|
|
};
|
||
|
|
|
||
|
|
#define NS_CLIENT_MAGIC ISC_MAGIC('N', 'S', 'C', 'c')
|
||
|
|
--
|
||
|
|
2.33.0
|
||
|
|
|