async: Fix qemu main thread hang on weak ordered platfrom

aio-wait: delegate polling of main AioContext if BQL not held upstream_url: https://patchwork.kernel.org/patch/11482099/ async: use explicit memory barriers upstream_url: https://patchwork.kernel.org/patch/11482103/ Signed-off-by: Ying Fang <fanging1@huawei.com>
2020-04-11 00:28:01 +08:00 · 2020-04-11 00:28:01 +08:00 · e21eae0df2
commit e21eae0df2
parent c8a00f8f2a
3 changed files with 297 additions and 0 deletions
--- a/aio-wait-delegate-polling-of-main-AioContext-if-BQL-not-held.patch
+++ b/aio-wait-delegate-polling-of-main-AioContext-if-BQL-not-held.patch
@ -0,0 +1,120 @@
+From a2bae876b7f694b12073bac8ad6668e4d975ad88 Mon Sep 17 00:00:00 2001
+From: Ying Fang <fangying1@huawei.com>
+Date: Fri, 10 Apr 2020 16:08:19 +0000
+Subject: [PATCH 1/2] aio-wait: delegate polling of main AioContext if BQL not
+ held
+
+Any thread that is not a iothread returns NULL for qemu_get_current_aio_context().
+As a result, it would also return true for
+in_aio_context_home_thread(qemu_get_aio_context()), causing
+AIO_WAIT_WHILE to invoke aio_poll() directly.  This is incorrect
+if the BQL is not held, because aio_poll() does not expect to
+run concurrently from multiple threads, and it can actually
+happen when savevm writes to the vmstate file from the
+migration thread.
+
+Therefore, restrict in_aio_context_home_thread to return true
+for the main AioContext only if the BQL is held.
+
+The function is moved to aio-wait.h because it is mostly used
+there and to avoid a circular reference between main-loop.h
+and block/aio.h.
+
+upstream_url: https://patchwork.kernel.org/patch/11482099/
+
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Message-Id: <20200407140746.8041-5-pbonzini@redhat.com>
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ include/block/aio-wait.h | 22 ++++++++++++++++++++++
+ include/block/aio.h      | 29 ++++++++++-------------------
+ 2 files changed, 32 insertions(+), 19 deletions(-)
+
+diff --git a/include/block/aio-wait.h b/include/block/aio-wait.h
+index afd0ff7e..d349e7e3 100644
+--- a/include/block/aio-wait.h
+++ b/include/block/aio-wait.h
+@@ -26,6 +26,7 @@
+ #define QEMU_AIO_WAIT_H
+ 
+ #include "block/aio.h"
+#include "qemu/main-loop.h"
+ 
+ /**
+  * AioWait:
+@@ -124,4 +125,25 @@ void aio_wait_kick(void);
+  */
+ void aio_wait_bh_oneshot(AioContext *ctx, QEMUBHFunc *cb, void *opaque);
+ 
+/**
+ * in_aio_context_home_thread:
+ * @ctx: the aio context
+ *
+ * Return whether we are running in the thread that normally runs @ctx.  Note
+ * that acquiring/releasing ctx does not affect the outcome, each AioContext
+ * still only has one home thread that is responsible for running it.
+ */
+static inline bool in_aio_context_home_thread(AioContext *ctx)
+{
+    if (ctx == qemu_get_current_aio_context()) {
+        return true;
+    }
+
+    if (ctx == qemu_get_aio_context()) {
+        return qemu_mutex_iothread_locked();
+    } else {
+        return false;
+    }
+}
+
+ #endif /* QEMU_AIO_WAIT */
+diff --git a/include/block/aio.h b/include/block/aio.h
+index 0ca25dfe..c527893b 100644
+--- a/include/block/aio.h
+++ b/include/block/aio.h
+@@ -61,12 +61,16 @@ struct AioContext {
+     QLIST_HEAD(, AioHandler) aio_handlers;
+ 
+     /* Used to avoid unnecessary event_notifier_set calls in aio_notify;
+-     * accessed with atomic primitives.  If this field is 0, everything
+-     * (file descriptors, bottom halves, timers) will be re-evaluated
+-     * before the next blocking poll(), thus the event_notifier_set call
+-     * can be skipped.  If it is non-zero, you may need to wake up a
+-     * concurrent aio_poll or the glib main event loop, making
+-     * event_notifier_set necessary.
+     * only written from the AioContext home thread, or under the BQL in
+     * the case of the main AioContext.  However, it is read from any
+     * thread so it is still accessed with atomic primitives.
+     *
+     * If this field is 0, everything (file descriptors, bottom halves,
+     * timers) will be re-evaluated before the next blocking poll() or
+     * io_uring wait; therefore, the event_notifier_set call can be
+     * skipped.  If it is non-zero, you may need to wake up a concurrent
+     * aio_poll or the glib main event loop, making event_notifier_set
+     * necessary.
+      *
+      * Bit 0 is reserved for GSource usage of the AioContext, and is 1
+      * between a call to aio_ctx_prepare and the next call to aio_ctx_check.
+@@ -581,19 +585,6 @@ void aio_co_enter(AioContext *ctx, struct Coroutine *co);
+  */
+ AioContext *qemu_get_current_aio_context(void);
+ 
+-/**
+- * in_aio_context_home_thread:
+- * @ctx: the aio context
+- *
+- * Return whether we are running in the thread that normally runs @ctx.  Note
+- * that acquiring/releasing ctx does not affect the outcome, each AioContext
+- * still only has one home thread that is responsible for running it.
+- */
+-static inline bool in_aio_context_home_thread(AioContext *ctx)
+-{
+-    return ctx == qemu_get_current_aio_context();
+-}
+-
+ /**
+  * aio_context_setup:
+  * @ctx: the aio context
+-- 
+2.25.2
+
--- a/async-use-explicit-memory-barriers.patch
+++ b/async-use-explicit-memory-barriers.patch
@ -0,0 +1,171 @@
+From 787af8ed2bc86dc8688727d62a251965d9c42e00 Mon Sep 17 00:00:00 2001
+From: Ying Fang <fangying1@huawei.com>
+Date: Fri, 10 Apr 2020 16:19:50 +0000
+Subject: [PATCH 2/2] async: use explicit memory barriers
+
+When using C11 atomics, non-seqcst reads and writes do not participate
+in the total order of seqcst operations.  In util/async.c and util/aio-posix.c,
+in particular, the pattern that we use
+
+        write ctx->notify_me                 write bh->scheduled
+        read bh->scheduled                   read ctx->notify_me
+        if !bh->scheduled, sleep             if ctx->notify_me, notify
+
+needs to use seqcst operations for both the write and the read.  In
+general this is something that we do not want, because there can be
+many sources that are polled in addition to bottom halves.  The
+alternative is to place a seqcst memory barrier between the write
+and the read.  This also comes with a disadvantage, in that the
+memory barrier is implicit on strongly-ordered architectures and
+it wastes a few dozen clock cycles.
+
+Fortunately, ctx->notify_me is never written concurrently by two
+threads, so we can assert that and relax the writes to ctx->notify_me.
+The resulting solution works and performs well on both aarch64 and x86.
+
+Note that the atomic_set/atomic_read combination is not an atomic
+read-modify-write, and therefore it is even weaker than C11 ATOMIC_RELAXED;
+on x86, ATOMIC_RELAXED compiles to a locked operation.
+
+upstream_url: https://patchwork.kernel.org/patch/11482103/
+
+Analyzed-by: Ying Fang <fangying1@huawei.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Tested-by: Ying Fang <fangying1@huawei.com>
+Message-Id: <20200407140746.8041-6-pbonzini@redhat.com>
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ util/aio-posix.c | 16 ++++++++++++++--
+ util/aio-win32.c | 17 ++++++++++++++---
+ util/async.c     | 16 ++++++++++++----
+ 3 files changed, 40 insertions(+), 9 deletions(-)
+
+diff --git a/util/aio-posix.c b/util/aio-posix.c
+index 6fbfa792..ca58b9a4 100644
+--- a/util/aio-posix.c
+++ b/util/aio-posix.c
+@@ -613,6 +613,11 @@ bool aio_poll(AioContext *ctx, bool blocking)
+     int64_t timeout;
+     int64_t start = 0;
+ 
+    /*
+     * There cannot be two concurrent aio_poll calls for the same AioContext (or
+     * an aio_poll concurrent with a GSource prepare/check/dispatch callback).
+     * We rely on this below to avoid slow locked accesses to ctx->notify_me.
+     */
+     assert(in_aio_context_home_thread(ctx));
+ 
+     /* aio_notify can avoid the expensive event_notifier_set if
+@@ -623,7 +628,13 @@ bool aio_poll(AioContext *ctx, bool blocking)
+      * so disable the optimization now.
+      */
+     if (blocking) {
+-        atomic_add(&ctx->notify_me, 2);
+        atomic_set(&ctx->notify_me, atomic_read(&ctx->notify_me) + 2);
+        /*
+         * Write ctx->notify_me before computing the timeout
+         * (reading bottom half flags, etc.).  Pairs with
+         * smp_mb in aio_notify().
+         */
+        smp_mb();
+     }
+ 
+     qemu_lockcnt_inc(&ctx->list_lock);
+@@ -668,7 +679,8 @@ bool aio_poll(AioContext *ctx, bool blocking)
+     }
+ 
+     if (blocking) {
+-        atomic_sub(&ctx->notify_me, 2);
+        /* Finish the poll before clearing the flag.  */
+        atomic_store_release(&ctx->notify_me, atomic_read(&ctx->notify_me) - 2);
+         aio_notify_accept(ctx);
+     }
+ 
+diff --git a/util/aio-win32.c b/util/aio-win32.c
+index a23b9c36..729d533f 100644
+--- a/util/aio-win32.c
+++ b/util/aio-win32.c
+@@ -321,6 +321,12 @@ bool aio_poll(AioContext *ctx, bool blocking)
+     int count;
+     int timeout;
+ 
+    /*
+     * There cannot be two concurrent aio_poll calls for the same AioContext (or
+     * an aio_poll concurrent with a GSource prepare/check/dispatch callback).
+     * We rely on this below to avoid slow locked accesses to ctx->notify_me.
+     */
+    assert(in_aio_context_home_thread(ctx));
+     progress = false;
+ 
+     /* aio_notify can avoid the expensive event_notifier_set if
+@@ -331,7 +337,13 @@ bool aio_poll(AioContext *ctx, bool blocking)
+      * so disable the optimization now.
+      */
+     if (blocking) {
+-        atomic_add(&ctx->notify_me, 2);
+        atomic_set(&ctx->notify_me, atomic_read(&ctx->notify_me) + 2);
+        /*
+         * Write ctx->notify_me before computing the timeout
+         * (reading bottom half flags, etc.).  Pairs with
+         * smp_mb in aio_notify().
+         */
+        smp_mb();
+     }
+ 
+     qemu_lockcnt_inc(&ctx->list_lock);
+@@ -364,8 +376,7 @@ bool aio_poll(AioContext *ctx, bool blocking)
+         ret = WaitForMultipleObjects(count, events, FALSE, timeout);
+         if (blocking) {
+             assert(first);
+-            assert(in_aio_context_home_thread(ctx));
+-            atomic_sub(&ctx->notify_me, 2);
+            atomic_store_release(&ctx->notify_me, atomic_read(&ctx->notify_me) - 2);
+             aio_notify_accept(ctx);
+         }
+ 
+diff --git a/util/async.c b/util/async.c
+index afc17fb3..12b33204 100644
+--- a/util/async.c
+++ b/util/async.c
+@@ -221,7 +221,14 @@ aio_ctx_prepare(GSource *source, gint    *timeout)
+ {
+     AioContext *ctx = (AioContext *) source;
+ 
+-    atomic_or(&ctx->notify_me, 1);
+    atomic_set(&ctx->notify_me, atomic_read(&ctx->notify_me) | 1);
+
+    /*
+     * Write ctx->notify_me before computing the timeout
+     * (reading bottom half flags, etc.).  Pairs with
+     * smp_mb in aio_notify().
+     */
+    smp_mb();
+ 
+     /* We assume there is no timeout already supplied */
+     *timeout = qemu_timeout_ns_to_ms(aio_compute_timeout(ctx));
+@@ -239,7 +246,8 @@ aio_ctx_check(GSource *source)
+     AioContext *ctx = (AioContext *) source;
+     QEMUBH *bh;
+ 
+-    atomic_and(&ctx->notify_me, ~1);
+    /* Finish computing the timeout before clearing the flag.  */
+    atomic_store_release(&ctx->notify_me, atomic_read(&ctx->notify_me) & ~1);
+     aio_notify_accept(ctx);
+ 
+     for (bh = ctx->first_bh; bh; bh = bh->next) {
+@@ -344,10 +352,10 @@ LinuxAioState *aio_get_linux_aio(AioContext *ctx)
+ void aio_notify(AioContext *ctx)
+ {
+     /* Write e.g. bh->scheduled before reading ctx->notify_me.  Pairs
+-     * with atomic_or in aio_ctx_prepare or atomic_add in aio_poll.
+     * with smp_mb in aio_ctx_prepare or aio_poll.
+      */
+     smp_mb();
+-    if (ctx->notify_me) {
+    if (atomic_read(&ctx->notify_me)) {
+         event_notifier_set(&ctx->notifier);
+         atomic_mb_set(&ctx->notified, true);
+     }
+-- 
+2.25.2
+
--- a/qemu.spec
+++ b/qemu.spec
@ -61,6 +61,8 @@ Patch0048: COLO-compare-Fix-incorrect-if-logic.patch
 Patch0049: qcow2-bitmap-Fix-uint64_t-left-shift-overflow.patch
 Patch0050: pcie-Add-pcie-root-port-fast-plug-unplug-feature.patch
 Patch0051: pcie-Compat-with-devices-which-do-not-support-Link-W.patch
+Patch0052: aio-wait-delegate-polling-of-main-AioContext-if-BQL-not-held.patch
+Patch0053: async-use-explicit-memory-barriers.patch

 BuildRequires: flex
 BuildRequires: bison
@ -396,6 +398,10 @@ getent passwd qemu >/dev/null || \
 %endif

 %changelog
+* Sat Apr 11 4 2020 Huawei Technologies Co., Ltd. <fangying1@huawei.com>
+- aio-wait: delegate polling of main AioContext if BQL not held
+- async: use explicit memory barriers
+
 * Wed Mar 18 2020 Huawei Technologies Co., Ltd. <fangying1@huawei.com>
 - pcie: Add pcie-root-port fast plug/unplug feature
 - pcie: Compat with devices which do not support Link Width