288 lines
14 KiB
Diff
288 lines
14 KiB
Diff
From 5d354e525a56955ae7f68062e283dda85ab07794 Mon Sep 17 00:00:00 2001
|
|
From: Yu Watanabe <watanabe.yu+github@gmail.com>
|
|
Date: Tue, 15 Mar 2022 13:50:06 +0900
|
|
Subject: [PATCH] udev: requeue event when the corresponding block device is
|
|
locked by another process
|
|
|
|
Previously, if a block device is locked by another process, then the
|
|
corresponding worker skip to process the corresponding event, and does
|
|
not broadcast the uevent to libudev listners. This causes several issues:
|
|
|
|
- During a period of a device being locked by a process, if a user trigger
|
|
an event with `udevadm trigger --settle`, then it never returned.
|
|
|
|
- When there is a delay between close and unlock in a process, then the
|
|
synthesized events triggered by inotify may not be processed. This can
|
|
happens easily by wrapping mkfs with flock. This causes severe issues
|
|
e.g. new devlinks are not created, or old devlinks are not removed.
|
|
|
|
This commit makes events are requeued with a tiny delay when the corresponding
|
|
block devices are locked by other processes. With this way, the triggered
|
|
uevent may be delayed but is always processed by udevd. Hence, the above
|
|
issues can be solved. Also, it is not necessary to watch a block device
|
|
unconditionally when it is already locked. Hence, the logic is dropped.
|
|
|
|
Reference:https://github.com/systemd/systemd/commit/5d354e525a56955ae7f68062e283dda85ab07794
|
|
Conflict:adaption
|
|
|
|
---
|
|
src/udev/udevd.c | 154 +++++++++++++++++++++++++++++------------------
|
|
1 file changed, 97 insertions(+), 57 deletions(-)
|
|
|
|
diff --git a/src/udev/udevd.c b/src/udev/udevd.c
|
|
index d153b03a38e1..973727375b67 100644
|
|
--- a/src/udev/udevd.c
|
|
+++ b/src/udev/udevd.c
|
|
@@ -70,6 +70,8 @@
|
|
#include "version.h"
|
|
|
|
#define WORKER_NUM_MAX 2048U
|
|
+#define EVENT_RETRY_INTERVAL_USEC (200 * USEC_PER_MSEC)
|
|
+#define EVENT_RETRY_TIMEOUT_USEC (3 * USEC_PER_MINUTE)
|
|
|
|
static bool arg_debug = false;
|
|
static int arg_daemonize = false;
|
|
@@ -128,6 +130,8 @@ typedef struct Event {
|
|
sd_device_action_t action;
|
|
uint64_t seqnum;
|
|
uint64_t blocker_seqnum;
|
|
+ usec_t retry_again_next_usec;
|
|
+ usec_t retry_again_timeout_usec;
|
|
|
|
sd_event_source *timeout_warning_event;
|
|
sd_event_source *timeout_event;
|
|
@@ -152,8 +156,13 @@ typedef struct Worker {
|
|
} Worker;
|
|
|
|
/* passed from worker to main process */
|
|
-typedef struct WorkerMessage {
|
|
-} WorkerMessage;
|
|
+typedef enum EventResult {
|
|
+ EVENT_RESULT_SUCCESS,
|
|
+ EVENT_RESULT_FAILED,
|
|
+ EVENT_RESULT_TRY_AGAIN, /* when the block device is locked by another process. */
|
|
+ _EVENT_RESULT_MAX,
|
|
+ _EVENT_RESULT_INVALID = -EINVAL,
|
|
+} EventResult;
|
|
|
|
static Event *event_free(Event *event) {
|
|
if (!event)
|
|
@@ -360,10 +369,11 @@ static void device_broadcast(sd_device_monitor *monitor, sd_device *dev) {
|
|
"Failed to broadcast event to libudev listeners, ignoring: %m");
|
|
}
|
|
|
|
-static int worker_send_message(int fd) {
|
|
- WorkerMessage message = {};
|
|
+static int worker_send_result(Manager *manager, EventResult result) {
|
|
+ assert(manager);
|
|
+ assert(manager->worker_watch[WRITE_END] >= 0);
|
|
|
|
- return loop_write(fd, &message, sizeof(message), false);
|
|
+ return loop_write(manager->worker_watch[WRITE_END], &result, sizeof(result), false);
|
|
}
|
|
|
|
static int worker_lock_block_device(sd_device *dev, int *ret_fd) {
|
|
@@ -490,44 +500,12 @@ static int worker_process_device(Manager *manager, sd_device *dev) {
|
|
if (!udev_event)
|
|
return -ENOMEM;
|
|
|
|
+ /* If this is a block device and the device is locked currently via the BSD advisory locks,
|
|
+ * someone else is using it exclusively. We don't run our udev rules now to not interfere.
|
|
+ * Instead of processing the event, we requeue the event and will try again after a delay.
|
|
+ *
|
|
+ * The user-facing side of this: https://systemd.io/BLOCK_DEVICE_LOCKING */
|
|
r = worker_lock_block_device(dev, &fd_lock);
|
|
- if (r == -EAGAIN) {
|
|
- /* So this is a block device and the device is locked currently via the BSD advisory locks —
|
|
- * someone else is exclusively using it. This means we don't run our udev rules now, to not
|
|
- * interfere. However we want to know when the device is unlocked again, and retrigger the
|
|
- * device again then, so that the rules are run eventually. For that we use IN_CLOSE_WRITE
|
|
- * inotify watches (which isn't exactly the same as waiting for the BSD locks to release, but
|
|
- * not totally off, as long as unlock+close() is done together, as it usually is).
|
|
- *
|
|
- * (The user-facing side of this: https://systemd.io/BLOCK_DEVICE_LOCKING)
|
|
- *
|
|
- * There's a bit of a chicken and egg problem here for this however: inotify watching is
|
|
- * supposed to be enabled via an option set via udev rules (OPTIONS+="watch"). If we skip the
|
|
- * udev rules here however (as we just said we do), we would thus never see that specific
|
|
- * udev rule, and thus never turn on inotify watching. But in order to catch up eventually
|
|
- * and run them we we need the inotify watching: hence a classic chicken and egg problem.
|
|
- *
|
|
- * Our way out here: if we see the block device locked, unconditionally watch the device via
|
|
- * inotify, regardless of any explicit request via OPTIONS+="watch". Thus, a device that is
|
|
- * currently locked via the BSD file locks will be treated as if we ran a single udev rule
|
|
- * only for it: the one that turns on inotify watching for it. If we eventually see the
|
|
- * inotify IN_CLOSE_WRITE event, and then run the rules after all and we then realize that
|
|
- * this wasn't actually requested (i.e. no OPTIONS+="watch" set) we'll simply turn off the
|
|
- * watching again (see below). Effectively this means: inotify watching is now enabled either
|
|
- * a) when the udev rules say so, or b) while the device is locked.
|
|
- *
|
|
- * Worst case scenario hence: in the (unlikely) case someone locked the device and we clash
|
|
- * with that we might do inotify watching for a brief moment for a device where we actually
|
|
- * weren't supposed to. But that shouldn't be too bad, in particular as BSD locks being taken
|
|
- * on a block device is kinda an indication that the inotify logic is desired too, to some
|
|
- * degree — they go hand-in-hand after all. */
|
|
-
|
|
- log_device_debug(dev, "Block device is currently locked, installing watch to wait until the lock is released.");
|
|
- (void) udev_watch_begin(manager->inotify_fd, dev);
|
|
-
|
|
- /* Now the watch is installed, let's lock the device again, maybe in the meantime things changed */
|
|
- r = worker_lock_block_device(dev, &fd_lock);
|
|
- }
|
|
if (r < 0)
|
|
return r;
|
|
|
|
@@ -560,25 +538,29 @@ static int worker_process_device(Manager *manager, sd_device *dev) {
|
|
|
|
static int worker_device_monitor_handler(sd_device_monitor *monitor, sd_device *dev, void *userdata) {
|
|
Manager *manager = userdata;
|
|
+ EventResult result;
|
|
int r;
|
|
|
|
assert(dev);
|
|
assert(manager);
|
|
|
|
r = worker_process_device(manager, dev);
|
|
- if (r == -EAGAIN)
|
|
- /* if we couldn't acquire the flock(), then proceed quietly */
|
|
- log_device_debug_errno(dev, r, "Device currently locked, not processing.");
|
|
- else {
|
|
- if (r < 0)
|
|
- log_device_warning_errno(dev, r, "Failed to process device, ignoring: %m");
|
|
+ if (r == -EAGAIN) {
|
|
+ /* if we couldn't acquire the flock(), then requeue the event */
|
|
+ result = EVENT_RESULT_TRY_AGAIN;
|
|
+ log_device_debug_errno(dev, r, "Block device is currently locked, requeueing the event.");
|
|
+ } else if (r < 0) {
|
|
+ result = EVENT_RESULT_FAILED;
|
|
+ log_device_warning_errno(dev, r, "Failed to process device, ignoring: %m");
|
|
+ } else
|
|
+ result = EVENT_RESULT_SUCCESS;
|
|
|
|
+ if (result != EVENT_RESULT_TRY_AGAIN)
|
|
/* send processed event back to libudev listeners */
|
|
device_broadcast(monitor, dev);
|
|
- }
|
|
|
|
/* send udevd the result of the event execution */
|
|
- r = worker_send_message(manager->worker_watch[WRITE_END]);
|
|
+ r = worker_send_result(manager, result);
|
|
if (r < 0)
|
|
log_device_warning_errno(dev, r, "Failed to send signal to main daemon, ignoring: %m");
|
|
|
|
@@ -794,6 +776,17 @@ static int event_is_blocked(Event *event) {
|
|
assert(event->manager);
|
|
assert(event->blocker_seqnum <= event->seqnum);
|
|
|
|
+ if (event->retry_again_next_usec > 0) {
|
|
+ usec_t now_usec;
|
|
+
|
|
+ r = sd_event_now(event->manager->event, clock_boottime_or_monotonic(), &now_usec);
|
|
+ if (r < 0)
|
|
+ return r;
|
|
+
|
|
+ if (event->retry_again_next_usec <= now_usec)
|
|
+ return true;
|
|
+ }
|
|
+
|
|
if (event->blocker_seqnum == event->seqnum)
|
|
/* we have checked previously and no blocker found */
|
|
return false;
|
|
@@ -980,6 +973,44 @@ static int event_queue_start(Manager *manager) {
|
|
return 0;
|
|
}
|
|
|
|
+static int event_requeue(Event *event) {
|
|
+ usec_t now_usec;
|
|
+ int r;
|
|
+
|
|
+ assert(event);
|
|
+ assert(event->manager);
|
|
+ assert(event->manager->event);
|
|
+
|
|
+ event->timeout_warning_event = sd_event_source_disable_unref(event->timeout_warning_event);
|
|
+ event->timeout_event = sd_event_source_disable_unref(event->timeout_event);
|
|
+
|
|
+ /* add a short delay to suppress busy loop */
|
|
+ r = sd_event_now(event->manager->event, clock_boottime_or_monotonic(), &now_usec);
|
|
+ if (r < 0)
|
|
+ return log_device_warning_errno(event->dev, r,
|
|
+ "Failed to get current time, "
|
|
+ "skipping event (SEQNUM=%"PRIu64", ACTION=%s): %m",
|
|
+ event->seqnum, strna(device_action_to_string(event->action)));
|
|
+
|
|
+ if (event->retry_again_timeout_usec > 0 && event->retry_again_timeout_usec <= now_usec)
|
|
+ return log_device_warning_errno(event->dev, SYNTHETIC_ERRNO(ETIMEDOUT),
|
|
+ "The underlying block device is locked by a process more than %s, "
|
|
+ "skipping event (SEQNUM=%"PRIu64", ACTION=%s).",
|
|
+ format_timespan((char[FORMAT_TIMESPAN_MAX]){}, FORMAT_TIMESPAN_MAX, EVENT_RETRY_TIMEOUT_USEC, USEC_PER_MINUTE),
|
|
+ event->seqnum, strna(device_action_to_string(event->action)));
|
|
+
|
|
+ event->retry_again_next_usec = usec_add(now_usec, EVENT_RETRY_INTERVAL_USEC);
|
|
+ if (event->retry_again_timeout_usec == 0)
|
|
+ event->retry_again_timeout_usec = usec_add(now_usec, EVENT_RETRY_TIMEOUT_USEC);
|
|
+
|
|
+ if (event->worker && event->worker->event == event)
|
|
+ event->worker->event = NULL;
|
|
+ event->worker = NULL;
|
|
+
|
|
+ event->state = EVENT_QUEUED;
|
|
+ return 0;
|
|
+}
|
|
+
|
|
static int event_queue_insert(Manager *manager, sd_device *dev) {
|
|
sd_device_action_t action;
|
|
uint64_t seqnum;
|
|
@@ -1054,11 +1085,8 @@ static int on_worker(sd_event_source *s, int fd, uint32_t revents, void *userdat
|
|
assert(manager);
|
|
|
|
for (;;) {
|
|
- WorkerMessage msg;
|
|
- struct iovec iovec = {
|
|
- .iov_base = &msg,
|
|
- .iov_len = sizeof(msg),
|
|
- };
|
|
+ EventResult result;
|
|
+ struct iovec iovec = IOVEC_MAKE(&result, sizeof(result));
|
|
CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred))) control;
|
|
struct msghdr msghdr = {
|
|
.msg_iov = &iovec,
|
|
@@ -1081,7 +1109,7 @@ static int on_worker(sd_event_source *s, int fd, uint32_t revents, void *userdat
|
|
|
|
cmsg_close_all(&msghdr);
|
|
|
|
- if (size != sizeof(WorkerMessage)) {
|
|
+ if (size != sizeof(EventResult)) {
|
|
log_warning("Ignoring worker message with invalid size %zi bytes", size);
|
|
continue;
|
|
}
|
|
@@ -1106,6 +1134,11 @@ static int on_worker(sd_event_source *s, int fd, uint32_t revents, void *userdat
|
|
worker->state = WORKER_IDLE;
|
|
|
|
/* worker returned */
|
|
+ if (result == EVENT_RESULT_TRY_AGAIN &&
|
|
+ event_requeue(worker->event) < 0)
|
|
+ device_broadcast(manager->monitor, worker->event->dev);
|
|
+
|
|
+ /* When event_requeue() succeeds, worker->event is NULL, and event_free() handles NULL gracefully. */
|
|
event_free(worker->event);
|
|
}
|
|
|
|
@@ -1467,8 +1500,15 @@ static int on_post(sd_event_source *s, void *userdata) {
|
|
|
|
assert(manager);
|
|
|
|
- if (!LIST_IS_EMPTY(manager->events))
|
|
+ if (!LIST_IS_EMPTY(manager->events)) {
|
|
+ /* Try to process pending events if idle workers exist. Why is this necessary?
|
|
+ * When a worker finished an event and became idle, even if there was a pending event,
|
|
+ * the corresponding device might have been locked and the processing of the event
|
|
+ * delayed for a while, preventing the worker from processing the event immediately.
|
|
+ * Now, the device may be unlocked. Let's try again! */
|
|
+ event_queue_start(manager);
|
|
return 1;
|
|
+ }
|
|
|
|
/* There are no queued events. Let's remove /run/udev/queue and clean up the idle processes. */
|
|
|
|
|