fix RAID0 becoming unusable after setting fault of member disks

This commit is contained in:
wguanghao 2023-11-07 10:22:10 +08:00
parent ae1ce114a7
commit 5d15a7ad83
4 changed files with 190 additions and 1 deletions

View File

@ -0,0 +1,91 @@
From fc6fd4063769f4194c3fb8f77b32b2819e140fb9 Mon Sep 17 00:00:00 2001
From: Mateusz Kusiak <mateusz.kusiak@intel.com>
Date: Thu, 18 Aug 2022 11:47:21 +0200
Subject: [PATCH] Manage: Block unsafe member failing
Kernel may or may not block mdadm from removing member device if it
will cause arrays failed state. It depends on raid personality
implementation in kernel.
Add verification on requested removal path (#mdadm --set-faulty
command).
Signed-off-by: Mateusz Kusiak <mateusz.kusiak@intel.com>
Signed-off-by: Jes Sorensen <jsorensen@fb.com>
---
Manage.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 52 insertions(+), 1 deletion(-)
diff --git a/Manage.c b/Manage.c
index a142f8bd..b1d0e630 100644
--- a/Manage.c
+++ b/Manage.c
@@ -1285,6 +1285,50 @@ int Manage_with(struct supertype *tst, int fd, struct mddev_dev *dv,
return -1;
}
+/**
+ * is_remove_safe() - Check if remove is safe.
+ * @array: Array info.
+ * @fd: Array file descriptor.
+ * @devname: Name of device to remove.
+ * @verbose: Verbose.
+ *
+ * The function determines if array will be operational
+ * after removing &devname.
+ *
+ * Return: True if array will be operational, false otherwise.
+ */
+bool is_remove_safe(mdu_array_info_t *array, const int fd, char *devname, const int verbose)
+{
+ dev_t devid = devnm2devid(devname + 5);
+ struct mdinfo *mdi = sysfs_read(fd, NULL, GET_DEVS | GET_DISKS | GET_STATE);
+
+ if (!mdi) {
+ if (verbose)
+ pr_err("Failed to read sysfs attributes for %s\n", devname);
+ return false;
+ }
+
+ char *avail = xcalloc(array->raid_disks, sizeof(char));
+
+ for (mdi = mdi->devs; mdi; mdi = mdi->next) {
+ if (mdi->disk.raid_disk < 0)
+ continue;
+ if (!(mdi->disk.state & (1 << MD_DISK_SYNC)))
+ continue;
+ if (makedev(mdi->disk.major, mdi->disk.minor) == devid)
+ continue;
+ avail[mdi->disk.raid_disk] = 1;
+ }
+ sysfs_free(mdi);
+
+ bool is_enough = enough(array->level, array->raid_disks,
+ array->layout, (array->state & 1),
+ avail);
+
+ free(avail);
+ return is_enough;
+}
+
int Manage_subdevs(char *devname, int fd,
struct mddev_dev *devlist, int verbose, int test,
char *update, int force)
@@ -1598,7 +1642,14 @@ int Manage_subdevs(char *devname, int fd,
break;
case 'f': /* set faulty */
- /* FIXME check current member */
+ if (!is_remove_safe(&array, fd, dv->devname, verbose)) {
+ pr_err("Cannot remove %s from %s, array will be failed.\n",
+ dv->devname, devname);
+ if (sysfd >= 0)
+ close(sysfd);
+ goto abort;
+ }
+
if ((sysfd >= 0 && write(sysfd, "faulty", 6) != 6) ||
(sysfd < 0 && ioctl(fd, SET_DISK_FAULTY,
rdev))) {
--
2.33.0

View File

@ -0,0 +1,33 @@
From b3e7b7eb1dfedd7cbd9a3800e884941f67d94c96 Mon Sep 17 00:00:00 2001
From: Kinga Tanska <kinga.tanska@intel.com>
Date: Tue, 27 Dec 2022 06:50:42 +0100
Subject: [PATCH] Manage: do not check array state when drive is removed
Array state doesn't need to be checked when drive is
removed, but until now clean state was required. Result
of the is_remove_safe() function will be independent
from array state.
Signed-off-by: Kinga Tanska <kinga.tanska@intel.com>
Signed-off-by: Jes Sorensen <jes@trained-monkey.org>
---
Manage.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/Manage.c b/Manage.c
index 594e3d2c..4d6e54b1 100644
--- a/Manage.c
+++ b/Manage.c
@@ -1321,8 +1321,7 @@ bool is_remove_safe(mdu_array_info_t *array, const int fd, char *devname, const
sysfs_free(mdi);
bool is_enough = enough(array->level, array->raid_disks,
- array->layout, (array->state & 1),
- avail);
+ array->layout, 1, avail);
free(avail);
return is_enough;
--
2.33.0

View File

@ -0,0 +1,59 @@
From 461fae7e7809670d286cc19aac5bfa861c29f93a Mon Sep 17 00:00:00 2001
From: Kinga Tanska <kinga.tanska@intel.com>
Date: Tue, 27 Dec 2022 06:50:43 +0100
Subject: [PATCH] incremental, manage: do not verify if remove is safe
Function is_remove_safe() was introduced to verify if removing
member device won't cause failed state of the array. This
verification should be used only with set-faulty command. Add
special mode indicating that Incremental removal was executed.
If this mode is used do not execute is_remove_safe() routine.
Signed-off-by: Kinga Tanska <kinga.tanska@intel.com>
Signed-off-by: Jes Sorensen <jes@trained-monkey.org>
---
Incremental.c | 2 +-
Manage.c | 7 ++++---
2 files changed, 5 insertions(+), 4 deletions(-)
diff --git a/Incremental.c b/Incremental.c
index ff3548c0..09b94b9f 100644
--- a/Incremental.c
+++ b/Incremental.c
@@ -1744,7 +1744,7 @@ int IncrementalRemove(char *devname, char *id_path, int verbose)
memset(&devlist, 0, sizeof(devlist));
devlist.devname = devname;
- devlist.disposition = 'f';
+ devlist.disposition = 'I';
/* for a container, we must fail each member array */
if (ent->metadata_version &&
strncmp(ent->metadata_version, "external:", 9) == 0) {
diff --git a/Manage.c b/Manage.c
index 4d6e54b1..6184d3f7 100644
--- a/Manage.c
+++ b/Manage.c
@@ -1494,8 +1494,9 @@ int Manage_subdevs(char *devname, int fd,
/* Assume this is a kernel-internal name like 'sda1' */
int found = 0;
char dname[55];
- if (dv->disposition != 'r' && dv->disposition != 'f') {
- pr_err("%s only meaningful with -r or -f, not -%c\n",
+ if (dv->disposition != 'r' && dv->disposition != 'f' &&
+ dv->disposition != 'I') {
+ pr_err("%s only meaningful with -r, -f or -I, not -%c\n",
dv->devname, dv->disposition);
goto abort;
}
@@ -1647,7 +1648,7 @@ int Manage_subdevs(char *devname, int fd,
close(sysfd);
goto abort;
}
-
+ case 'I': /* incremental fail */
if ((sysfd >= 0 && write(sysfd, "faulty", 6) != 6) ||
(sysfd < 0 && ioctl(fd, SET_DISK_FAULTY,
rdev))) {
--
2.33.0

View File

@ -1,6 +1,6 @@
Name: mdadm
Version: 4.2
Release: 9
Release: 10
Summary: The software RAID arrays user manage tools
License: GPLv2+
URL: http://www.kernel.org/pub/linux/utils/raid/mdadm/
@ -20,6 +20,9 @@ Patch7: 0007-DDF-Fix-NULL-pointer-dereference-in-validate_geometr.patch
Patch8: 0008-fix-NULL-dereference-in-super_by_fd.patch
Patch9: 0009-fix-mdmonitor-oneshot.service-start-error.patch
Patch10: 0010-Fix-null-pointer-for-incremental-in-mdadm.patch
Patch11: 0011-Manage-Block-unsafe-member-failing.patch
Patch12: 0012-Manage-do-not-check-array-state-when-drive-is-remove.patch
Patch13: 0013-incremental-manage-do-not-verify-if-remove-is-safe.patch
BuildRequires: systemd gcc binutils libudev-devel
Requires(post): systemd coreutils
@ -85,6 +88,9 @@ install -d -m 710 %{buildroot}/var/run/mdadm/
%{_mandir}/man*/*
%changelog
* Tue Nov 7 2023 wuguanghao <wuguanghao3@huawei.com> - 4.2-10
* fix RAID0 becoming unusable after setting fault of member disks
* Wed Sep 13 2023 miaoguanqin <miaoguanqin@huawei.com> - 4.2-9
* Fix null pointer for incremental in mdadm