diff mbox series

[-e2fsprogs] mke2fs: don't set the raid stripe for non-rotational devices by default

Message ID 20250505172732.570955-1-tytso@mit.edu
State New
Headers show
Series [-e2fsprogs] mke2fs: don't set the raid stripe for non-rotational devices by default | expand

Commit Message

Theodore Ts'o May 5, 2025, 5:27 p.m. UTC
The ext4 block allocator is not at all efficient when it is asked to
enforce RAID alignment.  It is especially bad for flash-based devices,
or when the file system is highly fragmented.  For non-rotational
devices, it's fine to set the stride parameter (which controls
spreading the allocation bitmaps across the RAID component devices,
which always makessense); but for the stripe parameter (which asks the
ext4 block alocator to try _very_ hard to find RAID stripe aligned
devices) it's probably not a good idea.

Add new mke2fs.conf parameters with the defaults:

[defaults]
   set_raid_stride = always
   set_raid_stripe = disk

Even for RAID arrays based on HDD's, we can still have problems for
highly fragmented file systems.  This will need to solved in the
kernel, probably by having some kind of wall clock or CPU time
limitation for each block allocation or adding some kind of
optimization which is faster than using our current buddy bitmap
implementation, especially if the stripe size is not multiple of a
power of two.  But for SSD's, it's much less likely to make sense even
if we have an optimized block allocator, because if you've paid $$$
for a flash-based RAID array, the cost/benefit tradeoffs of doing less
optimized stripe RMW cycles versus the block allocator time and CPU
overhead is harder to justify without a lot of optimization effort.

If and when we can improve the ext4 kernel implementation (and it gets
rolled out to users using LTS kernels), we can change the defaults.
And of course, system administrators can always change
/etc/mke2fs.conf settings.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 misc/mke2fs.c         | 73 +++++++++++++++++++++++++++++++++++++++++--
 misc/mke2fs.conf.5.in | 18 +++++++++++
 2 files changed, 88 insertions(+), 3 deletions(-)
diff mbox series

Patch

diff --git a/misc/mke2fs.c b/misc/mke2fs.c
index f24076bc1..dfb4405a7 100644
--- a/misc/mke2fs.c
+++ b/misc/mke2fs.c
@@ -46,6 +46,9 @@  extern int optind;
 #ifdef HAVE_SYS_IOCTL_H
 #include <sys/ioctl.h>
 #endif
+#ifdef HAVE_SYS_SYSMACROS_H
+#include <sys/sysmacros.h>
+#endif
 #include <libgen.h>
 #include <limits.h>
 #include <blkid/blkid.h>
@@ -1571,6 +1574,36 @@  struct device_param {
 };
 
 #ifdef HAVE_BLKID_PROBE_GET_TOPOLOGY
+static int is_rotational (const char *device_name EXT2FS_ATTR((unused)))
+{
+	int		rotational = -1;
+#ifdef __linux__
+	char		path[1024];
+	struct stat	st;
+	FILE		*f;
+
+	if ((stat(device_name, &st) < 0) || !S_ISBLK(st.st_mode))
+		return -1;
+
+	snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/queue/rotational",
+		major(st.st_rdev), minor(st.st_rdev));
+	f = fopen(path, "r");
+	if (!f) {
+		snprintf(path, sizeof(path),
+			"/sys/dev/block/%d:%d/../queue/rotational",
+			major(st.st_rdev), minor(st.st_rdev));
+		f = fopen(path, "r");
+	}
+	if (f) {
+		if (fscanf(f, "%d", &rotational) != 1)
+			rotational = -1;
+		fclose(f);
+	}
+#endif
+	return rotational;
+}
+
+
 /*
  * Sets the geometry of a device (stripe/stride), and returns the
  * device's alignment offset, if any, or a negative error.
@@ -2429,13 +2462,47 @@  profile_error:
 			_("warning: Unable to get device geometry for %s\n"),
 			device_name);
 	} else {
+		int set_stripe, set_stride, rotational;
+
 		/* setting stripe/stride to blocksize is pointless */
-		if (dev_param.min_io > (unsigned) blocksize)
+		set_stride = dev_param.min_io > (unsigned) blocksize;
+		set_stripe = dev_param.opt_io > (unsigned) blocksize;
+		rotational = is_rotational(device_name) != 0;
+
+		/*
+		 * allow mke2fs.conf settings to control whether the
+		 * raid stripe/stride is set.  The default is to
+		 * always set the stride regardless of whether the
+		 * storage device is using HDD's or SSD's.  But only
+		 * set the stripe size if the storage device is
+		 * HDD-based by default because the ext4's block
+		 * allocator is very inefficient and especially for
+		 * SSD-based RAID arrays, trying to do raid-aligned
+		 * allocations is not worth it.
+		 */
+		tmp = get_string_from_profile(fs_types, "set_raid_stride",
+					      "always");
+		if (tmp && *tmp) {
+			if ((strcmp(tmp, "never") == 0) ||
+			    ((strcmp(tmp, "always") != 0) && !rotational))
+				set_stride = 0;
+		}
+		free(tmp);
+
+		tmp = get_string_from_profile(fs_types, "set_raid_stripe",
+					      "disk");
+		if (tmp && *tmp) {
+			if ((strcmp(tmp, "never") == 0) ||
+			    ((strcmp(tmp, "always") != 0) && !rotational))
+				set_stripe = 0;
+		}
+		free(tmp);
+
+		if (set_stride)
 			fs_param.s_raid_stride = dev_param.min_io / blocksize;
-		if (dev_param.opt_io > (unsigned) blocksize) {
+		if (set_stripe)
 			fs_param.s_raid_stripe_width =
 						dev_param.opt_io / blocksize;
-		}
 
 		if (dev_param.alignment_offset) {
 			printf(_("%s alignment is offset by %lu bytes.\n"),
diff --git a/misc/mke2fs.conf.5.in b/misc/mke2fs.conf.5.in
index 96dbfcbf8..629d1e1d0 100644
--- a/misc/mke2fs.conf.5.in
+++ b/misc/mke2fs.conf.5.in
@@ -438,6 +438,24 @@  This boolean relation specifies whether the
 .BR mke2fs (8)
 should attempt to discard device prior to file system creation.
 .TP
+.I set_raid_stride
+This relation specifies whether the file sytem's RAID stride size is set
+from the block device if available.  Valid values are:
+.IR always ,
+.IR disk ,
+.IR never .
+The default value is
+.IR always .
+.TP
+.I set_raid_stripe
+This relation specifies whether the file sytem's RAID stripe size is set
+from the block device if available.  Valid values are:
+.IR always ,
+.IR disk ,
+.IR never .
+The default value is
+.IR disk .
+.TP
 .I cluster_size
 This relation specifies the default cluster size if the bigalloc file
 system feature is enabled.  It can be overridden via the