diff mbox series

[08/12] swap,iomap: simplify iomap_swapfile_iter

Message ID 20260512053625.2950900-9-hch@lst.de
State Not Applicable
Headers show
Series [01/12] swap: remove the maxpages variable in sys_swapon | expand

Commit Message

Christoph Hellwig May 12, 2026, 5:35 a.m. UTC
add_swap_extent already coalesces multiple extents, no need to duplicate
that in the caller.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/iomap/swapfile.c | 104 +++++++++++++-------------------------------
 1 file changed, 31 insertions(+), 73 deletions(-)

Comments

Damien Le Moal May 12, 2026, 7:31 a.m. UTC | #1
On 5/12/26 14:35, Christoph Hellwig wrote:
> add_swap_extent already coalesces multiple extents, no need to duplicate
> that in the caller.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Looks OK to me.

Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Darrick J. Wong May 12, 2026, 5:02 p.m. UTC | #2
On Tue, May 12, 2026 at 07:35:24AM +0200, Christoph Hellwig wrote:
> add_swap_extent already coalesces multiple extents, no need to duplicate
> that in the caller.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

/me wishes he'd either noticed that add_swap_extent already had the
coalescing code or had documented why he implemented his own.

OH.  Now I remember why -- it's to handle contiguous mixed mappings
better.

Let's say that you have a 1k fsblock filesystem and 4k base pages.  You
fallocate an 8G swap file and then mkswap it.  The first mapping is a 1k
written mapping at offset 0 for the swap header, followed by an 8388607k
unwritten mapping at offset 3k.

The PAGE_SIZE rounding code in iomap_swapfile_add_extent will round the
end of that first mapping down to zero and ignore it.  The second
mapping will be treated as if it were a 8388604k mapping starting at
offset 4096.  Now the page counts are wrong and the swapon fails.

A more generic solution to this would be to change add_swap_extent to
take sector_t addr and length values and use them to construct a bitmap
representing contiguous physical space on the bdev, accounting of course
for PAGE_SIZE alignment.  Except for the swap header page, every other
contiguously set page-aligned region in the bitmap gets added to the
swap extent map.

You could then maximize the number of pages participating in swap even
for files with layouts that are truly egregiously bad.  But I elected
not to go there because the common case is fallocate getting contiguous
space.

But still, I'm not sure we want to drop the iomap accumulator in
fs/iomap/swapfile.c.

--D

> ---
>  fs/iomap/swapfile.c | 104 +++++++++++++-------------------------------
>  1 file changed, 31 insertions(+), 73 deletions(-)
> 
> diff --git a/fs/iomap/swapfile.c b/fs/iomap/swapfile.c
> index cf354fdfb7c3..a4e0ca462cc4 100644
> --- a/fs/iomap/swapfile.c
> +++ b/fs/iomap/swapfile.c
> @@ -6,57 +6,32 @@
>  #include <linux/iomap.h>
>  #include <linux/swap.h>
>  
> -/* Swapfile activation */
> -
> -struct iomap_swapfile_info {
> -	struct iomap iomap;		/* accumulated iomap */
> -	struct swap_info_struct *sis;
> -	unsigned long nr_pages;		/* number of pages collected */
> -	struct file *file;
> -};
> -
> -/*
> - * Collect physical extents for this swap file.  Physical extents reported to
> - * the swap code must be trimmed to align to a page boundary.  The logical
> - * offset within the file is irrelevant since the swapfile code maps logical
> - * page numbers of the swap device to the physical page-aligned extents.
> - */
> -static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi)
> -{
> -	struct iomap *iomap = &isi->iomap;
> -	uint64_t first_ppage;
> -	uint64_t next_ppage;
> -
> -	/*
> -	 * Round the start up and the end down so that the physical
> -	 * extent aligns to a page boundary.
> -	 */
> -	first_ppage = ALIGN(iomap->addr, PAGE_SIZE) >> PAGE_SHIFT;
> -	next_ppage = ALIGN_DOWN(iomap->addr + iomap->length, PAGE_SIZE) >>
> -			PAGE_SHIFT;
> -	return add_swap_extent(isi->sis, next_ppage - first_ppage, first_ppage);
> -}
> -
> -static int iomap_swapfile_fail(struct iomap_swapfile_info *isi, const char *str)
> +static int iomap_swapfile_fail(struct file *file, const char *str)
>  {
>  	char *buf, *p = ERR_PTR(-ENOMEM);
>  
>  	buf = kmalloc(PATH_MAX, GFP_KERNEL);
>  	if (buf)
> -		p = file_path(isi->file, buf, PATH_MAX);
> +		p = file_path(file, buf, PATH_MAX);
>  	pr_err("swapon: file %s %s\n", IS_ERR(p) ? "<unknown>" : p, str);
>  	kfree(buf);
>  	return -EINVAL;
>  }
>  
>  /*
> - * Accumulate iomaps for this swap file.  We have to accumulate iomaps because
> - * swap only cares about contiguous page-aligned physical extents and makes no
> - * distinction between written and unwritten extents.
> + * Report physical extents for this swap file.  Physical extents reported to the
> + * swap code must be trimmed to align to a page boundary.  The logical offset
> + * within the file is irrelevant since the swapfile code maps logical page
> + * numbers of the swap device to the physical page-aligned extents.
>   */
> -static int iomap_swapfile_iter(struct iomap_iter *iter,
> -		struct iomap *iomap, struct iomap_swapfile_info *isi)
> +static int iomap_swapfile_iter(struct iomap_iter *iter, struct file *file,
> +		struct swap_info_struct *sis)
>  {
> +	struct iomap *iomap = &iter->iomap;
> +	uint64_t first_ppage;
> +	uint64_t next_ppage;
> +	int error;
> +
>  	switch (iomap->type) {
>  	case IOMAP_MAPPED:
>  	case IOMAP_UNWRITTEN:
> @@ -64,35 +39,31 @@ static int iomap_swapfile_iter(struct iomap_iter *iter,
>  		break;
>  	case IOMAP_INLINE:
>  		/* No inline data. */
> -		return iomap_swapfile_fail(isi, "is inline");
> +		return iomap_swapfile_fail(file, "is inline");
>  	default:
> -		return iomap_swapfile_fail(isi, "has unallocated extents");
> +		return iomap_swapfile_fail(file, "has unallocated extents");
>  	}
>  
>  	/* No uncommitted metadata or shared blocks. */
>  	if (iomap->flags & IOMAP_F_DIRTY)
> -		return iomap_swapfile_fail(isi, "is not committed");
> +		return iomap_swapfile_fail(file, "is not committed");
>  	if (iomap->flags & IOMAP_F_SHARED)
> -		return iomap_swapfile_fail(isi, "has shared extents");
> +		return iomap_swapfile_fail(file, "has shared extents");
>  
>  	/* Only one bdev per swap file. */
> -	if (iomap->bdev != isi->sis->bdev)
> -		return iomap_swapfile_fail(isi, "outside the main device");
> -
> -	if (isi->iomap.length == 0) {
> -		/* No accumulated extent, so just store it. */
> -		memcpy(&isi->iomap, iomap, sizeof(isi->iomap));
> -	} else if (isi->iomap.addr + isi->iomap.length == iomap->addr) {
> -		/* Append this to the accumulated extent. */
> -		isi->iomap.length += iomap->length;
> -	} else {
> -		/* Otherwise, add the retained iomap and store this one. */
> -		int error = iomap_swapfile_add_extent(isi);
> -		if (error)
> -			return error;
> -		memcpy(&isi->iomap, iomap, sizeof(isi->iomap));
> -	}
> +	if (iomap->bdev != sis->bdev)
> +		return iomap_swapfile_fail(file, "outside the main device");
>  
> +	/*
> +	 * Round the start up and the end down so that the physical extent
> +	 * aligns to a page boundary.
> +	 */
> +	first_ppage = ALIGN(iomap->addr, PAGE_SIZE) >> PAGE_SHIFT;
> +	next_ppage = ALIGN_DOWN(iomap->addr + iomap->length, PAGE_SIZE) >>
> +			PAGE_SHIFT;
> +	error = add_swap_extent(sis, next_ppage - first_ppage, first_ppage);
> +	if (error)
> +		return error;
>  	return iomap_iter_advance_full(iter);
>  }
>  
> @@ -110,10 +81,6 @@ int iomap_swap_activate(struct file *file, struct swap_info_struct *sis,
>  		.len	= ALIGN_DOWN(i_size_read(inode), PAGE_SIZE),
>  		.flags	= IOMAP_REPORT,
>  	};
> -	struct iomap_swapfile_info isi = {
> -		.sis = sis,
> -		.file = file,
> -	};
>  	int ret;
>  
>  	/*
> @@ -125,16 +92,7 @@ int iomap_swap_activate(struct file *file, struct swap_info_struct *sis,
>  		return ret;
>  
>  	while ((ret = iomap_iter(&iter, ops)) > 0)
> -		iter.status = iomap_swapfile_iter(&iter, &iter.iomap, &isi);
> -	if (ret < 0)
> -		return ret;
> -
> -	if (isi.iomap.length) {
> -		ret = iomap_swapfile_add_extent(&isi);
> -		if (ret)
> -			return ret;
> -	}
> -
> -	return 0;
> +		iter.status = iomap_swapfile_iter(&iter, file, sis);
> +	return ret;
>  }
>  EXPORT_SYMBOL_GPL(iomap_swap_activate);
> -- 
> 2.53.0
> 
>
Christoph Hellwig May 13, 2026, 6:56 a.m. UTC | #3
On Tue, May 12, 2026 at 10:02:04AM -0700, Darrick J. Wong wrote:
> OH.  Now I remember why -- it's to handle contiguous mixed mappings
> better.
> 
> Let's say that you have a 1k fsblock filesystem and 4k base pages.  You
> fallocate an 8G swap file and then mkswap it.  The first mapping is a 1k
> written mapping at offset 0 for the swap header, followed by an 8388607k
> unwritten mapping at offset 3k.
> 
> The PAGE_SIZE rounding code in iomap_swapfile_add_extent will round the
> end of that first mapping down to zero and ignore it.  The second
> mapping will be treated as if it were a 8388604k mapping starting at
> offset 4096.  Now the page counts are wrong and the swapon fails.

Do we care about this use case?  I guess you did as you implemented
his, but still?

> 
> A more generic solution to this would be to change add_swap_extent to
> take sector_t addr and length values and use them to construct a bitmap
> representing contiguous physical space on the bdev, accounting of course
> for PAGE_SIZE alignment.  Except for the swap header page, every other
> contiguously set page-aligned region in the bitmap gets added to the
> swap extent map.

You don't even need a bitmap, just do basically the same checks as
the iomap code when moving to a new swap extent after moving to use
the sector_t.  And it really should anyway, as the current abuse of
sector_t to store a disk offset in PAGE_SIZE units is pretty gross.
Darrick J. Wong May 13, 2026, 2:59 p.m. UTC | #4
On Wed, May 13, 2026 at 08:56:08AM +0200, Christoph Hellwig wrote:
> On Tue, May 12, 2026 at 10:02:04AM -0700, Darrick J. Wong wrote:
> > OH.  Now I remember why -- it's to handle contiguous mixed mappings
> > better.
> > 
> > Let's say that you have a 1k fsblock filesystem and 4k base pages.  You
> > fallocate an 8G swap file and then mkswap it.  The first mapping is a 1k
> > written mapping at offset 0 for the swap header, followed by an 8388607k
> > unwritten mapping at offset 3k.
> > 
> > The PAGE_SIZE rounding code in iomap_swapfile_add_extent will round the
> > end of that first mapping down to zero and ignore it.  The second
> > mapping will be treated as if it were a 8388604k mapping starting at
> > offset 4096.  Now the page counts are wrong and the swapon fails.
> 
> Do we care about this use case?  I guess you did as you implemented
> his, but still?

We do, because mkswap -F uses fallocate nowadays:

$ mkswap -s 4194304 -F a
Setting up swapspace version 1, size = 4 MiB (4190208 bytes)
no label, UUID=bc9746bf-e200-4944-927c-80d83872f1cb
$ filefrag -v a
Filesystem type is: 58465342
File size of a is 4194304 (1024 blocks of 4096 bytes)
 ext:     logical_offset:        physical_offset: length:   expected: flags:
   0:        0..       0:  411383552.. 411383552:      1:            
   1:        1..    1023:  411383553.. 411384575:   1023:             last,unwritten,eof
a: 1 extent found

> > A more generic solution to this would be to change add_swap_extent to
> > take sector_t addr and length values and use them to construct a bitmap
> > representing contiguous physical space on the bdev, accounting of course
> > for PAGE_SIZE alignment.  Except for the swap header page, every other
> > contiguously set page-aligned region in the bitmap gets added to the
> > swap extent map.
> 
> You don't even need a bitmap, just do basically the same checks as
> the iomap code when moving to a new swap extent after moving to use
> the sector_t.  And it really should anyway, as the current abuse of
> sector_t to store a disk offset in PAGE_SIZE units is pretty gross.

Oh, I meant this to handle the particularly gross case where the fsblock
size is smaller than a base page, but there are a very large number of
file mappings that point to a physically contiguous extent but are not
in logical order:

{.offset=0, .length=1k, .addr=7},
{.offset=1, .length=1k, .addr=6},
{.offset=2, .length=1k, .addr=5},
{.offset=3, .length=1k, .addr=4},
{.offset=4, .length=1k, .addr=3},
{.offset=5, .length=1k, .addr=2},
{.offset=6, .length=1k, .addr=1},
{.offset=7, .length=1k, .addr=0},

That's two pages of swapfile, but with the current layout accumulation
code we "cannot" find either.

--D
diff mbox series

Patch

diff --git a/fs/iomap/swapfile.c b/fs/iomap/swapfile.c
index cf354fdfb7c3..a4e0ca462cc4 100644
--- a/fs/iomap/swapfile.c
+++ b/fs/iomap/swapfile.c
@@ -6,57 +6,32 @@ 
 #include <linux/iomap.h>
 #include <linux/swap.h>
 
-/* Swapfile activation */
-
-struct iomap_swapfile_info {
-	struct iomap iomap;		/* accumulated iomap */
-	struct swap_info_struct *sis;
-	unsigned long nr_pages;		/* number of pages collected */
-	struct file *file;
-};
-
-/*
- * Collect physical extents for this swap file.  Physical extents reported to
- * the swap code must be trimmed to align to a page boundary.  The logical
- * offset within the file is irrelevant since the swapfile code maps logical
- * page numbers of the swap device to the physical page-aligned extents.
- */
-static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi)
-{
-	struct iomap *iomap = &isi->iomap;
-	uint64_t first_ppage;
-	uint64_t next_ppage;
-
-	/*
-	 * Round the start up and the end down so that the physical
-	 * extent aligns to a page boundary.
-	 */
-	first_ppage = ALIGN(iomap->addr, PAGE_SIZE) >> PAGE_SHIFT;
-	next_ppage = ALIGN_DOWN(iomap->addr + iomap->length, PAGE_SIZE) >>
-			PAGE_SHIFT;
-	return add_swap_extent(isi->sis, next_ppage - first_ppage, first_ppage);
-}
-
-static int iomap_swapfile_fail(struct iomap_swapfile_info *isi, const char *str)
+static int iomap_swapfile_fail(struct file *file, const char *str)
 {
 	char *buf, *p = ERR_PTR(-ENOMEM);
 
 	buf = kmalloc(PATH_MAX, GFP_KERNEL);
 	if (buf)
-		p = file_path(isi->file, buf, PATH_MAX);
+		p = file_path(file, buf, PATH_MAX);
 	pr_err("swapon: file %s %s\n", IS_ERR(p) ? "<unknown>" : p, str);
 	kfree(buf);
 	return -EINVAL;
 }
 
 /*
- * Accumulate iomaps for this swap file.  We have to accumulate iomaps because
- * swap only cares about contiguous page-aligned physical extents and makes no
- * distinction between written and unwritten extents.
+ * Report physical extents for this swap file.  Physical extents reported to the
+ * swap code must be trimmed to align to a page boundary.  The logical offset
+ * within the file is irrelevant since the swapfile code maps logical page
+ * numbers of the swap device to the physical page-aligned extents.
  */
-static int iomap_swapfile_iter(struct iomap_iter *iter,
-		struct iomap *iomap, struct iomap_swapfile_info *isi)
+static int iomap_swapfile_iter(struct iomap_iter *iter, struct file *file,
+		struct swap_info_struct *sis)
 {
+	struct iomap *iomap = &iter->iomap;
+	uint64_t first_ppage;
+	uint64_t next_ppage;
+	int error;
+
 	switch (iomap->type) {
 	case IOMAP_MAPPED:
 	case IOMAP_UNWRITTEN:
@@ -64,35 +39,31 @@  static int iomap_swapfile_iter(struct iomap_iter *iter,
 		break;
 	case IOMAP_INLINE:
 		/* No inline data. */
-		return iomap_swapfile_fail(isi, "is inline");
+		return iomap_swapfile_fail(file, "is inline");
 	default:
-		return iomap_swapfile_fail(isi, "has unallocated extents");
+		return iomap_swapfile_fail(file, "has unallocated extents");
 	}
 
 	/* No uncommitted metadata or shared blocks. */
 	if (iomap->flags & IOMAP_F_DIRTY)
-		return iomap_swapfile_fail(isi, "is not committed");
+		return iomap_swapfile_fail(file, "is not committed");
 	if (iomap->flags & IOMAP_F_SHARED)
-		return iomap_swapfile_fail(isi, "has shared extents");
+		return iomap_swapfile_fail(file, "has shared extents");
 
 	/* Only one bdev per swap file. */
-	if (iomap->bdev != isi->sis->bdev)
-		return iomap_swapfile_fail(isi, "outside the main device");
-
-	if (isi->iomap.length == 0) {
-		/* No accumulated extent, so just store it. */
-		memcpy(&isi->iomap, iomap, sizeof(isi->iomap));
-	} else if (isi->iomap.addr + isi->iomap.length == iomap->addr) {
-		/* Append this to the accumulated extent. */
-		isi->iomap.length += iomap->length;
-	} else {
-		/* Otherwise, add the retained iomap and store this one. */
-		int error = iomap_swapfile_add_extent(isi);
-		if (error)
-			return error;
-		memcpy(&isi->iomap, iomap, sizeof(isi->iomap));
-	}
+	if (iomap->bdev != sis->bdev)
+		return iomap_swapfile_fail(file, "outside the main device");
 
+	/*
+	 * Round the start up and the end down so that the physical extent
+	 * aligns to a page boundary.
+	 */
+	first_ppage = ALIGN(iomap->addr, PAGE_SIZE) >> PAGE_SHIFT;
+	next_ppage = ALIGN_DOWN(iomap->addr + iomap->length, PAGE_SIZE) >>
+			PAGE_SHIFT;
+	error = add_swap_extent(sis, next_ppage - first_ppage, first_ppage);
+	if (error)
+		return error;
 	return iomap_iter_advance_full(iter);
 }
 
@@ -110,10 +81,6 @@  int iomap_swap_activate(struct file *file, struct swap_info_struct *sis,
 		.len	= ALIGN_DOWN(i_size_read(inode), PAGE_SIZE),
 		.flags	= IOMAP_REPORT,
 	};
-	struct iomap_swapfile_info isi = {
-		.sis = sis,
-		.file = file,
-	};
 	int ret;
 
 	/*
@@ -125,16 +92,7 @@  int iomap_swap_activate(struct file *file, struct swap_info_struct *sis,
 		return ret;
 
 	while ((ret = iomap_iter(&iter, ops)) > 0)
-		iter.status = iomap_swapfile_iter(&iter, &iter.iomap, &isi);
-	if (ret < 0)
-		return ret;
-
-	if (isi.iomap.length) {
-		ret = iomap_swapfile_add_extent(&isi);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
+		iter.status = iomap_swapfile_iter(&iter, file, sis);
+	return ret;
 }
 EXPORT_SYMBOL_GPL(iomap_swap_activate);