diff mbox series

[v2] ext4: Replace value of xattrs in place

Message ID 20230510001409.14522-1-sunjunchao2870@gmail.com
State New
Headers show
Series [v2] ext4: Replace value of xattrs in place | expand

Commit Message

Julian Sun May 10, 2023, 12:14 a.m. UTC
When replacing the value of an xattr found in an ea_inode, currently
ext4 will evict the ea_inode that stores the old value, recreate an
ea_inode, and then write the new value into the new ea_inode.
This can be optimized by writing the new value into the old
ea_inode directly.

The logic for replacing value of xattrs without this patch
is as follows:
ext4_xattr_set_entry()
    ->ext4_xattr_inode_iget(&old_ea_inode)
    ->ext4_xattr_inode_lookup_create(&new_ea_inode)
    ->ext4_xattr_inode_dec_ref(old_ea_inode)
    ->iput(old_ea_inode)
        ->ext4_destroy_inode()
        ->ext4_evict_inode()
        ->ext4_free_inode()
    ->iput(new_ea_inode)

The logic with this patch is:
ext4_xattr_set_entry()
    ->ext4_xattr_inode_iget(&old_ea_inode)
    ->ext4_xattr_inode_write(old_ea_inode, new_value)
    ->iput(old_ea_inode)

This patch reduces the time it takes to replace xattrs in the ext4.
Without this patch, replacing the value of an xattr two million times takes
about 45 seconds on Intel(R) Xeon(R) CPU E5-2620 v3 platform.
With this patch, the same operation takes only 6 seconds.

  [root@client01 sjc]# ./mount.sh
  /dev/sdb1 contains a ext4 file system
      last mounted on /mnt/ext4 on Mon May  8 17:05:38 2023
  [root@client01 sjc]# touch /mnt/ext4/file1
  [root@client01 sjc]# gcc test.c
  [root@client01 sjc]# time ./a.out

  real    0m45.248s
  user    0m0.513s
  sys 0m39.231s

  [root@client01 sjc]# ./mount.sh
  /dev/sdb1 contains a ext4 file system
      last mounted on /mnt/ext4 on Mon May  8 17:08:20 2023
  [root@client01 sjc]# touch /mnt/ext4/file1
  [root@client01 sjc]# time ./a.out

  real    0m5.977s
  user    0m0.316s
  sys 0m5.659s

The test.c and mount.sh are in [1].
This patch passed the tests with xfstests using 'check -g quick'.

[1] https://gist.github.com/sjc2870/c923d7fa627d10ab65d6c305afb02cdb

Signed-off-by: JunChao Sun <sunjunchao2870@gmail.com>
---

Changes in v2:
  - Fix a problem when ref of an ea_inode not equal to 1
  - Link to v1: https://lore.kernel.org/linux-ext4/20230509011042.11781-1-sunjunchao2870@gmail.com/

 fs/ext4/xattr.c | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

Comments

Julian Sun May 23, 2023, 12:06 p.m. UTC | #1
Friendly ping...
Could anyone review this patch?

JunChao Sun <sunjunchao2870@gmail.com> 于2023年5月10日周三 08:14写道:
>
> When replacing the value of an xattr found in an ea_inode, currently
> ext4 will evict the ea_inode that stores the old value, recreate an
> ea_inode, and then write the new value into the new ea_inode.
> This can be optimized by writing the new value into the old
> ea_inode directly.
>
> The logic for replacing value of xattrs without this patch
> is as follows:
> ext4_xattr_set_entry()
>     ->ext4_xattr_inode_iget(&old_ea_inode)
>     ->ext4_xattr_inode_lookup_create(&new_ea_inode)
>     ->ext4_xattr_inode_dec_ref(old_ea_inode)
>     ->iput(old_ea_inode)
>         ->ext4_destroy_inode()
>         ->ext4_evict_inode()
>         ->ext4_free_inode()
>     ->iput(new_ea_inode)
>
> The logic with this patch is:
> ext4_xattr_set_entry()
>     ->ext4_xattr_inode_iget(&old_ea_inode)
>     ->ext4_xattr_inode_write(old_ea_inode, new_value)
>     ->iput(old_ea_inode)
>
> This patch reduces the time it takes to replace xattrs in the ext4.
> Without this patch, replacing the value of an xattr two million times takes
> about 45 seconds on Intel(R) Xeon(R) CPU E5-2620 v3 platform.
> With this patch, the same operation takes only 6 seconds.
>
>   [root@client01 sjc]# ./mount.sh
>   /dev/sdb1 contains a ext4 file system
>       last mounted on /mnt/ext4 on Mon May  8 17:05:38 2023
>   [root@client01 sjc]# touch /mnt/ext4/file1
>   [root@client01 sjc]# gcc test.c
>   [root@client01 sjc]# time ./a.out
>
>   real    0m45.248s
>   user    0m0.513s
>   sys 0m39.231s
>
>   [root@client01 sjc]# ./mount.sh
>   /dev/sdb1 contains a ext4 file system
>       last mounted on /mnt/ext4 on Mon May  8 17:08:20 2023
>   [root@client01 sjc]# touch /mnt/ext4/file1
>   [root@client01 sjc]# time ./a.out
>
>   real    0m5.977s
>   user    0m0.316s
>   sys 0m5.659s
>
> The test.c and mount.sh are in [1].
> This patch passed the tests with xfstests using 'check -g quick'.
>
> [1] https://gist.github.com/sjc2870/c923d7fa627d10ab65d6c305afb02cdb
>
> Signed-off-by: JunChao Sun <sunjunchao2870@gmail.com>
> ---
>
> Changes in v2:
>   - Fix a problem when ref of an ea_inode not equal to 1
>   - Link to v1: https://lore.kernel.org/linux-ext4/20230509011042.11781-1-sunjunchao2870@gmail.com/
>
>  fs/ext4/xattr.c | 36 ++++++++++++++++++++++++++++++++++++
>  1 file changed, 36 insertions(+)
>
> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
> index d57408cbe903..8f03958bfcc6 100644
> --- a/fs/ext4/xattr.c
> +++ b/fs/ext4/xattr.c
> @@ -1713,6 +1713,42 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
>                 }
>         }
>
> +       if (!s->not_found && i->value && here->e_value_inum && i->in_inode) {
> +               /* Replace xattr value in ea_inode in place */
> +               int size_diff = i->value_len - le32_to_cpu(here->e_value_size);
> +
> +               ret = ext4_xattr_inode_iget(inode,
> +                                               le32_to_cpu(here->e_value_inum),
> +                                               le32_to_cpu(here->e_hash),
> +                                               &old_ea_inode);
> +               if (ret) {
> +                       old_ea_inode = NULL;
> +                       goto out;
> +               }
> +               if (ext4_xattr_inode_get_ref(old_ea_inode) == 1) {
> +                       if (size_diff > 0)
> +                               ret = ext4_xattr_inode_alloc_quota(inode, size_diff);
> +                       else if (size_diff < 0)
> +                               ext4_xattr_inode_free_quota(inode, NULL, -size_diff);
> +                       if (ret)
> +                               goto out;
> +
> +                       ret = ext4_xattr_inode_write(handle, old_ea_inode, i->value, i->value_len);
> +                       if (ret) {
> +                               if (size_diff > 0)
> +                                       ext4_xattr_inode_free_quota(inode, NULL, size_diff);
> +                               else if (size_diff < 0)
> +                                       ret = ext4_xattr_inode_alloc_quota(inode, -size_diff);
> +                               goto out;
> +                       }
> +                       here->e_value_size = cpu_to_le32(i->value_len);
> +                       new_ea_inode = old_ea_inode;
> +                       old_ea_inode = NULL;
> +                       goto update_hash;
> +               } else
> +                       iput(old_ea_inode);
> +       }
> +
>         /*
>          * Getting access to old and new ea inodes is subject to failures.
>          * Finish that work before doing any modifications to the xattr data.
> --
> 1.8.3.1
>
Andreas Dilger July 20, 2023, 10:51 p.m. UTC | #2
On May 9, 2023, at 6:14 PM, JunChao Sun <sunjunchao2870@gmail.com> wrote:
> 
> When replacing the value of an xattr found in an ea_inode, currently
> ext4 will evict the ea_inode that stores the old value, recreate an
> ea_inode, and then write the new value into the new ea_inode.
> This can be optimized by writing the new value into the old
> ea_inode directly.

Sorry for the long delay in reviewing this patch.

The performance gains are nice, and reducing overhead is always good.

One question about this patch is whether it is safe to overwrite the
xattr in place if the system crashes during the overwrite?  That was
one of the reasons why the xattr update was implemented by writing to
a new xattr inode, and then atomically swapping xattr inode numbers.

However, if the xattr overwrite is done via journaled data writes then
it would be safe to "overwrite" the xattr data "in place", because
data will first be committed to the journal and then checkpointed into
the inode itself, so it should never be inconsistent/corrupted.

Did you also test cases where the xattr size is growing or shrinking
during the overwrite in place?  That should allocate or free blocks
in the xattr inode so that they are not wasted.

Cheers, Andreas

> The logic for replacing value of xattrs without this patch
> is as follows:
> ext4_xattr_set_entry()
>    ->ext4_xattr_inode_iget(&old_ea_inode)
>    ->ext4_xattr_inode_lookup_create(&new_ea_inode)
>    ->ext4_xattr_inode_dec_ref(old_ea_inode)
>    ->iput(old_ea_inode)
>        ->ext4_destroy_inode()
>        ->ext4_evict_inode()
>        ->ext4_free_inode()
>    ->iput(new_ea_inode)
> 
> The logic with this patch is:
> ext4_xattr_set_entry()
>    ->ext4_xattr_inode_iget(&old_ea_inode)
>    ->ext4_xattr_inode_write(old_ea_inode, new_value)
>    ->iput(old_ea_inode)
> 
> This patch reduces the time it takes to replace xattrs in the ext4.
> Without this patch, replacing the value of an xattr two million times takes
> about 45 seconds on Intel(R) Xeon(R) CPU E5-2620 v3 platform.
> With this patch, the same operation takes only 6 seconds.
> 
>  [root@client01 sjc]# ./mount.sh
>  /dev/sdb1 contains a ext4 file system
>      last mounted on /mnt/ext4 on Mon May  8 17:05:38 2023
>  [root@client01 sjc]# touch /mnt/ext4/file1
>  [root@client01 sjc]# gcc test.c
>  [root@client01 sjc]# time ./a.out
> 
>  real    0m45.248s
>  user    0m0.513s
>  sys 0m39.231s
> 
>  [root@client01 sjc]# ./mount.sh
>  /dev/sdb1 contains a ext4 file system
>      last mounted on /mnt/ext4 on Mon May  8 17:08:20 2023
>  [root@client01 sjc]# touch /mnt/ext4/file1
>  [root@client01 sjc]# time ./a.out
> 
>  real    0m5.977s
>  user    0m0.316s
>  sys 0m5.659s
> 
> The test.c and mount.sh are in [1].
> This patch passed the tests with xfstests using 'check -g quick'.
> 
> [1] https://gist.github.com/sjc2870/c923d7fa627d10ab65d6c305afb02cdb
> 
> Signed-off-by: JunChao Sun <sunjunchao2870@gmail.com>
> ---
> 
> Changes in v2:
>  - Fix a problem when ref of an ea_inode not equal to 1
>  - Link to v1: https://lore.kernel.org/linux-ext4/20230509011042.11781-1-sunjunchao2870@gmail.com/
> 
> fs/ext4/xattr.c | 36 ++++++++++++++++++++++++++++++++++++
> 1 file changed, 36 insertions(+)
> 
> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
> index d57408cbe903..8f03958bfcc6 100644
> --- a/fs/ext4/xattr.c
> +++ b/fs/ext4/xattr.c
> @@ -1713,6 +1713,42 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
> 		}
> 	}
> 
> +	if (!s->not_found && i->value && here->e_value_inum && i->in_inode) {
> +		/* Replace xattr value in ea_inode in place */
> +		int size_diff = i->value_len - le32_to_cpu(here->e_value_size);
> +
> +		ret = ext4_xattr_inode_iget(inode,
> +						le32_to_cpu(here->e_value_inum),
> +						le32_to_cpu(here->e_hash),
> +						&old_ea_inode);
> +		if (ret) {
> +			old_ea_inode = NULL;
> +			goto out;
> +		}
> +		if (ext4_xattr_inode_get_ref(old_ea_inode) == 1) {
> +			if (size_diff > 0)
> +				ret = ext4_xattr_inode_alloc_quota(inode, size_diff);
> +			else if (size_diff < 0)
> +				ext4_xattr_inode_free_quota(inode, NULL, -size_diff);
> +			if (ret)
> +				goto out;
> +
> +			ret = ext4_xattr_inode_write(handle, old_ea_inode, i->value, i->value_len);
> +			if (ret) {
> +				if (size_diff > 0)
> +					ext4_xattr_inode_free_quota(inode, NULL, size_diff);
> +				else if (size_diff < 0)
> +					ret = ext4_xattr_inode_alloc_quota(inode, -size_diff);
> +				goto out;
> +			}
> +			here->e_value_size = cpu_to_le32(i->value_len);
> +			new_ea_inode = old_ea_inode;
> +			old_ea_inode = NULL;
> +			goto update_hash;
> +		} else
> +			iput(old_ea_inode);
> +	}
> +
> 	/*
> 	 * Getting access to old and new ea inodes is subject to failures.
> 	 * Finish that work before doing any modifications to the xattr data.
> --
> 1.8.3.1
> 


Cheers, Andreas
Julian Sun Oct. 18, 2023, 8:44 a.m. UTC | #3
Andreas Dilger <adilger@dilger.ca> 于2023年7月21日周五 06:50写道:
>
> On May 9, 2023, at 6:14 PM, JunChao Sun <sunjunchao2870@gmail.com> wrote:
> >
> > When replacing the value of an xattr found in an ea_inode, currently
> > ext4 will evict the ea_inode that stores the old value, recreate an
> > ea_inode, and then write the new value into the new ea_inode.
> > This can be optimized by writing the new value into the old
> > ea_inode directly.
>
> Sorry for the long delay in reviewing this patch.
>
> The performance gains are nice, and reducing overhead is always good.
>
>
> > One question about this patch is whether it is safe to overwrite the
> > xattr in place if the system crashes during the overwrite?  That was
> > one of the reasons why the xattr update was implemented by writing to
> > a new xattr inode, and then atomically swapping xattr inode numbers.
> >
> > However, if the xattr overwrite is done via journaled data writes then
> > it would be safe to "overwrite" the xattr data "in place", because
> > data will first be committed to the journal and then checkpointed into
> > the inode itself, so it should never be inconsistent/corrupted.
Thanks for your review. I'm sorry for taking so long to reply..
I checked the relevant code, and ensured that xattr overwriting is
done via journal, so it should be safe.
>
>
> > Did you also test cases where the xattr size is growing or shrinking
> > during the overwrite in place?  That should allocate or free blocks
> > in the xattr inode so that they are not wasted.
> >
Here is a bug in this patch, when xattr size is shrinking, blocks that
were previously allocated but need to be released now will not be
released. Here may need a ext4_truncate to release redundant blocks. I
will check ext4_truncate function and test relative cases, and then
send patch v3.

> Cheers, Andreas
>
> > The logic for replacing value of xattrs without this patch
> > is as follows:
> > ext4_xattr_set_entry()
> >    ->ext4_xattr_inode_iget(&old_ea_inode)
> >    ->ext4_xattr_inode_lookup_create(&new_ea_inode)
> >    ->ext4_xattr_inode_dec_ref(old_ea_inode)
> >    ->iput(old_ea_inode)
> >        ->ext4_destroy_inode()
> >        ->ext4_evict_inode()
> >        ->ext4_free_inode()
> >    ->iput(new_ea_inode)
> >
> > The logic with this patch is:
> > ext4_xattr_set_entry()
> >    ->ext4_xattr_inode_iget(&old_ea_inode)
> >    ->ext4_xattr_inode_write(old_ea_inode, new_value)
> >    ->iput(old_ea_inode)
> >
> > This patch reduces the time it takes to replace xattrs in the ext4.
> > Without this patch, replacing the value of an xattr two million times takes
> > about 45 seconds on Intel(R) Xeon(R) CPU E5-2620 v3 platform.
> > With this patch, the same operation takes only 6 seconds.
> >
> >  [root@client01 sjc]# ./mount.sh
> >  /dev/sdb1 contains a ext4 file system
> >      last mounted on /mnt/ext4 on Mon May  8 17:05:38 2023
> >  [root@client01 sjc]# touch /mnt/ext4/file1
> >  [root@client01 sjc]# gcc test.c
> >  [root@client01 sjc]# time ./a.out
> >
> >  real    0m45.248s
> >  user    0m0.513s
> >  sys 0m39.231s
> >
> >  [root@client01 sjc]# ./mount.sh
> >  /dev/sdb1 contains a ext4 file system
> >      last mounted on /mnt/ext4 on Mon May  8 17:08:20 2023
> >  [root@client01 sjc]# touch /mnt/ext4/file1
> >  [root@client01 sjc]# time ./a.out
> >
> >  real    0m5.977s
> >  user    0m0.316s
> >  sys 0m5.659s
> >
> > The test.c and mount.sh are in [1].
> > This patch passed the tests with xfstests using 'check -g quick'.
> >
> > [1] https://gist.github.com/sjc2870/c923d7fa627d10ab65d6c305afb02cdb
> >
> > Signed-off-by: JunChao Sun <sunjunchao2870@gmail.com>
> > ---
> >
> > Changes in v2:
> >  - Fix a problem when ref of an ea_inode not equal to 1
> >  - Link to v1: https://lore.kernel.org/linux-ext4/20230509011042.11781-1-sunjunchao2870@gmail.com/
> >
> > fs/ext4/xattr.c | 36 ++++++++++++++++++++++++++++++++++++
> > 1 file changed, 36 insertions(+)
> >
> > diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
> > index d57408cbe903..8f03958bfcc6 100644
> > --- a/fs/ext4/xattr.c
> > +++ b/fs/ext4/xattr.c
> > @@ -1713,6 +1713,42 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
> >               }
> >       }
> >
> > +     if (!s->not_found && i->value && here->e_value_inum && i->in_inode) {
> > +             /* Replace xattr value in ea_inode in place */
> > +             int size_diff = i->value_len - le32_to_cpu(here->e_value_size);
> > +
> > +             ret = ext4_xattr_inode_iget(inode,
> > +                                             le32_to_cpu(here->e_value_inum),
> > +                                             le32_to_cpu(here->e_hash),
> > +                                             &old_ea_inode);
> > +             if (ret) {
> > +                     old_ea_inode = NULL;
> > +                     goto out;
> > +             }
> > +             if (ext4_xattr_inode_get_ref(old_ea_inode) == 1) {
> > +                     if (size_diff > 0)
> > +                             ret = ext4_xattr_inode_alloc_quota(inode, size_diff);
> > +                     else if (size_diff < 0)
> > +                             ext4_xattr_inode_free_quota(inode, NULL, -size_diff);
> > +                     if (ret)
> > +                             goto out;
> > +
> > +                     ret = ext4_xattr_inode_write(handle, old_ea_inode, i->value, i->value_len);
> > +                     if (ret) {
> > +                             if (size_diff > 0)
> > +                                     ext4_xattr_inode_free_quota(inode, NULL, size_diff);
> > +                             else if (size_diff < 0)
> > +                                     ret = ext4_xattr_inode_alloc_quota(inode, -size_diff);
> > +                             goto out;
> > +                     }
> > +                     here->e_value_size = cpu_to_le32(i->value_len);
> > +                     new_ea_inode = old_ea_inode;
> > +                     old_ea_inode = NULL;
> > +                     goto update_hash;
> > +             } else
> > +                     iput(old_ea_inode);
> > +     }
> > +
> >       /*
> >        * Getting access to old and new ea inodes is subject to failures.
> >        * Finish that work before doing any modifications to the xattr data.
> > --
> > 1.8.3.1
> >
>
>
> Cheers, Andreas
>
>
>
>
>
diff mbox series

Patch

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index d57408cbe903..8f03958bfcc6 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1713,6 +1713,42 @@  static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
 		}
 	}
 
+	if (!s->not_found && i->value && here->e_value_inum && i->in_inode) {
+		/* Replace xattr value in ea_inode in place */
+		int size_diff = i->value_len - le32_to_cpu(here->e_value_size);
+
+		ret = ext4_xattr_inode_iget(inode,
+						le32_to_cpu(here->e_value_inum),
+						le32_to_cpu(here->e_hash),
+						&old_ea_inode);
+		if (ret) {
+			old_ea_inode = NULL;
+			goto out;
+		}
+		if (ext4_xattr_inode_get_ref(old_ea_inode) == 1) {
+			if (size_diff > 0)
+				ret = ext4_xattr_inode_alloc_quota(inode, size_diff);
+			else if (size_diff < 0)
+				ext4_xattr_inode_free_quota(inode, NULL, -size_diff);
+			if (ret)
+				goto out;
+
+			ret = ext4_xattr_inode_write(handle, old_ea_inode, i->value, i->value_len);
+			if (ret) {
+				if (size_diff > 0)
+					ext4_xattr_inode_free_quota(inode, NULL, size_diff);
+				else if (size_diff < 0)
+					ret = ext4_xattr_inode_alloc_quota(inode, -size_diff);
+				goto out;
+			}
+			here->e_value_size = cpu_to_le32(i->value_len);
+			new_ea_inode = old_ea_inode;
+			old_ea_inode = NULL;
+			goto update_hash;
+		} else
+			iput(old_ea_inode);
+	}
+
 	/*
 	 * Getting access to old and new ea inodes is subject to failures.
 	 * Finish that work before doing any modifications to the xattr data.