diff mbox series

[02/10] mbcache: Add functions to delete entry if unused

Message ID 20220712105436.32204-2-jack@suse.cz
State Accepted
Headers show
Series ext4: Fix possible fs corruption due to xattr races | expand

Commit Message

Jan Kara July 12, 2022, 10:54 a.m. UTC
Add function mb_cache_entry_delete_or_get() to delete mbcache entry if
it is unused and also add a function to wait for entry to become unused
- mb_cache_entry_wait_unused(). We do not share code between the two
deleting function as one of them will go away soon.

CC: stable@vger.kernel.org
Fixes: 82939d7999df ("ext4: convert to mbcache2")
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/mbcache.c            | 66 +++++++++++++++++++++++++++++++++++++++--
 include/linux/mbcache.h | 10 ++++++-
 2 files changed, 73 insertions(+), 3 deletions(-)

Comments

Ritesh Harjani (IBM) July 14, 2022, 12:15 p.m. UTC | #1
On 22/07/12 12:54PM, Jan Kara wrote:
> Add function mb_cache_entry_delete_or_get() to delete mbcache entry if
> it is unused and also add a function to wait for entry to become unused
> - mb_cache_entry_wait_unused(). We do not share code between the two
> deleting function as one of them will go away soon.
>
> CC: stable@vger.kernel.org
> Fixes: 82939d7999df ("ext4: convert to mbcache2")
> Signed-off-by: Jan Kara <jack@suse.cz>
> ---
>  fs/mbcache.c            | 66 +++++++++++++++++++++++++++++++++++++++--
>  include/linux/mbcache.h | 10 ++++++-
>  2 files changed, 73 insertions(+), 3 deletions(-)
>
> diff --git a/fs/mbcache.c b/fs/mbcache.c
> index cfc28129fb6f..2010bc80a3f2 100644
> --- a/fs/mbcache.c
> +++ b/fs/mbcache.c
> @@ -11,7 +11,7 @@
>  /*
>   * Mbcache is a simple key-value store. Keys need not be unique, however
>   * key-value pairs are expected to be unique (we use this fact in
> - * mb_cache_entry_delete()).
> + * mb_cache_entry_delete_or_get()).
>   *
>   * Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
>   * Ext4 also uses it for deduplication of xattr values stored in inodes.
> @@ -125,6 +125,19 @@ void __mb_cache_entry_free(struct mb_cache_entry *entry)
>  }
>  EXPORT_SYMBOL(__mb_cache_entry_free);
>
> +/*
> + * mb_cache_entry_wait_unused - wait to be the last user of the entry
> + *
> + * @entry - entry to work on
> + *
> + * Wait to be the last user of the entry.
> + */
> +void mb_cache_entry_wait_unused(struct mb_cache_entry *entry)
> +{
> +	wait_var_event(&entry->e_refcnt, atomic_read(&entry->e_refcnt) <= 3);

It's not very intuitive of why we check for refcnt <= 3.
A small note at top of this function might be helpful.
IIUC, it is because by default when anyone creates an entry we start with
a refcnt of 2 (in mb_cache_entry_create.
- Now when the user of the entry wants to delete this, it will try and call
  mb_cache_entry_delete_or_get(). If during this function call it sees that the
  refcnt is elevated more than 2, that means there is another user of this entry
  currently active and hence we should wait before we remove this entry from the
  cache. So it will take an extra refcnt and return.
- So then this caller will call mb_cache_entry_wait_unused() for the refcnt to
  be <= 3, so that the entry can be deleted.

Quick qn -
So now is the design like, ext4_evict_ea_inode() will be waiting indefinitely
until the other user of this mb_cache entry releases the reference right?
And that will not happen until,
- either the shrinker removes this entry from the cache during which we are
  checking if the refcnt <= 3, then we call a wakeup event
- Or the user removes/deletes the xattr entry
Is the above understanding correct?

-ritesh


> +}
> +EXPORT_SYMBOL(mb_cache_entry_wait_unused);
> +
>  static struct mb_cache_entry *__entry_find(struct mb_cache *cache,
>  					   struct mb_cache_entry *entry,
>  					   u32 key)
> @@ -217,7 +230,7 @@ struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
>  }
>  EXPORT_SYMBOL(mb_cache_entry_get);
>
> -/* mb_cache_entry_delete - remove a cache entry
> +/* mb_cache_entry_delete - try to remove a cache entry
>   * @cache - cache we work with
>   * @key - key
>   * @value - value
> @@ -254,6 +267,55 @@ void mb_cache_entry_delete(struct mb_cache *cache, u32 key, u64 value)
>  }
>  EXPORT_SYMBOL(mb_cache_entry_delete);
>
> +/* mb_cache_entry_delete_or_get - remove a cache entry if it has no users
> + * @cache - cache we work with
> + * @key - key
> + * @value - value
> + *
> + * Remove entry from cache @cache with key @key and value @value. The removal
> + * happens only if the entry is unused. The function returns NULL in case the
> + * entry was successfully removed or there's no entry in cache. Otherwise the
> + * function grabs reference of the entry that we failed to delete because it
> + * still has users and return it.
> + */
> +struct mb_cache_entry *mb_cache_entry_delete_or_get(struct mb_cache *cache,
> +						    u32 key, u64 value)
> +{
> +	struct hlist_bl_node *node;
> +	struct hlist_bl_head *head;
> +	struct mb_cache_entry *entry;
> +
> +	head = mb_cache_entry_head(cache, key);
> +	hlist_bl_lock(head);
> +	hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
> +		if (entry->e_key == key && entry->e_value == value) {
> +			if (atomic_read(&entry->e_refcnt) > 2) {
> +				atomic_inc(&entry->e_refcnt);
> +				hlist_bl_unlock(head);
> +				return entry;
> +			}
> +			/* We keep hash list reference to keep entry alive */
> +			hlist_bl_del_init(&entry->e_hash_list);
> +			hlist_bl_unlock(head);
> +			spin_lock(&cache->c_list_lock);
> +			if (!list_empty(&entry->e_list)) {
> +				list_del_init(&entry->e_list);
> +				if (!WARN_ONCE(cache->c_entry_count == 0,
> +		"mbcache: attempt to decrement c_entry_count past zero"))
> +					cache->c_entry_count--;
> +				atomic_dec(&entry->e_refcnt);
> +			}
> +			spin_unlock(&cache->c_list_lock);
> +			mb_cache_entry_put(cache, entry);
> +			return NULL;
> +		}
> +	}
> +	hlist_bl_unlock(head);
> +
> +	return NULL;
> +}
> +EXPORT_SYMBOL(mb_cache_entry_delete_or_get);
> +
>  /* mb_cache_entry_touch - cache entry got used
>   * @cache - cache the entry belongs to
>   * @entry - entry that got used
> diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h
> index 20f1e3ff6013..8eca7f25c432 100644
> --- a/include/linux/mbcache.h
> +++ b/include/linux/mbcache.h
> @@ -30,15 +30,23 @@ void mb_cache_destroy(struct mb_cache *cache);
>  int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
>  			  u64 value, bool reusable);
>  void __mb_cache_entry_free(struct mb_cache_entry *entry);
> +void mb_cache_entry_wait_unused(struct mb_cache_entry *entry);
>  static inline int mb_cache_entry_put(struct mb_cache *cache,
>  				     struct mb_cache_entry *entry)
>  {
> -	if (!atomic_dec_and_test(&entry->e_refcnt))
> +	unsigned int cnt = atomic_dec_return(&entry->e_refcnt);
> +
> +	if (cnt > 0) {
> +		if (cnt <= 3)
> +			wake_up_var(&entry->e_refcnt);
>  		return 0;
> +	}
>  	__mb_cache_entry_free(entry);
>  	return 1;
>  }
>
> +struct mb_cache_entry *mb_cache_entry_delete_or_get(struct mb_cache *cache,
> +						    u32 key, u64 value);
>  void mb_cache_entry_delete(struct mb_cache *cache, u32 key, u64 value);
>  struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
>  					  u64 value);
> --
> 2.35.3
>
Jan Kara July 14, 2022, 2:49 p.m. UTC | #2
On Thu 14-07-22 17:45:32, Ritesh Harjani wrote:
> On 22/07/12 12:54PM, Jan Kara wrote:
> > Add function mb_cache_entry_delete_or_get() to delete mbcache entry if
> > it is unused and also add a function to wait for entry to become unused
> > - mb_cache_entry_wait_unused(). We do not share code between the two
> > deleting function as one of them will go away soon.
> >
> > CC: stable@vger.kernel.org
> > Fixes: 82939d7999df ("ext4: convert to mbcache2")
> > Signed-off-by: Jan Kara <jack@suse.cz>
> > ---
> >  fs/mbcache.c            | 66 +++++++++++++++++++++++++++++++++++++++--
> >  include/linux/mbcache.h | 10 ++++++-
> >  2 files changed, 73 insertions(+), 3 deletions(-)
> >
> > diff --git a/fs/mbcache.c b/fs/mbcache.c
> > index cfc28129fb6f..2010bc80a3f2 100644
> > --- a/fs/mbcache.c
> > +++ b/fs/mbcache.c
> > @@ -11,7 +11,7 @@
> >  /*
> >   * Mbcache is a simple key-value store. Keys need not be unique, however
> >   * key-value pairs are expected to be unique (we use this fact in
> > - * mb_cache_entry_delete()).
> > + * mb_cache_entry_delete_or_get()).
> >   *
> >   * Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
> >   * Ext4 also uses it for deduplication of xattr values stored in inodes.
> > @@ -125,6 +125,19 @@ void __mb_cache_entry_free(struct mb_cache_entry *entry)
> >  }
> >  EXPORT_SYMBOL(__mb_cache_entry_free);
> >
> > +/*
> > + * mb_cache_entry_wait_unused - wait to be the last user of the entry
> > + *
> > + * @entry - entry to work on
> > + *
> > + * Wait to be the last user of the entry.
> > + */
> > +void mb_cache_entry_wait_unused(struct mb_cache_entry *entry)
> > +{
> > +	wait_var_event(&entry->e_refcnt, atomic_read(&entry->e_refcnt) <= 3);
> 
> It's not very intuitive of why we check for refcnt <= 3.
> A small note at top of this function might be helpful.
> IIUC, it is because by default when anyone creates an entry we start with
> a refcnt of 2 (in mb_cache_entry_create.
> - Now when the user of the entry wants to delete this, it will try and call
>   mb_cache_entry_delete_or_get(). If during this function call it sees that the
>   refcnt is elevated more than 2, that means there is another user of this entry
>   currently active and hence we should wait before we remove this entry from the
>   cache. So it will take an extra refcnt and return.
> - So then this caller will call mb_cache_entry_wait_unused() for the refcnt to
>   be <= 3, so that the entry can be deleted.

Correct. I will add a comment as you suggest.

> Quick qn -
> So now is the design like, ext4_evict_ea_inode() will be waiting indefinitely
> until the other user of this mb_cache entry releases the reference right?

Correct. Similarly for ext4_xattr_release_block().

> And that will not happen until,
> - either the shrinker removes this entry from the cache during which we are
>   checking if the refcnt <= 3, then we call a wakeup event

No, shrinker will not touch these entries with active users anymore.

> - Or the user removes/deletes the xattr entry

No. We hold reference to mbcache entry only while we are trying to reuse
it. So functions ext4_xattr_block_cache_find() and
ext4_xattr_inode_cache_find() will lookup potential mbcache entry that may
have the same contents and get reference to it. Then we do comparisons
verifying whether the contents really matches, if yes, we increment on-disk
inode/block refcount. Then we drop mbcache entry reference which unblocks
waiters in mb_cache_entry_wait_unused().

								Honza
Ritesh Harjani (IBM) July 14, 2022, 3 p.m. UTC | #3
On 22/07/14 04:49PM, Jan Kara wrote:
> On Thu 14-07-22 17:45:32, Ritesh Harjani wrote:
> > On 22/07/12 12:54PM, Jan Kara wrote:
> > > Add function mb_cache_entry_delete_or_get() to delete mbcache entry if
> > > it is unused and also add a function to wait for entry to become unused
> > > - mb_cache_entry_wait_unused(). We do not share code between the two
> > > deleting function as one of them will go away soon.
> > >
> > > CC: stable@vger.kernel.org
> > > Fixes: 82939d7999df ("ext4: convert to mbcache2")
> > > Signed-off-by: Jan Kara <jack@suse.cz>
> > > ---
> > >  fs/mbcache.c            | 66 +++++++++++++++++++++++++++++++++++++++--
> > >  include/linux/mbcache.h | 10 ++++++-
> > >  2 files changed, 73 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/fs/mbcache.c b/fs/mbcache.c
> > > index cfc28129fb6f..2010bc80a3f2 100644
> > > --- a/fs/mbcache.c
> > > +++ b/fs/mbcache.c
> > > @@ -11,7 +11,7 @@
> > >  /*
> > >   * Mbcache is a simple key-value store. Keys need not be unique, however
> > >   * key-value pairs are expected to be unique (we use this fact in
> > > - * mb_cache_entry_delete()).
> > > + * mb_cache_entry_delete_or_get()).
> > >   *
> > >   * Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
> > >   * Ext4 also uses it for deduplication of xattr values stored in inodes.
> > > @@ -125,6 +125,19 @@ void __mb_cache_entry_free(struct mb_cache_entry *entry)
> > >  }
> > >  EXPORT_SYMBOL(__mb_cache_entry_free);
> > >
> > > +/*
> > > + * mb_cache_entry_wait_unused - wait to be the last user of the entry
> > > + *
> > > + * @entry - entry to work on
> > > + *
> > > + * Wait to be the last user of the entry.
> > > + */
> > > +void mb_cache_entry_wait_unused(struct mb_cache_entry *entry)
> > > +{
> > > +	wait_var_event(&entry->e_refcnt, atomic_read(&entry->e_refcnt) <= 3);
> >
> > It's not very intuitive of why we check for refcnt <= 3.
> > A small note at top of this function might be helpful.
> > IIUC, it is because by default when anyone creates an entry we start with
> > a refcnt of 2 (in mb_cache_entry_create.
> > - Now when the user of the entry wants to delete this, it will try and call
> >   mb_cache_entry_delete_or_get(). If during this function call it sees that the
> >   refcnt is elevated more than 2, that means there is another user of this entry
> >   currently active and hence we should wait before we remove this entry from the
> >   cache. So it will take an extra refcnt and return.
> > - So then this caller will call mb_cache_entry_wait_unused() for the refcnt to
> >   be <= 3, so that the entry can be deleted.
>
> Correct. I will add a comment as you suggest.
>
> > Quick qn -
> > So now is the design like, ext4_evict_ea_inode() will be waiting indefinitely
> > until the other user of this mb_cache entry releases the reference right?
>
> Correct. Similarly for ext4_xattr_release_block().
>
> > And that will not happen until,
> > - either the shrinker removes this entry from the cache during which we are
> >   checking if the refcnt <= 3, then we call a wakeup event
>
> No, shrinker will not touch these entries with active users anymore.
>
> > - Or the user removes/deletes the xattr entry
>
> No. We hold reference to mbcache entry only while we are trying to reuse
> it. So functions ext4_xattr_block_cache_find() and
> ext4_xattr_inode_cache_find() will lookup potential mbcache entry that may
> have the same contents and get reference to it. Then we do comparisons
> verifying whether the contents really matches, if yes, we increment on-disk
> inode/block refcount. Then we drop mbcache entry reference which unblocks
> waiters in mb_cache_entry_wait_unused().
>

ohk, yes. This is where I was a bit confused.
Thanks for explaining it. This makes more sense. I did go through the mbcache
implementation, but I was missing the info on how the callers are using it.

-ritesh

> 								Honza
>
> --
> Jan Kara <jack@suse.com>
> SUSE Labs, CR
diff mbox series

Patch

diff --git a/fs/mbcache.c b/fs/mbcache.c
index cfc28129fb6f..2010bc80a3f2 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -11,7 +11,7 @@ 
 /*
  * Mbcache is a simple key-value store. Keys need not be unique, however
  * key-value pairs are expected to be unique (we use this fact in
- * mb_cache_entry_delete()).
+ * mb_cache_entry_delete_or_get()).
  *
  * Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
  * Ext4 also uses it for deduplication of xattr values stored in inodes.
@@ -125,6 +125,19 @@  void __mb_cache_entry_free(struct mb_cache_entry *entry)
 }
 EXPORT_SYMBOL(__mb_cache_entry_free);
 
+/*
+ * mb_cache_entry_wait_unused - wait to be the last user of the entry
+ *
+ * @entry - entry to work on
+ *
+ * Wait to be the last user of the entry.
+ */
+void mb_cache_entry_wait_unused(struct mb_cache_entry *entry)
+{
+	wait_var_event(&entry->e_refcnt, atomic_read(&entry->e_refcnt) <= 3);
+}
+EXPORT_SYMBOL(mb_cache_entry_wait_unused);
+
 static struct mb_cache_entry *__entry_find(struct mb_cache *cache,
 					   struct mb_cache_entry *entry,
 					   u32 key)
@@ -217,7 +230,7 @@  struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
 }
 EXPORT_SYMBOL(mb_cache_entry_get);
 
-/* mb_cache_entry_delete - remove a cache entry
+/* mb_cache_entry_delete - try to remove a cache entry
  * @cache - cache we work with
  * @key - key
  * @value - value
@@ -254,6 +267,55 @@  void mb_cache_entry_delete(struct mb_cache *cache, u32 key, u64 value)
 }
 EXPORT_SYMBOL(mb_cache_entry_delete);
 
+/* mb_cache_entry_delete_or_get - remove a cache entry if it has no users
+ * @cache - cache we work with
+ * @key - key
+ * @value - value
+ *
+ * Remove entry from cache @cache with key @key and value @value. The removal
+ * happens only if the entry is unused. The function returns NULL in case the
+ * entry was successfully removed or there's no entry in cache. Otherwise the
+ * function grabs reference of the entry that we failed to delete because it
+ * still has users and return it.
+ */
+struct mb_cache_entry *mb_cache_entry_delete_or_get(struct mb_cache *cache,
+						    u32 key, u64 value)
+{
+	struct hlist_bl_node *node;
+	struct hlist_bl_head *head;
+	struct mb_cache_entry *entry;
+
+	head = mb_cache_entry_head(cache, key);
+	hlist_bl_lock(head);
+	hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
+		if (entry->e_key == key && entry->e_value == value) {
+			if (atomic_read(&entry->e_refcnt) > 2) {
+				atomic_inc(&entry->e_refcnt);
+				hlist_bl_unlock(head);
+				return entry;
+			}
+			/* We keep hash list reference to keep entry alive */
+			hlist_bl_del_init(&entry->e_hash_list);
+			hlist_bl_unlock(head);
+			spin_lock(&cache->c_list_lock);
+			if (!list_empty(&entry->e_list)) {
+				list_del_init(&entry->e_list);
+				if (!WARN_ONCE(cache->c_entry_count == 0,
+		"mbcache: attempt to decrement c_entry_count past zero"))
+					cache->c_entry_count--;
+				atomic_dec(&entry->e_refcnt);
+			}
+			spin_unlock(&cache->c_list_lock);
+			mb_cache_entry_put(cache, entry);
+			return NULL;
+		}
+	}
+	hlist_bl_unlock(head);
+
+	return NULL;
+}
+EXPORT_SYMBOL(mb_cache_entry_delete_or_get);
+
 /* mb_cache_entry_touch - cache entry got used
  * @cache - cache the entry belongs to
  * @entry - entry that got used
diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h
index 20f1e3ff6013..8eca7f25c432 100644
--- a/include/linux/mbcache.h
+++ b/include/linux/mbcache.h
@@ -30,15 +30,23 @@  void mb_cache_destroy(struct mb_cache *cache);
 int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
 			  u64 value, bool reusable);
 void __mb_cache_entry_free(struct mb_cache_entry *entry);
+void mb_cache_entry_wait_unused(struct mb_cache_entry *entry);
 static inline int mb_cache_entry_put(struct mb_cache *cache,
 				     struct mb_cache_entry *entry)
 {
-	if (!atomic_dec_and_test(&entry->e_refcnt))
+	unsigned int cnt = atomic_dec_return(&entry->e_refcnt);
+
+	if (cnt > 0) {
+		if (cnt <= 3)
+			wake_up_var(&entry->e_refcnt);
 		return 0;
+	}
 	__mb_cache_entry_free(entry);
 	return 1;
 }
 
+struct mb_cache_entry *mb_cache_entry_delete_or_get(struct mb_cache *cache,
+						    u32 key, u64 value);
 void mb_cache_entry_delete(struct mb_cache *cache, u32 key, u64 value);
 struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
 					  u64 value);