@@ -5,7 +5,7 @@
obj-$(CONFIG_EXT2_FS) += ext2.o
-ext2-y := balloc.o dir.o file.o ialloc.o inode.o \
+ext2-y := balloc.o cache.o dir.o file.o ialloc.o inode.o \
ioctl.o namei.o super.o symlink.o trace.o
# For tracepoints to include our trace.h from tracepoint infrastructure
new file mode 100644
@@ -0,0 +1,195 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2024 Oracle. All rights reserved.
+ */
+
+#include "ext2.h"
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/rhashtable.h>
+#include <linux/mm.h>
+#include <linux/types.h>
+
+static const struct rhashtable_params buffer_cache_params = {
+ .key_len = sizeof(sector_t),
+ .key_offset = offsetof(struct ext2_buffer, b_block),
+ .head_offset = offsetof(struct ext2_buffer, b_rhash),
+ .automatic_shrinking = true,
+};
+
+static struct ext2_buffer *insert_buffer_cache(struct super_block *sb, struct ext2_buffer *new_buf)
+{
+ struct ext2_sb_info *sbi = EXT2_SB(sb);
+ struct rhashtable *buffer_cache = &sbi->buffer_cache;
+ struct ext2_buffer *old_buf;
+
+ spin_lock(&sbi->buffer_cache_lock);
+ old_buf = rhashtable_lookup_get_insert_fast(buffer_cache,
+ &new_buf->b_rhash, buffer_cache_params);
+ spin_unlock(&sbi->buffer_cache_lock);
+
+ if (old_buf)
+ return old_buf;
+
+ return new_buf;
+}
+
+static void buf_write_end_io(struct bio *bio)
+{
+ struct ext2_buffer *buf = bio->bi_private;
+
+ clear_bit(EXT2_BUF_DIRTY_BIT, &buf->b_flags);
+ bio_put(bio);
+}
+
+static int submit_buffer_read(struct super_block *sb, struct ext2_buffer *buf)
+{
+ struct bio_vec bio_vec;
+ struct bio bio;
+ sector_t sector = buf->b_block * (sb->s_blocksize >> 9);
+
+ bio_init(&bio, sb->s_bdev, &bio_vec, 1, REQ_OP_READ);
+ bio.bi_iter.bi_sector = sector;
+
+ __bio_add_page(&bio, buf->b_page, buf->b_size, 0);
+
+ return submit_bio_wait(&bio);
+}
+
+static void submit_buffer_write(struct super_block *sb, struct ext2_buffer *buf)
+{
+ struct bio *bio;
+ sector_t sector = buf->b_block * (sb->s_blocksize >> 9);
+
+ bio = bio_alloc(sb->s_bdev, 1, REQ_OP_WRITE, GFP_KERNEL);
+
+ bio->bi_iter.bi_sector = sector;
+ bio->bi_end_io = buf_write_end_io;
+ bio->bi_private = buf;
+
+ __bio_add_page(bio, buf->b_page, buf->b_size, 0);
+
+ submit_bio(bio);
+}
+
+int sync_buffers(struct super_block *sb)
+{
+ struct ext2_sb_info *sbi = EXT2_SB(sb);
+ struct rhashtable *buffer_cache = &sbi->buffer_cache;
+ struct rhashtable_iter iter;
+ struct ext2_buffer *buf;
+ struct blk_plug plug;
+
+ blk_start_plug(&plug);
+ rhashtable_walk_enter(buffer_cache, &iter);
+ rhashtable_walk_start(&iter);
+ while ((buf = rhashtable_walk_next(&iter)) != NULL) {
+ if (IS_ERR(buf))
+ continue;
+ if (test_bit(EXT2_BUF_DIRTY_BIT, &buf->b_flags)) {
+ submit_buffer_write(sb, buf);
+ }
+ }
+ rhashtable_walk_stop(&iter);
+ rhashtable_walk_exit(&iter);
+ blk_finish_plug(&plug);
+
+ return 0;
+}
+
+static struct ext2_buffer *lookup_buffer_cache(struct super_block *sb, sector_t block)
+{
+ struct ext2_sb_info *sbi = EXT2_SB(sb);
+ struct rhashtable *buffer_cache = &sbi->buffer_cache;
+ struct ext2_buffer *found = NULL;
+
+ found = rhashtable_lookup_fast(buffer_cache, &block, buffer_cache_params);
+
+ return found;
+}
+
+static int init_buffer(struct super_block *sb, sector_t block, struct ext2_buffer **buf_ptr)
+{
+ struct ext2_buffer *buf;
+
+ buf = kmalloc(sizeof(struct ext2_buffer), GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ buf->b_block = block;
+ buf->b_size = sb->s_blocksize;
+ buf->b_flags = 0;
+
+ mutex_init(&buf->b_lock);
+ refcount_set(&buf->b_refcount, 1);
+
+ buf->b_page = alloc_page(GFP_KERNEL);
+ if (!buf->b_page)
+ return -ENOMEM;
+
+ buf->b_data = page_address(buf->b_page);
+
+ *buf_ptr = buf;
+
+ return 0;
+}
+
+void put_buffer(struct ext2_buffer *buf)
+{
+ refcount_dec(&buf->b_refcount);
+ mutex_unlock(&buf->b_lock);
+}
+
+struct ext2_buffer *get_buffer(struct super_block *sb, sector_t block, bool need_uptodate)
+{
+ int err;
+ struct ext2_buffer *buf;
+ struct ext2_buffer *new_buf;
+
+ buf = lookup_buffer_cache(sb, block);
+
+ if (!buf) {
+ err = init_buffer(sb, block, &new_buf);
+ if (err)
+ return ERR_PTR(err);
+
+ if (need_uptodate) {
+ err = submit_buffer_read(sb, new_buf);
+ if (err)
+ return ERR_PTR(err);
+ }
+
+ buf = insert_buffer_cache(sb, new_buf);
+ if (IS_ERR(buf))
+ kfree(new_buf);
+ }
+
+ mutex_lock(&buf->b_lock);
+ refcount_inc(&buf->b_refcount);
+
+ return buf;
+}
+
+void buffer_set_dirty(struct ext2_buffer *buf)
+{
+ set_bit(EXT2_BUF_DIRTY_BIT, &buf->b_flags);
+}
+
+int init_buffer_cache(struct rhashtable *buffer_cache)
+{
+ return rhashtable_init(buffer_cache, &buffer_cache_params);
+}
+
+static void destroy_buffer(void *ptr, void *arg)
+{
+ struct ext2_buffer *buf = ptr;
+
+ WARN_ON(test_bit(EXT2_BUF_DIRTY_BIT, &buf->b_flags));
+ __free_page(buf->b_page);
+ kfree(buf);
+}
+
+void destroy_buffer_cache(struct rhashtable *buffer_cache)
+{
+ rhashtable_free_and_destroy(buffer_cache, destroy_buffer, NULL);
+}
@@ -18,6 +18,7 @@
#include <linux/rbtree.h>
#include <linux/mm.h>
#include <linux/highmem.h>
+#include <linux/rhashtable.h>
/* XXX Here for now... not interested in restructing headers JUST now */
@@ -116,6 +117,8 @@ struct ext2_sb_info {
struct mb_cache *s_ea_block_cache;
struct dax_device *s_daxdev;
u64 s_dax_part_off;
+ struct rhashtable buffer_cache;
+ spinlock_t buffer_cache_lock;
};
static inline spinlock_t *
@@ -683,6 +686,24 @@ struct ext2_inode_info {
*/
#define EXT2_STATE_NEW 0x00000001 /* inode is newly created */
+/*
+ * ext2 buffer
+ */
+struct ext2_buffer {
+ sector_t b_block;
+ struct rhash_head b_rhash;
+ struct page *b_page;
+ size_t b_size;
+ char *b_data;
+ unsigned long b_flags;
+ refcount_t b_refcount;
+ struct mutex b_lock;
+};
+
+/*
+ * Buffer flags
+ */
+ #define EXT2_BUF_DIRTY_BIT 0
/*
* Function prototypes
@@ -716,6 +737,14 @@ extern int ext2_should_retry_alloc(struct super_block *sb, int *retries);
extern void ext2_init_block_alloc_info(struct inode *);
extern void ext2_rsv_window_add(struct super_block *sb, struct ext2_reserve_window_node *rsv);
+/* cache.c */
+extern int init_buffer_cache(struct rhashtable *);
+extern void destroy_buffer_cache(struct rhashtable *buffer_cache);
+extern int sync_buffers(struct super_block *);
+extern struct ext2_buffer *get_buffer(struct super_block *, sector_t, bool);
+extern void put_buffer(struct ext2_buffer *);
+extern void buffer_set_dirty(struct ext2_buffer *);
+
/* dir.c */
int ext2_add_link(struct dentry *, struct inode *);
int ext2_inode_by_name(struct inode *dir,
@@ -741,6 +770,7 @@ extern int ext2_write_inode (struct inode *, struct writeback_control *);
extern void ext2_evict_inode(struct inode *);
void ext2_write_failed(struct address_space *mapping, loff_t to);
extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
+extern int ext2_get_block_bno(struct inode *, sector_t, int, u32 *, bool *);
extern int ext2_setattr (struct mnt_idmap *, struct dentry *, struct iattr *);
extern int ext2_getattr (struct mnt_idmap *, const struct path *,
struct kstat *, u32, unsigned int);
@@ -803,6 +803,26 @@ int ext2_get_block(struct inode *inode, sector_t iblock,
}
+int ext2_get_block_bno(struct inode *inode, sector_t iblock,
+ int create, u32 *bno, bool *mapped)
+{
+ struct super_block *sb = inode->i_sb;
+ struct buffer_head tmp_bh;
+ int err;
+
+ tmp_bh.b_state = 0;
+ tmp_bh.b_size = sb->s_blocksize;
+
+ err = ext2_get_block(inode, iblock, &tmp_bh, 0);
+ if (err)
+ return err;
+
+ *mapped = buffer_mapped(&tmp_bh);
+ *bno = tmp_bh.b_blocknr;
+
+ return 0;
+}
+
static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
unsigned flags, struct iomap *iomap, struct iomap *srcmap)
{
@@ -152,6 +152,8 @@ static void ext2_put_super (struct super_block * sb)
ext2_xattr_destroy_cache(sbi->s_ea_block_cache);
sbi->s_ea_block_cache = NULL;
+ destroy_buffer_cache(&sbi->buffer_cache);
+
if (!sb_rdonly(sb)) {
struct ext2_super_block *es = sbi->s_es;
@@ -835,6 +837,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off,
NULL, NULL);
+ spin_lock_init(&sbi->buffer_cache_lock);
+ ret = init_buffer_cache(&sbi->buffer_cache);
+ if (ret) {
+ ext2_msg(sb, KERN_ERR, "error: unable to create buffer cache");
+ goto failed_sbi;
+ }
+
spin_lock_init(&sbi->s_lock);
ret = -EINVAL;
@@ -1278,6 +1287,8 @@ static int ext2_sync_fs(struct super_block *sb, int wait)
*/
dquot_writeback_dquots(sb, -1);
+ sync_buffers(sb);
+
spin_lock(&sbi->s_lock);
if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) {
ext2_debug("setting valid to 0\n");
@@ -1491,9 +1502,10 @@ static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data,
int offset = off & (sb->s_blocksize - 1);
int tocopy;
size_t toread;
- struct buffer_head tmp_bh;
- struct buffer_head *bh;
loff_t i_size = i_size_read(inode);
+ struct ext2_buffer *buf;
+ u32 bno;
+ bool mapped;
if (off > i_size)
return 0;
@@ -1503,20 +1515,19 @@ static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data,
while (toread > 0) {
tocopy = min_t(size_t, sb->s_blocksize - offset, toread);
- tmp_bh.b_state = 0;
- tmp_bh.b_size = sb->s_blocksize;
- err = ext2_get_block(inode, blk, &tmp_bh, 0);
+ err = ext2_get_block_bno(inode, blk, 0, &bno, &mapped);
if (err < 0)
return err;
- if (!buffer_mapped(&tmp_bh)) /* A hole? */
+ if (!mapped) /* A hole? */
memset(data, 0, tocopy);
else {
- bh = sb_bread(sb, tmp_bh.b_blocknr);
- if (!bh)
- return -EIO;
- memcpy(data, bh->b_data+offset, tocopy);
- brelse(bh);
+ buf = get_buffer(sb, bno, 1);
+ if (IS_ERR(buf))
+ return PTR_ERR(buf);
+ memcpy(data, buf->b_data+offset, tocopy);
+ put_buffer(buf);
}
+
offset = 0;
toread -= tocopy;
data += tocopy;
@@ -1535,32 +1546,29 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type,
int offset = off & (sb->s_blocksize - 1);
int tocopy;
size_t towrite = len;
- struct buffer_head tmp_bh;
- struct buffer_head *bh;
+ struct ext2_buffer *buf;
+ u32 bno;
+ bool mapped;
while (towrite > 0) {
tocopy = min_t(size_t, sb->s_blocksize - offset, towrite);
- tmp_bh.b_state = 0;
- tmp_bh.b_size = sb->s_blocksize;
- err = ext2_get_block(inode, blk, &tmp_bh, 1);
+ err = ext2_get_block_bno(inode, blk, 1, &bno, &mapped);
if (err < 0)
goto out;
+
if (offset || tocopy != EXT2_BLOCK_SIZE(sb))
- bh = sb_bread(sb, tmp_bh.b_blocknr);
+ buf = get_buffer(sb, bno, 1);
else
- bh = sb_getblk(sb, tmp_bh.b_blocknr);
- if (unlikely(!bh)) {
- err = -EIO;
+ buf = get_buffer(sb, bno, 0);
+ if (IS_ERR(buf)) {
+ err = PTR_ERR(buf);
goto out;
}
- lock_buffer(bh);
- memcpy(bh->b_data+offset, data, tocopy);
- flush_dcache_page(bh->b_page);
- set_buffer_uptodate(bh);
- mark_buffer_dirty(bh);
- unlock_buffer(bh);
- brelse(bh);
+ memcpy(buf->b_data+offset, data, tocopy);
+ buffer_set_dirty(buf);
+ put_buffer(buf);
+
offset = 0;
towrite -= tocopy;
data += tocopy;
This patch removes the use of buffer heads from the quota read and write paths. To do so, we implement a new buffer cache using an rhashtable. Each buffer stores data from an associated block, and can be read or written to as needed. Ultimately, we want to completely remove buffer heads from ext2. This patch serves as an example than can be applied to other parts of the filesystem. Signed-off-by: Catherine Hoang <catherine.hoang@oracle.com> --- fs/ext2/Makefile | 2 +- fs/ext2/cache.c | 195 +++++++++++++++++++++++++++++++++++++++++++++++ fs/ext2/ext2.h | 30 ++++++++ fs/ext2/inode.c | 20 +++++ fs/ext2/super.c | 62 ++++++++------- 5 files changed, 281 insertions(+), 28 deletions(-) create mode 100644 fs/ext2/cache.c