[3/4] Memory-mapped writing and reading of a graded file spanning over two media types

Message ID CAExFE6kRn+a5ShWNF6nxjHz2m2Z+oonMHrLm59s_rxZ+8QYY7Q@mail.gmail.com
State New
Headers show
Series
  • RFC : Support for data gradation of a single file.
Related show

Commit Message

Sayan Ghosh April 6, 2018, 11:41 a.m.
This patch aims at solving write and read of a single graded file
whose blocks stretch across two media types - Persistent Memory
(faster) and HDD (slower). We redirect write and read page fault and
page writes to appropriate tiers based on their grades by introducing
modified modules of the same. The new function pointers substitute the
existing file pointers for page fault handling and pagewrite. To
access the data present in persistent memory we can use DAX path (to
increase efficiency by reducing access times). This the new handlers
are inherently a mix of existing fault handlers for dax and non-dax
paths.
Changes are made in ext4_file_mmap(). The new functions introduced are
graded_ext4_fault() and graded_ext4_mkwrite().
The patch is on top of Linux Kernel 4.7.2.

Signed-off-by: Sayan Ghosh <sgdgp.2014@gmail.com>
---
 fs/ext4/file.c | 178 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 174 insertions(+), 4 deletions(-)

+
 static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
     struct inode *inode = file->f_mapping->host;
@@ -310,11 +468,23 @@ static int ext4_file_mmap(struct file *file,
struct vm_area_struct *vma)
             return -ENOKEY;
     }
     file_accessed(file);
-    if (IS_DAX(file_inode(file))) {
-        vma->vm_ops = &ext4_dax_vm_ops;
+
+    /*
+     * For graded file new function pointers for
+     * fault and page write are assigned.
+     */
+    if(is_file_graded(file_inode(file))){
+        vma->vm_ops = &graded_ext4_vm_ops;
         vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
-    } else {
-        vma->vm_ops = &ext4_file_vm_ops;
+    }
+    else{
+        if (IS_DAX(file_inode(file))) {
+            vma->vm_ops = &ext4_dax_vm_ops;
+            vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
+        }
+        else {
+            vma->vm_ops = &ext4_file_vm_ops;
+        }
     }
     return 0;
 }
‌

Comments

Randy Dunlap April 6, 2018, 5:25 p.m. | #1
On 04/06/2018 04:41 AM, Sayan Ghosh wrote:
> The patch is on top of Linux Kernel 4.7.2.
> 
> Signed-off-by: Sayan Ghosh <sgdgp.2014@gmail.com>
> ---
>  fs/ext4/file.c | 178 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 174 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index df44c87..368cf53 100755
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -298,6 +298,164 @@ static const struct vm_operations_struct
> ext4_file_vm_ops = {
>      .page_mkwrite   = ext4_page_mkwrite,
>  };
> 
> +/*
> + * This function is the fault function for our case to
> + * redirect the high grade blocks through DAX path (since the
> + * higher tier we chose is Persistent Memory) and the lower
> + * grade blocks via the normal ext4 fault.
> + * Some parts of the code are copied from ext4_dax_fault.
> + * The parts where the redirection is done is added by additional
> + * comments.
> + */
> +static int graded_ext4_fault(struct vm_area_struct *vma, struct vm_fault *vmf){

Put '{' on separate line.

> +    int result;
> +    sector_t block;
> +    handle_t *handle = NULL;
> +    struct file *file = vma->vm_file;
> +    struct address_space *mapping = file->f_mapping;
> +    struct inode *inode = file_inode(vma->vm_file);
> +    struct super_block *sb = inode->i_sb;
> +    bool write = vmf->flags & FAULT_FLAG_WRITE;

Indentation. (many)

> +
> +    struct grade_struct *grade_array = NULL;
> +    unsigned long long total;
> +    if (is_file_graded(inode)){
> +        total = read_count_xattr(inode);
> +        grade_array = (struct grade_struct
> *)kmalloc(total*sizeof(struct grade_struct), GFP_USER);
> +        read_grade_xattr(inode,grade_array);
> +    }
> +
> +    block = (sector_t)vmf->pgoff << (PAGE_SHIFT - mapping->host->i_blkbits);
> +    if (write) {
> +        sb_start_pagefault(sb);
> +        file_update_time(vma->vm_file);
> +        down_read(&EXT4_I(inode)->i_mmap_sem);
> +        handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
> +                        EXT4_DATA_TRANS_BLOCKS(sb));
> +    } else
> +        down_read(&EXT4_I(inode)->i_mmap_sem);
> +
> +    if (IS_ERR(handle))
> +        result = VM_FAULT_SIGBUS;
> +    else
> +    {
> +        if(write){

		if (write) {

> +            unsigned long long temp;
> +            if(find_grade(grade_array,total,block,&temp) == 1){

		if (                                           1) {

> +                result = __dax_fault(vma, vmf, ext4_dax_get_block);
> +            }
> +            else if(find_grade(grade_array,total,block,&temp) == 0){

same.

> +                result = ext4_filemap_fault(vma,vmf);
> +            }
> +        }
> +        else{

	else {

> +            /*
> +             * Here the higher graded blocks are redirected via DAX path
> +             * since we consider Persistent Memory as higher tier.
> +             *
> +             * ** TODO **
> +             * To take care of the case when the higher tier is not
> +             * persistent memory (can be HDD-SSD combination), a check
> +             * of the same needs to be provided before re-direction.
> +             */
> +            unsigned long long temp;
> +            if(find_grade(grade_array,total,block,&temp) == 1){
> +                result = __dax_fault(vma, vmf, ext4_dax_get_block);
> +            }
> +            else if(find_grade(grade_array,total,block,&temp) == 0){
> +                result = ext4_filemap_fault(vma,vmf);
> +            }
> +        }
> +    }
> + out:
> +    if (write) {
> +        if (!IS_ERR(handle))
> +            ext4_journal_stop(handle);
> +        up_read(&EXT4_I(inode)->i_mmap_sem);
> +        sb_end_pagefault(sb);
> +    } else
> +        up_read(&EXT4_I(inode)->i_mmap_sem);
> +
> +    return result;
> +}
> +
> +/*
> + * This is the new page write function for our scenario.
> + * This also takes care of the grade and redirects
> + * through the correct path, DAX for higer tier
> + * (Persistent Memory) and ext4 path for lower tier.
> + * To take care of the cases when the higher tier
> + * is not Persistent Memory a TODO has been added
> + */
> +static int graded_ext4_mkwrite(struct vm_area_struct *vma, struct
> vm_fault *vmf){

{ on separate line.

> +    int result;
> +    sector_t block;
> +    handle_t *handle = NULL;
> +    struct file *file = vma->vm_file;
> +    struct address_space *mapping = file->f_mapping;
> +    struct inode *inode = file_inode(vma->vm_file);
> +    struct super_block *sb = inode->i_sb;
> +    bool write = vmf->flags & FAULT_FLAG_WRITE;
> +    block = (sector_t)vmf->pgoff << (PAGE_SHIFT - mapping->host->i_blkbits);
> +
> +    struct grade_struct *grade_array = NULL;
> +    unsigned long long total;
> +    if (is_file_graded(inode)){
> +        total = read_count_xattr(inode);
> +        grade_array = (struct grade_struct
> *)kmalloc(total*sizeof(struct grade_struct), GFP_USER);
> +        read_grade_xattr(inode,grade_array);
> +    }
> +
> +    if (write) {
> +        sb_start_pagefault(sb);
> +        file_update_time(vma->vm_file);
> +        down_read(&EXT4_I(inode)->i_mmap_sem);
> +        handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
> +                        EXT4_DATA_TRANS_BLOCKS(sb));
> +    } else
> +        down_read(&EXT4_I(inode)->i_mmap_sem);
> +
> +    if (IS_ERR(handle))
> +        result = VM_FAULT_SIGBUS;
> +    else{

	else {

> +          /*
> +         * Here the higher graded blocks are redirected via DAX path
> +         * since we consider Persistent Memory as higher tier.
> +         *
> +         * ** TODO **
> +         * To take care of the case when the higher tier is not
> +         * persistent memory (can be HDD-SSD combination), a check
> +         * of the same needs to be provided before re-direction.
> +         */
> +          unsigned long long temp;
> +        if(find_grade(grade_array,total,block,&temp)==1){
> +            result = __dax_fault(vma, vmf, ext4_dax_get_block);
> +        }
> +        else if(find_grade(grade_array,total,block,&temp)==0){
> +            filemap_map_pages(vma,vmf);
> +            result = ext4_page_mkwrite(vma,vmf);
> +        }
> +    }
> +    if (write) {
> +        if (!IS_ERR(handle))
> +            ext4_journal_stop(handle);
> +        up_read(&EXT4_I(inode)->i_mmap_sem);
> +        sb_end_pagefault(sb);
> +    } else
> +        up_read(&EXT4_I(inode)->i_mmap_sem);
> +
> +    return result;
> +}
> +
> +/*
> + * New function pointers for page fault handling and page writes.
> + */
> +static const struct vm_operations_struct graded_ext4_vm_ops = {
> +    .fault = graded_ext4_fault,
> +    .page_mkwrite = graded_ext4_mkwrite,
> +
> +};
> +
>  static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
>  {
>      struct inode *inode = file->f_mapping->host;
> @@ -310,11 +468,23 @@ static int ext4_file_mmap(struct file *file,
> struct vm_area_struct *vma)
>              return -ENOKEY;
>      }
>      file_accessed(file);
> -    if (IS_DAX(file_inode(file))) {
> -        vma->vm_ops = &ext4_dax_vm_ops;
> +
> +    /*
> +     * For graded file new function pointers for
> +     * fault and page write are assigned.
> +     */
> +    if(is_file_graded(file_inode(file))){

	if (                            ))) {

> +        vma->vm_ops = &graded_ext4_vm_ops;
>          vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
> -    } else {
> -        vma->vm_ops = &ext4_file_vm_ops;
> +    }
> +    else{

	else {

> +        if (IS_DAX(file_inode(file))) {
> +            vma->vm_ops = &ext4_dax_vm_ops;
> +            vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
> +        }
> +        else {
> +            vma->vm_ops = &ext4_file_vm_ops;
> +        }
>      }
>      return 0;
>  }
> ‌
>

Patch

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index df44c87..368cf53 100755
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -298,6 +298,164 @@  static const struct vm_operations_struct
ext4_file_vm_ops = {
     .page_mkwrite   = ext4_page_mkwrite,
 };

+/*
+ * This function is the fault function for our case to
+ * redirect the high grade blocks through DAX path (since the
+ * higher tier we chose is Persistent Memory) and the lower
+ * grade blocks via the normal ext4 fault.
+ * Some parts of the code are copied from ext4_dax_fault.
+ * The parts where the redirection is done is added by additional
+ * comments.
+ */
+static int graded_ext4_fault(struct vm_area_struct *vma, struct vm_fault *vmf){
+    int result;
+    sector_t block;
+    handle_t *handle = NULL;
+    struct file *file = vma->vm_file;
+    struct address_space *mapping = file->f_mapping;
+    struct inode *inode = file_inode(vma->vm_file);
+    struct super_block *sb = inode->i_sb;
+    bool write = vmf->flags & FAULT_FLAG_WRITE;
+
+    struct grade_struct *grade_array = NULL;
+    unsigned long long total;
+    if (is_file_graded(inode)){
+        total = read_count_xattr(inode);
+        grade_array = (struct grade_struct
*)kmalloc(total*sizeof(struct grade_struct), GFP_USER);
+        read_grade_xattr(inode,grade_array);
+    }
+
+    block = (sector_t)vmf->pgoff << (PAGE_SHIFT - mapping->host->i_blkbits);
+    if (write) {
+        sb_start_pagefault(sb);
+        file_update_time(vma->vm_file);
+        down_read(&EXT4_I(inode)->i_mmap_sem);
+        handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
+                        EXT4_DATA_TRANS_BLOCKS(sb));
+    } else
+        down_read(&EXT4_I(inode)->i_mmap_sem);
+
+    if (IS_ERR(handle))
+        result = VM_FAULT_SIGBUS;
+    else
+    {
+        if(write){
+            unsigned long long temp;
+            if(find_grade(grade_array,total,block,&temp) == 1){
+                result = __dax_fault(vma, vmf, ext4_dax_get_block);
+            }
+            else if(find_grade(grade_array,total,block,&temp) == 0){
+                result = ext4_filemap_fault(vma,vmf);
+            }
+        }
+        else{
+            /*
+             * Here the higher graded blocks are redirected via DAX path
+             * since we consider Persistent Memory as higher tier.
+             *
+             * ** TODO **
+             * To take care of the case when the higher tier is not
+             * persistent memory (can be HDD-SSD combination), a check
+             * of the same needs to be provided before re-direction.
+             */
+            unsigned long long temp;
+            if(find_grade(grade_array,total,block,&temp) == 1){
+                result = __dax_fault(vma, vmf, ext4_dax_get_block);
+            }
+            else if(find_grade(grade_array,total,block,&temp) == 0){
+                result = ext4_filemap_fault(vma,vmf);
+            }
+        }
+    }
+ out:
+    if (write) {
+        if (!IS_ERR(handle))
+            ext4_journal_stop(handle);
+        up_read(&EXT4_I(inode)->i_mmap_sem);
+        sb_end_pagefault(sb);
+    } else
+        up_read(&EXT4_I(inode)->i_mmap_sem);
+
+    return result;
+}
+
+/*
+ * This is the new page write function for our scenario.
+ * This also takes care of the grade and redirects
+ * through the correct path, DAX for higer tier
+ * (Persistent Memory) and ext4 path for lower tier.
+ * To take care of the cases when the higher tier
+ * is not Persistent Memory a TODO has been added
+ */
+static int graded_ext4_mkwrite(struct vm_area_struct *vma, struct
vm_fault *vmf){
+    int result;
+    sector_t block;
+    handle_t *handle = NULL;
+    struct file *file = vma->vm_file;
+    struct address_space *mapping = file->f_mapping;
+    struct inode *inode = file_inode(vma->vm_file);
+    struct super_block *sb = inode->i_sb;
+    bool write = vmf->flags & FAULT_FLAG_WRITE;
+    block = (sector_t)vmf->pgoff << (PAGE_SHIFT - mapping->host->i_blkbits);
+
+    struct grade_struct *grade_array = NULL;
+    unsigned long long total;
+    if (is_file_graded(inode)){
+        total = read_count_xattr(inode);
+        grade_array = (struct grade_struct
*)kmalloc(total*sizeof(struct grade_struct), GFP_USER);
+        read_grade_xattr(inode,grade_array);
+    }
+
+    if (write) {
+        sb_start_pagefault(sb);
+        file_update_time(vma->vm_file);
+        down_read(&EXT4_I(inode)->i_mmap_sem);
+        handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
+                        EXT4_DATA_TRANS_BLOCKS(sb));
+    } else
+        down_read(&EXT4_I(inode)->i_mmap_sem);
+
+    if (IS_ERR(handle))
+        result = VM_FAULT_SIGBUS;
+    else{
+          /*
+         * Here the higher graded blocks are redirected via DAX path
+         * since we consider Persistent Memory as higher tier.
+         *
+         * ** TODO **
+         * To take care of the case when the higher tier is not
+         * persistent memory (can be HDD-SSD combination), a check
+         * of the same needs to be provided before re-direction.
+         */
+          unsigned long long temp;
+        if(find_grade(grade_array,total,block,&temp)==1){
+            result = __dax_fault(vma, vmf, ext4_dax_get_block);
+        }
+        else if(find_grade(grade_array,total,block,&temp)==0){
+            filemap_map_pages(vma,vmf);
+            result = ext4_page_mkwrite(vma,vmf);
+        }
+    }
+    if (write) {
+        if (!IS_ERR(handle))
+            ext4_journal_stop(handle);
+        up_read(&EXT4_I(inode)->i_mmap_sem);
+        sb_end_pagefault(sb);
+    } else
+        up_read(&EXT4_I(inode)->i_mmap_sem);
+
+    return result;
+}
+
+/*
+ * New function pointers for page fault handling and page writes.
+ */
+static const struct vm_operations_struct graded_ext4_vm_ops = {
+    .fault = graded_ext4_fault,
+    .page_mkwrite = graded_ext4_mkwrite,
+
+};