[4/4] Support for obtaining reduced view of a graded file

Message ID CAExFE6=3VcVmofT0zNDP_PYMZ0S+DcgJO=b4BaZH5dG_c_QkfQ@mail.gmail.com
State Rejected, archived
Headers show
Series
  • RFC : Support for data gradation of a single file.
Related show

Commit Message

Sayan Ghosh April 6, 2018, 11:42 a.m.
In this patch, we provide changes in the new introduced page fault for
a reduced view of the file with only high graded blocks. An example
could be of accessing only the annotated part of an indexed video as
given in the overall patch description. The instruction of whether to
give a reduced view of file is passed as an extended attribute
“read_high” which has to be set by the user before reading the file in
the user-space. We make further changes in dax.c for this so that we
can skip over the lower graded blocks.
In dax.c the new functions introduced are __graded_dax_fault() and
graded_dax_insert_mapping().
The patch is on top of Linux Kernel 4.7.2.

Signed-off-by: Sayan Ghosh <sgdgp.2014@gmail.com>
---
 fs/dax.c       | 139 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/ext4.h |   1 +
 fs/ext4/file.c |  79 +++++++++++++++++++++++++-------
 3 files changed, 203 insertions(+), 16 deletions(-)

  * gets called only when /all/ the files are closed.
@@ -349,22 +363,55 @@ static int graded_ext4_fault(struct
vm_area_struct *vma, struct vm_fault *vmf){
             }
         }
         else{
-            /*
-             * Here the higher graded blocks are redirected via DAX path
-             * since we consider Persistent Memory as higher tier.
-             *
-             * ** TODO **
-             * To take care of the case when the higher tier is not
-             * persistent memory (can be HDD-SSD combination), a check
-             * of the same needs to be provided before re-direction.
-             */
-            unsigned long long temp;
-            if(find_grade(grade_array,total,block,&temp) == 1){
-                result = __dax_fault(vma, vmf, ext4_dax_get_block);
-            }
-            else if(find_grade(grade_array,total,block,&temp) == 0){
-                result = ext4_filemap_fault(vma,vmf);
-            }
+            /*
+             * If read_high is enabled then read the higher
+             * grade blocks only.
+             * It uses a modified dax_fault handler with
+             * the assumption that high grade blocks are
+             * in Persistent Memory.
+             *
+             * ** TODO 1**
+             * To take care when high grade blocks are allocated elsewhere.
+             * Checking of allocated space of each high graded block needs
+             * to be done.
+             *
+             * ** TODO 2**
+             * Modifying vmf according to the target_block in order to
+             * use the existing dax_fault handler needs to be done.
+             */
+            if(read_high(inode) == 1)
+            {
+                ext4_lblk_t target_block;
+                if(block >= total)
+                {
+                    goto out;
+                }
+                else{
+                    target_block = block;
+                    goto pm_fault_handler;
+                }
+            pm_fault_handler:
+                result = __skip_dax_fault(vma, vmf,
ext4_dax_get_block,target_block);
+            }
+            else
+            {
+                /*
+                 * Here the higher graded blocks are redirected via DAX path
+                 * since we consider Persistent Memory as higher tier.
+                 *
+                 * ** TODO **
+                 * To take care of the case when the higher tier is not
+                 * persistent memory (can be HDD-SSD combination), a check
+                 * of the same needs to be provided before re-direction.
+                 */
+                unsigned long long temp;
+                if(find_grade(grade_array,total,block,&temp) == 1){
+                    result = __dax_fault(vma, vmf, ext4_dax_get_block);
+                }
+                else if(find_grade(grade_array,total,block,&temp) == 0){
+                    result = ext4_filemap_fault(vma,vmf);
+                }
+            }
         }
     }
  out:
‌

Comments

Randy Dunlap April 6, 2018, 5:34 p.m. | #1
On 04/06/2018 04:42 AM, Sayan Ghosh wrote:
> The patch is on top of Linux Kernel 4.7.2.
> 
> Signed-off-by: Sayan Ghosh <sgdgp.2014@gmail.com>
> ---
>  fs/dax.c       | 139 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/ext4/ext4.h |   1 +
>  fs/ext4/file.c |  79 +++++++++++++++++++++++++-------
>  3 files changed, 203 insertions(+), 16 deletions(-)
> 
> diff --git a/fs/dax.c b/fs/dax.c
> index e207f8f..1930307 100755
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -793,6 +793,41 @@ int dax_writeback_mapping_range(struct
> address_space *mapping,
>  }
>  EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
> 
> +/*
> + * This function is a copy of dax_insert_mapping.
> + * It is called in skip_dax_fault_handler.
> + */
> +static int skip_dax_insert_mapping(struct address_space *mapping,
> +            struct buffer_head *bh, void **entryp,
> +            struct vm_area_struct *vma, struct vm_fault *vmf, sector_t blknum)
> +{
> +    unsigned long vaddr = (unsigned long)vmf->virtual_address;
> +    struct inode *inode = mapping->host;
> +    struct block_device *bdev = bh->b_bdev;
> +    bdev->bd_inode->i_ino=mapping->host->i_ino;
> +    struct blk_dax_ctl dax = {
> +        .sector = to_sector(bh, mapping->host),
> +        .size = bh->b_size,
> +    };
> +    int error;
> +    sector_t block;
> +    void *ret;
> +    void *entry = *entryp;
> +    block = (sector_t)vmf->pgoff << (PAGE_SHIFT - mapping->host->i_blkbits);
> +    dax.sector = blknum << (mapping->host->i_blkbits - 9);
> +    if (dax_map_atomic(bdev, &dax) < 0){
> +        return PTR_ERR(dax.addr);
> +    }

Indentation size.

Use tabs instead of spaces.

> +    dax_unmap_atomic(bdev, &dax);
> +    ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector);
> +    if (IS_ERR(ret)){
> +        return PTR_ERR(ret);
> +    }
> +    *entryp = ret;
> +
> +    vm_insert_mixed(vma, vaddr, dax.pfn);
> +}
> +
>  static int dax_insert_mapping(struct address_space *mapping,
>              struct buffer_head *bh, void **entryp,
>              struct vm_area_struct *vma, struct vm_fault *vmf)
> @@ -915,6 +950,110 @@ int __dax_fault(struct vm_area_struct *vma,
> struct vm_fault *vmf,
>  }
>  EXPORT_SYMBOL(__dax_fault);
> 
> +/*
> + * This is the modified __dax_fault handler.
> + * Most of the code is copied from __dax_fault function.
> + * One more parameter is passed here, namely skip_dax.
> + */
> +int __skip_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
> +            get_block_t get_block,long skip_dax)
> +{
> +    struct file *file = vma->vm_file;
> +    struct address_space *mapping = file->f_mapping;
> +    struct inode *inode = mapping->host;
> +    void *entry;
> +    struct buffer_head bh;
> +    unsigned long vaddr = (unsigned long)vmf->virtual_address;
> +    unsigned blkbits = inode->i_blkbits;
> +    sector_t block;
> +    sector_t corrected_sector,corrected_new_block;
> +    pgoff_t size;
> +    int error;
> +    int new_error;
> +    int major = 0;
> +
> +    /*
> +     * Check whether offset isn't beyond end of file now. Caller is supposed
> +     * to hold locks serializing us with truncate / punch hole so this is
> +     * a reliable test.
> +     */
> +    size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
> +    if (vmf->pgoff >= size)
> +        return VM_FAULT_SIGBUS;
> +
> +    memset(&bh, 0, sizeof(bh));
> +    block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
> +    bh.b_bdev = inode->i_sb->s_bdev;
> +    bh.b_size = PAGE_SIZE;
> +
> +    entry = grab_mapping_entry(mapping, vmf->pgoff);
> +    if (IS_ERR(entry)) {
> +        error = PTR_ERR(entry);
> +        goto out;
> +    }
> +
> +    error = get_block(inode, block, &bh, 0);
> +    if (!error && (bh.b_size < PAGE_SIZE))
> +        error = -EIO;        /* fs corruption? */
> +    if (error){
> +        goto unlock_entry;
> +    }
> +
> +    if (vmf->cow_page) {
> +        struct page *new_page = vmf->cow_page;
> +        if (buffer_written(&bh))
> +            error = copy_user_bh(new_page, inode, &bh, vaddr);
> +        else
> +            clear_user_highpage(new_page, vaddr);
> +        if (error){
> +            goto unlock_entry;
> +        }
> +        if (!radix_tree_exceptional_entry(entry)) {
> +            vmf->page = entry;
> +            return VM_FAULT_LOCKED;
> +        }
> +        vmf->entry = entry;
> +        return VM_FAULT_DAX_LOCKED;
> +    }
> +
> +    if (!buffer_mapped(&bh)) {
> +        if (vmf->flags & FAULT_FLAG_WRITE) {
> +            error = get_block(inode, block, &bh, 1);
> +            count_vm_event(PGMAJFAULT);
> +            mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
> +            major = VM_FAULT_MAJOR;
> +            if (!error && (bh.b_size < PAGE_SIZE))
> +                error = -EIO;
> +            if (error)
> +                goto unlock_entry;
> +        } else {
> +            goto out2;
> +        }
> +    }
> +
> +    /* Filesystem should not return unwritten buffers to us! */
> +    WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
> +out2:
> +    /* We take the new block here, the next higher
> +     * graded block
> +     */
> +    corrected_sector = skip_dax;
> +    new_error = get_block(inode, corrected_sector, &bh, 0);
> +    corrected_new_block = bh.b_blocknr;
> +    error = get_block(inode, block, &bh, 0);
> +    error = skip_dax_insert_mapping(mapping, &bh, &entry, vma, vmf,
> corrected_new_block);
> + unlock_entry:
> +    put_locked_mapping_entry(mapping, vmf->pgoff, entry);
> + out:
> +    if (error == -ENOMEM)
> +        return VM_FAULT_OOM | major;
> +    /* -EBUSY is fine, somebody else faulted on the same PTE */
> +    if ((error < 0) && (error != -EBUSY))
> +        return VM_FAULT_SIGBUS | major;
> +    return VM_FAULT_NOPAGE | major;
> +}
> +EXPORT_SYMBOL(__skip_dax_fault);
> +
>  /**
>   * dax_fault - handle a page fault on a DAX file
>   * @vma: The virtual memory area where the fault occurred
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index 368cf53..5dafd52 100755
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -32,6 +32,20 @@
>  #include "acl.h"
> 
>  /*
> + * read_high() returns 0 or 1 depending whether we want to read all the file
> + * blocks or only high graded, respectively.
> + * It gets this information from the extended attribute set by user beforehand.
> + */
> +int read_high(struct inode *inode)
> +{
> +    const char *xattr_name = "read_high";
> +    int read_high = 0;
> +    int xattr_size = sizeof(int);
> +    xattr_size = ext4_xattr_get(inode,
> EXT4_XATTR_INDEX_USER,xattr_name, (void *)&read_high,xattr_size);

line too long.

> +    return read_high;
> +}
> +
> +/*
>   * Called when an inode is released. Note that this is different
>   * from ext4_file_open: open gets called at every open, but release
>   * gets called only when /all/ the files are closed.
> @@ -349,22 +363,55 @@ static int graded_ext4_fault(struct
> vm_area_struct *vma, struct vm_fault *vmf){
>              }
>          }
>          else{
> -            /*
> -             * Here the higher graded blocks are redirected via DAX path
> -             * since we consider Persistent Memory as higher tier.
> -             *
> -             * ** TODO **
> -             * To take care of the case when the higher tier is not
> -             * persistent memory (can be HDD-SSD combination), a check
> -             * of the same needs to be provided before re-direction.
> -             */
> -            unsigned long long temp;
> -            if(find_grade(grade_array,total,block,&temp) == 1){
> -                result = __dax_fault(vma, vmf, ext4_dax_get_block);
> -            }
> -            else if(find_grade(grade_array,total,block,&temp) == 0){
> -                result = ext4_filemap_fault(vma,vmf);
> -            }
> +            /*
> +             * If read_high is enabled then read the higher
> +             * grade blocks only.
> +             * It uses a modified dax_fault handler with
> +             * the assumption that high grade blocks are
> +             * in Persistent Memory.
> +             *
> +             * ** TODO 1**
> +             * To take care when high grade blocks are allocated elsewhere.
> +             * Checking of allocated space of each high graded block needs
> +             * to be done.
> +             *
> +             * ** TODO 2**
> +             * Modifying vmf according to the target_block in order to
> +             * use the existing dax_fault handler needs to be done.
> +             */
> +            if(read_high(inode) == 1)
> +            {

		if (read_high(inode) == 1) {

> +                ext4_lblk_t target_block;
> +                if(block >= total)

		if (block >= total) {

> +                {
> +                    goto out;
> +                }
> +                else{
> +                    target_block = block;
> +                    goto pm_fault_handler;
> +                }
> +            pm_fault_handler:
> +                result = __skip_dax_fault(vma, vmf,
> ext4_dax_get_block,target_block);
> +            }
> +            else
> +            {
> +                /*
> +                 * Here the higher graded blocks are redirected via DAX path
> +                 * since we consider Persistent Memory as higher tier.
> +                 *
> +                 * ** TODO **
> +                 * To take care of the case when the higher tier is not
> +                 * persistent memory (can be HDD-SSD combination), a check
> +                 * of the same needs to be provided before re-direction.
> +                 */
> +                unsigned long long temp;
> +                if(find_grade(grade_array,total,block,&temp) == 1){> +                    result = __dax_fault(vma, vmf, ext4_dax_get_block);
> +                }
> +                else if(find_grade(grade_array,total,block,&temp) == 0){
> +                    result = ext4_filemap_fault(vma,vmf);
> +                }
> +            }
>          }
>      }
>   out:
> ‌
>

Patch

diff --git a/fs/dax.c b/fs/dax.c
index e207f8f..1930307 100755
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -793,6 +793,41 @@  int dax_writeback_mapping_range(struct
address_space *mapping,
 }
 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);

+/*
+ * This function is a copy of dax_insert_mapping.
+ * It is called in skip_dax_fault_handler.
+ */
+static int skip_dax_insert_mapping(struct address_space *mapping,
+            struct buffer_head *bh, void **entryp,
+            struct vm_area_struct *vma, struct vm_fault *vmf, sector_t blknum)
+{
+    unsigned long vaddr = (unsigned long)vmf->virtual_address;
+    struct inode *inode = mapping->host;
+    struct block_device *bdev = bh->b_bdev;
+    bdev->bd_inode->i_ino=mapping->host->i_ino;
+    struct blk_dax_ctl dax = {
+        .sector = to_sector(bh, mapping->host),
+        .size = bh->b_size,
+    };
+    int error;
+    sector_t block;
+    void *ret;
+    void *entry = *entryp;
+    block = (sector_t)vmf->pgoff << (PAGE_SHIFT - mapping->host->i_blkbits);
+    dax.sector = blknum << (mapping->host->i_blkbits - 9);
+    if (dax_map_atomic(bdev, &dax) < 0){
+        return PTR_ERR(dax.addr);
+    }
+    dax_unmap_atomic(bdev, &dax);
+    ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector);
+    if (IS_ERR(ret)){
+        return PTR_ERR(ret);
+    }
+    *entryp = ret;
+
+    vm_insert_mixed(vma, vaddr, dax.pfn);
+}
+
 static int dax_insert_mapping(struct address_space *mapping,
             struct buffer_head *bh, void **entryp,
             struct vm_area_struct *vma, struct vm_fault *vmf)
@@ -915,6 +950,110 @@  int __dax_fault(struct vm_area_struct *vma,
struct vm_fault *vmf,
 }
 EXPORT_SYMBOL(__dax_fault);

+/*
+ * This is the modified __dax_fault handler.
+ * Most of the code is copied from __dax_fault function.
+ * One more parameter is passed here, namely skip_dax.
+ */
+int __skip_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+            get_block_t get_block,long skip_dax)
+{
+    struct file *file = vma->vm_file;
+    struct address_space *mapping = file->f_mapping;
+    struct inode *inode = mapping->host;
+    void *entry;
+    struct buffer_head bh;
+    unsigned long vaddr = (unsigned long)vmf->virtual_address;
+    unsigned blkbits = inode->i_blkbits;
+    sector_t block;
+    sector_t corrected_sector,corrected_new_block;
+    pgoff_t size;
+    int error;
+    int new_error;
+    int major = 0;
+
+    /*
+     * Check whether offset isn't beyond end of file now. Caller is supposed
+     * to hold locks serializing us with truncate / punch hole so this is
+     * a reliable test.
+     */
+    size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+    if (vmf->pgoff >= size)
+        return VM_FAULT_SIGBUS;
+
+    memset(&bh, 0, sizeof(bh));
+    block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
+    bh.b_bdev = inode->i_sb->s_bdev;
+    bh.b_size = PAGE_SIZE;
+
+    entry = grab_mapping_entry(mapping, vmf->pgoff);
+    if (IS_ERR(entry)) {
+        error = PTR_ERR(entry);
+        goto out;
+    }
+
+    error = get_block(inode, block, &bh, 0);
+    if (!error && (bh.b_size < PAGE_SIZE))
+        error = -EIO;        /* fs corruption? */
+    if (error){
+        goto unlock_entry;
+    }
+
+    if (vmf->cow_page) {
+        struct page *new_page = vmf->cow_page;
+        if (buffer_written(&bh))
+            error = copy_user_bh(new_page, inode, &bh, vaddr);
+        else
+            clear_user_highpage(new_page, vaddr);
+        if (error){
+            goto unlock_entry;
+        }
+        if (!radix_tree_exceptional_entry(entry)) {
+            vmf->page = entry;
+            return VM_FAULT_LOCKED;
+        }
+        vmf->entry = entry;
+        return VM_FAULT_DAX_LOCKED;
+    }
+
+    if (!buffer_mapped(&bh)) {
+        if (vmf->flags & FAULT_FLAG_WRITE) {
+            error = get_block(inode, block, &bh, 1);
+            count_vm_event(PGMAJFAULT);
+            mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+            major = VM_FAULT_MAJOR;
+            if (!error && (bh.b_size < PAGE_SIZE))
+                error = -EIO;
+            if (error)
+                goto unlock_entry;
+        } else {
+            goto out2;
+        }
+    }
+
+    /* Filesystem should not return unwritten buffers to us! */
+    WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
+out2:
+    /* We take the new block here, the next higher
+     * graded block
+     */
+    corrected_sector = skip_dax;
+    new_error = get_block(inode, corrected_sector, &bh, 0);
+    corrected_new_block = bh.b_blocknr;
+    error = get_block(inode, block, &bh, 0);
+    error = skip_dax_insert_mapping(mapping, &bh, &entry, vma, vmf,
corrected_new_block);
+ unlock_entry:
+    put_locked_mapping_entry(mapping, vmf->pgoff, entry);
+ out:
+    if (error == -ENOMEM)
+        return VM_FAULT_OOM | major;
+    /* -EBUSY is fine, somebody else faulted on the same PTE */
+    if ((error < 0) && (error != -EBUSY))
+        return VM_FAULT_SIGBUS | major;
+    return VM_FAULT_NOPAGE | major;
+}
+EXPORT_SYMBOL(__skip_dax_fault);
+
 /**
  * dax_fault - handle a page fault on a DAX file
  * @vma: The virtual memory area where the fault occurred
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index c7d2eed..3cf44dd 100755
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3062,6 +3062,7 @@  extern const struct file_operations ext4_dir_operations;
 extern const struct inode_operations ext4_file_inode_operations;
 extern const struct file_operations ext4_file_operations;
 extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
+extern int read_high(struct inode *inode);

 /* inline.c */
 extern int ext4_get_max_inline_size(struct inode *inode);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 368cf53..5dafd52 100755
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -32,6 +32,20 @@ 
 #include "acl.h"

 /*
+ * read_high() returns 0 or 1 depending whether we want to read all the file
+ * blocks or only high graded, respectively.
+ * It gets this information from the extended attribute set by user beforehand.
+ */
+int read_high(struct inode *inode)
+{
+    const char *xattr_name = "read_high";
+    int read_high = 0;
+    int xattr_size = sizeof(int);
+    xattr_size = ext4_xattr_get(inode,
EXT4_XATTR_INDEX_USER,xattr_name, (void *)&read_high,xattr_size);
+    return read_high;
+}
+
+/*
  * Called when an inode is released. Note that this is different
  * from ext4_file_open: open gets called at every open, but release