Patchwork [V8,4/8] mm/fs: add hooks to support cleancache

login
register
mail settings
Submitter Dan Magenheimer
Date April 14, 2011, 9:17 p.m.
Message ID <20110414211732.GA27761@ca-server1.us.oracle.com>
Download mbox | patch
Permalink /patch/91295/
State Not Applicable
Headers show

Comments

Dan Magenheimer - April 14, 2011, 9:17 p.m.
[PATCH V8 4/8] mm/fs: add hooks to support cleancache

This fourth patch of eight in this cleancache series provides the
core hooks in VFS for: initializing cleancache per filesystem;
capturing clean pages reclaimed by page cache; attempting to get
pages from cleancache before filesystem read; and ensuring coherency
between pagecache, disk, and cleancache.  Note that the placement
of these hooks was stable from 2.6.18 to 2.6.38; a minor semantic
change was required due to a patchset in 2.6.39.

All hooks become no-ops if CONFIG_CLEANCACHE is unset, or become
a check of a boolean global if CONFIG_CLEANCACHE is set but no
cleancache "backend" has claimed cleancache_ops.

Details and a FAQ can be found in Documentation/vm/cleancache.txt

[v8: minchan.kim@gmail.com: adapt to new remove_from_page_cache function]
Signed-off-by: Chris Mason <chris.mason@oracle.com>
Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com>
Reviewed-by: Jeremy Fitzhardinge <jeremy@goop.org>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Matthew Wilcox <matthew@wil.cx>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Rik Van Riel <riel@redhat.com>
Cc: Jan Beulich <JBeulich@novell.com>
Cc: Andreas Dilger <adilger@sun.com>
Cc: Ted Ts'o <tytso@mit.edu>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <joel.becker@oracle.com>
Cc: Nitin Gupta <ngupta@vflare.org>

---

Diffstat:
 fs/buffer.c                              |    5 +++++
 fs/mpage.c                               |    7 +++++++
 fs/super.c                               |    3 +++
 mm/filemap.c                             |   11 +++++++++++
 mm/truncate.c                            |    6 ++++++
 5 files changed, 32 insertions(+)

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
MinChan Kim - April 14, 2011, 11:37 p.m.
Hi Dan,

On Fri, Apr 15, 2011 at 6:17 AM, Dan Magenheimer
<dan.magenheimer@oracle.com> wrote:
> [PATCH V8 4/8] mm/fs: add hooks to support cleancache
>
> This fourth patch of eight in this cleancache series provides the
> core hooks in VFS for: initializing cleancache per filesystem;
> capturing clean pages reclaimed by page cache; attempting to get
> pages from cleancache before filesystem read; and ensuring coherency
> between pagecache, disk, and cleancache.  Note that the placement
> of these hooks was stable from 2.6.18 to 2.6.38; a minor semantic
> change was required due to a patchset in 2.6.39.
>
> All hooks become no-ops if CONFIG_CLEANCACHE is unset, or become
> a check of a boolean global if CONFIG_CLEANCACHE is set but no
> cleancache "backend" has claimed cleancache_ops.
>
> Details and a FAQ can be found in Documentation/vm/cleancache.txt
>
> [v8: minchan.kim@gmail.com: adapt to new remove_from_page_cache function]
> Signed-off-by: Chris Mason <chris.mason@oracle.com>
> Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com>
> Reviewed-by: Jeremy Fitzhardinge <jeremy@goop.org>
> Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Al Viro <viro@ZenIV.linux.org.uk>
> Cc: Matthew Wilcox <matthew@wil.cx>
> Cc: Nick Piggin <npiggin@kernel.dk>
> Cc: Mel Gorman <mel@csn.ul.ie>
> Cc: Rik Van Riel <riel@redhat.com>
> Cc: Jan Beulich <JBeulich@novell.com>
> Cc: Andreas Dilger <adilger@sun.com>
> Cc: Ted Ts'o <tytso@mit.edu>
> Cc: Mark Fasheh <mfasheh@suse.com>
> Cc: Joel Becker <joel.becker@oracle.com>
> Cc: Nitin Gupta <ngupta@vflare.org>
>
> ---
>
> Diffstat:
>  fs/buffer.c                              |    5 +++++
>  fs/mpage.c                               |    7 +++++++
>  fs/super.c                               |    3 +++
>  mm/filemap.c                             |   11 +++++++++++
>  mm/truncate.c                            |    6 ++++++
>  5 files changed, 32 insertions(+)
>
> --- linux-2.6.39-rc3/fs/super.c 2011-04-11 18:21:51.000000000 -0600
> +++ linux-2.6.39-rc3-cleancache/fs/super.c      2011-04-13 17:08:09.175853426 -0600
> @@ -31,6 +31,7 @@
>  #include <linux/mutex.h>
>  #include <linux/backing-dev.h>
>  #include <linux/rculist_bl.h>
> +#include <linux/cleancache.h>
>  #include "internal.h"
>
>
> @@ -112,6 +113,7 @@ static struct super_block *alloc_super(s
>                s->s_maxbytes = MAX_NON_LFS;
>                s->s_op = &default_op;
>                s->s_time_gran = 1000000000;
> +               s->cleancache_poolid = -1;
>        }
>  out:
>        return s;
> @@ -177,6 +179,7 @@ void deactivate_locked_super(struct supe
>  {
>        struct file_system_type *fs = s->s_type;
>        if (atomic_dec_and_test(&s->s_active)) {
> +               cleancache_flush_fs(s);
>                fs->kill_sb(s);
>                /*
>                 * We need to call rcu_barrier so all the delayed rcu free
> --- linux-2.6.39-rc3/fs/buffer.c        2011-04-11 18:21:51.000000000 -0600
> +++ linux-2.6.39-rc3-cleancache/fs/buffer.c     2011-04-13 17:07:24.700917174 -0600
> @@ -41,6 +41,7 @@
>  #include <linux/bitops.h>
>  #include <linux/mpage.h>
>  #include <linux/bit_spinlock.h>
> +#include <linux/cleancache.h>
>
>  static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
>
> @@ -269,6 +270,10 @@ void invalidate_bdev(struct block_device
>        invalidate_bh_lrus();
>        lru_add_drain_all();    /* make sure all lru add caches are flushed */
>        invalidate_mapping_pages(mapping, 0, -1);
> +       /* 99% of the time, we don't need to flush the cleancache on the bdev.
> +        * But, for the strange corners, lets be cautious
> +        */
> +       cleancache_flush_inode(mapping);
>  }
>  EXPORT_SYMBOL(invalidate_bdev);
>
> --- linux-2.6.39-rc3/fs/mpage.c 2011-04-11 18:21:51.000000000 -0600
> +++ linux-2.6.39-rc3-cleancache/fs/mpage.c      2011-04-13 17:07:24.706913410 -0600
> @@ -27,6 +27,7 @@
>  #include <linux/writeback.h>
>  #include <linux/backing-dev.h>
>  #include <linux/pagevec.h>
> +#include <linux/cleancache.h>
>
>  /*
>  * I/O completion handler for multipage BIOs.
> @@ -271,6 +272,12 @@ do_mpage_readpage(struct bio *bio, struc
>                SetPageMappedToDisk(page);
>        }
>
> +       if (fully_mapped && blocks_per_page == 1 && !PageUptodate(page) &&
> +           cleancache_get_page(page) == 0) {
> +               SetPageUptodate(page);
> +               goto confused;
> +       }
> +
>        /*
>         * This page will go to BIO.  Do we need to send this BIO off first?
>         */
> --- linux-2.6.39-rc3/mm/filemap.c       2011-04-11 18:21:51.000000000 -0600
> +++ linux-2.6.39-rc3-cleancache/mm/filemap.c    2011-04-13 17:09:46.367852002 -0600
> @@ -34,6 +34,7 @@
>  #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
>  #include <linux/memcontrol.h>
>  #include <linux/mm_inline.h> /* for page_is_file_cache() */
> +#include <linux/cleancache.h>
>  #include "internal.h"
>
>  /*
> @@ -118,6 +119,16 @@ void __delete_from_page_cache(struct pag
>  {
>        struct address_space *mapping = page->mapping;
>
> +       /*
> +        * if we're uptodate, flush out into the cleancache, otherwise
> +        * invalidate any existing cleancache entries.  We can't leave
> +        * stale data around in the cleancache once our page is gone
> +        */
> +       if (PageUptodate(page) && PageMappedToDisk(page))
> +               cleancache_put_page(page);
> +       else
> +               cleancache_flush_page(mapping, page);
> +

First of all, thanks for resolving conflict with my patch.

Before I suggested a thing about cleancache_flush_page, cleancache_flush_inode.

what's the meaning of flush's semantic?
I thought it means invalidation.
AFAIC, how about change flush with invalidate?
Dan Magenheimer - April 15, 2011, 2:47 p.m.
Hi Minchan --

> First of all, thanks for resolving conflict with my patch.

You're welcome!  As I pointed out offlist, yours was the first
change in MM that caused any semantic changes to the cleancache
core hooks patch since before 2.6.18.
 
> Before I suggested a thing about cleancache_flush_page,
> cleancache_flush_inode.
> 
> what's the meaning of flush's semantic?
> I thought it means invalidation.
> AFAIC, how about change flush with invalidate?

I'm not sure the words "flush" and "invalidate" are defined
precisely or used consistently everywhere in computer
science, but I think that "invalidate" is to destroy
a "pointer" to some data, but not necessarily destroy the
data itself.   And "flush" means to actually remove
the data.  So one would "invalidate a mapping" but one
would "flush a cache".

Since cleancache_flush_page and cleancache_flush_inode
semantically remove data from cleancache, I think flush
is a better name than invalidate.

Does that make sense?

Thanks,
Dan
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andrew Morton - April 15, 2011, 3:10 p.m.
On Fri, 15 Apr 2011 07:47:57 -0700 (PDT) Dan Magenheimer <dan.magenheimer@oracle.com> wrote:

> Hi Minchan --
> 
> > First of all, thanks for resolving conflict with my patch.
> 
> You're welcome!  As I pointed out offlist, yours was the first
> change in MM that caused any semantic changes to the cleancache
> core hooks patch since before 2.6.18.
>  
> > Before I suggested a thing about cleancache_flush_page,
> > cleancache_flush_inode.
> > 
> > what's the meaning of flush's semantic?
> > I thought it means invalidation.
> > AFAIC, how about change flush with invalidate?
> 
> I'm not sure the words "flush" and "invalidate" are defined
> precisely or used consistently everywhere in computer
> science, but I think that "invalidate" is to destroy
> a "pointer" to some data, but not necessarily destroy the
> data itself.   And "flush" means to actually remove
> the data.  So one would "invalidate a mapping" but one
> would "flush a cache".
> 
> Since cleancache_flush_page and cleancache_flush_inode
> semantically remove data from cleancache, I think flush
> is a better name than invalidate.
> 
> Does that make sense?
> 

nope ;)

Kernel code freely uses "flush" to refer to both invalidation and to
writeback, sometimes in confusing ways.  In this case,
cleancache_flush_inode and cleancache_flush_page rather sound like they
might write those things to backing store.  
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dan Magenheimer - April 15, 2011, 3:32 p.m.
> From: Andrew Morton [mailto:akpm@linux-foundation.org]
> On Fri, 15 Apr 2011 07:47:57 -0700 (PDT) Dan Magenheimer
> <dan.magenheimer@oracle.com> wrote:
> 
> > Hi Minchan --
> >
> > > Before I suggested a thing about cleancache_flush_page,
> > > cleancache_flush_inode.
> > >
> > > what's the meaning of flush's semantic?
> > > I thought it means invalidation.
> > > AFAIC, how about change flush with invalidate?
> >
> > I'm not sure the words "flush" and "invalidate" are defined
> > precisely or used consistently everywhere in computer
> > science, but I think that "invalidate" is to destroy
> > a "pointer" to some data, but not necessarily destroy the
> > data itself.   And "flush" means to actually remove
> > the data.  So one would "invalidate a mapping" but one
> > would "flush a cache".
> >
> > Since cleancache_flush_page and cleancache_flush_inode
> > semantically remove data from cleancache, I think flush
> > is a better name than invalidate.
> >
> > Does that make sense?
> 
> nope ;)
> 
> Kernel code freely uses "flush" to refer to both invalidation and to
> writeback, sometimes in confusing ways.  In this case,
> cleancache_flush_inode and cleancache_flush_page rather sound like they
> might write those things to backing store.

OK, I guess I am displaying my kernel-newbie-ness... though,
in this case, writeback of a cleancache page to backing store
doesn't make much sense either (since cleancache pages are
by definition "clean").

I'm happy to rename the hooks, though will probably not
repost a V9 unless/until more substantive changes collect...
unless someone considers this an unmergeable offense.

Thanks for the feedback, Minchan and Andrew!
Dan
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
OGAWA Hirofumi - April 15, 2011, 3:37 p.m.
Andrew Morton <akpm@linux-foundation.org> writes:

>> > Before I suggested a thing about cleancache_flush_page,
>> > cleancache_flush_inode.
>> > 
>> > what's the meaning of flush's semantic?
>> > I thought it means invalidation.
>> > AFAIC, how about change flush with invalidate?
>> 
>> I'm not sure the words "flush" and "invalidate" are defined
>> precisely or used consistently everywhere in computer
>> science, but I think that "invalidate" is to destroy
>> a "pointer" to some data, but not necessarily destroy the
>> data itself.   And "flush" means to actually remove
>> the data.  So one would "invalidate a mapping" but one
>> would "flush a cache".
>> 
>> Since cleancache_flush_page and cleancache_flush_inode
>> semantically remove data from cleancache, I think flush
>> is a better name than invalidate.
>> 
>> Does that make sense?
>> 
>
> nope ;)
>
> Kernel code freely uses "flush" to refer to both invalidation and to
> writeback, sometimes in confusing ways.  In this case,
> cleancache_flush_inode and cleancache_flush_page rather sound like they
> might write those things to backing store.  

I'd like to mention about *_{get,put}_page too. In linux get/put is not
meaning read/write. There is {get,put}_page those are refcount stuff
(Yeah, and I felt those methods does refcount by quick read. But it
seems to be false. There is no xen codes, so I don't know actually
though.).

And I agree, I also think the needing thing is consistency on the linux
codes (term).

Thanks.
Dan Magenheimer - April 15, 2011, 6:53 p.m.
> From: OGAWA Hirofumi [mailto:hirofumi@mail.parknet.co.jp]
> 
> Andrew Morton <akpm@linux-foundation.org> writes:
> 
> >> > Before I suggested a thing about cleancache_flush_page,
> >> > cleancache_flush_inode.
> >> >
> >> > what's the meaning of flush's semantic?
> >> > I thought it means invalidation.
> >> > AFAIC, how about change flush with invalidate?
> >>
> >> I'm not sure the words "flush" and "invalidate" are defined
> >> precisely or used consistently everywhere in computer
> >> science, but I think that "invalidate" is to destroy
> >> a "pointer" to some data, but not necessarily destroy the
> >> data itself.   And "flush" means to actually remove
> >> the data.  So one would "invalidate a mapping" but one
> >> would "flush a cache".
> >>
> >> Since cleancache_flush_page and cleancache_flush_inode
> >> semantically remove data from cleancache, I think flush
> >> is a better name than invalidate.
> >>
> >> Does that make sense?
> >
> > nope ;)
> >
> > Kernel code freely uses "flush" to refer to both invalidation and to
> > writeback, sometimes in confusing ways.  In this case,
> > cleancache_flush_inode and cleancache_flush_page rather sound like
> they
> > might write those things to backing store.
> 
> I'd like to mention about *_{get,put}_page too. In linux get/put is not
> meaning read/write. There is {get,put}_page those are refcount stuff
> (Yeah, and I felt those methods does refcount by quick read. But it
> seems to be false. There is no xen codes, so I don't know actually
> though.).
> 
> And I agree, I also think the needing thing is consistency on the linux
> codes (term).
> 
> Thanks.
> --
> OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>

Hmmm, yes, that's a point of confusion also.  No, cleancache put/get
do not have any relationship with reference counting.

Andrew, I wonder if you would be so kind as to read the following
and make a "ruling".  If you determine a preferable set of names,
I will abide by your decision and repost (if necessary).

The problem is this: The English language has a limited number
of words that can be used to represent data motion and mapping
and most/all of them are already used in the kernel, often,
to quote Andrew, "in confusing ways."  Complicating this, I
think the semantics of the cleancache operations are different
from the semantics of any other kernel operation... intentionally
so, because the value of cleancache is a direct result of those
differing semantics.  And the cleancache semantics
are fairly complex (again intentionally) so a single function
name can't possibly describe the semantics.

The cleancache operations are:
- put (page)
- get (page)
- flush page
- flush inode
- init fs
- flush fs

I think these names are reasonable representations of the
semantics of the operations performed... but I'm not a kernel
expert so there is certainly room for disagreement.  Though I
absolutely recognize the importance of a "name", I am primarily
interested in merging the semantics of the operations and
would happily accept any name that kernel developers could
agree on.  However, I fear that there will be NO name that
will satisfy all, so would prefer to keep the existing names.
If some renaming is eventually agreed upon, this could be done
post-merge.

Here's a brief description of the semantics:

The cleancache operation currently known as "put" has the
following semantics:  If *possible*, please take the data
contained in the pageframe referred to by this struct page
into cleancache and associate it with the filesystem-determined
"handle" derived from the struct page.

The cleancache operation currently known as "get" has the
following semantics:  Derive the filesystem-determined handle
from this struct page.  If cleancache contains a page matching
that handle, recreate the page of data from cleancache and
place the results in the pageframe referred to by the
struct page.  Then delete in cleancache any record of the
handle and any data associated with it, so that a
subsequent "get" will no longer find a match for the handle;
any space used for the data can also be freed.

(Note that "take the data" and "recreate the page of data" are
similar in semantics to "copy to" and "copy from", but since
the cleancache operation may perform an "inflight" transformation
on the data, and "copy" usually means a byte-for-byte replication,
the word "copy" is also misleading.)

The cleancache operation currently known as "flush" has the
following semantics:  Derive the filesystem-determined handle
from this struct page and struct mapping.  If cleancache
contains a page matching that handle, delete in cleancache any
record of the handle and any data associated with it, so that a
subsequent "get" will no longer find a match for the handle;
any space used for the data can also be freed

The cleancache operation currently known as "flush inode" has
the following semantics: Derive the filesystem-determined filekey
from this struct mapping.  If cleancache contains ANY handles
matching that filekey, delete in cleancache any record of
any matching handle and any data associated with those handles;
any space used for the data can also be freed.

The cleancache operation currently known as "init fs" has
the following semantics: Create a unique poolid to refer
to this filesystem and save it in the superblock's
cleancache_poolid field.

The cleancache operation currently known as "flush fs" has
the following semantics: Get the cleancache_poolid field
from this superblock.  If cleancache contains ANY handles
associated with that poolid, delete in cleancache any
record of any matching handles and any data associated with
those handles; any space used for the data can also be freed.
Also, set the superblock's cleancache_poolid to be invalid
and, in cleancache, recycle the poolid so a subsequent init_fs
operation can reuse it.

That's all!

Thanks,
Dan
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
MinChan Kim - April 18, 2011, 5:32 a.m.
On Sat, Apr 16, 2011 at 3:53 AM, Dan Magenheimer
<dan.magenheimer@oracle.com> wrote:
>> From: OGAWA Hirofumi [mailto:hirofumi@mail.parknet.co.jp]
>>
>> Andrew Morton <akpm@linux-foundation.org> writes:
>>
>> >> > Before I suggested a thing about cleancache_flush_page,
>> >> > cleancache_flush_inode.
>> >> >
>> >> > what's the meaning of flush's semantic?
>> >> > I thought it means invalidation.
>> >> > AFAIC, how about change flush with invalidate?
>> >>
>> >> I'm not sure the words "flush" and "invalidate" are defined
>> >> precisely or used consistently everywhere in computer
>> >> science, but I think that "invalidate" is to destroy
>> >> a "pointer" to some data, but not necessarily destroy the
>> >> data itself.   And "flush" means to actually remove
>> >> the data.  So one would "invalidate a mapping" but one
>> >> would "flush a cache".
>> >>
>> >> Since cleancache_flush_page and cleancache_flush_inode
>> >> semantically remove data from cleancache, I think flush
>> >> is a better name than invalidate.
>> >>
>> >> Does that make sense?
>> >
>> > nope ;)
>> >
>> > Kernel code freely uses "flush" to refer to both invalidation and to
>> > writeback, sometimes in confusing ways.  In this case,
>> > cleancache_flush_inode and cleancache_flush_page rather sound like
>> they
>> > might write those things to backing store.
>>
>> I'd like to mention about *_{get,put}_page too. In linux get/put is not
>> meaning read/write. There is {get,put}_page those are refcount stuff
>> (Yeah, and I felt those methods does refcount by quick read. But it
>> seems to be false. There is no xen codes, so I don't know actually
>> though.).
>>
>> And I agree, I also think the needing thing is consistency on the linux
>> codes (term).
>>
>> Thanks.
>> --
>> OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
>
> Hmmm, yes, that's a point of confusion also.  No, cleancache put/get
> do not have any relationship with reference counting.
>
> Andrew, I wonder if you would be so kind as to read the following
> and make a "ruling".  If you determine a preferable set of names,
> I will abide by your decision and repost (if necessary).
>
> The problem is this: The English language has a limited number
> of words that can be used to represent data motion and mapping
> and most/all of them are already used in the kernel, often,
> to quote Andrew, "in confusing ways."  Complicating this, I
> think the semantics of the cleancache operations are different
> from the semantics of any other kernel operation... intentionally
> so, because the value of cleancache is a direct result of those
> differing semantics.  And the cleancache semantics
> are fairly complex (again intentionally) so a single function
> name can't possibly describe the semantics.
>
> The cleancache operations are:
> - put (page)
> - get (page)
> - flush page
> - flush inode
> - init fs
> - flush fs
>
> I think these names are reasonable representations of the
> semantics of the operations performed... but I'm not a kernel
> expert so there is certainly room for disagreement.  Though I
> absolutely recognize the importance of a "name", I am primarily
> interested in merging the semantics of the operations and
> would happily accept any name that kernel developers could
> agree on.  However, I fear that there will be NO name that
> will satisfy all, so would prefer to keep the existing names.
> If some renaming is eventually agreed upon, this could be done
> post-merge.
>
> Here's a brief description of the semantics:
>
> The cleancache operation currently known as "put" has the
> following semantics:  If *possible*, please take the data
> contained in the pageframe referred to by this struct page
> into cleancache and associate it with the filesystem-determined
> "handle" derived from the struct page.
>
> The cleancache operation currently known as "get" has the
> following semantics:  Derive the filesystem-determined handle
> from this struct page.  If cleancache contains a page matching
> that handle, recreate the page of data from cleancache and
> place the results in the pageframe referred to by the
> struct page.  Then delete in cleancache any record of the
> handle and any data associated with it, so that a
> subsequent "get" will no longer find a match for the handle;
> any space used for the data can also be freed.
>
> (Note that "take the data" and "recreate the page of data" are
> similar in semantics to "copy to" and "copy from", but since
> the cleancache operation may perform an "inflight" transformation
> on the data, and "copy" usually means a byte-for-byte replication,
> the word "copy" is also misleading.)
>
> The cleancache operation currently known as "flush" has the
> following semantics:  Derive the filesystem-determined handle
> from this struct page and struct mapping.  If cleancache
> contains a page matching that handle, delete in cleancache any
> record of the handle and any data associated with it, so that a
> subsequent "get" will no longer find a match for the handle;
> any space used for the data can also be freed
>
> The cleancache operation currently known as "flush inode" has
> the following semantics: Derive the filesystem-determined filekey
> from this struct mapping.  If cleancache contains ANY handles
> matching that filekey, delete in cleancache any record of
> any matching handle and any data associated with those handles;
> any space used for the data can also be freed.
>
> The cleancache operation currently known as "init fs" has
> the following semantics: Create a unique poolid to refer
> to this filesystem and save it in the superblock's
> cleancache_poolid field.
>
> The cleancache operation currently known as "flush fs" has
> the following semantics: Get the cleancache_poolid field
> from this superblock.  If cleancache contains ANY handles
> associated with that poolid, delete in cleancache any
> record of any matching handles and any data associated with
> those handles; any space used for the data can also be freed.
> Also, set the superblock's cleancache_poolid to be invalid
> and, in cleancache, recycle the poolid so a subsequent init_fs
> operation can reuse it.
>
> That's all!
>
> Thanks,
> Dan
>

At least, I didn't confused your semantics except just flush. That's
why I suggested only flush but after seeing your explaining, there is
another thing I want to change. The get/put is common semantic of
reference counting in kernel but in POV your semantics, it makes sense
to me but get has a exclusive semantic so I want to represent it with
API name. Maybe cleancache_get_page_exclusive.

The summary is that I don't want to change all API name. Just two thing.
(I am not sure you and others agree on me. It's just suggestion).

1. cleancache_flush_page -> cleancache_[invalidate|remove]_page
2. cleancache_get_page -> cleancache_get_page_exclusive

BTW, Nice description.
Please include it in documentation if we can't reach the conclusion.
It will help others to understand semantic of cleancache.

Thanks, Dan.
Dan Magenheimer - April 26, 2011, 4 p.m.
> From: Minchan Kim [mailto:minchan.kim@gmail.com]
> On Sat, Apr 16, 2011 at 3:53 AM, Dan Magenheimer
> <dan.magenheimer@oracle.com> wrote:
> >> From: OGAWA Hirofumi [mailto:hirofumi@mail.parknet.co.jp]
> >>
> >> Andrew Morton <akpm@linux-foundation.org> writes:
> >>
> > Andrew, I wonder if you would be so kind as to read the following
> > and make a "ruling".  If you determine a preferable set of names,
> > I will abide by your decision and repost (if necessary).
> >
> > The problem is this: The English language has a limited number
> > of words that can be used to represent data motion and mapping
> > and most/all of them are already used in the kernel, often,
> > to quote Andrew, "in confusing ways."  Complicating this, I
> > think the semantics of the cleancache operations are different
> > from the semantics of any other kernel operation... intentionally
> > so, because the value of cleancache is a direct result of those
> > differing semantics.  And the cleancache semantics
> > are fairly complex (again intentionally) so a single function
> > name can't possibly describe the semantics.
> >
> > The cleancache operations are:
> > - put (page)
> > - get (page)
> > - flush page
> > - flush inode
> > - init fs
> > - flush fs
> >
> > I think these names are reasonable representations of the
> > semantics of the operations performed... but I'm not a kernel
> > expert so there is certainly room for disagreement.  Though I
> > absolutely recognize the importance of a "name", I am primarily
> > interested in merging the semantics of the operations and
> > would happily accept any name that kernel developers could
> > agree on.  However, I fear that there will be NO name that
> > will satisfy all, so would prefer to keep the existing names.
> > If some renaming is eventually agreed upon, this could be done
> > post-merge.
> >
> > Here's a brief description of the semantics:
> >     :
> >  <semantics for other operations elided>
> >     :
> > The cleancache operation currently known as "get" has the
> > following semantics:  Derive the filesystem-determined handle
> > from this struct page.  If cleancache contains a page matching
> > that handle, recreate the page of data from cleancache and
> > place the results in the pageframe referred to by the
> > struct page.  Then delete in cleancache any record of the
> > handle and any data associated with it, so that a
> > subsequent "get" will no longer find a match for the handle;
> > any space used for the data can also be freed.
> >     :
> >  <semantics for other operations elided>
> >     :
> 
> At least, I didn't confused your semantics except just flush. That's
> why I suggested only flush but after seeing your explaining, there is
> another thing I want to change. The get/put is common semantic of
> reference counting in kernel but in POV your semantics, it makes sense
> to me but get has a exclusive semantic so I want to represent it with
> API name. Maybe cleancache_get_page_exclusive.
> 
> The summary is that I don't want to change all API name. Just two
> thing.
> (I am not sure you and others agree on me. It's just suggestion).
> 
> 1. cleancache_flush_page -> cleancache_[invalidate|remove]_page
> 2. cleancache_get_page -> cleancache_get_page_exclusive
> 

Hi Minchan --

Thanks for continuing to be interested in this and sorry for my
delayed response.

Actually, your comment about "get_page_exclusive" points out an
incompleteness in my description of the semantics for
cleancache_get_page.

First, I forgot to list cleancache_init_shared_fs, which is
the equivalent of cleancache_init_fs but used for clustered
filesystems.  (Support is included in the patch for ocfs2 but
I haven't played with it in quite some time and my focus has
been on the other filesystems, so it slipped my mind :-}

The cleancache_get_page operation has a slightly different semantics
depending on which of the init_fs calls was used.  However, the
location of the cleancache_get_page hook is the same regardless
of the fs, so the name of the operation must represent both
semantics.  In the case of init_fs (non-shared), the behavior
of cleancache_get_page is that the get is "destructive"; the page
is removed from cleancache on a successful get.  In the case of
a init_shared_fs, however, the get is "non-destructive"; the
page is NOT removed from cleancache.  When cleancache contains
pages from multiple kernels (e.g. Xen guests or different machines
in a RAMster cluster), this semantic difference can make a big
performance difference for a clustered filesystem.  Since zcache
only contains pages for a single kernel, the difference is moot.

Because of this, I am hesitant to add "exclusive" to the
end of the name of the operation.

> BTW, Nice description.
> Please include it in documentation if we can't reach the conclusion.
> It will help others to understand semantic of cleancache.

Thanks!  Nearly all of the description already exists in various
places in the patch but I agree that it would be good if I add
a new section to the Documentation file with the exact semantics.

Dan
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

--- linux-2.6.39-rc3/fs/super.c	2011-04-11 18:21:51.000000000 -0600
+++ linux-2.6.39-rc3-cleancache/fs/super.c	2011-04-13 17:08:09.175853426 -0600
@@ -31,6 +31,7 @@ 
 #include <linux/mutex.h>
 #include <linux/backing-dev.h>
 #include <linux/rculist_bl.h>
+#include <linux/cleancache.h>
 #include "internal.h"
 
 
@@ -112,6 +113,7 @@  static struct super_block *alloc_super(s
 		s->s_maxbytes = MAX_NON_LFS;
 		s->s_op = &default_op;
 		s->s_time_gran = 1000000000;
+		s->cleancache_poolid = -1;
 	}
 out:
 	return s;
@@ -177,6 +179,7 @@  void deactivate_locked_super(struct supe
 {
 	struct file_system_type *fs = s->s_type;
 	if (atomic_dec_and_test(&s->s_active)) {
+		cleancache_flush_fs(s);
 		fs->kill_sb(s);
 		/*
 		 * We need to call rcu_barrier so all the delayed rcu free
--- linux-2.6.39-rc3/fs/buffer.c	2011-04-11 18:21:51.000000000 -0600
+++ linux-2.6.39-rc3-cleancache/fs/buffer.c	2011-04-13 17:07:24.700917174 -0600
@@ -41,6 +41,7 @@ 
 #include <linux/bitops.h>
 #include <linux/mpage.h>
 #include <linux/bit_spinlock.h>
+#include <linux/cleancache.h>
 
 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
 
@@ -269,6 +270,10 @@  void invalidate_bdev(struct block_device
 	invalidate_bh_lrus();
 	lru_add_drain_all();	/* make sure all lru add caches are flushed */
 	invalidate_mapping_pages(mapping, 0, -1);
+	/* 99% of the time, we don't need to flush the cleancache on the bdev.
+	 * But, for the strange corners, lets be cautious
+	 */
+	cleancache_flush_inode(mapping);
 }
 EXPORT_SYMBOL(invalidate_bdev);
 
--- linux-2.6.39-rc3/fs/mpage.c	2011-04-11 18:21:51.000000000 -0600
+++ linux-2.6.39-rc3-cleancache/fs/mpage.c	2011-04-13 17:07:24.706913410 -0600
@@ -27,6 +27,7 @@ 
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
+#include <linux/cleancache.h>
 
 /*
  * I/O completion handler for multipage BIOs.
@@ -271,6 +272,12 @@  do_mpage_readpage(struct bio *bio, struc
 		SetPageMappedToDisk(page);
 	}
 
+	if (fully_mapped && blocks_per_page == 1 && !PageUptodate(page) &&
+	    cleancache_get_page(page) == 0) {
+		SetPageUptodate(page);
+		goto confused;
+	}
+
 	/*
 	 * This page will go to BIO.  Do we need to send this BIO off first?
 	 */
--- linux-2.6.39-rc3/mm/filemap.c	2011-04-11 18:21:51.000000000 -0600
+++ linux-2.6.39-rc3-cleancache/mm/filemap.c	2011-04-13 17:09:46.367852002 -0600
@@ -34,6 +34,7 @@ 
 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
 #include <linux/memcontrol.h>
 #include <linux/mm_inline.h> /* for page_is_file_cache() */
+#include <linux/cleancache.h>
 #include "internal.h"
 
 /*
@@ -118,6 +119,16 @@  void __delete_from_page_cache(struct pag
 {
 	struct address_space *mapping = page->mapping;
 
+	/*
+	 * if we're uptodate, flush out into the cleancache, otherwise
+	 * invalidate any existing cleancache entries.  We can't leave
+	 * stale data around in the cleancache once our page is gone
+	 */
+	if (PageUptodate(page) && PageMappedToDisk(page))
+		cleancache_put_page(page);
+	else
+		cleancache_flush_page(mapping, page);
+
 	radix_tree_delete(&mapping->page_tree, page->index);
 	page->mapping = NULL;
 	mapping->nrpages--;
--- linux-2.6.39-rc3/mm/truncate.c	2011-04-11 18:21:51.000000000 -0600
+++ linux-2.6.39-rc3-cleancache/mm/truncate.c	2011-04-13 17:07:24.710911759 -0600
@@ -19,6 +19,7 @@ 
 #include <linux/task_io_accounting_ops.h>
 #include <linux/buffer_head.h>	/* grr. try_to_release_page,
 				   do_invalidatepage */
+#include <linux/cleancache.h>
 #include "internal.h"
 
 
@@ -51,6 +52,7 @@  void do_invalidatepage(struct page *page
 static inline void truncate_partial_page(struct page *page, unsigned partial)
 {
 	zero_user_segment(page, partial, PAGE_CACHE_SIZE);
+	cleancache_flush_page(page->mapping, page);
 	if (page_has_private(page))
 		do_invalidatepage(page, partial);
 }
@@ -214,6 +216,7 @@  void truncate_inode_pages_range(struct a
 	pgoff_t next;
 	int i;
 
+	cleancache_flush_inode(mapping);
 	if (mapping->nrpages == 0)
 		return;
 
@@ -291,6 +294,7 @@  void truncate_inode_pages_range(struct a
 		pagevec_release(&pvec);
 		mem_cgroup_uncharge_end();
 	}
+	cleancache_flush_inode(mapping);
 }
 EXPORT_SYMBOL(truncate_inode_pages_range);
 
@@ -440,6 +444,7 @@  int invalidate_inode_pages2_range(struct
 	int did_range_unmap = 0;
 	int wrapped = 0;
 
+	cleancache_flush_inode(mapping);
 	pagevec_init(&pvec, 0);
 	next = start;
 	while (next <= end && !wrapped &&
@@ -498,6 +503,7 @@  int invalidate_inode_pages2_range(struct
 		mem_cgroup_uncharge_end();
 		cond_resched();
 	}
+	cleancache_flush_inode(mapping);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);