diff mbox

[01/11,RESEND] libe2p: Add new function get_fragment_score()

Message ID 4DF8522F.2020304@sx.jp.nec.com
State Superseded, archived
Headers show

Commit Message

Kazuya Mio June 15, 2011, 6:33 a.m. UTC
This patch adds get_fragment_score() to libe2p. get_fragment_score() returns
the fragmentation score. It shows the percentage of extents whose size is
smaller than the input argument "threshold".

However, there are some cases that cannot be merged into a next extent.
The following extents are excepted from the calculation of fragmentation score:
- The extent whose initialize status is different from the next extent
- There is a hole between the extent and its next extent
- The extent is a tail

Signed-off-by: Kazuya Mio <k-mio@sx.jp.nec.com>
---
 lib/e2p/Makefile.in      |    6 +
 lib/e2p/e2p.h            |    2 
 lib/e2p/fragment_score.c |  142 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 148 insertions(+), 2 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Andreas Dilger June 16, 2011, 3:06 a.m. UTC | #1
On 2011-06-15, at 12:33 AM, Kazuya Mio <k-mio@sx.jp.nec.com> wrote:
> This patch adds get_fragment_score() to libe2p. get_fragment_score() returns
> the fragmentation score. It shows the percentage of extents whose size is
> smaller than the input argument "threshold".

> However, there are some cases that cannot be merged into a next extent.
> The following extents are excepted from the calculation of fragmentation score:
> - The extent whose initialize status is different from the next extent
> - There is a hole between the extent and its next extent
> - The extent is a tail

This description of the fragmentation score should also go into a comment at the function itself. Otherwise there is no description in the code of what the "score" is or what it's used for. 

> Signed-off-by: Kazuya Mio <k-mio@sx.jp.nec.com>
> ---
> lib/e2p/Makefile.in      |    6 +
> lib/e2p/e2p.h            |    2 
> lib/e2p/fragment_score.c |  142 +++++++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 148 insertions(+), 2 deletions(-)
> diff --git a/lib/e2p/Makefile.in b/lib/e2p/Makefile.in
> index 9775a98..c62a81d 100644
> --- a/lib/e2p/Makefile.in
> +++ b/lib/e2p/Makefile.in
> @@ -19,7 +19,7 @@ all::    e2p.pc
> OBJS=        feature.o fgetflags.o fsetflags.o fgetversion.o fsetversion.o \
>        getflags.o getversion.o hashstr.o iod.o ls.o mntopts.o \
>        parse_num.o pe.o pf.o ps.o setflags.o setversion.o uuid.o \
> -        ostype.o percent.o
> +        ostype.o percent.o fragment_score.o
> 
> SRCS=        $(srcdir)/feature.c $(srcdir)/fgetflags.c \
>        $(srcdir)/fsetflags.c $(srcdir)/fgetversion.c \
> @@ -28,7 +28,7 @@ SRCS=        $(srcdir)/feature.c $(srcdir)/fgetflags.c \
>        $(srcdir)/ls.c $(srcdir)/mntopts.c $(srcdir)/parse_num.c \
>        $(srcdir)/pe.c $(srcdir)/pf.c $(srcdir)/ps.c \
>        $(srcdir)/setflags.c $(srcdir)/setversion.c $(srcdir)/uuid.c \
> -        $(srcdir)/ostype.c $(srcdir)/percent.c
> +        $(srcdir)/ostype.c $(srcdir)/percent.c $(srcdir)/fragment_score.c
> HFILES= e2p.h
> 
> LIBRARY= libe2p
> @@ -162,3 +162,5 @@ ostype.o: $(srcdir)/ostype.c $(srcdir)/e2p.h \
>  $(top_srcdir)/lib/ext2fs/ext2_fs.h $(top_builddir)/lib/ext2fs/ext2_types.h
> percent.o: $(srcdir)/percent.c $(srcdir)/e2p.h \
>  $(top_srcdir)/lib/ext2fs/ext2_fs.h $(top_builddir)/lib/ext2fs/ext2_types.h
> +fragment_score.o: $(srcdir)/fragment_score.c $(srcdir)/e2p.h \
> + $(top_srcdir)/lib/ext2fs/ext2_fs.h $(top_builddir)/lib/ext2fs/ext2_types.h
> diff --git a/lib/e2p/e2p.h b/lib/e2p/e2p.h
> index 4a68dd9..52a8e51 100644
> --- a/lib/e2p/e2p.h
> +++ b/lib/e2p/e2p.h
> @@ -72,3 +72,5 @@ char *e2p_os2string(int os_type);
> int e2p_string2os(char *str);
> 
> unsigned int e2p_percent(int percent, unsigned int base);
> +
> +int get_fragment_score(int fd, size_t threshold);

This function name should be prefixed with e2p_ to avoid name clashes and make it clear that it us part of the e2p library. 

> diff --git a/lib/e2p/fragment_score.c b/lib/e2p/fragment_score.c
> new file mode 100644
> index 0000000..3ad21b9
> --- /dev/null
> +++ b/lib/e2p/fragment_score.c
> @@ -0,0 +1,142 @@
> +/*
> + * fragment_score.c --- Get file fragmentation score on ext4 filesystem.
> + *
> + * Copyright (C) 2011    Kazuya Mio <k-mio@sx.jp.nec.com>
> + *            NEC Software Tohoku, Ltd.
> + *
> + * %Begin-Header%
> + * This file may be redistributed under the terms of the GNU Library
> + * General Public License, version 2.
> + * %End-Header%
> + */
> +
> +#define _LARGEFILE64_SOURCE
> +
> +#if HAVE_ERRNO_H
> +#include <errno.h>
> +#endif
> +#include <unistd.h>
> +#include <string.h>
> +#include <sys/ioctl.h>
> +#include <sys/types.h>
> +#include <sys/stat.h>
> +#include <sys/vfs.h>
> +
> +#include <ext2fs/ext2_types.h>
> +#include <ext2fs/fiemap.h>
> +
> +#include "e2p.h"
> +
> +#define    EXT3_IOC_GETFLAGS        _IOR('f', 1, long)
> +
> +#ifdef HAVE_FSTAT64
> +#define FSTAT        fstat64
> +#define STRUCT_STAT    struct stat64
> +#else
> +#define FSTAT        fstat
> +#define STRUCT_STAT    struct stat
> +#endif
> +
> +static int is_target_extent(struct fiemap_extent *cur_fm_ext,
> +                 struct fiemap_extent *next_fm_ext)
> +{
> +    /* Check a hole between the extent */
> +    if ((cur_fm_ext->fe_logical + cur_fm_ext->fe_length)
> +        < next_fm_ext->fe_logical)
> +        return 0;
> +    /* Check a defference of unwritten flag */
> +    if ((cur_fm_ext->fe_flags & FIEMAP_EXTENT_UNWRITTEN)
> +        != (next_fm_ext->fe_flags & FIEMAP_EXTENT_UNWRITTEN))
> +        return 0;
> +
> +    /* target extent */
> +    return 1;
> +}
> +
> +int get_fragment_score(int fd, size_t threshold)
> +{
> +    unsigned int    flags = 0;
> +    STRUCT_STAT    fileinfo;
> +    struct statfs    fsinfo;
> +    char buf[4096] = "";
> +    struct fiemap *fiemap = (struct fiemap *)buf;
> +    struct fiemap_extent *fm_ext = &fiemap->fm_extents[0];
> +    struct fiemap_extent prev_fm_ext;
> +    int count = (sizeof(buf) - sizeof(*fiemap)) /
> +            sizeof(struct fiemap_extent);
> +    int tot_extents = 0;
> +    int frag_extents = 0;
> +    unsigned int i;
> +    int first = 1, last = 0;
> +
> +    if (FSTAT(fd, &fileinfo) < 0 ||
> +        fstatfs(fd, &fsinfo) < 0)
> +        return -1;
> +    if (ioctl(fd, EXT3_IOC_GETFLAGS, &flags) < 0)
> +        flags = 0;
> +
> +    /*
> +     * Return an error if the target file is not the following cases.
> +     * - regular file
> +     * - extent format file on ext4 filesystem
> +     */
> +    if (!S_ISREG(fileinfo.st_mode) ||
> +        fsinfo.f_type != EXT2_SUPER_MAGIC ||
> +        !(flags & EXT4_EXTENTS_FL)) {
> +        errno = EOPNOTSUPP;
> +        return -1;
> +    }

I don't think there is a particular reason why this function shouldn't be usable for any filesytem that can return FIEMAP data. 

The fragmentation score should be independent of the underlying implementation, and it is the job of the caller to decide what to do with this information.  In the case of e4defrag it couldn't defrag a directory (ioctl should fail) and it will likely fail on other filesystem types but that is fine also. If other filesystems begin to support this ioctl they may want to begin using the same defrag tool. 

> +    memset(fiemap, 0, sizeof(struct fiemap));
> +    fiemap->fm_start = 0;
> +    fiemap->fm_length = ~0ULL;
> +    fiemap->fm_extent_count = count;
> +
> +    do {
> +        fiemap->fm_flags = 0;
> +        if (ioctl(fd, FS_IOC_FIEMAP, (unsigned long) fiemap) < 0)
> +            return -1;
> +
> +        /* If 0 extents are returned, then more ioctls are not needed */
> +        if (fiemap->fm_mapped_extents == 0)
> +            break;
> +
> +        if (first != 0)
> +            first = 0;
> +        else {
> +            /* Check the last extent gotten by previous FIEMAP */
> +            if (is_target_extent(&prev_fm_ext, &fm_ext[0])) {
> +                tot_extents++;
> +                if (prev_fm_ext.fe_length < threshold)
> +                    frag_extents++;
> +            }
> +        }
> +
> +        for (i = 0; i < fiemap->fm_mapped_extents; i++) {
> +            if (fm_ext[i].fe_flags & FIEMAP_EXTENT_LAST) {
> +                last = 1;
> +                continue;
> +            }
> +
> +            if (fiemap->fm_mapped_extents - 1 == i) {
> +                memcpy(&prev_fm_ext, &fm_ext[i],
> +                    sizeof(struct fiemap_extent));
> +                continue;
> +            }
> +
> +            /* Check target extent */
> +            if (!is_target_extent(&fm_ext[i], &fm_ext[i+1]))
> +                continue;
> +
> +            tot_extents++;
> +
> +            if (fm_ext[i].fe_length < threshold)
> +                frag_extents++;
> +        }
> +
> +        fiemap->fm_start = (fm_ext[i-1].fe_logical +
> +                    fm_ext[i-1].fe_length);
> +    } while (last == 0);
> +
> +    return tot_extents == 0 ? 0 : (100 * frag_extents) / tot_extents;
> +}

Cheers, Andreas--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kazuya Mio June 17, 2011, 3:01 a.m. UTC | #2
Hi Andreas,

Thanks for your comments.
I'm fixing my patchset now.
I will send it next week.

Regards,
Kazuya Mio

2011/06/16 12:06, Andreas Dilger wrote:
> On 2011-06-15, at 12:33 AM, Kazuya Mio<k-mio@sx.jp.nec.com>  wrote:
>> This patch adds get_fragment_score() to libe2p. get_fragment_score() returns
>> the fragmentation score. It shows the percentage of extents whose size is
>> smaller than the input argument "threshold".
>
>> However, there are some cases that cannot be merged into a next extent.
>> The following extents are excepted from the calculation of fragmentation score:
>> - The extent whose initialize status is different from the next extent
>> - There is a hole between the extent and its next extent
>> - The extent is a tail
>
> This description of the fragmentation score should also go into a comment at the function itself. Otherwise there is no description in the code of what the "score" is or what it's used for.
>
>> Signed-off-by: Kazuya Mio<k-mio@sx.jp.nec.com>
>> ---
>> lib/e2p/Makefile.in      |    6 +
>> lib/e2p/e2p.h            |    2
>> lib/e2p/fragment_score.c |  142 +++++++++++++++++++++++++++++++++++++++++++++++
>> 3 files changed, 148 insertions(+), 2 deletions(-)
>> diff --git a/lib/e2p/Makefile.in b/lib/e2p/Makefile.in
>> index 9775a98..c62a81d 100644
>> --- a/lib/e2p/Makefile.in
>> +++ b/lib/e2p/Makefile.in
>> @@ -19,7 +19,7 @@ all::    e2p.pc
>> OBJS=        feature.o fgetflags.o fsetflags.o fgetversion.o fsetversion.o \
>>         getflags.o getversion.o hashstr.o iod.o ls.o mntopts.o \
>>         parse_num.o pe.o pf.o ps.o setflags.o setversion.o uuid.o \
>> -        ostype.o percent.o
>> +        ostype.o percent.o fragment_score.o
>>
>> SRCS=        $(srcdir)/feature.c $(srcdir)/fgetflags.c \
>>         $(srcdir)/fsetflags.c $(srcdir)/fgetversion.c \
>> @@ -28,7 +28,7 @@ SRCS=        $(srcdir)/feature.c $(srcdir)/fgetflags.c \
>>         $(srcdir)/ls.c $(srcdir)/mntopts.c $(srcdir)/parse_num.c \
>>         $(srcdir)/pe.c $(srcdir)/pf.c $(srcdir)/ps.c \
>>         $(srcdir)/setflags.c $(srcdir)/setversion.c $(srcdir)/uuid.c \
>> -        $(srcdir)/ostype.c $(srcdir)/percent.c
>> +        $(srcdir)/ostype.c $(srcdir)/percent.c $(srcdir)/fragment_score.c
>> HFILES= e2p.h
>>
>> LIBRARY= libe2p
>> @@ -162,3 +162,5 @@ ostype.o: $(srcdir)/ostype.c $(srcdir)/e2p.h \
>>   $(top_srcdir)/lib/ext2fs/ext2_fs.h $(top_builddir)/lib/ext2fs/ext2_types.h
>> percent.o: $(srcdir)/percent.c $(srcdir)/e2p.h \
>>   $(top_srcdir)/lib/ext2fs/ext2_fs.h $(top_builddir)/lib/ext2fs/ext2_types.h
>> +fragment_score.o: $(srcdir)/fragment_score.c $(srcdir)/e2p.h \
>> + $(top_srcdir)/lib/ext2fs/ext2_fs.h $(top_builddir)/lib/ext2fs/ext2_types.h
>> diff --git a/lib/e2p/e2p.h b/lib/e2p/e2p.h
>> index 4a68dd9..52a8e51 100644
>> --- a/lib/e2p/e2p.h
>> +++ b/lib/e2p/e2p.h
>> @@ -72,3 +72,5 @@ char *e2p_os2string(int os_type);
>> int e2p_string2os(char *str);
>>
>> unsigned int e2p_percent(int percent, unsigned int base);
>> +
>> +int get_fragment_score(int fd, size_t threshold);
>
> This function name should be prefixed with e2p_ to avoid name clashes and make it clear that it us part of the e2p library.
>
>> diff --git a/lib/e2p/fragment_score.c b/lib/e2p/fragment_score.c
>> new file mode 100644
>> index 0000000..3ad21b9
>> --- /dev/null
>> +++ b/lib/e2p/fragment_score.c
>> @@ -0,0 +1,142 @@
>> +/*
>> + * fragment_score.c --- Get file fragmentation score on ext4 filesystem.
>> + *
>> + * Copyright (C) 2011    Kazuya Mio<k-mio@sx.jp.nec.com>
>> + *            NEC Software Tohoku, Ltd.
>> + *
>> + * %Begin-Header%
>> + * This file may be redistributed under the terms of the GNU Library
>> + * General Public License, version 2.
>> + * %End-Header%
>> + */
>> +
>> +#define _LARGEFILE64_SOURCE
>> +
>> +#if HAVE_ERRNO_H
>> +#include<errno.h>
>> +#endif
>> +#include<unistd.h>
>> +#include<string.h>
>> +#include<sys/ioctl.h>
>> +#include<sys/types.h>
>> +#include<sys/stat.h>
>> +#include<sys/vfs.h>
>> +
>> +#include<ext2fs/ext2_types.h>
>> +#include<ext2fs/fiemap.h>
>> +
>> +#include "e2p.h"
>> +
>> +#define    EXT3_IOC_GETFLAGS        _IOR('f', 1, long)
>> +
>> +#ifdef HAVE_FSTAT64
>> +#define FSTAT        fstat64
>> +#define STRUCT_STAT    struct stat64
>> +#else
>> +#define FSTAT        fstat
>> +#define STRUCT_STAT    struct stat
>> +#endif
>> +
>> +static int is_target_extent(struct fiemap_extent *cur_fm_ext,
>> +                 struct fiemap_extent *next_fm_ext)
>> +{
>> +    /* Check a hole between the extent */
>> +    if ((cur_fm_ext->fe_logical + cur_fm_ext->fe_length)
>> +<  next_fm_ext->fe_logical)
>> +        return 0;
>> +    /* Check a defference of unwritten flag */
>> +    if ((cur_fm_ext->fe_flags&  FIEMAP_EXTENT_UNWRITTEN)
>> +        != (next_fm_ext->fe_flags&  FIEMAP_EXTENT_UNWRITTEN))
>> +        return 0;
>> +
>> +    /* target extent */
>> +    return 1;
>> +}
>> +
>> +int get_fragment_score(int fd, size_t threshold)
>> +{
>> +    unsigned int    flags = 0;
>> +    STRUCT_STAT    fileinfo;
>> +    struct statfs    fsinfo;
>> +    char buf[4096] = "";
>> +    struct fiemap *fiemap = (struct fiemap *)buf;
>> +    struct fiemap_extent *fm_ext =&fiemap->fm_extents[0];
>> +    struct fiemap_extent prev_fm_ext;
>> +    int count = (sizeof(buf) - sizeof(*fiemap)) /
>> +            sizeof(struct fiemap_extent);
>> +    int tot_extents = 0;
>> +    int frag_extents = 0;
>> +    unsigned int i;
>> +    int first = 1, last = 0;
>> +
>> +    if (FSTAT(fd,&fileinfo)<  0 ||
>> +        fstatfs(fd,&fsinfo)<  0)
>> +        return -1;
>> +    if (ioctl(fd, EXT3_IOC_GETFLAGS,&flags)<  0)
>> +        flags = 0;
>> +
>> +    /*
>> +     * Return an error if the target file is not the following cases.
>> +     * - regular file
>> +     * - extent format file on ext4 filesystem
>> +     */
>> +    if (!S_ISREG(fileinfo.st_mode) ||
>> +        fsinfo.f_type != EXT2_SUPER_MAGIC ||
>> +        !(flags&  EXT4_EXTENTS_FL)) {
>> +        errno = EOPNOTSUPP;
>> +        return -1;
>> +    }
>
> I don't think there is a particular reason why this function shouldn't be usable for any filesytem that can return FIEMAP data.
>
> The fragmentation score should be independent of the underlying implementation, and it is the job of the caller to decide what to do with this information.  In the case of e4defrag it couldn't defrag a directory (ioctl should fail) and it will likely fail on other filesystem types but that is fine also. If other filesystems begin to support this ioctl they may want to begin using the same defrag tool.
>
>> +    memset(fiemap, 0, sizeof(struct fiemap));
>> +    fiemap->fm_start = 0;
>> +    fiemap->fm_length = ~0ULL;
>> +    fiemap->fm_extent_count = count;
>> +
>> +    do {
>> +        fiemap->fm_flags = 0;
>> +        if (ioctl(fd, FS_IOC_FIEMAP, (unsigned long) fiemap)<  0)
>> +            return -1;
>> +
>> +        /* If 0 extents are returned, then more ioctls are not needed */
>> +        if (fiemap->fm_mapped_extents == 0)
>> +            break;
>> +
>> +        if (first != 0)
>> +            first = 0;
>> +        else {
>> +            /* Check the last extent gotten by previous FIEMAP */
>> +            if (is_target_extent(&prev_fm_ext,&fm_ext[0])) {
>> +                tot_extents++;
>> +                if (prev_fm_ext.fe_length<  threshold)
>> +                    frag_extents++;
>> +            }
>> +        }
>> +
>> +        for (i = 0; i<  fiemap->fm_mapped_extents; i++) {
>> +            if (fm_ext[i].fe_flags&  FIEMAP_EXTENT_LAST) {
>> +                last = 1;
>> +                continue;
>> +            }
>> +
>> +            if (fiemap->fm_mapped_extents - 1 == i) {
>> +                memcpy(&prev_fm_ext,&fm_ext[i],
>> +                    sizeof(struct fiemap_extent));
>> +                continue;
>> +            }
>> +
>> +            /* Check target extent */
>> +            if (!is_target_extent(&fm_ext[i],&fm_ext[i+1]))
>> +                continue;
>> +
>> +            tot_extents++;
>> +
>> +            if (fm_ext[i].fe_length<  threshold)
>> +                frag_extents++;
>> +        }
>> +
>> +        fiemap->fm_start = (fm_ext[i-1].fe_logical +
>> +                    fm_ext[i-1].fe_length);
>> +    } while (last == 0);
>> +
>> +    return tot_extents == 0 ? 0 : (100 * frag_extents) / tot_extents;
>> +}
>
> Cheers, Andreas
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Theodore Ts'o June 17, 2011, 3:18 a.m. UTC | #3
On Wed, Jun 15, 2011 at 03:33:19PM +0900, Kazuya Mio wrote:
> This patch adds get_fragment_score() to libe2p. get_fragment_score() returns
> the fragmentation score. It shows the percentage of extents whose size is
> smaller than the input argument "threshold".

It perhaps might be useful to also articulate what are the goals of
this metric.  Is just just to decide which files should be
defragmented, and which should be left alone?  Or do you want to be
able to compare which file is "worse off"?

I can imagine two files that have a score of 100%, but one is much
worse off than the other.  Does that matter?  It may or might not,
depending how you plan to use the fragmentation score, both now and in
the future.  So it might be good to explicitly declare what are the
goals for this metrics, and its planned use cases.

Regards,

						- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Sandeen June 17, 2011, 2:20 p.m. UTC | #4
On 6/16/11 10:18 PM, Ted Ts'o wrote:
> On Wed, Jun 15, 2011 at 03:33:19PM +0900, Kazuya Mio wrote:
>> This patch adds get_fragment_score() to libe2p. get_fragment_score() returns
>> the fragmentation score. It shows the percentage of extents whose size is
>> smaller than the input argument "threshold".
> 
> It perhaps might be useful to also articulate what are the goals of
> this metric.  Is just just to decide which files should be
> defragmented, and which should be left alone?  Or do you want to be
> able to compare which file is "worse off"?
> 
> I can imagine two files that have a score of 100%, but one is much
> worse off than the other.  Does that matter?  It may or might not,
> depending how you plan to use the fragmentation score, both now and in
> the future.  So it might be good to explicitly declare what are the
> goals for this metrics, and its planned use cases.
> 
> Regards,

Just as a random datapoint, the xfs_db "frag factor" has been a constant
source of misunderstanding and woe for us.  (Granted, it works differently;
it is an fs-wide number representing

	((actual - ideal) / actual)

extents in the fs.)

This "% of fragments smaller than threshold" is more easily understandable
and possibly more descriptive, but I think Ted makes good points;
think about how this will be used, and whether the metric is useful.

It's hard to make a single number a) make sense to the user, and b)
be usefully representative of fragmentation "badness" - so I am
feeling very cautious about this idea overall.

To really convey fragmentation "badness" you'd almost want a histogram
of fragment sizes, which is a bit hard to present concisely...


-Eric

> 						- Ted

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andreas Dilger June 18, 2011, 7:19 a.m. UTC | #5
I was thinking about this, and am wondering if it makes sense to have an absolute score for fragmentation instead of a relative one?

By absolute I mean something like fragments per MB or similar. A bad score might be anything > 1. For files smaller than 1 MB in size it would scale the ratio to the equivalent if the file was 1MB in size (e.g. a 16kB file with 4 fragments would have a score of 256, which is clearly bad).  Large files can have a score much less than 1, which is good. 

Cheers, Andreas

On 2011-06-17, at 8:20 AM, Eric Sandeen <sandeen@redhat.com> wrote:

> On 6/16/11 10:18 PM, Ted Ts'o wrote:
>> On Wed, Jun 15, 2011 at 03:33:19PM +0900, Kazuya Mio wrote:
>>> This patch adds get_fragment_score() to libe2p. get_fragment_score() returns
>>> the fragmentation score. It shows the percentage of extents whose size is
>>> smaller than the input argument "threshold".
>> 
>> It perhaps might be useful to also articulate what are the goals of
>> this metric.  Is just just to decide which files should be
>> defragmented, and which should be left alone?  Or do you want to be
>> able to compare which file is "worse off"?
>> 
>> I can imagine two files that have a score of 100%, but one is much
>> worse off than the other.  Does that matter?  It may or might not,
>> depending how you plan to use the fragmentation score, both now and in
>> the future.  So it might be good to explicitly declare what are the
>> goals for this metrics, and its planned use cases.
>> 
>> Regards,
> 
> Just as a random datapoint, the xfs_db "frag factor" has been a constant
> source of misunderstanding and woe for us.  (Granted, it works differently;
> it is an fs-wide number representing
> 
>    ((actual - ideal) / actual)
> 
> extents in the fs.)
> 
> This "% of fragments smaller than threshold" is more easily understandable
> and possibly more descriptive, but I think Ted makes good points;
> think about how this will be used, and whether the metric is useful.
> 
> It's hard to make a single number a) make sense to the user, and b)
> be usefully representative of fragmentation "badness" - so I am
> feeling very cautious about this idea overall.
> 
> To really convey fragmentation "badness" you'd almost want a histogram
> of fragment sizes, which is a bit hard to present concisely...
> 
> 
> -Eric
> 
>>                        - Ted
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Greg Freemyer June 18, 2011, 5 p.m. UTC | #6
On Sat, Jun 18, 2011 at 3:19 AM, Andreas Dilger <aedilger@gmail.com> wrote:
> I was thinking about this, and am wondering if it makes sense to have an absolute score for fragmentation instead of a relative one?
>
> By absolute I mean something like fragments per MB or similar. A bad score might be anything > 1. For files smaller than 1 MB in size it would scale the ratio to the equivalent if the file was 1MB in size (e.g. a 16kB file with 4 fragments would have a score of 256, which is clearly bad).  Large files can have a score much less than 1, which is good.
>
> Cheers, Andreas

Shouldn't be based on fragments per max extent size for ext4?

And I think the max extent size for a 4KB page is 128 MB, right?

Greg
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andreas Dilger June 18, 2011, 5:15 p.m. UTC | #7
On 2011-06-18, at 11:00 AM, Greg Freemyer <greg.freemyer@gmail.com> wrote:
> On Sat, Jun 18, 2011 at 3:19 AM, Andreas Dilger <aedilger@gmail.com> wrote:
>> I was thinking about this, and am wondering if it makes sense to have an absolute score for fragmentation instead of a relative one?
>> 
>> By absolute I mean something like fragments per MB or similar. A bad score might be anything > 1. For files smaller than 1 MB in size it would scale the ratio to the equivalent if the file was 1MB in size (e.g. a 16kB file with 4 fragments would have a score of 256, which is clearly bad).  Large files can have a score much less than 1, which is good.
>> 
>> Cheers, Andreas
> 
> Shouldn't be based on fragments per max extent size for ext4?
> 
> And I think the max extent size for a 4KB page is 128 MB, right?

I was thinking about that, but in most cases it is unrealistic that all files have 128MB extents except on empty test filesystems, and I don't think that files with "only" 32MB extents should be considered that badly off.

I don't particularly care what the exact scale is, but I like the idea of an absolute measure instead of a relative measure (i.e. 33% fragmented).

Cheers, Andreas--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Greg Freemyer June 19, 2011, 7:55 p.m. UTC | #8
2011/6/15 Kazuya Mio <k-mio@sx.jp.nec.com>:
> This patch adds get_fragment_score() to libe2p. get_fragment_score() returns
> the fragmentation score. It shows the percentage of extents whose size is
> smaller than the input argument "threshold".
>
> However, there are some cases that cannot be merged into a next extent.
> The following extents are excepted from the calculation of fragmentation score:
> - The extent whose initialize status is different from the next extent
> - There is a hole between the extent and its next extent
> - The extent is a tail
>
> Signed-off-by: Kazuya Mio <k-mio@sx.jp.nec.com>
> ---

Kazuya,

If you're are going to stick with a percentage based approach I agree
with special casing the tail of the file or the tail of each
block_group in a sparse file.

But I think you should also special case the head extent of a block_group.

More clearly, whatever special treatment that is afforded to the
tail_extent of a block_group, should also be afforded to head_extent.

Maybe something like this for sparse files:

M_total=0
N_total=0
For (each block group) {
   M = extents in block group below threshold
   N = extents in block group above threshold

   if first_extent is below threshold  {  --M; }  # don't count it as
below threshold
   if last_extent is below threshold  {  --M; }  # don't count it as
below threshold

   M_total +=M
   N_totatl += N

   block_group_fragmentation_score = M / (N+M)
}
file_fragmentation_score = M_total / (N_total + M_total)

===> overall e4defrag design feedback

e4defrag makes the decision to defrag sparse files at the full file
level last I looked and I believe it still does.

I think it should treat each non-sparse file and each block_group of a
sparse file separately as a target for defragmentation.

The patch you are submitting just embeds the "file" defragmentation
methodology further into the code.  It very much seems to me that the
e4defrag code and scoring needs to work at the block_group size at the
largest.

That is for sparse files it should "score" a block_group at a time at
most.  And then it should compare the target's block_group layout vs.
the donor files block_group layout.

Once it is decided to defragment a block_group, then I think it should
fallocate the largest available extent (that may require a kernel
patch).

If that newly allocated extent is bigger than the first extent in the
block_group, then call into the kernel to swap out the target blocks
for the donor blocks.  Then move to the next target extent and repeat.

For a large file, making the overall defrag choice at the block_group
level just seems right, but for actually replacing target extents with
donor extents, I really think that needs to be decided one extent at a
time as I describe above.

After all, what is the value taking a block group with a 100 full
extents followed by two partial extents and moving all of that data
around in order to defrag it.  Only the last 2 extents need to be
consolidated.

Using the approach I described above, the code would quickly bypass
the first 100 full extents and then just defrag the last 2 partial
extents.

Greg
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kazuya Mio June 21, 2011, 11:26 a.m. UTC | #9
2011/06/17 12:18, Ted Ts'o wrote:
> It perhaps might be useful to also articulate what are the goals of
> this metric.  Is just just to decide which files should be
> defragmented, and which should be left alone?  Or do you want to be
> able to compare which file is "worse off"?

I decided to implement a fragmentation score for the two purposes:
one is for filefrag that outputs the score to decide which files should be
defragmented, and the other is for e4defrag that compares two files'
fragmentation to prevent the worse fragmentation.

> I can imagine two files that have a score of 100%, but one is much
> worse off than the other.  Does that matter?  It may or might not,
> depending how you plan to use the fragmentation score, both now and in
> the future.  So it might be good to explicitly declare what are the
> goals for this metrics, and its planned use cases.

Certainly, the same fragmentation score doesn't always mean the same
fragmentation. Just as Andreas said, "fragments per MB" is a good idea. It's
easy to understand, and other filesystem also would be able to use it without
change. Moreover, there is no worry about what threshold we use to
the application.

What do you think about this idea?

Regards,
Kazuya Mio
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kazuya Mio June 21, 2011, 11:28 a.m. UTC | #10
2011/06/18 16:19, Andreas Dilger wrote:
> I was thinking about this, and am wondering if it makes sense to have an absolute score for fragmentation
> instead of a relative one?
>
> By absolute I mean something like fragments per MB or similar. A bad score might be anything>  1. For
> files smaller than 1 MB in size it would scale the ratio to the equivalent if the file was 1MB in size
> (e.g. a 16kB file with 4 fragments would have a score of 256, which is clearly bad).  Large files can
> have a score much less than 1, which is good.

I think fragments per MB is easy to understand. I will fix the library function
to "double e2p_get_fragscore(int fd)". To return fragments per MB, it will
get the number of extents and the total length of extents except the following
special cases:
- The extent whose initialize status is different from the next extent
- There is a hole between the extent and the next extent
- The extent is a tail

The output of filefrag would be as follows:

# filefrag /mnt/mp1/testfile
/mnt/mp1/testfile: 4 extents found, 0.75 fragments/MB

Regards,
Kazuya Mio

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Theodore Ts'o June 21, 2011, 1:56 p.m. UTC | #11
On Tue, Jun 21, 2011 at 08:26:25PM +0900, Kazuya Mio wrote:
> 
> I decided to implement a fragmentation score for the two purposes:
> one is for filefrag that outputs the score to decide which files should be
> defragmented, and the other is for e4defrag that compares two files'
> fragmentation to prevent the worse fragmentation.

I'm really nervous about having filefrag print a "fragmentation
score".  The problem is that the problem is invariably far more
complex than can be boiled into a single number, and so users look at
it and start worrying when they shouldn't.

And the statement, "so that e4defrag can compare two files'
fragmentation to prevent the worse fragmentation" begs the question of
what is "worse".  The real issue here is that it's a multidimensional
problem.

> Certainly, the same fragmentation score doesn't always mean the same
> fragmentation. Just as Andreas said, "fragments per MB" is a good idea. It's
> easy to understand, and other filesystem also would be able to use it without
> change. Moreover, there is no worry about what threshold we use to
> the application.

"fragments per megabyte" is definitely better, especially if you
disregard the tail.  It's worth consider how it works for files
smaller than a megabyte.  Do you round the file size up to the nearest
megabyte?  Is it an integer score, or does it need to be floating
point?  An integer score where the size is rounded up to the nearest
megabyte sounds like a best plan, but I'm sure we could still find
some interesting non-linearities that lead to surprising results.

	      	    	    	      	   - Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kazuya Mio June 23, 2011, 8 a.m. UTC | #12
2011/06/21 22:56, Ted Ts'o wrote:
> I'm really nervous about having filefrag print a "fragmentation
> score".  The problem is that the problem is invariably far more
> complex than can be boiled into a single number, and so users look at
> it and start worrying when they shouldn't.

It's possible that could happen. I suppose filefrag should output
"fragmented" or "not fragmented" to understand when they shouldn't do e4defag.
But it would be difficult to implement this idea because the threshold for
the determination of which file is fragmented is different of each filesystem.
As it stands now, I think filefrag shouldn't output fragmentation score.

However, I think I add get_fragment_score() to libe2p because e4defrag will
still use it.

> And the statement, "so that e4defrag can compare two files'
> fragmentation to prevent the worse fragmentation" begs the question of
> what is "worse".  The real issue here is that it's a multidimensional
> problem.

We need to define "what is worse" for e4defrag. If fragments per megabyte of
the file is bigger than the threshold, e4defrag will call EXT4_IOC_MOVE_EXT
ioctl. The threshold may be customizable by an option.

> "fragments per megabyte" is definitely better, especially if you
> disregard the tail.  It's worth consider how it works for files
> smaller than a megabyte.  Do you round the file size up to the nearest
> megabyte?  Is it an integer score, or does it need to be floating
> point?  An integer score where the size is rounded up to the nearest
> megabyte sounds like a best plan, but I'm sure we could still find
> some interesting non-linearities that lead to surprising results.

An integer score sounds good to me.

Regards,
Kazuya Mio
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Greg Freemyer June 23, 2011, 11:16 a.m. UTC | #13
On Tue, Jun 21, 2011 at 7:28 AM, Kazuya Mio <k-mio@sx.jp.nec.com> wrote:
> 2011/06/18 16:19, Andreas Dilger wrote:
>>
>> I was thinking about this, and am wondering if it makes sense to have an
>> absolute score for fragmentation
>> instead of a relative one?
>>
>> By absolute I mean something like fragments per MB or similar. A bad score
>> might be anything>  1. For
>> files smaller than 1 MB in size it would scale the ratio to the equivalent
>> if the file was 1MB in size
>> (e.g. a 16kB file with 4 fragments would have a score of 256, which is
>> clearly bad).  Large files can
>> have a score much less than 1, which is good.
>
> I think fragments per MB is easy to understand. I will fix the library
> function
> to "double e2p_get_fragscore(int fd)". To return fragments per MB, it will
> get the number of extents and the total length of extents except the
> following
> special cases:
> - The extent whose initialize status is different from the next extent
> - There is a hole between the extent and the next extent
> - The extent is a tail

For a sparse file, can you explain why you treat the head and tail
extents of a block group differently?

The issue is totally symetric in my mind, so either include both or
exclude both in my opinion.  The above description only excludes the
block group tail extents.

Greg
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Greg Freemyer June 23, 2011, 11:27 a.m. UTC | #14
On Thu, Jun 23, 2011 at 7:16 AM, Greg Freemyer <greg.freemyer@gmail.com> wrote:
> On Tue, Jun 21, 2011 at 7:28 AM, Kazuya Mio <k-mio@sx.jp.nec.com> wrote:
>> 2011/06/18 16:19, Andreas Dilger wrote:
>>>
>>> I was thinking about this, and am wondering if it makes sense to have an
>>> absolute score for fragmentation
>>> instead of a relative one?
>>>
>>> By absolute I mean something like fragments per MB or similar. A bad score
>>> might be anything>  1. For
>>> files smaller than 1 MB in size it would scale the ratio to the equivalent
>>> if the file was 1MB in size
>>> (e.g. a 16kB file with 4 fragments would have a score of 256, which is
>>> clearly bad).  Large files can
>>> have a score much less than 1, which is good.
>>
>> I think fragments per MB is easy to understand. I will fix the library
>> function
>> to "double e2p_get_fragscore(int fd)". To return fragments per MB, it will
>> get the number of extents and the total length of extents except the
>> following
>> special cases:
>> - The extent whose initialize status is different from the next extent
>> - There is a hole between the extent and the next extent
>> - The extent is a tail
>
> For a sparse file, can you explain why you treat the head and tail
> extents of a block group differently?
>
> The issue is totally symetric in my mind, so either include both or
> exclude both in my opinion.  The above description only excludes the
> block group tail extents.
>
> Greg

I forgot to say, fragments per MB is not a large enough unit in my mind.

A storage system that can transfer 100MB / sec , but takes 5 msecs to
seek and do a half rotation of the platter would spend 33% of its time
seeking if it had 1MB contiguous sections of data all over the drive.

Fragments per max extent really seems the only logical thng for
e4defrag to use in my mind.

Greg
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kazuya Mio June 24, 2011, 8:28 a.m. UTC | #15
Hi Greg,
I'm sorry for the late reply.

2011/06/23 20:16, Greg Freemyer wrote:
> For a sparse file, can you explain why you treat the head and tail
> extents of a block group differently?

Could you tell me what "block group" you said means?

If "block group" means the ext4 block group, I will treat the head and tail
extents of a block group the same way.

And if "block group" means the chunk of the extents whose offset is continued,
I will treat only the tail extents as a special case.

# filefrag -v /mnt/mp1/file
Filesystem type is: ef53
File size of /mnt/mp1/file is 285212672 (69632 blocks, blocksize 4096)
  ext logical physical expected length flags
    0       0    34816           30720
    1   30720    65536            2048 unwritten
    2   65536    67584            4096 unwritten,eof
/mnt/mp1/file: 1 extent found

The case is not fragmented. The length of #1 extent is a little bit short,
but there is no point in doing defragmentation because of the hole existence.

Regards,
Kazuya Mio
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Greg Freemyer June 26, 2011, 2:16 a.m. UTC | #16
On Fri, Jun 24, 2011 at 4:28 AM, Kazuya Mio <k-mio@sx.jp.nec.com> wrote:
> Hi Greg,
> I'm sorry for the late reply.
>
> 2011/06/23 20:16, Greg Freemyer wrote:
>>
>> For a sparse file, can you explain why you treat the head and tail
>> extents of a block group differently?
>
> Could you tell me what "block group" you said means?

Kazuya,

Sorry if I've got the terminology wrong.

I'm talking about sparse files.

Where you might have:

block_group1 - hole - block_group2 - hole - block_group3.

Block_group2 has a head and a tail extent.  In my mind, from a
performance perspective, they are symmetric.  Meaning that having a
small extent at the beginning is no better and no worse than having a
small extent at the end.

> If "block group" means the ext4 block group, I will treat the head and tail
> extents of a block group the same way.
>
> And if "block group" means the chunk of the extents whose offset is
> continued,
> I will treat only the tail extents as a special case.
>
> # filefrag -v /mnt/mp1/file
> Filesystem type is: ef53
> File size of /mnt/mp1/file is 285212672 (69632 blocks, blocksize 4096)
>  ext logical physical expected length flags
>   0       0    34816           30720
>   1 30720 65536            2048 unwritten
>   2   65536    67584            4096 unwritten,eof
> /mnt/mp1/file: 1 extent found
>
> The case is not fragmented. The length of #1 extent is a little bit short,
> but there is no point in doing defragmentation because of the hole
> existence.

Please consider this caseL

Assume for a minute  a large sparse file with lots of holes.  This is
one like a VM might create.  And that in the middle of the file is a
block_group with a hole in front of it and a hole after it.  Assume it
is 1 extent long exactly, and that one extent is a maximum extent (ie
128 MB with 4KB blocks I believe).

Now assume the VM writes a single short extent at the end of
block_group.  Your logic says full_extent+small_extent, no need to
defrag because its already optimal.

I agree with that.

Now assume instead of the block_group being extended at the tail, it
is extended at the head.

Now you have:

   small_extent+full_extent

Your logic as I understand it will score that as not as good as the
first case.  I disagree,  Both are optimally defragged and both should
get the same score.

>
> Regards,
> Kazuya Mio

Greg
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kazuya Mio June 28, 2011, 10:21 a.m. UTC | #17
2011/06/26 11:16, Greg Freemyer wrote:
> I'm talking about sparse files.
>
> Where you might have:
>
> block_group1 - hole - block_group2 - hole - block_group3.
>
> Block_group2 has a head and a tail extent.  In my mind, from a
> performance perspective, they are symmetric.  Meaning that having a
> small extent at the beginning is no better and no worse than having a
> small extent at the end.

Thanks for your kind description. Your point is right. I'll think a little more
about the fragmentation score.

Regards,
Kazuya Mio
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Greg Freemyer June 28, 2011, 1:53 p.m. UTC | #18
On Tue, Jun 28, 2011 at 6:21 AM, Kazuya Mio <k-mio@sx.jp.nec.com> wrote:
> 2011/06/26 11:16, Greg Freemyer wrote:
>>
>> I'm talking about sparse files.
>>
>> Where you might have:
>>
>> block_group1 - hole - block_group2 - hole - block_group3.
>>
>> Block_group2 has a head and a tail extent.  In my mind, from a
>> performance perspective, they are symmetric.  Meaning that having a
>> small extent at the beginning is no better and no worse than having a
>> small extent at the end.
>
> Thanks for your kind description. Your point is right. I'll think a little
> more
> about the fragmentation score.
>
> Regards,
> Kazuya Mio

Kazuya,

While you're thinking about the issue:

As I hope I've said before, for sparse file I think e4defrag should
score and defrag one block_group at a time.  Thus if a VM backing
storage file has 100 block_groups (as I'm using the term), then it
should score each of the 100 separately and if needed defrag them one
at a time.

I can see no benefit from treating a large sparse file as monolithic
for the decision process.

fyi: Is there an agreed on term for what I'm calling a block_group.  I
believe e4defrag uses the term "extent group" in the comments, but
sparse files exist in non-extent based filesystems, so it's not a very
portable name.

Greg
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kazuya Mio July 1, 2011, 8:34 a.m. UTC | #19
2011/06/28 22:53, Greg Freemyer wrote:
> While you're thinking about the issue:
>
> As I hope I've said before, for sparse file I think e4defrag should
> score and defrag one block_group at a time.  Thus if a VM backing
> storage file has 100 block_groups (as I'm using the term), then it
> should score each of the 100 separately and if needed defrag them one
> at a time.
>
> I can see no benefit from treating a large sparse file as monolithic
> for the decision process.

If e4defrag does defrag one block_group at a time, this block_group may be
allocated far away from the other block_groups. If so, seek time increases
even if the number of extents is less than before.

Of course, I'm aware of the advantage of your suggestion. I may also try to
consider it to defrag only a part of a file in the future, but before that
I think I should do the cleanup and bugfix.

> fyi: Is there an agreed on term for what I'm calling a block_group.  I
> believe e4defrag uses the term "extent group" in the comments, but
> sparse files exist in non-extent based filesystems, so it's not a very
> portable name.

e4defrag supports only an extent based filesystem, so I think it's no problem.
And I associate "block_group" with the physical layout of the blocks on the
disk. I guess we shouldn't use the same word with different meanings.

Regards,
Kazuya Mio
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kazuya Mio July 7, 2011, 10:40 a.m. UTC | #20
Hi all,

I come up with the new fragmentation score based on your comments.
The interface of new libe2p function will be as follows:

int e2p_get_fragscore(int fd, __u32 threshold, __u32 max_chunk_blks)

This function returns the fragmentation score that shows how badly fragmented
the file might be. The score is extents per @threshold, so the higher score
means the worse fragmentation.

Fragmentation score treats extents, whose file offset continues and
whose status is the same, as one chunk. If the number of extents
in chunk is equal to the ideal number of extents, the chunk is not used for
the fragmentation score because there is no fragment in the chunk.
The ideal number of extents is calculated based on @max_chunk_blks.

In case of ext4, @max_chunk_blks is 32768 blocks. If you have 128MB file
with two extents, these two extents are used for the calculation of
the fragmentation score because the ideal number of extents in this file is one.

e4defrag will judge the necessity of calling EXT4_IOC_MOVE_EXT ioctl by
fragmentation score. If the fragmentation score of defrag target file is
more than one, and the score of the file created by fallocate is zero,
e4defrag will call the ioctl. I'll decide the better value of threshold during
some tests. If you must do e4defrag to the specified file, you will be able to
force defrag by using new option of e4defrag.


Two examples (@threshold=256, @max_chunk_blks=32768):

# filefrag -v fragment
Filesystem type is: ef53
File size of fragment is 409600 (100 blocks, blocksize 4096)
  ext logical physical expected length flags
    0       0    33807              16
    1      16    32848    33822     84 eof
fragment: 2 extents found

This small file has two extents, so it is fragmented. The calculation is
as follows:

fragmentation score = 2 / (100 / 256)
                     = 5 extents / MB.

# filefrag -v not_fragment
Filesystem type is: ef53
File size of not_fragment is 125829120 (30720 blocks, blocksize 4096)
  ext logical physical expected length flags
    0       0    33823              16
    1      16    32944    33838     48
    2      64    33152    32991     64
    3     128    34304    33215  30592 eof
not_fragment: 4 extents found

This file has four extents included one large extent. So the score is zero
in this case.

fragmentation score = 4 / (30720 / 256)
                     = 0 extents / MB.

Regards,
Kazuya Mio
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/lib/e2p/Makefile.in b/lib/e2p/Makefile.in
index 9775a98..c62a81d 100644
--- a/lib/e2p/Makefile.in
+++ b/lib/e2p/Makefile.in
@@ -19,7 +19,7 @@  all::	e2p.pc
 OBJS=		feature.o fgetflags.o fsetflags.o fgetversion.o fsetversion.o \
 		getflags.o getversion.o hashstr.o iod.o ls.o mntopts.o \
 		parse_num.o pe.o pf.o ps.o setflags.o setversion.o uuid.o \
-		ostype.o percent.o
+		ostype.o percent.o fragment_score.o
 
 SRCS=		$(srcdir)/feature.c $(srcdir)/fgetflags.c \
 		$(srcdir)/fsetflags.c $(srcdir)/fgetversion.c \
@@ -28,7 +28,7 @@  SRCS=		$(srcdir)/feature.c $(srcdir)/fgetflags.c \
 		$(srcdir)/ls.c $(srcdir)/mntopts.c $(srcdir)/parse_num.c \
 		$(srcdir)/pe.c $(srcdir)/pf.c $(srcdir)/ps.c \
 		$(srcdir)/setflags.c $(srcdir)/setversion.c $(srcdir)/uuid.c \
-		$(srcdir)/ostype.c $(srcdir)/percent.c
+		$(srcdir)/ostype.c $(srcdir)/percent.c $(srcdir)/fragment_score.c
 HFILES= e2p.h
 
 LIBRARY= libe2p
@@ -162,3 +162,5 @@  ostype.o: $(srcdir)/ostype.c $(srcdir)/e2p.h \
  $(top_srcdir)/lib/ext2fs/ext2_fs.h $(top_builddir)/lib/ext2fs/ext2_types.h
 percent.o: $(srcdir)/percent.c $(srcdir)/e2p.h \
  $(top_srcdir)/lib/ext2fs/ext2_fs.h $(top_builddir)/lib/ext2fs/ext2_types.h
+fragment_score.o: $(srcdir)/fragment_score.c $(srcdir)/e2p.h \
+ $(top_srcdir)/lib/ext2fs/ext2_fs.h $(top_builddir)/lib/ext2fs/ext2_types.h
diff --git a/lib/e2p/e2p.h b/lib/e2p/e2p.h
index 4a68dd9..52a8e51 100644
--- a/lib/e2p/e2p.h
+++ b/lib/e2p/e2p.h
@@ -72,3 +72,5 @@  char *e2p_os2string(int os_type);
 int e2p_string2os(char *str);
 
 unsigned int e2p_percent(int percent, unsigned int base);
+
+int get_fragment_score(int fd, size_t threshold);
diff --git a/lib/e2p/fragment_score.c b/lib/e2p/fragment_score.c
new file mode 100644
index 0000000..3ad21b9
--- /dev/null
+++ b/lib/e2p/fragment_score.c
@@ -0,0 +1,142 @@ 
+/*
+ * fragment_score.c --- Get file fragmentation score on ext4 filesystem.
+ *
+ * Copyright (C) 2011	Kazuya Mio <k-mio@sx.jp.nec.com>
+ *			NEC Software Tohoku, Ltd.
+ *
+ * %Begin-Header%
+ * This file may be redistributed under the terms of the GNU Library
+ * General Public License, version 2.
+ * %End-Header%
+ */
+
+#define _LARGEFILE64_SOURCE
+
+#if HAVE_ERRNO_H
+#include <errno.h>
+#endif
+#include <unistd.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/vfs.h>
+
+#include <ext2fs/ext2_types.h>
+#include <ext2fs/fiemap.h>
+
+#include "e2p.h"
+
+#define	EXT3_IOC_GETFLAGS		_IOR('f', 1, long)
+
+#ifdef HAVE_FSTAT64
+#define FSTAT		fstat64
+#define STRUCT_STAT	struct stat64
+#else
+#define FSTAT		fstat
+#define STRUCT_STAT	struct stat
+#endif
+
+static int is_target_extent(struct fiemap_extent *cur_fm_ext,
+			     struct fiemap_extent *next_fm_ext)
+{
+	/* Check a hole between the extent */
+	if ((cur_fm_ext->fe_logical + cur_fm_ext->fe_length)
+		< next_fm_ext->fe_logical)
+		return 0;
+	/* Check a defference of unwritten flag */
+	if ((cur_fm_ext->fe_flags & FIEMAP_EXTENT_UNWRITTEN)
+		!= (next_fm_ext->fe_flags & FIEMAP_EXTENT_UNWRITTEN))
+		return 0;
+
+	/* target extent */
+	return 1;
+}
+
+int get_fragment_score(int fd, size_t threshold)
+{
+	unsigned int	flags = 0;
+	STRUCT_STAT	fileinfo;
+	struct statfs	fsinfo;
+	char buf[4096] = "";
+	struct fiemap *fiemap = (struct fiemap *)buf;
+	struct fiemap_extent *fm_ext = &fiemap->fm_extents[0];
+	struct fiemap_extent prev_fm_ext;
+	int count = (sizeof(buf) - sizeof(*fiemap)) /
+			sizeof(struct fiemap_extent);
+	int tot_extents = 0;
+	int frag_extents = 0;
+	unsigned int i;
+	int first = 1, last = 0;
+
+	if (FSTAT(fd, &fileinfo) < 0 ||
+	    fstatfs(fd, &fsinfo) < 0)
+		return -1;
+	if (ioctl(fd, EXT3_IOC_GETFLAGS, &flags) < 0)
+		flags = 0;
+
+	/*
+	 * Return an error if the target file is not the following cases.
+	 * - regular file
+	 * - extent format file on ext4 filesystem
+	 */
+	if (!S_ISREG(fileinfo.st_mode) ||
+	    fsinfo.f_type != EXT2_SUPER_MAGIC ||
+	    !(flags & EXT4_EXTENTS_FL)) {
+		errno = EOPNOTSUPP;
+		return -1;
+	}
+
+	memset(fiemap, 0, sizeof(struct fiemap));
+	fiemap->fm_start = 0;
+	fiemap->fm_length = ~0ULL;
+	fiemap->fm_extent_count = count;
+
+	do {
+		fiemap->fm_flags = 0;
+		if (ioctl(fd, FS_IOC_FIEMAP, (unsigned long) fiemap) < 0)
+			return -1;
+
+		/* If 0 extents are returned, then more ioctls are not needed */
+		if (fiemap->fm_mapped_extents == 0)
+			break;
+
+		if (first != 0)
+			first = 0;
+		else {
+			/* Check the last extent gotten by previous FIEMAP */
+			if (is_target_extent(&prev_fm_ext, &fm_ext[0])) {
+				tot_extents++;
+				if (prev_fm_ext.fe_length < threshold)
+					frag_extents++;
+			}
+		}
+
+		for (i = 0; i < fiemap->fm_mapped_extents; i++) {
+			if (fm_ext[i].fe_flags & FIEMAP_EXTENT_LAST) {
+				last = 1;
+				continue;
+			}
+
+			if (fiemap->fm_mapped_extents - 1 == i) {
+				memcpy(&prev_fm_ext, &fm_ext[i],
+					sizeof(struct fiemap_extent));
+				continue;
+			}
+
+			/* Check target extent */
+			if (!is_target_extent(&fm_ext[i], &fm_ext[i+1]))
+				continue;
+
+			tot_extents++;
+
+			if (fm_ext[i].fe_length < threshold)
+				frag_extents++;
+		}
+
+		fiemap->fm_start = (fm_ext[i-1].fe_logical +
+				    fm_ext[i-1].fe_length);
+	} while (last == 0);
+
+	return tot_extents == 0 ? 0 : (100 * frag_extents) / tot_extents;
+}