diff mbox series

[v2] syscalls/readahead02: limit max readahead to backing device max_readahead_kb

Message ID e27f21eef1ea317efcae24816b7a014e9e663f2d.1551802444.git.jstancek@redhat.com
State Superseded, archived
Headers show
Series [v2] syscalls/readahead02: limit max readahead to backing device max_readahead_kb | expand

Commit Message

Jan Stancek March 5, 2019, 4:17 p.m. UTC
Using system-wide "Cached" size is not accurate. The test is sporadically
failing with warning on ppc64le 4.18 and 5.0 kernels.

Problem is that test over-estimates max readahead size, which then
leads to fewer readhead calls and kernel can silently trims length
in each of them:
  ...
  readahead02.c:244: INFO: Test #2: POSIX_FADV_WILLNEED on file
  readahead02.c:134: INFO: creating test file of size: 67108864
  readahead02.c:263: INFO: read_testfile(0)
  readahead02.c:274: INFO: read_testfile(1)
  readahead02.c:189: INFO: max ra estimate: 12320768
  readahead02.c:198: INFO: readahead calls made: 6
  readahead02.c:204: PASS: offset is still at 0 as expected
  readahead02.c:308: INFO: read_testfile(0) took: 492486 usec
  readahead02.c:309: INFO: read_testfile(1) took: 430627 usec
  readahead02.c:311: INFO: read_testfile(0) read: 67108864 bytes
  readahead02.c:313: INFO: read_testfile(1) read: 59244544 bytes
  readahead02.c:316: PASS: readahead saved some I/O
  readahead02.c:324: INFO: cache can hold at least: 264192 kB
  readahead02.c:325: INFO: read_testfile(0) used cache: 124992 kB
  readahead02.c:326: INFO: read_testfile(1) used cache: 12032 kB
  readahead02.c:338: WARN: using less cache than expected

Stop relying on used cache size, and use backing device readahead
limit instead.

Signed-off-by: Jan Stancek <jstancek@redhat.com>
---
 testcases/kernel/syscalls/readahead/readahead02.c | 38 +++++++++++++----------
 1 file changed, 21 insertions(+), 17 deletions(-)

Comments

Amir Goldstein March 5, 2019, 4:35 p.m. UTC | #1
On Tue, Mar 5, 2019 at 6:17 PM Jan Stancek <jstancek@redhat.com> wrote:
>
> Using system-wide "Cached" size is not accurate. The test is sporadically
> failing with warning on ppc64le 4.18 and 5.0 kernels.
>
> Problem is that test over-estimates max readahead size, which then
> leads to fewer readhead calls and kernel can silently trims length
> in each of them:
>   ...
>   readahead02.c:244: INFO: Test #2: POSIX_FADV_WILLNEED on file
>   readahead02.c:134: INFO: creating test file of size: 67108864
>   readahead02.c:263: INFO: read_testfile(0)
>   readahead02.c:274: INFO: read_testfile(1)
>   readahead02.c:189: INFO: max ra estimate: 12320768
>   readahead02.c:198: INFO: readahead calls made: 6
>   readahead02.c:204: PASS: offset is still at 0 as expected
>   readahead02.c:308: INFO: read_testfile(0) took: 492486 usec
>   readahead02.c:309: INFO: read_testfile(1) took: 430627 usec
>   readahead02.c:311: INFO: read_testfile(0) read: 67108864 bytes
>   readahead02.c:313: INFO: read_testfile(1) read: 59244544 bytes
>   readahead02.c:316: PASS: readahead saved some I/O
>   readahead02.c:324: INFO: cache can hold at least: 264192 kB
>   readahead02.c:325: INFO: read_testfile(0) used cache: 124992 kB
>   readahead02.c:326: INFO: read_testfile(1) used cache: 12032 kB
>   readahead02.c:338: WARN: using less cache than expected
>
> Stop relying on used cache size, and use backing device readahead
> limit instead.
>

This is certainly better than 4K, but still feels like we are not really testing
the API properly, but I'm fine with this fix.

However... as follow up, how about extending the new
tst_dev_bytes_written() utils from Sumit to cover also bytes_read
and replace validation of readahead() from get_cached_size() diff
to tst_dev_bytes_read()?

That feels like a much more deterministic test and in-fact readahead()
validation is kind of like the reverse of sync_file_range() validation.

Thanks,
Amir.


> Signed-off-by: Jan Stancek <jstancek@redhat.com>
> ---
>  testcases/kernel/syscalls/readahead/readahead02.c | 38 +++++++++++++----------
>  1 file changed, 21 insertions(+), 17 deletions(-)
>
> diff --git a/testcases/kernel/syscalls/readahead/readahead02.c b/testcases/kernel/syscalls/readahead/readahead02.c
> index 293c839e169e..c4c90a51c7de 100644
> --- a/testcases/kernel/syscalls/readahead/readahead02.c
> +++ b/testcases/kernel/syscalls/readahead/readahead02.c
> @@ -49,7 +49,7 @@ static int ovl_mounted;
>  #define OVL_UPPER      MNTPOINT"/upper"
>  #define OVL_WORK       MNTPOINT"/work"
>  #define OVL_MNT                MNTPOINT"/ovl"
> -#define MIN_SANE_READAHEAD (4u * 1024u)
> +int min_sane_readahead  = 4096;
>
>  static const char mntpoint[] = MNTPOINT;
>
> @@ -165,13 +165,11 @@ static int read_testfile(struct tcase *tc, int do_readahead,
>         size_t i = 0;
>         long read_bytes_start;
>         unsigned char *p, tmp;
> -       unsigned long cached_start, max_ra_estimate = 0;
>         off_t offset = 0;
>
>         fd = SAFE_OPEN(fname, O_RDONLY);
>
>         if (do_readahead) {
> -               cached_start = get_cached_size();
>                 do {
>                         TEST(tc->readahead(fd, offset, fsize - offset));
>                         if (TST_RET != 0) {
> @@ -179,21 +177,8 @@ static int read_testfile(struct tcase *tc, int do_readahead,
>                                 return TST_ERR;
>                         }
>
> -                       /* estimate max readahead size based on first call */
> -                       if (!max_ra_estimate) {
> -                               *cached = get_cached_size();
> -                               if (*cached > cached_start) {
> -                                       max_ra_estimate = (1024 *
> -                                               (*cached - cached_start));
> -                                       tst_res(TINFO, "max ra estimate: %lu",
> -                                               max_ra_estimate);
> -                               }
> -                               max_ra_estimate = MAX(max_ra_estimate,
> -                                       MIN_SANE_READAHEAD);
> -                       }
> -
>                         i++;
> -                       offset += max_ra_estimate;
> +                       offset += min_sane_readahead;
>                 } while ((size_t)offset < fsize);
>                 tst_res(TINFO, "readahead calls made: %zu", i);
>                 *cached = get_cached_size();
> @@ -366,12 +351,31 @@ static void setup_overlay(void)
>
>  static void setup(void)
>  {
> +       struct stat sbuf;
> +       char tmp[PATH_MAX], sys_ra_path[PATH_MAX], *backing_dev;
> +       int read_ahead_kb;
> +
>         if (opt_fsizestr)
>                 testfile_size = SAFE_STRTOL(opt_fsizestr, 1, INT_MAX);
>
>         if (access(PROC_IO_FNAME, F_OK))
>                 tst_brk(TCONF, "Requires " PROC_IO_FNAME);
>
> +       /* Use backing device read_ahead_kb limit as min_sane_readahead */
> +       SAFE_LSTAT(tst_device->dev, &sbuf);
> +       if (S_ISLNK(sbuf.st_mode))
> +               SAFE_READLINK(tst_device->dev, tmp, PATH_MAX);
> +       else
> +               strcpy(tmp, tst_device->dev);
> +       backing_dev = basename(tmp);
> +       sprintf(sys_ra_path, "/sys/class/block/%s/bdi/read_ahead_kb", backing_dev);
> +       if (access(sys_ra_path, F_OK) == 0) {
> +               SAFE_FILE_SCANF(sys_ra_path, "%d", &read_ahead_kb);
> +               if (read_ahead_kb * 1024 > min_sane_readahead)
> +                       min_sane_readahead = read_ahead_kb * 1024;
> +       }
> +       tst_res(TINFO, "min_sane_readahead: %u", min_sane_readahead);
> +
>         has_file(DROP_CACHES_FNAME, 1);
>         has_file(MEMINFO_FNAME, 1);
>
> --
> 1.8.3.1
>
Jan Stancek March 5, 2019, 4:55 p.m. UTC | #2
----- Original Message -----
> On Tue, Mar 5, 2019 at 6:17 PM Jan Stancek <jstancek@redhat.com> wrote:
> >
> > Using system-wide "Cached" size is not accurate. The test is sporadically
> > failing with warning on ppc64le 4.18 and 5.0 kernels.
> >
> > Problem is that test over-estimates max readahead size, which then
> > leads to fewer readhead calls and kernel can silently trims length
> > in each of them:
> >   ...
> >   readahead02.c:244: INFO: Test #2: POSIX_FADV_WILLNEED on file
> >   readahead02.c:134: INFO: creating test file of size: 67108864
> >   readahead02.c:263: INFO: read_testfile(0)
> >   readahead02.c:274: INFO: read_testfile(1)
> >   readahead02.c:189: INFO: max ra estimate: 12320768
> >   readahead02.c:198: INFO: readahead calls made: 6
> >   readahead02.c:204: PASS: offset is still at 0 as expected
> >   readahead02.c:308: INFO: read_testfile(0) took: 492486 usec
> >   readahead02.c:309: INFO: read_testfile(1) took: 430627 usec
> >   readahead02.c:311: INFO: read_testfile(0) read: 67108864 bytes
> >   readahead02.c:313: INFO: read_testfile(1) read: 59244544 bytes
> >   readahead02.c:316: PASS: readahead saved some I/O
> >   readahead02.c:324: INFO: cache can hold at least: 264192 kB
> >   readahead02.c:325: INFO: read_testfile(0) used cache: 124992 kB
> >   readahead02.c:326: INFO: read_testfile(1) used cache: 12032 kB
> >   readahead02.c:338: WARN: using less cache than expected
> >
> > Stop relying on used cache size, and use backing device readahead
> > limit instead.
> >
> 
> This is certainly better than 4K, but still feels like we are not really
> testing
> the API properly, but I'm fine with this fix.
> 
> However... as follow up, how about extending the new
> tst_dev_bytes_written() utils from Sumit to cover also bytes_read
> and replace validation of readahead() from get_cached_size() diff
> to tst_dev_bytes_read()?

There is something similar based on /proc/self/io. We could try using
that to estimate max readahead size.

Or /sys/class/block/$dev/stat as you suggested, not sure which one is
more accurate/up to date.

> That feels like a much more deterministic test and in-fact readahead()
> validation is kind of like the reverse of sync_file_range() validation.
> 
> Thanks,
> Amir.
>
Amir Goldstein March 5, 2019, 8:08 p.m. UTC | #3
On Tue, Mar 5, 2019 at 6:55 PM Jan Stancek <jstancek@redhat.com> wrote:
>
>
> ----- Original Message -----
> > On Tue, Mar 5, 2019 at 6:17 PM Jan Stancek <jstancek@redhat.com> wrote:
> > >
> > > Using system-wide "Cached" size is not accurate. The test is sporadically
> > > failing with warning on ppc64le 4.18 and 5.0 kernels.
> > >
> > > Problem is that test over-estimates max readahead size, which then
> > > leads to fewer readhead calls and kernel can silently trims length
> > > in each of them:
> > >   ...
> > >   readahead02.c:244: INFO: Test #2: POSIX_FADV_WILLNEED on file
> > >   readahead02.c:134: INFO: creating test file of size: 67108864
> > >   readahead02.c:263: INFO: read_testfile(0)
> > >   readahead02.c:274: INFO: read_testfile(1)
> > >   readahead02.c:189: INFO: max ra estimate: 12320768
> > >   readahead02.c:198: INFO: readahead calls made: 6
> > >   readahead02.c:204: PASS: offset is still at 0 as expected
> > >   readahead02.c:308: INFO: read_testfile(0) took: 492486 usec
> > >   readahead02.c:309: INFO: read_testfile(1) took: 430627 usec
> > >   readahead02.c:311: INFO: read_testfile(0) read: 67108864 bytes
> > >   readahead02.c:313: INFO: read_testfile(1) read: 59244544 bytes
> > >   readahead02.c:316: PASS: readahead saved some I/O
> > >   readahead02.c:324: INFO: cache can hold at least: 264192 kB
> > >   readahead02.c:325: INFO: read_testfile(0) used cache: 124992 kB
> > >   readahead02.c:326: INFO: read_testfile(1) used cache: 12032 kB
> > >   readahead02.c:338: WARN: using less cache than expected
> > >
> > > Stop relying on used cache size, and use backing device readahead
> > > limit instead.
> > >
> >
> > This is certainly better than 4K, but still feels like we are not really
> > testing
> > the API properly, but I'm fine with this fix.
> >
> > However... as follow up, how about extending the new
> > tst_dev_bytes_written() utils from Sumit to cover also bytes_read
> > and replace validation of readahead() from get_cached_size() diff
> > to tst_dev_bytes_read()?
>
> There is something similar based on /proc/self/io. We could try using
> that to estimate max readahead size.
>
> Or /sys/class/block/$dev/stat as you suggested, not sure which one is
> more accurate/up to date.
>

I believe /proc/self/io doesn't count IO performed by kernel async
readahead against the process that issued the readahead, but didn't
check. The test uses /proc/self/io to check how many IO where avoided
by readahead...

Do its the combination of both that gives the most information, i.e.:
X bytes read from device, Y bytes out of which read by process
on mmap access... (the rest must have been read by kernel readahead)

Thanks,
Amir.
Jan Stancek March 5, 2019, 8:22 p.m. UTC | #4
----- Original Message -----
> On Tue, Mar 5, 2019 at 6:55 PM Jan Stancek <jstancek@redhat.com> wrote:
> >
> >
> > ----- Original Message -----
> > > On Tue, Mar 5, 2019 at 6:17 PM Jan Stancek <jstancek@redhat.com> wrote:
> > > >
> > > > Using system-wide "Cached" size is not accurate. The test is
> > > > sporadically
> > > > failing with warning on ppc64le 4.18 and 5.0 kernels.
> > > >
> > > > Problem is that test over-estimates max readahead size, which then
> > > > leads to fewer readhead calls and kernel can silently trims length
> > > > in each of them:
> > > >   ...
> > > >   readahead02.c:244: INFO: Test #2: POSIX_FADV_WILLNEED on file
> > > >   readahead02.c:134: INFO: creating test file of size: 67108864
> > > >   readahead02.c:263: INFO: read_testfile(0)
> > > >   readahead02.c:274: INFO: read_testfile(1)
> > > >   readahead02.c:189: INFO: max ra estimate: 12320768
> > > >   readahead02.c:198: INFO: readahead calls made: 6
> > > >   readahead02.c:204: PASS: offset is still at 0 as expected
> > > >   readahead02.c:308: INFO: read_testfile(0) took: 492486 usec
> > > >   readahead02.c:309: INFO: read_testfile(1) took: 430627 usec
> > > >   readahead02.c:311: INFO: read_testfile(0) read: 67108864 bytes
> > > >   readahead02.c:313: INFO: read_testfile(1) read: 59244544 bytes
> > > >   readahead02.c:316: PASS: readahead saved some I/O
> > > >   readahead02.c:324: INFO: cache can hold at least: 264192 kB
> > > >   readahead02.c:325: INFO: read_testfile(0) used cache: 124992 kB
> > > >   readahead02.c:326: INFO: read_testfile(1) used cache: 12032 kB
> > > >   readahead02.c:338: WARN: using less cache than expected
> > > >
> > > > Stop relying on used cache size, and use backing device readahead
> > > > limit instead.
> > > >
> > >
> > > This is certainly better than 4K, but still feels like we are not really
> > > testing
> > > the API properly, but I'm fine with this fix.
> > >
> > > However... as follow up, how about extending the new
> > > tst_dev_bytes_written() utils from Sumit to cover also bytes_read
> > > and replace validation of readahead() from get_cached_size() diff
> > > to tst_dev_bytes_read()?
> >
> > There is something similar based on /proc/self/io. We could try using
> > that to estimate max readahead size.
> >
> > Or /sys/class/block/$dev/stat as you suggested, not sure which one is
> > more accurate/up to date.
> >
> 
> I believe /proc/self/io doesn't count IO performed by kernel async
> readahead against the process that issued the readahead, but didn't
> check. The test uses /proc/self/io to check how many IO where avoided
> by readahead...

We could do one readahead() on entire file, then read
the file and see how many IO we didn't manage to avoid.
The difference between filesize and IO we couldn't avoid,
would be our max readahead size.

> 
> Do its the combination of both that gives the most information, i.e.:
> X bytes read from device, Y bytes out of which read by process
> on mmap access... (the rest must have been read by kernel readahead)
> 
> Thanks,
> Amir.
>
Amir Goldstein March 5, 2019, 8:44 p.m. UTC | #5
On Tue, Mar 5, 2019 at 10:22 PM Jan Stancek <jstancek@redhat.com> wrote:
>
>
>
> ----- Original Message -----
> > On Tue, Mar 5, 2019 at 6:55 PM Jan Stancek <jstancek@redhat.com> wrote:
> > >
> > >
> > > ----- Original Message -----
> > > > On Tue, Mar 5, 2019 at 6:17 PM Jan Stancek <jstancek@redhat.com> wrote:
> > > > >
> > > > > Using system-wide "Cached" size is not accurate. The test is
> > > > > sporadically
> > > > > failing with warning on ppc64le 4.18 and 5.0 kernels.
> > > > >
> > > > > Problem is that test over-estimates max readahead size, which then
> > > > > leads to fewer readhead calls and kernel can silently trims length
> > > > > in each of them:
> > > > >   ...
> > > > >   readahead02.c:244: INFO: Test #2: POSIX_FADV_WILLNEED on file
> > > > >   readahead02.c:134: INFO: creating test file of size: 67108864
> > > > >   readahead02.c:263: INFO: read_testfile(0)
> > > > >   readahead02.c:274: INFO: read_testfile(1)
> > > > >   readahead02.c:189: INFO: max ra estimate: 12320768
> > > > >   readahead02.c:198: INFO: readahead calls made: 6
> > > > >   readahead02.c:204: PASS: offset is still at 0 as expected
> > > > >   readahead02.c:308: INFO: read_testfile(0) took: 492486 usec
> > > > >   readahead02.c:309: INFO: read_testfile(1) took: 430627 usec
> > > > >   readahead02.c:311: INFO: read_testfile(0) read: 67108864 bytes
> > > > >   readahead02.c:313: INFO: read_testfile(1) read: 59244544 bytes
> > > > >   readahead02.c:316: PASS: readahead saved some I/O
> > > > >   readahead02.c:324: INFO: cache can hold at least: 264192 kB
> > > > >   readahead02.c:325: INFO: read_testfile(0) used cache: 124992 kB
> > > > >   readahead02.c:326: INFO: read_testfile(1) used cache: 12032 kB
> > > > >   readahead02.c:338: WARN: using less cache than expected
> > > > >
> > > > > Stop relying on used cache size, and use backing device readahead
> > > > > limit instead.
> > > > >
> > > >
> > > > This is certainly better than 4K, but still feels like we are not really
> > > > testing
> > > > the API properly, but I'm fine with this fix.
> > > >
> > > > However... as follow up, how about extending the new
> > > > tst_dev_bytes_written() utils from Sumit to cover also bytes_read
> > > > and replace validation of readahead() from get_cached_size() diff
> > > > to tst_dev_bytes_read()?
> > >
> > > There is something similar based on /proc/self/io. We could try using
> > > that to estimate max readahead size.
> > >
> > > Or /sys/class/block/$dev/stat as you suggested, not sure which one is
> > > more accurate/up to date.
> > >
> >
> > I believe /proc/self/io doesn't count IO performed by kernel async
> > readahead against the process that issued the readahead, but didn't
> > check. The test uses /proc/self/io to check how many IO where avoided
> > by readahead...
>
> We could do one readahead() on entire file, then read
> the file and see how many IO we didn't manage to avoid.
> The difference between filesize and IO we couldn't avoid,
> would be our max readahead size.
>

Sounds good.

Thanks,
Amir.
Jan Stancek March 6, 2019, 4:42 p.m. UTC | #6
On Tue, Mar 05, 2019 at 10:44:57PM +0200, Amir Goldstein wrote:
>> > > > This is certainly better than 4K, but still feels like we are not really
>> > > > testing
>> > > > the API properly, but I'm fine with this fix.
>> > > >
>> > > > However... as follow up, how about extending the new
>> > > > tst_dev_bytes_written() utils from Sumit to cover also bytes_read
>> > > > and replace validation of readahead() from get_cached_size() diff
>> > > > to tst_dev_bytes_read()?
>> > >
>> > > There is something similar based on /proc/self/io. We could try using
>> > > that to estimate max readahead size.
>> > >
>> > > Or /sys/class/block/$dev/stat as you suggested, not sure which one is
>> > > more accurate/up to date.
>> > >
>> >
>> > I believe /proc/self/io doesn't count IO performed by kernel async
>> > readahead against the process that issued the readahead, but didn't
>> > check. The test uses /proc/self/io to check how many IO where avoided
>> > by readahead...
>>
>> We could do one readahead() on entire file, then read
>> the file and see how many IO we didn't manage to avoid.
>> The difference between filesize and IO we couldn't avoid,
>> would be our max readahead size.

This also doesn't seem 100% accurate.

Any method to inspect side-effect of readahead, appears to lead to more
readahead done by kernel. E.g. sequential reads leading to more async
readahead started by kernel (which tries to stay ahead by async_size).
mmap() approach appears to fault-in with do_fault_around().

MAP_NONBLOCK is gone, mincore and pagemap doesn't help here.

I'm attaching v3, where I do reads with sycalls() in reverse order.
But occasionally, it still somehow leads to couple extra pages being
read to cache. So, it still over-estimates. On ppc64le, it's quite
significant, 4 extra pages in cache, each 64k, causes readahead
loop to miss ~10MB of data.

/sys/class/block/$dev/ stats appear to be increased for fs metadata
as well, which can also inflate value and we over-estimate.

I'm running out of ideas for something more accurate/stable than v2.

Regards,
Jan
From f5a0bf04cb4dd1746636702b668da7b6c9146008 Mon Sep 17 00:00:00 2001
Message-Id: <f5a0bf04cb4dd1746636702b668da7b6c9146008.1551888987.git.jstancek@redhat.com>
From: Jan Stancek <jstancek@redhat.com>
Date: Wed, 6 Mar 2019 16:52:33 +0100
Subject: [PATCH v3] syscalls/readahead02: don't use system-wide cache stats to
 estimate max readahead

Using system-wide "Cached" size is not accurate. The test is sporadically
failing with warning on ppc64le 4.18 and 5.0 kernels.

Problem is that test over-estimates max readahead size, which then
leads to fewer readhead calls and kernel can silently trims length
in each of them:
  ...
  readahead02.c:244: INFO: Test #2: POSIX_FADV_WILLNEED on file
  readahead02.c:134: INFO: creating test file of size: 67108864
  readahead02.c:263: INFO: read_testfile(0)
  readahead02.c:274: INFO: read_testfile(1)
  readahead02.c:189: INFO: max ra estimate: 12320768
  readahead02.c:198: INFO: readahead calls made: 6
  readahead02.c:204: PASS: offset is still at 0 as expected
  readahead02.c:308: INFO: read_testfile(0) took: 492486 usec
  readahead02.c:309: INFO: read_testfile(1) took: 430627 usec
  readahead02.c:311: INFO: read_testfile(0) read: 67108864 bytes
  readahead02.c:313: INFO: read_testfile(1) read: 59244544 bytes
  readahead02.c:316: PASS: readahead saved some I/O
  readahead02.c:324: INFO: cache can hold at least: 264192 kB
  readahead02.c:325: INFO: read_testfile(0) used cache: 124992 kB
  readahead02.c:326: INFO: read_testfile(1) used cache: 12032 kB
  readahead02.c:338: WARN: using less cache than expected

This patch makes following changes:
- max readahead size estimate is no longer using system-wide cache
- it is replaced with function, that makes 1 readahead on entire file,
  then tries to read it and checks /proc/self/io stats.
  The difference in read_bytes stat is the amount of IO we didn't
  manage to avoid. max readahead size is then file size minus
  IO we didn't manage to avoid.
- File reading is no longer done sequentially, because kernel has
  optimizations that lead to async readahead on offsets following
  the read.
- File reading is done directly with syscalls (without mmap), to
  try avoid kernel optimizations like do_fault_around().

This combined makes max readahead estimate more accurate, but it's
not byte-perfect every time. It still occasionally over-estimates
maximum readahead by small amount.

Signed-off-by: Jan Stancek <jstancek@redhat.com>
---
 testcases/kernel/syscalls/readahead/readahead02.c | 83 ++++++++++++++---------
 1 file changed, 50 insertions(+), 33 deletions(-)

diff --git a/testcases/kernel/syscalls/readahead/readahead02.c b/testcases/kernel/syscalls/readahead/readahead02.c
index 293c839e169e..89d1ca3bcd64 100644
--- a/testcases/kernel/syscalls/readahead/readahead02.c
+++ b/testcases/kernel/syscalls/readahead/readahead02.c
@@ -49,7 +49,7 @@ static int ovl_mounted;
 #define OVL_UPPER	MNTPOINT"/upper"
 #define OVL_WORK	MNTPOINT"/work"
 #define OVL_MNT		MNTPOINT"/ovl"
-#define MIN_SANE_READAHEAD (4u * 1024u)
+#define MIN_SANE_READAHEAD 4096
 
 static const char mntpoint[] = MNTPOINT;
 
@@ -145,6 +145,49 @@ static void create_testfile(int use_overlay)
 	free(tmp);
 }
 
+static void read_file_backwards(int fd, size_t fsize)
+{
+	size_t i;
+	unsigned char p;
+
+	/*
+	 * read from end to beginning, to avoid kernel optimizations
+	 * for sequential read, where it asynchronously reads ahead
+	 */
+	for (i = 0; i < fsize; i+= pagesize) {
+		SAFE_LSEEK(fd, fsize - i - 1, SEEK_SET);
+		SAFE_READ(1, fd, &p, 1);
+	}
+	SAFE_LSEEK(fd, 0, SEEK_SET);
+}
+
+/*
+ * Call readahead() on entire file and try to read it. Check how much
+ * has read IO stat increased. Difference between file size and increase
+ * in IO is our guess for maximum allowed readahead size.
+ */
+static int guess_max_ra(struct tcase *tc, int fd, size_t fsize)
+{
+	int max_ra = 0;
+	long read_bytes_start, read_bytes;
+
+	tc->readahead(fd, 0, fsize);
+
+	read_bytes_start = get_bytes_read();
+	read_file_backwards(fd, fsize);
+	read_bytes = get_bytes_read() - read_bytes_start;
+
+	max_ra = fsize - read_bytes;
+	if (max_ra < MIN_SANE_READAHEAD) {
+		tst_res(TWARN, "Failed to estimate max ra size: %d", max_ra);
+		max_ra = MIN_SANE_READAHEAD;
+	}
+	tst_res(TINFO, "max readahead size estimate: %d", max_ra);
+
+	drop_caches();
+	return max_ra;
+}
+
 /* read_testfile - mmap testfile and read every page.
  * This functions measures how many I/O and time it takes to fully
  * read contents of test file.
@@ -164,14 +207,12 @@ static int read_testfile(struct tcase *tc, int do_readahead,
 	int fd;
 	size_t i = 0;
 	long read_bytes_start;
-	unsigned char *p, tmp;
-	unsigned long cached_start, max_ra_estimate = 0;
 	off_t offset = 0;
+	int max_ra = 0;
 
 	fd = SAFE_OPEN(fname, O_RDONLY);
-
 	if (do_readahead) {
-		cached_start = get_cached_size();
+		max_ra = guess_max_ra(tc, fd, fsize);
 		do {
 			TEST(tc->readahead(fd, offset, fsize - offset));
 			if (TST_RET != 0) {
@@ -179,21 +220,8 @@ static int read_testfile(struct tcase *tc, int do_readahead,
 				return TST_ERR;
 			}
 
-			/* estimate max readahead size based on first call */
-			if (!max_ra_estimate) {
-				*cached = get_cached_size();
-				if (*cached > cached_start) {
-					max_ra_estimate = (1024 *
-						(*cached - cached_start));
-					tst_res(TINFO, "max ra estimate: %lu",
-						max_ra_estimate);
-				}
-				max_ra_estimate = MAX(max_ra_estimate,
-					MIN_SANE_READAHEAD);
-			}
-
 			i++;
-			offset += max_ra_estimate;
+			offset += max_ra;
 		} while ((size_t)offset < fsize);
 		tst_res(TINFO, "readahead calls made: %zu", i);
 		*cached = get_cached_size();
@@ -207,25 +235,14 @@ static int read_testfile(struct tcase *tc, int do_readahead,
 	}
 
 	tst_timer_start(CLOCK_MONOTONIC);
-	read_bytes_start = get_bytes_read();
 
-	p = SAFE_MMAP(NULL, fsize, PROT_READ, MAP_SHARED | MAP_POPULATE, fd, 0);
-
-	/* for old kernels, where MAP_POPULATE doesn't work, touch each page */
-	tmp = 0;
-	for (i = 0; i < fsize; i += pagesize)
-		tmp = tmp ^ p[i];
-	/* prevent gcc from optimizing out loop above */
-	if (tmp != 0)
-		tst_brk(TBROK, "This line should not be reached");
+	read_bytes_start = get_bytes_read();
+	read_file_backwards(fd, fsize);
+	*read_bytes = get_bytes_read() - read_bytes_start;
 
 	if (!do_readahead)
 		*cached = get_cached_size();
 
-	SAFE_MUNMAP(p, fsize);
-
-	*read_bytes = get_bytes_read() - read_bytes_start;
-
 	tst_timer_stop();
 	*usec = tst_timer_elapsed_us();
Amir Goldstein March 7, 2019, 6:41 a.m. UTC | #7
On Wed, Mar 6, 2019 at 6:43 PM Jan Stancek <jstancek@redhat.com> wrote:
>
> On Tue, Mar 05, 2019 at 10:44:57PM +0200, Amir Goldstein wrote:
> >> > > > This is certainly better than 4K, but still feels like we are not really
> >> > > > testing
> >> > > > the API properly, but I'm fine with this fix.
> >> > > >
> >> > > > However... as follow up, how about extending the new
> >> > > > tst_dev_bytes_written() utils from Sumit to cover also bytes_read
> >> > > > and replace validation of readahead() from get_cached_size() diff
> >> > > > to tst_dev_bytes_read()?
> >> > >
> >> > > There is something similar based on /proc/self/io. We could try using
> >> > > that to estimate max readahead size.
> >> > >
> >> > > Or /sys/class/block/$dev/stat as you suggested, not sure which one is
> >> > > more accurate/up to date.
> >> > >
> >> >
> >> > I believe /proc/self/io doesn't count IO performed by kernel async
> >> > readahead against the process that issued the readahead, but didn't
> >> > check. The test uses /proc/self/io to check how many IO where avoided
> >> > by readahead...
> >>
> >> We could do one readahead() on entire file, then read
> >> the file and see how many IO we didn't manage to avoid.
> >> The difference between filesize and IO we couldn't avoid,
> >> would be our max readahead size.
>
> This also doesn't seem 100% accurate.
>
> Any method to inspect side-effect of readahead, appears to lead to more
> readahead done by kernel. E.g. sequential reads leading to more async
> readahead started by kernel (which tries to stay ahead by async_size).
> mmap() approach appears to fault-in with do_fault_around().
>
> MAP_NONBLOCK is gone, mincore and pagemap doesn't help here.
>
> I'm attaching v3, where I do reads with sycalls() in reverse order.
> But occasionally, it still somehow leads to couple extra pages being
> read to cache. So, it still over-estimates. On ppc64le, it's quite
> significant, 4 extra pages in cache, each 64k, causes readahead
> loop to miss ~10MB of data.
>
> /sys/class/block/$dev/ stats appear to be increased for fs metadata
> as well, which can also inflate value and we over-estimate.
>
> I'm running out of ideas for something more accurate/stable than v2.
>

I'm trying to understand if maybe you are running off the rails with
the estimation issue.
What the test aims to verify is that readahead prevents waiting
on IO in the future.
The max readahead size estimation is a very small part
of the test and not that significant IMO.
Why is the test not trying to readahead more than 64MB?
It's arbitrary. So my first proposal to over-estimation was
to cap upper estimation with 1MB, i.e.:

offset += MIN(max_ra_estimate, MAX_SANE_READAHEAD_ESTIMATION);

With this change we won't be testing if readahead of 64MB
in one go works anymore - it looks like getting that to work reliably
is more challenging and not sure its worth the trouble.

What you ended up implementing in v2 is not what I proposed
(you just disabled estimation altogether)
I am fine with the dynamic setting on min_sane_readahead
in v2, but it is independent of capping the upper limit for
estimation.

So what do you say about v2 + estimation (with or without reading
file backwards) and upper estimation limit?
Does it solve the sporadic failures problem?

Thanks,
Amir.
Jan Stancek March 7, 2019, 8:18 a.m. UTC | #8
----- Original Message -----
> On Wed, Mar 6, 2019 at 6:43 PM Jan Stancek <jstancek@redhat.com> wrote:
> >
> > On Tue, Mar 05, 2019 at 10:44:57PM +0200, Amir Goldstein wrote:
> > >> > > > This is certainly better than 4K, but still feels like we are not
> > >> > > > really
> > >> > > > testing
> > >> > > > the API properly, but I'm fine with this fix.
> > >> > > >
> > >> > > > However... as follow up, how about extending the new
> > >> > > > tst_dev_bytes_written() utils from Sumit to cover also bytes_read
> > >> > > > and replace validation of readahead() from get_cached_size() diff
> > >> > > > to tst_dev_bytes_read()?
> > >> > >
> > >> > > There is something similar based on /proc/self/io. We could try
> > >> > > using
> > >> > > that to estimate max readahead size.
> > >> > >
> > >> > > Or /sys/class/block/$dev/stat as you suggested, not sure which one
> > >> > > is
> > >> > > more accurate/up to date.
> > >> > >
> > >> >
> > >> > I believe /proc/self/io doesn't count IO performed by kernel async
> > >> > readahead against the process that issued the readahead, but didn't
> > >> > check. The test uses /proc/self/io to check how many IO where avoided
> > >> > by readahead...
> > >>
> > >> We could do one readahead() on entire file, then read
> > >> the file and see how many IO we didn't manage to avoid.
> > >> The difference between filesize and IO we couldn't avoid,
> > >> would be our max readahead size.
> >
> > This also doesn't seem 100% accurate.
> >
> > Any method to inspect side-effect of readahead, appears to lead to more
> > readahead done by kernel. E.g. sequential reads leading to more async
> > readahead started by kernel (which tries to stay ahead by async_size).
> > mmap() approach appears to fault-in with do_fault_around().
> >
> > MAP_NONBLOCK is gone, mincore and pagemap doesn't help here.
> >
> > I'm attaching v3, where I do reads with sycalls() in reverse order.
> > But occasionally, it still somehow leads to couple extra pages being
> > read to cache. So, it still over-estimates. On ppc64le, it's quite
> > significant, 4 extra pages in cache, each 64k, causes readahead
> > loop to miss ~10MB of data.
> >
> > /sys/class/block/$dev/ stats appear to be increased for fs metadata
> > as well, which can also inflate value and we over-estimate.
> >
> > I'm running out of ideas for something more accurate/stable than v2.
> >
> 
> I'm trying to understand if maybe you are running off the rails with
> the estimation issue.
> What the test aims to verify is that readahead prevents waiting
> on IO in the future.
> The max readahead size estimation is a very small part
> of the test and not that significant IMO.
> Why is the test not trying to readahead more than 64MB?
> It's arbitrary. So my first proposal to over-estimation was
> to cap upper estimation with 1MB, i.e.:

Problem is that max allowed readahead can be smaller than 1MB,
so then we still over-estimate.

> 
> offset += MIN(max_ra_estimate, MAX_SANE_READAHEAD_ESTIMATION);
> 
> With this change we won't be testing if readahead of 64MB
> in one go works anymore - it looks like getting that to work reliably
> is more challenging and not sure its worth the trouble.
> 
> What you ended up implementing in v2 is not what I proposed
> (you just disabled estimation altogether)

Yes, v2/bdi limit is highest number I'm aware of, that should
work across all kernels without guessing.

> I am fine with the dynamic setting on min_sane_readahead
> in v2, but it is independent of capping the upper limit for
> estimation.
> 
> So what do you say about v2 + estimation (with or without reading
> file backwards) and upper estimation limit?

How could we tell if 1MB upper limit is smaller than max readahead limit?

Other option could be v3, but in steps equal to "max_ra / 2":

 *                        |<----- async_size ---------|
 *     |------------------- size -------------------->|
 *     |==================#===========================|
 *     ^start             ^page marked with PG_readahead
 *
 * To overlap application thinking time and disk I/O time, we do
 * `readahead pipelining': Do not wait until the application consumed all
 * readahead pages and stalled on the missing page at readahead_index;
 * Instead, submit an asynchronous readahead I/O as soon as there are
 * only async_size pages left in the readahead window. Normally async_size
 * will be equal to size, for maximum pipelining.


> Does it solve the sporadic failures problem?
> 
> Thanks,
> Amir.
>
Amir Goldstein March 7, 2019, 8:48 a.m. UTC | #9
On Thu, Mar 7, 2019 at 10:18 AM Jan Stancek <jstancek@redhat.com> wrote:
>
>
>
> ----- Original Message -----
> > On Wed, Mar 6, 2019 at 6:43 PM Jan Stancek <jstancek@redhat.com> wrote:
> > >
> > > On Tue, Mar 05, 2019 at 10:44:57PM +0200, Amir Goldstein wrote:
> > > >> > > > This is certainly better than 4K, but still feels like we are not
> > > >> > > > really
> > > >> > > > testing
> > > >> > > > the API properly, but I'm fine with this fix.
> > > >> > > >
> > > >> > > > However... as follow up, how about extending the new
> > > >> > > > tst_dev_bytes_written() utils from Sumit to cover also bytes_read
> > > >> > > > and replace validation of readahead() from get_cached_size() diff
> > > >> > > > to tst_dev_bytes_read()?
> > > >> > >
> > > >> > > There is something similar based on /proc/self/io. We could try
> > > >> > > using
> > > >> > > that to estimate max readahead size.
> > > >> > >
> > > >> > > Or /sys/class/block/$dev/stat as you suggested, not sure which one
> > > >> > > is
> > > >> > > more accurate/up to date.
> > > >> > >
> > > >> >
> > > >> > I believe /proc/self/io doesn't count IO performed by kernel async
> > > >> > readahead against the process that issued the readahead, but didn't
> > > >> > check. The test uses /proc/self/io to check how many IO where avoided
> > > >> > by readahead...
> > > >>
> > > >> We could do one readahead() on entire file, then read
> > > >> the file and see how many IO we didn't manage to avoid.
> > > >> The difference between filesize and IO we couldn't avoid,
> > > >> would be our max readahead size.
> > >
> > > This also doesn't seem 100% accurate.
> > >
> > > Any method to inspect side-effect of readahead, appears to lead to more
> > > readahead done by kernel. E.g. sequential reads leading to more async
> > > readahead started by kernel (which tries to stay ahead by async_size).
> > > mmap() approach appears to fault-in with do_fault_around().
> > >
> > > MAP_NONBLOCK is gone, mincore and pagemap doesn't help here.
> > >
> > > I'm attaching v3, where I do reads with sycalls() in reverse order.
> > > But occasionally, it still somehow leads to couple extra pages being
> > > read to cache. So, it still over-estimates. On ppc64le, it's quite
> > > significant, 4 extra pages in cache, each 64k, causes readahead
> > > loop to miss ~10MB of data.
> > >
> > > /sys/class/block/$dev/ stats appear to be increased for fs metadata
> > > as well, which can also inflate value and we over-estimate.
> > >
> > > I'm running out of ideas for something more accurate/stable than v2.
> > >
> >
> > I'm trying to understand if maybe you are running off the rails with
> > the estimation issue.
> > What the test aims to verify is that readahead prevents waiting
> > on IO in the future.
> > The max readahead size estimation is a very small part
> > of the test and not that significant IMO.
> > Why is the test not trying to readahead more than 64MB?
> > It's arbitrary. So my first proposal to over-estimation was
> > to cap upper estimation with 1MB, i.e.:
>
> Problem is that max allowed readahead can be smaller than 1MB,
> so then we still over-estimate.
>
> >
> > offset += MIN(max_ra_estimate, MAX_SANE_READAHEAD_ESTIMATION);
> >
> > With this change we won't be testing if readahead of 64MB
> > in one go works anymore - it looks like getting that to work reliably
> > is more challenging and not sure its worth the trouble.
> >
> > What you ended up implementing in v2 is not what I proposed
> > (you just disabled estimation altogether)
>
> Yes, v2/bdi limit is highest number I'm aware of, that should
> work across all kernels without guessing.
>
> > I am fine with the dynamic setting on min_sane_readahead
> > in v2, but it is independent of capping the upper limit for
> > estimation.
> >
> > So what do you say about v2 + estimation (with or without reading
> > file backwards) and upper estimation limit?
>
> How could we tell if 1MB upper limit is smaller than max readahead limit?
>

Try to set bdi limit of test device on setup() to testfile_size before reading
back the value?
If that fails try testfile_size / 2 etc.
Use the value that bdi limit ended up with.

> Other option could be v3, but in steps equal to "max_ra / 2":
>
>  *                        |<----- async_size ---------|
>  *     |------------------- size -------------------->|
>  *     |==================#===========================|
>  *     ^start             ^page marked with PG_readahead
>  *
>  * To overlap application thinking time and disk I/O time, we do
>  * `readahead pipelining': Do not wait until the application consumed all
>  * readahead pages and stalled on the missing page at readahead_index;
>  * Instead, submit an asynchronous readahead I/O as soon as there are
>  * only async_size pages left in the readahead window. Normally async_size
>  * will be equal to size, for maximum pipelining.
>
>

OK, if it works, but v2 + raising bdi limit sounds simpler,
so if that works I think that it is better to minimize speculative code
in the test,

Thanks,
Amir.
Jan Stancek March 7, 2019, 9:15 a.m. UTC | #10
----- Original Message -----
> On Thu, Mar 7, 2019 at 10:18 AM Jan Stancek <jstancek@redhat.com> wrote:
> >
> >
> >
> > ----- Original Message -----
> > > On Wed, Mar 6, 2019 at 6:43 PM Jan Stancek <jstancek@redhat.com> wrote:
> > > >
> > > > On Tue, Mar 05, 2019 at 10:44:57PM +0200, Amir Goldstein wrote:
> > > > >> > > > This is certainly better than 4K, but still feels like we are
> > > > >> > > > not
> > > > >> > > > really
> > > > >> > > > testing
> > > > >> > > > the API properly, but I'm fine with this fix.
> > > > >> > > >
> > > > >> > > > However... as follow up, how about extending the new
> > > > >> > > > tst_dev_bytes_written() utils from Sumit to cover also
> > > > >> > > > bytes_read
> > > > >> > > > and replace validation of readahead() from get_cached_size()
> > > > >> > > > diff
> > > > >> > > > to tst_dev_bytes_read()?
> > > > >> > >
> > > > >> > > There is something similar based on /proc/self/io. We could try
> > > > >> > > using
> > > > >> > > that to estimate max readahead size.
> > > > >> > >
> > > > >> > > Or /sys/class/block/$dev/stat as you suggested, not sure which
> > > > >> > > one
> > > > >> > > is
> > > > >> > > more accurate/up to date.
> > > > >> > >
> > > > >> >
> > > > >> > I believe /proc/self/io doesn't count IO performed by kernel async
> > > > >> > readahead against the process that issued the readahead, but
> > > > >> > didn't
> > > > >> > check. The test uses /proc/self/io to check how many IO where
> > > > >> > avoided
> > > > >> > by readahead...
> > > > >>
> > > > >> We could do one readahead() on entire file, then read
> > > > >> the file and see how many IO we didn't manage to avoid.
> > > > >> The difference between filesize and IO we couldn't avoid,
> > > > >> would be our max readahead size.
> > > >
> > > > This also doesn't seem 100% accurate.
> > > >
> > > > Any method to inspect side-effect of readahead, appears to lead to more
> > > > readahead done by kernel. E.g. sequential reads leading to more async
> > > > readahead started by kernel (which tries to stay ahead by async_size).
> > > > mmap() approach appears to fault-in with do_fault_around().
> > > >
> > > > MAP_NONBLOCK is gone, mincore and pagemap doesn't help here.
> > > >
> > > > I'm attaching v3, where I do reads with sycalls() in reverse order.
> > > > But occasionally, it still somehow leads to couple extra pages being
> > > > read to cache. So, it still over-estimates. On ppc64le, it's quite
> > > > significant, 4 extra pages in cache, each 64k, causes readahead
> > > > loop to miss ~10MB of data.
> > > >
> > > > /sys/class/block/$dev/ stats appear to be increased for fs metadata
> > > > as well, which can also inflate value and we over-estimate.
> > > >
> > > > I'm running out of ideas for something more accurate/stable than v2.
> > > >
> > >
> > > I'm trying to understand if maybe you are running off the rails with
> > > the estimation issue.
> > > What the test aims to verify is that readahead prevents waiting
> > > on IO in the future.
> > > The max readahead size estimation is a very small part
> > > of the test and not that significant IMO.
> > > Why is the test not trying to readahead more than 64MB?
> > > It's arbitrary. So my first proposal to over-estimation was
> > > to cap upper estimation with 1MB, i.e.:
> >
> > Problem is that max allowed readahead can be smaller than 1MB,
> > so then we still over-estimate.
> >
> > >
> > > offset += MIN(max_ra_estimate, MAX_SANE_READAHEAD_ESTIMATION);
> > >
> > > With this change we won't be testing if readahead of 64MB
> > > in one go works anymore - it looks like getting that to work reliably
> > > is more challenging and not sure its worth the trouble.
> > >
> > > What you ended up implementing in v2 is not what I proposed
> > > (you just disabled estimation altogether)
> >
> > Yes, v2/bdi limit is highest number I'm aware of, that should
> > work across all kernels without guessing.
> >
> > > I am fine with the dynamic setting on min_sane_readahead
> > > in v2, but it is independent of capping the upper limit for
> > > estimation.
> > >
> > > So what do you say about v2 + estimation (with or without reading
> > > file backwards) and upper estimation limit?
> >
> > How could we tell if 1MB upper limit is smaller than max readahead limit?
> >
> 
> Try to set bdi limit of test device on setup() to testfile_size before
> reading
> back the value?
> If that fails try testfile_size / 2 etc.

Maybe, but we would need to start lower (2M). Kernels prior to
commit 600e19afc5f8a6c18ea49cee9511c5797db02391 will just ignore it.

I can give it a try.

> Use the value that bdi limit ended up with.
> 
> > Other option could be v3, but in steps equal to "max_ra / 2":
> >
> >  *                        |<----- async_size ---------|
> >  *     |------------------- size -------------------->|
> >  *     |==================#===========================|
> >  *     ^start             ^page marked with PG_readahead
> >  *
> >  * To overlap application thinking time and disk I/O time, we do
> >  * `readahead pipelining': Do not wait until the application consumed all
> >  * readahead pages and stalled on the missing page at readahead_index;
> >  * Instead, submit an asynchronous readahead I/O as soon as there are
> >  * only async_size pages left in the readahead window. Normally async_size
> >  * will be equal to size, for maximum pipelining.
> >
> >
> 
> OK, if it works, but v2 + raising bdi limit sounds simpler,
> so if that works
> I think that it is better to minimize speculative code
> in the test,

readahead has been tweaked a lot throughout the years,
so it all feels like speculation :-/

> 
> Thanks,
> Amir.
>
Amir Goldstein March 7, 2019, 9:53 a.m. UTC | #11
> > Try to set bdi limit of test device on setup() to testfile_size before
> > reading
> > back the value?
> > If that fails try testfile_size / 2 etc.
>
> Maybe, but we would need to start lower (2M). Kernels prior to
> commit 600e19afc5f8a6c18ea49cee9511c5797db02391 will just ignore it.
>

Ah. Thanks for the reference.
But kernels prior to this commit SHOULD fail the test if they
don't comply with explicit readahead() request that is below 2M
and bdi limit. That is the case that the commit sets to fix. No?

Thanks,
Amir.
Jan Stancek March 7, 2019, 11:25 a.m. UTC | #12
----- Original Message -----
> > > Try to set bdi limit of test device on setup() to testfile_size before
> > > reading
> > > back the value?
> > > If that fails try testfile_size / 2 etc.
> >
> > Maybe, but we would need to start lower (2M). Kernels prior to
> > commit 600e19afc5f8a6c18ea49cee9511c5797db02391 will just ignore it.
> >
> 
> Ah. Thanks for the reference.
> But kernels prior to this commit SHOULD fail the test if they
> don't comply with explicit readahead() request that is below 2M

Yes, agreed.

> and bdi limit.

What I meant was, that we can try bumping bdi limit, but our
max readahead length has to be capped to MIN(bdi limit, 2M).

Does that make sense?

> That is the case that the commit sets to fix. No?
> 
> Thanks,
> Amir.
>
Amir Goldstein March 7, 2019, 11:49 a.m. UTC | #13
On Thu, Mar 7, 2019 at 1:25 PM Jan Stancek <jstancek@redhat.com> wrote:
>
>
> ----- Original Message -----
> > > > Try to set bdi limit of test device on setup() to testfile_size before
> > > > reading
> > > > back the value?
> > > > If that fails try testfile_size / 2 etc.
> > >
> > > Maybe, but we would need to start lower (2M). Kernels prior to
> > > commit 600e19afc5f8a6c18ea49cee9511c5797db02391 will just ignore it.
> > >
> >
> > Ah. Thanks for the reference.
> > But kernels prior to this commit SHOULD fail the test if they
> > don't comply with explicit readahead() request that is below 2M
>
> Yes, agreed.
>
> > and bdi limit.
>
> What I meant was, that we can try bumping bdi limit, but our
> max readahead length has to be capped to MIN(bdi limit, 2M).
>
> Does that make sense?
>

It does to me.

Thanks,
Amir.
diff mbox series

Patch

diff --git a/testcases/kernel/syscalls/readahead/readahead02.c b/testcases/kernel/syscalls/readahead/readahead02.c
index 293c839e169e..c4c90a51c7de 100644
--- a/testcases/kernel/syscalls/readahead/readahead02.c
+++ b/testcases/kernel/syscalls/readahead/readahead02.c
@@ -49,7 +49,7 @@  static int ovl_mounted;
 #define OVL_UPPER	MNTPOINT"/upper"
 #define OVL_WORK	MNTPOINT"/work"
 #define OVL_MNT		MNTPOINT"/ovl"
-#define MIN_SANE_READAHEAD (4u * 1024u)
+int min_sane_readahead  = 4096;
 
 static const char mntpoint[] = MNTPOINT;
 
@@ -165,13 +165,11 @@  static int read_testfile(struct tcase *tc, int do_readahead,
 	size_t i = 0;
 	long read_bytes_start;
 	unsigned char *p, tmp;
-	unsigned long cached_start, max_ra_estimate = 0;
 	off_t offset = 0;
 
 	fd = SAFE_OPEN(fname, O_RDONLY);
 
 	if (do_readahead) {
-		cached_start = get_cached_size();
 		do {
 			TEST(tc->readahead(fd, offset, fsize - offset));
 			if (TST_RET != 0) {
@@ -179,21 +177,8 @@  static int read_testfile(struct tcase *tc, int do_readahead,
 				return TST_ERR;
 			}
 
-			/* estimate max readahead size based on first call */
-			if (!max_ra_estimate) {
-				*cached = get_cached_size();
-				if (*cached > cached_start) {
-					max_ra_estimate = (1024 *
-						(*cached - cached_start));
-					tst_res(TINFO, "max ra estimate: %lu",
-						max_ra_estimate);
-				}
-				max_ra_estimate = MAX(max_ra_estimate,
-					MIN_SANE_READAHEAD);
-			}
-
 			i++;
-			offset += max_ra_estimate;
+			offset += min_sane_readahead;
 		} while ((size_t)offset < fsize);
 		tst_res(TINFO, "readahead calls made: %zu", i);
 		*cached = get_cached_size();
@@ -366,12 +351,31 @@  static void setup_overlay(void)
 
 static void setup(void)
 {
+	struct stat sbuf;
+	char tmp[PATH_MAX], sys_ra_path[PATH_MAX], *backing_dev;
+	int read_ahead_kb;
+
 	if (opt_fsizestr)
 		testfile_size = SAFE_STRTOL(opt_fsizestr, 1, INT_MAX);
 
 	if (access(PROC_IO_FNAME, F_OK))
 		tst_brk(TCONF, "Requires " PROC_IO_FNAME);
 
+	/* Use backing device read_ahead_kb limit as min_sane_readahead */
+	SAFE_LSTAT(tst_device->dev, &sbuf);
+	if (S_ISLNK(sbuf.st_mode))
+		SAFE_READLINK(tst_device->dev, tmp, PATH_MAX);
+	else
+		strcpy(tmp, tst_device->dev);
+	backing_dev = basename(tmp);
+	sprintf(sys_ra_path, "/sys/class/block/%s/bdi/read_ahead_kb", backing_dev);
+	if (access(sys_ra_path, F_OK) == 0) {
+		SAFE_FILE_SCANF(sys_ra_path, "%d", &read_ahead_kb);
+		if (read_ahead_kb * 1024 > min_sane_readahead)
+			min_sane_readahead = read_ahead_kb * 1024;
+	}
+	tst_res(TINFO, "min_sane_readahead: %u", min_sane_readahead);
+
 	has_file(DROP_CACHES_FNAME, 1);
 	has_file(MEMINFO_FNAME, 1);