diff mbox

[1/3,v3] fs: add SEEK_HOLE and SEEK_DATA flags

Message ID 1304626619-1588-1-git-send-email-josef@redhat.com
State Not Applicable, archived
Headers show

Commit Message

Josef Bacik May 5, 2011, 8:16 p.m. UTC
This just gets us ready to support the SEEK_HOLE and SEEK_DATA flags.  Turns out
using fiemap in things like cp cause more problems than it solves, so lets try
and give userspace an interface that doesn't suck.  So we have

-SEEK_HOLE: move the file position to the start of the next hole greater than or
equal to the start of the suplied offset.  This is how solaris defines it, so
thats how it will work for us.  The only trick is preallocated space.  Some
filesystems will be able to safely differentiate between prealloced space and
soon to be converted prealloced space, so preallocated space could easily be
considered a hole.  At the same time some file systems will not be able to make
this differentiation and so for safety will not treat preallocated space as a
hole.

-SEEK_DATA: this is obviously a little more self-explanatory.  Again the only
ambiguity comes in with preallocated extents.  If you have an fs that can't
reliably tell that the preallocated extent is in the process of turning into a
real extent, it is correct for SEEK_DATA to park you at a preallocated extent.

In the generic case we will just assume the entire file is data and there is a
virtual hole at i_size, so SEEK_DATA will return -ENXIO unless you provide an
offset of 0 and the file size is larger than 0, and SEEK_HOLE will put you at
i_size unless pos is larger or equal to i_size.

Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
v2->v3:
-Fixed i_mutex unlock screw up
-Just use inode->i_size
-Make the comments clear that we mean the next hole
 fs/read_write.c    |   18 ++++++++++++++++++
 include/linux/fs.h |    4 +++-
 2 files changed, 21 insertions(+), 1 deletions(-)

Comments

Josef Bacik May 5, 2011, 8:16 p.m. UTC | #1
This is my very rough tester for testing seek_hole/seek_data.  Please look over
it and make sure we all agree that the semantics are correct.  My btrfs patch
passes with this and ext3 passes as well.  I still have to added fallocate() to
it, but for now this seems to cover most of the corner cases.  Thanks,

Josef

#include <sys/types.h>
#include <sys/stat.h>
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>

#define SEEK_HOLE	3
#define SEEK_DATA	4

#define ERROR(str)	\
	fprintf(stderr, "%s: pos=%lu, errno=%d\n", str, pos, errno)

static int reset_file(int fd)
{
	int ret;

	ret = ftruncate(fd, 0);
	if (ret < 0) {
		fprintf(stderr, "Truncate failed: %d\n", errno);
		return 1;
	}

	return 0;
}

int main(int argc, char **argv)
{
	char buf[4096 * 4];
	ssize_t bytes;
	off_t pos;
	int prealloc_is_hole = 0;
	int whole_file_is_data = 0;
	int ret;
	int i;
	int fd;

	fd = open("testfile", O_RDWR|O_CREAT|O_TRUNC, 0644);
	if (fd < 0) {
		fprintf(stderr, "Failed to open testfile: %d\n", errno);
		return 1;
	}

	/* Empty file */
	printf("Testing an empty file\n");
	pos = lseek(fd, 0, SEEK_DATA);
	if (pos != -1) {
		if (errno == EINVAL) {
			fprintf(stderr, "Kernel does not support seek "
				"hole/data\n");
			close(fd);
			return 1;
		}
		if (errno != ENXIO)
			ERROR("Seek data did not return a proper error");
		close(fd);
		return 1;
	}

	pos = lseek(fd, 0, SEEK_HOLE);
	if (pos != -1 && errno != ENXIO) {
		ERROR("Seek hole did not return a proper error");
		close(fd);
		return 1;
	}

	memset(&buf, 'a', 4096 * 4);
	/*
	 * All data file
	 */
	printf("Testing a normal data filled file\n");
	for (i = 0; i < 4; i++) {
		bytes = write(fd, &buf, 4096);
		if (bytes < 4096) {
			fprintf(stderr, "Failed to write to testfile: %d\n",
				errno);
			close(fd);
			return 1;
		}
	}

	pos = lseek(fd, 0, SEEK_HOLE);
	if (pos != (4096 * 4) || pos == -1) {
		ERROR("Seek hole failed to dump us out at the end of the file");
		close(fd);
		return 1;
	}

	pos = lseek(fd, 0, SEEK_DATA);
	if (pos != 0) {
		ERROR("Seek data failed to dump us out at the beginning of the"
		      " file");
		close(fd);
		return 1;
	}

	/*
	 * File with a hole at the front and data at the end
	 */
	printf("Testing file with hole at the start and data in the rest\n");
	if (reset_file(fd)) {
		close(fd);
		return 1;
	}

	bytes = pwrite(fd, &buf, 4096 * 3, 4096);
	if (bytes < (4096 * 3)) {
		fprintf(stderr, "Failed to write to testfile: %d\n");
		close(fd);
		return 1;
	}

	pos = lseek(fd, 0, SEEK_HOLE);
	if (pos != 0 && pos != (4096 * 4)) {
		ERROR("Seek hole failed to return 0");
		close(fd);
		return 1;
	} else if (pos == (4096 * 4)) {
		whole_file_is_data = 1;
		printf("Current file system views treats the entire file as "
		       "data\n");
	}

	pos = lseek(fd, 0, SEEK_DATA);
	if (pos != 4096 && (pos != 0 && whole_file_is_data)) {
		if (whole_file_is_data)
			ERROR("Seek data failed to return 0");
		else
			ERROR("Seek data failed to return 4096");
		close(fd);
		return 1;
	}

	if (whole_file_is_data) {
		pos = lseek(fd, 1, SEEK_DATA);
		if (pos != -1 && errno != ENXIO) {
			ERROR("Seek data failed to retun an error");
			close(fd);
			return 1;
		}
	}
	/*
	 * File with a hole at the end and data at the beginning
	 */
	printf("Testing file with hole at the end and data at the beginning\n");
	if (reset_file(fd)) {
		close(fd);
		return 1;
	}

	ret = ftruncate(fd, 4096 * 4);
	if (ret < 0) {
		fprintf(stderr, "Truncate failed: %d\n", errno);
		close(fd);
		return 1;
	}

	pwrite(fd, &buf, 4096 * 3, 0);
	if (bytes < (4096 * 3)) {
		fprintf(stderr, "Failed to write to testfile: %d\n", errno);
		close(fd);
		return 1;
	}

	pos = lseek(fd, 0, SEEK_HOLE);
	if (pos != (4096 * 3) && (pos != (4096 * 4) && whole_file_is_data)) {
		ERROR("Seeking hole didn't work right");
		close(fd);
		return 1;
	}

	if (whole_file_is_data) {
		pos = lseek(fd, pos, SEEK_HOLE);
		if (pos != -1 && errno != ENXIO) {
			ERROR("Seeking hole didn't return error");
			close(fd);
			return 1;
		}
		printf("No more tests to run since we treat the whole file as "
		       "data\n");
		goto out;
	}

	pos = lseek(fd, pos, SEEK_HOLE);
	if (pos != (4096 * 3)) {
		ERROR("Seek hole didn't return same position");
		close(fd);
		return 1;
	}

	pos = lseek(fd, pos+1, SEEK_HOLE);
	if (pos != (4096 * 4)) {
		ERROR("Seek hole didn't return the end of the file");
		close(fd);
		return 1;
	}

	pos = lseek(fd, pos, SEEK_DATA);
	if (pos != -1 && errno != ENXIO) {
		ERROR("Seek data didn't return ENXIO");
		close(fd);
		return 1;
	}

	/*
	 * Hole - Data - Hole - Data file
	 */
	printf("Testing file [Hole][Data][Hole][Data]\n");
	if (reset_file(fd)) {
		close(fd);
		return 1;
	}

	ret = ftruncate(fd, 4096 * 4);
	if (ret < 0) {
		fprintf(stderr, "ftruncate failed: %d\n", errno);
		close(fd);
		return 1;
	}

	bytes = pwrite(fd, &buf, 4096, 4096);
	if (bytes < 4096) {
		fprintf(stderr, "Failed to write: %d\n", errno);
		close(fd);
		return 1;
	}

	bytes = pwrite(fd, &buf, 4096, 4096 * 3);
	if (bytes < 4096) {
		fprintf(stderr, "Failed to write: %d\n", errno);
		close(fd);
		return 1;
	}

	pos = lseek(fd, 0, SEEK_DATA);
	if (pos != 4096) {
		ERROR("Seek data did not return 4096");
		close(fd);
		return 1;
	}

	pos = lseek(fd, pos, SEEK_HOLE);
	if (pos != (4096 * 2)) {
		ERROR("Seek hole did not return 4096*2");
		close(fd);
		return 1;
	}

	pos = lseek(fd, pos, SEEK_DATA);
	if (pos != (4096 * 3)) {
		ERROR("Seek data did not return 4096*3");
		close(fd);
		return 1;
	}
out:
	close(fd);
	return 0;
}
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sunil Mushran May 13, 2011, 11:46 p.m. UTC | #2
On 05/05/2011 01:16 PM, Josef Bacik wrote:
> This is my very rough tester for testing seek_hole/seek_data.  Please look over
> it and make sure we all agree that the semantics are correct.  My btrfs patch
> passes with this and ext3 passes as well.  I still have to added fallocate() to
> it, but for now this seems to cover most of the corner cases.  Thanks,

I am assuming that our aim is to be fully compatible with zfs.

I tried running the test on it and it failed. One reason was
that the default allocation size on zfs is 128K. The test assumes
4K. The other was our understanding of the various corner cases.
And lastly, the values for SEEK_DATA and SEEK_HOLE are 3 and 4
respectively. Not vice-versa.

So I enhanced the test a bit and have it running on zfs. If someone
else can, please do verify my results.

BTW, This test also does not touch fallocate.

http://oss.oracle.com/~smushran/seek_data/seek_test.c

On zfs:
# ./seek_test
Allocation size: 131072
01. Test basic support                                  SUCC
02. Test an empty file                                  SUCC
03. Test a full file                                    SUCC
04. Test file hole at beg, data at end                  SUCC
05. Test file data at beg, hole at end                  SUCC
06. Test file hole data hole data                       SUCC


On ext4:
# ./seek_test
Allocation size: 4096
01. Test basic support                                	SUCC
02. Test an empty file                                	SUCC
   ERROR in Test 3.4: POS expected 1, got -1
   ERROR in Test 3.6: POS expected 4195, got -1
03. Test a full file                                  	FAIL
   ERROR in Test 4.1: POS expected 0, got 8196
   ERROR in Test 4.2: POS expected 1, got 8196
   ERROR in Test 4.3: POS expected 8192, got 0
   ERROR in Test 4.4: POS expected 8192, got -1
   ERROR in Test 4.5: POS expected 8191, got 8196
   ERROR in Test 4.6: POS expected 8192, got -1
   ERROR in Test 4.8: POS expected 8192, got -1
   ERROR in Test 4.10: POS expected 8193, got -1
   ERROR in Test 4.12: POS expected 8195, got -1
04. Test file hole at beg, data at end                	FAIL
   ERROR in Test 5.1: POS expected 4096, got 16384
   ERROR in Test 5.2: POS expected 4096, got 16384
   ERROR in Test 5.4: POS expected 1, got -1
   ERROR in Test 5.5: POS expected 4096, got 16384
   ERROR in Test 5.6: POS expected 4095, got -1
   ERROR in Test 5.7: POS expected 4096, got 16384
   ERROR in Test 5.9: POS expected 4097, got 16384
   ERROR in Test 5.11: POS expected 16383, got 16384
05. Test file data at beg, hole at end                	FAIL
   ERROR in Test 6.1: POS expected 0, got 16384
   ERROR in Test 6.2: POS expected 1, got 16384
   ERROR in Test 6.3: POS expected 4096, got 0
   ERROR in Test 6.4: POS expected 4096, got -1
   ERROR in Test 6.5: POS expected 4095, got 16384
   ERROR in Test 6.6: POS expected 4096, got -1
   ERROR in Test 6.7: POS expected 8192, got 16384
   ERROR in Test 6.8: POS expected 4096, got -1
   ERROR in Test 6.9: POS expected 8192, got 16384
   ERROR in Test 6.10: POS expected 4097, got -1
   ERROR in Test 6.11: POS expected 8192, got 16384
   ERROR in Test 6.12: POS expected 8191, got -1
   ERROR in Test 6.13: POS expected 8192, got 16384
   ERROR in Test 6.14: POS expected 12288, got -1
   ERROR in Test 6.15: POS expected 8193, got 16384
   ERROR in Test 6.16: POS expected 12288, got -1
   ERROR in Test 6.17: POS expected 12287, got 16384
   ERROR in Test 6.18: POS expected 12288, got -1
   ERROR in Test 6.20: POS expected 12288, got -1
   ERROR in Test 6.22: POS expected 12289, got -1
   ERROR in Test 6.24: POS expected 16383, got -1
06. Test file hole data hole data                     	FAIL


--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/read_write.c b/fs/read_write.c
index 5520f8a..af9cc51 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -64,6 +64,24 @@  generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
 			return file->f_pos;
 		offset += file->f_pos;
 		break;
+	case SEEK_DATA:
+		/*
+		 * In the generic case the entire file is data, so data only
+		 * starts at position 0 provided the file has an i_size,
+		 * otherwise it's an empty file and will always be ENXIO.
+		 */
+		if (offset != 0 || inode->i_size == 0)
+			return -ENXIO;
+		break;
+	case SEEK_HOLE:
+		/*
+		 * There is a virtual hole at the end of the file, so as long as
+		 * offset isn't i_size or larger, return i_size.
+		 */
+		if (offset >= inode->i_size)
+			return -ENXIO;
+		offset = inode->i_size;
+		break;
 	}
 
 	if (offset < 0 && !unsigned_offsets(file))
diff --git a/include/linux/fs.h b/include/linux/fs.h
index dbd860a..185b278 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -31,7 +31,9 @@ 
 #define SEEK_SET	0	/* seek relative to beginning of file */
 #define SEEK_CUR	1	/* seek relative to current file position */
 #define SEEK_END	2	/* seek relative to end of file */
-#define SEEK_MAX	SEEK_END
+#define SEEK_HOLE	3	/* seek to the next hole */
+#define SEEK_DATA	4	/* seek to the next data */
+#define SEEK_MAX	SEEK_DATA
 
 struct fstrim_range {
 	__u64 start;