diff mbox series

libfs: Fix DIO mode aligment

Message ID 20201023112659.1559-1-artem.blagodarenko@gmail.com
State New
Headers show
Series libfs: Fix DIO mode aligment | expand

Commit Message

Благодаренко Артём Oct. 23, 2020, 11:26 a.m. UTC
From: Alexey Lyashkov <alexey.lyashkov@hpe.com>

Bounce buffer must be extended to hold a whole disk block size.
Read/write operations with unaligned offsets is prohobined
in the DIO mode, so read/write blocks should be adjusted to
reflect it.

Change-Id: Ic573c9ff0d476028dd2293f8b814c6112705db0e
Signed-off-by: Alexey Lyashkov <alexey.lyashkov@hpe.com>
HPE-bug-id: LUS-9241
---
 lib/ext2fs/io_manager.c |   5 +-
 lib/ext2fs/unix_io.c    | 296 +++++++++++++++++++++++++---------------
 2 files changed, 190 insertions(+), 111 deletions(-)

Comments

Благодаренко Артём Nov. 17, 2020, 3:30 p.m. UTC | #1
Hello,

Any thoughts about this change? Thanks.

Best regards,
Artem Blagodarenko.

> On 23 Oct 2020, at 14:26, Artem Blagodarenko <artem.blagodarenko@gmail.com> wrote:
> 
> From: Alexey Lyashkov <alexey.lyashkov@hpe.com>
> 
> Bounce buffer must be extended to hold a whole disk block size.
> Read/write operations with unaligned offsets is prohobined
> in the DIO mode, so read/write blocks should be adjusted to
> reflect it.
> 
> Change-Id: Ic573c9ff0d476028dd2293f8b814c6112705db0e
> Signed-off-by: Alexey Lyashkov <alexey.lyashkov@hpe.com>
> HPE-bug-id: LUS-9241
> ---
> lib/ext2fs/io_manager.c |   5 +-
> lib/ext2fs/unix_io.c    | 296 +++++++++++++++++++++++++---------------
> 2 files changed, 190 insertions(+), 111 deletions(-)
> 
> diff --git a/lib/ext2fs/io_manager.c b/lib/ext2fs/io_manager.c
> index c395d615..84399a12 100644
> --- a/lib/ext2fs/io_manager.c
> +++ b/lib/ext2fs/io_manager.c
> @@ -20,6 +20,9 @@
> #include "ext2_fs.h"
> #include "ext2fs.h"
> 
> +#define max(a, b) ((a) > (b) ? (a) : (b))
> +
> +
> errcode_t io_channel_set_options(io_channel channel, const char *opts)
> {
> 	errcode_t retval = 0;
> @@ -128,7 +131,7 @@ errcode_t io_channel_alloc_buf(io_channel io, int count, void *ptr)
> 	size_t	size;
> 
> 	if (count == 0)
> -		size = io->block_size;
> +		size = max(io->block_size, io->align);
> 	else if (count > 0)
> 		size = io->block_size * count;
> 	else
> diff --git a/lib/ext2fs/unix_io.c b/lib/ext2fs/unix_io.c
> index 628e60c3..53647a22 100644
> --- a/lib/ext2fs/unix_io.c
> +++ b/lib/ext2fs/unix_io.c
> @@ -154,46 +154,30 @@ static char *safe_getenv(const char *arg)
> /*
>  * Here are the raw I/O functions
>  */
> -static errcode_t raw_read_blk(io_channel channel,
> +static errcode_t raw_aligned_read_blk(io_channel channel,
> 			      struct unix_private_data *data,
> -			      unsigned long long block,
> -			      int count, void *bufv)
> +			      ext2_loff_t location,
> +			      ssize_t size, void *bufv,
> +			      int *asize)
> {
> -	errcode_t	retval;
> -	ssize_t		size;
> -	ext2_loff_t	location;
> 	int		actual = 0;
> +	errcode_t	retval;
> 	unsigned char	*buf = bufv;
> -	ssize_t		really_read = 0;
> -
> -	size = (count < 0) ? -count : (ext2_loff_t) count * channel->block_size;
> -	data->io_stats.bytes_read += size;
> -	location = ((ext2_loff_t) block * channel->block_size) + data->offset;
> 
> -	if (data->flags & IO_FLAG_FORCE_BOUNCE) {
> -		if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
> -			retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
> -			goto error_out;
> -		}
> -		goto bounce_read;
> -	}
> +#ifdef ALIGN_DEBUG
> +	printf("raw_aligned_read_blk: %p %lu<>%lu\n", buf,
> +		location, (unsigned long) size);
> +#endif
> 
> #ifdef HAVE_PREAD64
> 	/* Try an aligned pread */
> -	if ((channel->align == 0) ||
> -	    (IS_ALIGNED(buf, channel->align) &&
> -	     IS_ALIGNED(size, channel->align))) {
> -		actual = pread64(data->dev, buf, size, location);
> -		if (actual == size)
> -			return 0;
> -		actual = 0;
> -	}
> +	actual = pread64(data->dev, buf, size, location);
> +	if (actual == size)
> +		return 0;
> +	actual = 0;
> #elif HAVE_PREAD
> 	/* Try an aligned pread */
> -	if ((sizeof(off_t) >= sizeof(ext2_loff_t)) &&
> -	    ((channel->align == 0) ||
> -	     (IS_ALIGNED(buf, channel->align) &&
> -	      IS_ALIGNED(size, channel->align)))) {
> +	if (sizeof(off_t) >= sizeof(ext2_loff_t)) {
> 		actual = pread(data->dev, buf, size, location);
> 		if (actual == size)
> 			return 0;
> @@ -205,47 +189,100 @@ static errcode_t raw_read_blk(io_channel channel,
> 		retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
> 		goto error_out;
> 	}
> +
> +	actual = read(data->dev, buf, size);
> +	if (actual != size) {
> +		if (actual < 0) {
> +			retval = errno;
> +			actual = 0;
> +		} else {
> +			retval = EXT2_ET_SHORT_READ;
> +		}
> +	}
> +
> +error_out:
> +	*asize = actual;
> +	return retval;
> +}
> +
> +
> +/*
> + * Here are the raw I/O functions
> + */
> +static errcode_t raw_read_blk(io_channel channel,
> +			      struct unix_private_data *data,
> +			      unsigned long long block,
> +			      int count, void *bufv)
> +{
> +	errcode_t	retval;
> +	ssize_t		size;
> +	ext2_loff_t	location;
> +	int		actual = 0;
> +	unsigned char	*buf = bufv;
> +	unsigned int	align = channel->align;
> +	ssize_t		really_read = 0;
> +	blk64_t		blk;
> +	loff_t		offset;
> +
> +	size = (count < 0) ? -count : count * channel->block_size;
> +	data->io_stats.bytes_read += size;
> +	location = ((ext2_loff_t) block * channel->block_size) + data->offset;
> +
> +	if (data->flags & IO_FLAG_FORCE_BOUNCE) {
> +		align = channel->block_size;
> +		goto bounce_read;
> +	}
> +
> 	if ((channel->align == 0) ||
> -	    (IS_ALIGNED(buf, channel->align) &&
> +	    (IS_ALIGNED(location, channel->align) &&
> +	     IS_ALIGNED(buf, channel->align) &&
> 	     IS_ALIGNED(size, channel->align))) {
> -		actual = read(data->dev, buf, size);
> -		if (actual != size) {
> -		short_read:
> -			if (actual < 0) {
> -				retval = errno;
> -				actual = 0;
> -			} else
> -				retval = EXT2_ET_SHORT_READ;
> +		retval = raw_aligned_read_blk(channel, data, location,
> +						size, bufv, &actual);
> +		if (retval != 0)
> 			goto error_out;
> -		}
> -		return 0;
> +
> +		return retval;
> 	}
> 
> +bounce_read:
> #ifdef ALIGN_DEBUG
> -	printf("raw_read_blk: O_DIRECT fallback: %p %lu\n", buf,
> -	       (unsigned long) size);
> +	printf("raw_read_blk: O_DIRECT fallback: %p %lu<>%lu\n", buf,
> +		location, (unsigned long) size);
> #endif
> -
> 	/*
> 	 * The buffer or size which we're trying to read isn't aligned
> 	 * to the O_DIRECT rules, so we need to do this the hard way...
> +	 * read / write must be aligned to the block device sector size
> 	 */
> -bounce_read:
> +
> +	blk = location / align;
> +	offset = location % align;
> +
> +	if (lseek(data->dev, blk * align, SEEK_SET) < 0)
> +		return errno;
> +
> 	while (size > 0) {
> -		actual = read(data->dev, data->bounce, channel->block_size);
> -		if (actual != channel->block_size) {
> +		actual = read(data->dev, data->bounce, align);
> +		if (actual != align) {
> 			actual = really_read;
> 			buf -= really_read;
> 			size += really_read;
> -			goto short_read;
> +			retval = EXT2_ET_SHORT_READ;
> +			goto error_out;
> 		}
> 		actual = size;
> -		if (size > channel->block_size)
> -			actual = channel->block_size;
> -		memcpy(buf, data->bounce, actual);
> +		if ((actual + offset) > align)
> +			actual = align - offset;
> +		if (actual > size)
> +			actual = size;
> +
> +		memcpy(buf, data->bounce + offset, actual);
> 		really_read += actual;
> 		size -= actual;
> 		buf += actual;
> +		offset = 0;
> +		blk++;
> 	}
> 	return 0;
> 
> @@ -258,6 +295,58 @@ error_out:
> 	return retval;
> }
> 
> +static errcode_t raw_aligned_write_blk(io_channel channel,
> +			       struct unix_private_data *data,
> +			       ext2_loff_t  location,
> +			       ssize_t size, const void *bufv,
> +			       int *asize)
> +
> +{
> +	int		actual = 0;
> +	errcode_t	retval;
> +	const unsigned char *buf = (void *)bufv;
> +
> +#ifdef ALIGN_DEBUG
> +	printf("raw_aligned_write_blk: %p %lu %lu\n", buf,
> +	       location, (unsigned long) size);
> +#endif
> +
> +#ifdef HAVE_PWRITE64
> +	/* Try an aligned pwrite */
> +	actual = pwrite64(data->dev, buf, size, location);
> +	if (actual == size)
> +		return 0;
> +#elif HAVE_PWRITE
> +	/* Try an aligned pwrite */
> +	if ((sizeof(off_t) >= sizeof(ext2_loff_t)) {
> +		actual = pwrite(data->dev, buf, size, location);
> +		if (actual == size)
> +			return 0;
> +	}
> +#endif /* HAVE_PWRITE */
> +
> +	if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
> +		retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
> +		goto error_out;
> +	}
> +
> +	actual = write(data->dev, buf, size);
> +	if (actual < 0) {
> +		retval = errno;
> +		goto error_out;
> +	}
> +	if (actual != size) {
> +		retval = EXT2_ET_SHORT_WRITE;
> +		goto error_out;
> +	}
> +	retval = 0;
> +error_out:
> +	*asize = actual;
> +	return retval;
> +
> +}
> +
> +
> static errcode_t raw_write_blk(io_channel channel,
> 			       struct unix_private_data *data,
> 			       unsigned long long block,
> @@ -268,6 +357,9 @@ static errcode_t raw_write_blk(io_channel channel,
> 	int		actual = 0;
> 	errcode_t	retval;
> 	const unsigned char *buf = bufv;
> +	unsigned int	align = channel->align;
> +	blk64_t		blk;
> +	loff_t		offset;
> 
> 	if (count == 1)
> 		size = channel->block_size;
> @@ -282,95 +374,79 @@ static errcode_t raw_write_blk(io_channel channel,
> 	location = ((ext2_loff_t) block * channel->block_size) + data->offset;
> 
> 	if (data->flags & IO_FLAG_FORCE_BOUNCE) {
> -		if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
> -			retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
> -			goto error_out;
> -		}
> +		align = channel->block_size;
> 		goto bounce_write;
> 	}
> 
> -#ifdef HAVE_PWRITE64
> -	/* Try an aligned pwrite */
> 	if ((channel->align == 0) ||
> -	    (IS_ALIGNED(buf, channel->align) &&
> +	    (IS_ALIGNED(location, channel->align) &&
> +	     IS_ALIGNED(buf, channel->align) &&
> 	     IS_ALIGNED(size, channel->align))) {
> -		actual = pwrite64(data->dev, buf, size, location);
> -		if (actual == size)
> -			return 0;
> -	}
> -#elif HAVE_PWRITE
> -	/* Try an aligned pwrite */
> -	if ((sizeof(off_t) >= sizeof(ext2_loff_t)) &&
> -	    ((channel->align == 0) ||
> -	     (IS_ALIGNED(buf, channel->align) &&
> -	      IS_ALIGNED(size, channel->align)))) {
> -		actual = pwrite(data->dev, buf, size, location);
> -		if (actual == size)
> -			return 0;
> -	}
> -#endif /* HAVE_PWRITE */
> +		retval = raw_aligned_write_blk(channel, data, location,
> +						size, bufv, &actual);
> +		if (retval != 0)
> +			goto error_out;
> 
> -	if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
> -		retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
> -		goto error_out;
> +		return retval;
> 	}
> 
> -	if ((channel->align == 0) ||
> -	    (IS_ALIGNED(buf, channel->align) &&
> -	     IS_ALIGNED(size, channel->align))) {
> -		actual = write(data->dev, buf, size);
> -		if (actual < 0) {
> -			retval = errno;
> -			goto error_out;
> -		}
> -		if (actual != size) {
> -		short_write:
> -			retval = EXT2_ET_SHORT_WRITE;
> -			goto error_out;
> -		}
> -		return 0;
> -	}
> 
> +bounce_write:
> #ifdef ALIGN_DEBUG
> 	printf("raw_write_blk: O_DIRECT fallback: %p %lu\n", buf,
> 	       (unsigned long) size);
> #endif
> +
> +	/* logical offset may don't aligned with block device block size */
> +	blk = location / align;
> +	offset = location % align;
> +
> +	if (lseek(data->dev, blk * align, SEEK_SET) != blk * align) {
> +		retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
> +		goto error_out;
> +	}
> +
> 	/*
> 	 * The buffer or size which we're trying to write isn't aligned
> 	 * to the O_DIRECT rules, so we need to do this the hard way...
> 	 */
> -bounce_write:
> 	while (size > 0) {
> -		if (size < channel->block_size) {
> -			actual = read(data->dev, data->bounce,
> -				      channel->block_size);
> -			if (actual != channel->block_size) {
> -				if (actual < 0) {
> -					retval = errno;
> -					goto error_out;
> -				}
> -				memset((char *) data->bounce + actual, 0,
> -				       channel->block_size - actual);
> +		int actual_w;
> +
> +		memset((char *) data->bounce, 0, align);
> +		if (offset || (size < align)) {
> +			actual = read(data->dev, data->bounce, align);
> +			if (actual < 0) {
> +				retval = errno;
> +				goto error_out;
> 			}
> 		}
> 		actual = size;
> -		if (size > channel->block_size)
> -			actual = channel->block_size;
> -		memcpy(data->bounce, buf, actual);
> -		if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
> +		if ((actual + offset) > align)
> +			actual = align - offset;
> +		if (actual > size)
> +			actual = size;
> +		memcpy(((char *)data->bounce) + offset, buf, actual);
> +
> +		if (lseek(data->dev, blk * align, SEEK_SET) != blk * align) {
> 			retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
> 			goto error_out;
> 		}
> -		actual = write(data->dev, data->bounce, channel->block_size);
> -		if (actual < 0) {
> +
> +		actual_w = write(data->dev, data->bounce, align);
> +		if (actual_w < 0) {
> 			retval = errno;
> 			goto error_out;
> 		}
> -		if (actual != channel->block_size)
> -			goto short_write;
> +		if (actual_w != align) {
> +			retval = EXT2_ET_SHORT_WRITE;
> +			goto error_out;
> +		}
> 		size -= actual;
> 		buf += actual;
> 		location += actual;
> +		blk ++;
> +		offset = 0;
> 	}
> 	return 0;
> 
> -- 
> 2.18.4
>
Theodore Y. Ts'o Nov. 17, 2020, 7:19 p.m. UTC | #2
On Tue, Nov 17, 2020 at 06:30:11PM +0300, Благодаренко Артём wrote:
> Hello,
> 
> Any thoughts about this change? Thanks.

I'm trying to think of situations where this could actually trigger in
real life.  The only one I can think of is if a file system with a 1k
block file system is located on an an Advanced FormatDrive with a 4k
sector size.

What was the use case where this was actually an issue?

     	     	      	    	     - Ted
Lyashkov, Alexey Nov. 19, 2020, 12:26 p.m. UTC | #3
Tso,

This situation hit with modern hdd with 4k block size and e2image changed to use DIRECT IO instead of buffered.
e2fsprogs tries to read a super lock on offset 1k and it caused to set FS block size to 1k and second block reading.
(many other places exist, but it simplest).
>>
        if (superblock) {
                if (!block_size) {
                        retval = EXT2_ET_INVALID_ARGUMENT;
                        goto cleanup;
                }
                io_channel_set_blksize(fs->io, block_size);
                group_block = superblock;
                fs->orig_super = 0;
        } else {
                io_channel_set_blksize(fs->io, SUPERBLOCK_OFFSET); <<<<< this is problem
                superblock = 1;
                group_block = 0;
                retval = ext2fs_get_mem(SUPERBLOCK_SIZE, &fs->orig_super);
                if (retval)
                        goto cleanup;
        }
        retval = io_channel_read_blk(fs->io, superblock, -SUPERBLOCK_SIZE,
                                     fs->super);
>>
It caused errors like
# e2image -Q /dev/md65 /tmp/node05_image_out
e2image 1.45.6.cr1 (14-Aug-2020)
e2image: Attempt to read block from filesystem resulted in short read while trying to open /dev/md65
Couldn’t find valid filesystem superblock.

It looks like I don't first person to found a bug, as someone was add 

Alex

On 17/11/2020, 22:19, "Theodore Y. Ts'o" <tytso@mit.edu> wrote:

    On Tue, Nov 17, 2020 at 06:30:11PM +0300, Благодаренко Артём wrote:
    > Hello,
    > 
    > Any thoughts about this change? Thanks.

    I'm trying to think of situations where this could actually trigger in
    real life.  The only one I can think of is if a file system with a 1k
    block file system is located on an an Advanced FormatDrive with a 4k
    sector size.

    What was the use case where this was actually an issue?

         	     	      	    	     - Ted
diff mbox series

Patch

diff --git a/lib/ext2fs/io_manager.c b/lib/ext2fs/io_manager.c
index c395d615..84399a12 100644
--- a/lib/ext2fs/io_manager.c
+++ b/lib/ext2fs/io_manager.c
@@ -20,6 +20,9 @@ 
 #include "ext2_fs.h"
 #include "ext2fs.h"
 
+#define max(a, b) ((a) > (b) ? (a) : (b))
+
+
 errcode_t io_channel_set_options(io_channel channel, const char *opts)
 {
 	errcode_t retval = 0;
@@ -128,7 +131,7 @@  errcode_t io_channel_alloc_buf(io_channel io, int count, void *ptr)
 	size_t	size;
 
 	if (count == 0)
-		size = io->block_size;
+		size = max(io->block_size, io->align);
 	else if (count > 0)
 		size = io->block_size * count;
 	else
diff --git a/lib/ext2fs/unix_io.c b/lib/ext2fs/unix_io.c
index 628e60c3..53647a22 100644
--- a/lib/ext2fs/unix_io.c
+++ b/lib/ext2fs/unix_io.c
@@ -154,46 +154,30 @@  static char *safe_getenv(const char *arg)
 /*
  * Here are the raw I/O functions
  */
-static errcode_t raw_read_blk(io_channel channel,
+static errcode_t raw_aligned_read_blk(io_channel channel,
 			      struct unix_private_data *data,
-			      unsigned long long block,
-			      int count, void *bufv)
+			      ext2_loff_t location,
+			      ssize_t size, void *bufv,
+			      int *asize)
 {
-	errcode_t	retval;
-	ssize_t		size;
-	ext2_loff_t	location;
 	int		actual = 0;
+	errcode_t	retval;
 	unsigned char	*buf = bufv;
-	ssize_t		really_read = 0;
-
-	size = (count < 0) ? -count : (ext2_loff_t) count * channel->block_size;
-	data->io_stats.bytes_read += size;
-	location = ((ext2_loff_t) block * channel->block_size) + data->offset;
 
-	if (data->flags & IO_FLAG_FORCE_BOUNCE) {
-		if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
-			retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
-			goto error_out;
-		}
-		goto bounce_read;
-	}
+#ifdef ALIGN_DEBUG
+	printf("raw_aligned_read_blk: %p %lu<>%lu\n", buf,
+		location, (unsigned long) size);
+#endif
 
 #ifdef HAVE_PREAD64
 	/* Try an aligned pread */
-	if ((channel->align == 0) ||
-	    (IS_ALIGNED(buf, channel->align) &&
-	     IS_ALIGNED(size, channel->align))) {
-		actual = pread64(data->dev, buf, size, location);
-		if (actual == size)
-			return 0;
-		actual = 0;
-	}
+	actual = pread64(data->dev, buf, size, location);
+	if (actual == size)
+		return 0;
+	actual = 0;
 #elif HAVE_PREAD
 	/* Try an aligned pread */
-	if ((sizeof(off_t) >= sizeof(ext2_loff_t)) &&
-	    ((channel->align == 0) ||
-	     (IS_ALIGNED(buf, channel->align) &&
-	      IS_ALIGNED(size, channel->align)))) {
+	if (sizeof(off_t) >= sizeof(ext2_loff_t)) {
 		actual = pread(data->dev, buf, size, location);
 		if (actual == size)
 			return 0;
@@ -205,47 +189,100 @@  static errcode_t raw_read_blk(io_channel channel,
 		retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
 		goto error_out;
 	}
+
+	actual = read(data->dev, buf, size);
+	if (actual != size) {
+		if (actual < 0) {
+			retval = errno;
+			actual = 0;
+		} else {
+			retval = EXT2_ET_SHORT_READ;
+		}
+	}
+
+error_out:
+	*asize = actual;
+	return retval;
+}
+
+
+/*
+ * Here are the raw I/O functions
+ */
+static errcode_t raw_read_blk(io_channel channel,
+			      struct unix_private_data *data,
+			      unsigned long long block,
+			      int count, void *bufv)
+{
+	errcode_t	retval;
+	ssize_t		size;
+	ext2_loff_t	location;
+	int		actual = 0;
+	unsigned char	*buf = bufv;
+	unsigned int	align = channel->align;
+	ssize_t		really_read = 0;
+	blk64_t		blk;
+	loff_t		offset;
+
+	size = (count < 0) ? -count : count * channel->block_size;
+	data->io_stats.bytes_read += size;
+	location = ((ext2_loff_t) block * channel->block_size) + data->offset;
+
+	if (data->flags & IO_FLAG_FORCE_BOUNCE) {
+		align = channel->block_size;
+		goto bounce_read;
+	}
+
 	if ((channel->align == 0) ||
-	    (IS_ALIGNED(buf, channel->align) &&
+	    (IS_ALIGNED(location, channel->align) &&
+	     IS_ALIGNED(buf, channel->align) &&
 	     IS_ALIGNED(size, channel->align))) {
-		actual = read(data->dev, buf, size);
-		if (actual != size) {
-		short_read:
-			if (actual < 0) {
-				retval = errno;
-				actual = 0;
-			} else
-				retval = EXT2_ET_SHORT_READ;
+		retval = raw_aligned_read_blk(channel, data, location,
+						size, bufv, &actual);
+		if (retval != 0)
 			goto error_out;
-		}
-		return 0;
+
+		return retval;
 	}
 
+bounce_read:
 #ifdef ALIGN_DEBUG
-	printf("raw_read_blk: O_DIRECT fallback: %p %lu\n", buf,
-	       (unsigned long) size);
+	printf("raw_read_blk: O_DIRECT fallback: %p %lu<>%lu\n", buf,
+		location, (unsigned long) size);
 #endif
-
 	/*
 	 * The buffer or size which we're trying to read isn't aligned
 	 * to the O_DIRECT rules, so we need to do this the hard way...
+	 * read / write must be aligned to the block device sector size
 	 */
-bounce_read:
+
+	blk = location / align;
+	offset = location % align;
+
+	if (lseek(data->dev, blk * align, SEEK_SET) < 0)
+		return errno;
+
 	while (size > 0) {
-		actual = read(data->dev, data->bounce, channel->block_size);
-		if (actual != channel->block_size) {
+		actual = read(data->dev, data->bounce, align);
+		if (actual != align) {
 			actual = really_read;
 			buf -= really_read;
 			size += really_read;
-			goto short_read;
+			retval = EXT2_ET_SHORT_READ;
+			goto error_out;
 		}
 		actual = size;
-		if (size > channel->block_size)
-			actual = channel->block_size;
-		memcpy(buf, data->bounce, actual);
+		if ((actual + offset) > align)
+			actual = align - offset;
+		if (actual > size)
+			actual = size;
+
+		memcpy(buf, data->bounce + offset, actual);
 		really_read += actual;
 		size -= actual;
 		buf += actual;
+		offset = 0;
+		blk++;
 	}
 	return 0;
 
@@ -258,6 +295,58 @@  error_out:
 	return retval;
 }
 
+static errcode_t raw_aligned_write_blk(io_channel channel,
+			       struct unix_private_data *data,
+			       ext2_loff_t  location,
+			       ssize_t size, const void *bufv,
+			       int *asize)
+
+{
+	int		actual = 0;
+	errcode_t	retval;
+	const unsigned char *buf = (void *)bufv;
+
+#ifdef ALIGN_DEBUG
+	printf("raw_aligned_write_blk: %p %lu %lu\n", buf,
+	       location, (unsigned long) size);
+#endif
+
+#ifdef HAVE_PWRITE64
+	/* Try an aligned pwrite */
+	actual = pwrite64(data->dev, buf, size, location);
+	if (actual == size)
+		return 0;
+#elif HAVE_PWRITE
+	/* Try an aligned pwrite */
+	if ((sizeof(off_t) >= sizeof(ext2_loff_t)) {
+		actual = pwrite(data->dev, buf, size, location);
+		if (actual == size)
+			return 0;
+	}
+#endif /* HAVE_PWRITE */
+
+	if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
+		retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
+		goto error_out;
+	}
+
+	actual = write(data->dev, buf, size);
+	if (actual < 0) {
+		retval = errno;
+		goto error_out;
+	}
+	if (actual != size) {
+		retval = EXT2_ET_SHORT_WRITE;
+		goto error_out;
+	}
+	retval = 0;
+error_out:
+	*asize = actual;
+	return retval;
+
+}
+
+
 static errcode_t raw_write_blk(io_channel channel,
 			       struct unix_private_data *data,
 			       unsigned long long block,
@@ -268,6 +357,9 @@  static errcode_t raw_write_blk(io_channel channel,
 	int		actual = 0;
 	errcode_t	retval;
 	const unsigned char *buf = bufv;
+	unsigned int	align = channel->align;
+	blk64_t		blk;
+	loff_t		offset;
 
 	if (count == 1)
 		size = channel->block_size;
@@ -282,95 +374,79 @@  static errcode_t raw_write_blk(io_channel channel,
 	location = ((ext2_loff_t) block * channel->block_size) + data->offset;
 
 	if (data->flags & IO_FLAG_FORCE_BOUNCE) {
-		if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
-			retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
-			goto error_out;
-		}
+		align = channel->block_size;
 		goto bounce_write;
 	}
 
-#ifdef HAVE_PWRITE64
-	/* Try an aligned pwrite */
 	if ((channel->align == 0) ||
-	    (IS_ALIGNED(buf, channel->align) &&
+	    (IS_ALIGNED(location, channel->align) &&
+	     IS_ALIGNED(buf, channel->align) &&
 	     IS_ALIGNED(size, channel->align))) {
-		actual = pwrite64(data->dev, buf, size, location);
-		if (actual == size)
-			return 0;
-	}
-#elif HAVE_PWRITE
-	/* Try an aligned pwrite */
-	if ((sizeof(off_t) >= sizeof(ext2_loff_t)) &&
-	    ((channel->align == 0) ||
-	     (IS_ALIGNED(buf, channel->align) &&
-	      IS_ALIGNED(size, channel->align)))) {
-		actual = pwrite(data->dev, buf, size, location);
-		if (actual == size)
-			return 0;
-	}
-#endif /* HAVE_PWRITE */
+		retval = raw_aligned_write_blk(channel, data, location,
+						size, bufv, &actual);
+		if (retval != 0)
+			goto error_out;
 
-	if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
-		retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
-		goto error_out;
+		return retval;
 	}
 
-	if ((channel->align == 0) ||
-	    (IS_ALIGNED(buf, channel->align) &&
-	     IS_ALIGNED(size, channel->align))) {
-		actual = write(data->dev, buf, size);
-		if (actual < 0) {
-			retval = errno;
-			goto error_out;
-		}
-		if (actual != size) {
-		short_write:
-			retval = EXT2_ET_SHORT_WRITE;
-			goto error_out;
-		}
-		return 0;
-	}
 
+bounce_write:
 #ifdef ALIGN_DEBUG
 	printf("raw_write_blk: O_DIRECT fallback: %p %lu\n", buf,
 	       (unsigned long) size);
 #endif
+
+	/* logical offset may don't aligned with block device block size */
+	blk = location / align;
+	offset = location % align;
+
+	if (lseek(data->dev, blk * align, SEEK_SET) != blk * align) {
+		retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
+		goto error_out;
+	}
+
 	/*
 	 * The buffer or size which we're trying to write isn't aligned
 	 * to the O_DIRECT rules, so we need to do this the hard way...
 	 */
-bounce_write:
 	while (size > 0) {
-		if (size < channel->block_size) {
-			actual = read(data->dev, data->bounce,
-				      channel->block_size);
-			if (actual != channel->block_size) {
-				if (actual < 0) {
-					retval = errno;
-					goto error_out;
-				}
-				memset((char *) data->bounce + actual, 0,
-				       channel->block_size - actual);
+		int actual_w;
+
+		memset((char *) data->bounce, 0, align);
+		if (offset || (size < align)) {
+			actual = read(data->dev, data->bounce, align);
+			if (actual < 0) {
+				retval = errno;
+				goto error_out;
 			}
 		}
 		actual = size;
-		if (size > channel->block_size)
-			actual = channel->block_size;
-		memcpy(data->bounce, buf, actual);
-		if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
+		if ((actual + offset) > align)
+			actual = align - offset;
+		if (actual > size)
+			actual = size;
+		memcpy(((char *)data->bounce) + offset, buf, actual);
+
+		if (lseek(data->dev, blk * align, SEEK_SET) != blk * align) {
 			retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
 			goto error_out;
 		}
-		actual = write(data->dev, data->bounce, channel->block_size);
-		if (actual < 0) {
+
+		actual_w = write(data->dev, data->bounce, align);
+		if (actual_w < 0) {
 			retval = errno;
 			goto error_out;
 		}
-		if (actual != channel->block_size)
-			goto short_write;
+		if (actual_w != align) {
+			retval = EXT2_ET_SHORT_WRITE;
+			goto error_out;
+		}
 		size -= actual;
 		buf += actual;
 		location += actual;
+		blk ++;
+		offset = 0;
 	}
 	return 0;