[v2,1/1] ide: check DMA transfer size in ide_dma_cb() to prevent qemu DoS from quests
diff mbox series

Message ID 20191114172531.10644-1-alex.popov@linux.com
State New
Headers show
Series
  • [v2,1/1] ide: check DMA transfer size in ide_dma_cb() to prevent qemu DoS from quests
Related show

Commit Message

Alexander Popov Nov. 14, 2019, 5:25 p.m. UTC
The commit a718978ed58a from July 2015 introduced the assertion which
implies that the size of successful DMA transfers handled in ide_dma_cb()
should be multiple of 512 (the size of a sector). But guest systems can
initiate DMA transfers that don't fit this requirement.

PoC for Linux that uses SCSI_IOCTL_SEND_COMMAND to perform such an ATA
command and crash qemu:

#include <stdio.h>
#include <sys/ioctl.h>
#include <stdint.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <string.h>
#include <stdlib.h>
#include <scsi/scsi.h>
#include <scsi/scsi_ioctl.h>

#define CMD_SIZE 2048

struct scsi_ioctl_cmd_6 {
	unsigned int inlen;
	unsigned int outlen;
	unsigned char cmd[6];
	unsigned char data[];
};

int main(void)
{
	intptr_t fd = 0;
	struct scsi_ioctl_cmd_6 *cmd = NULL;

	cmd = malloc(CMD_SIZE);
	if (!cmd) {
		perror("[-] malloc");
		return 1;
	}

	memset(cmd, 0, CMD_SIZE);
	cmd->inlen = 1337;
	cmd->cmd[0] = READ_6;

	fd = open("/dev/sg0", O_RDONLY);
	if (fd == -1) {
		perror("[-] opening sg");
		return 1;
	}

	printf("[+] sg0 is opened\n");

	printf("[.] qemu should break here:\n");
	fflush(stdout);
	ioctl(fd, SCSI_IOCTL_SEND_COMMAND, cmd);
	printf("[-] qemu didn't break\n");

	free(cmd);

	return 1;
}

For fixing that let's check the number of bytes prepared for the transfer
by the prepare_buf() handler. If it is not a multiple of 512 then end
the DMA transfer with an error.

That also fixes the I/O stall in guests after a DMA transfer request
for less than the size of a sector.

Signed-off-by: Alexander Popov <alex.popov@linux.com>
---
 hw/ide/core.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

Comments

Darren Kenny Nov. 15, 2019, 11:09 a.m. UTC | #1
On Thu, Nov 14, 2019 at 08:25:31PM +0300, Alexander Popov wrote:
>The commit a718978ed58a from July 2015 introduced the assertion which
>implies that the size of successful DMA transfers handled in ide_dma_cb()
>should be multiple of 512 (the size of a sector). But guest systems can
>initiate DMA transfers that don't fit this requirement.
>
>PoC for Linux that uses SCSI_IOCTL_SEND_COMMAND to perform such an ATA
>command and crash qemu:
>
>#include <stdio.h>
>#include <sys/ioctl.h>
>#include <stdint.h>
>#include <sys/types.h>
>#include <sys/stat.h>
>#include <fcntl.h>
>#include <string.h>
>#include <stdlib.h>
>#include <scsi/scsi.h>
>#include <scsi/scsi_ioctl.h>
>
>#define CMD_SIZE 2048
>
>struct scsi_ioctl_cmd_6 {
>	unsigned int inlen;
>	unsigned int outlen;
>	unsigned char cmd[6];
>	unsigned char data[];
>};
>
>int main(void)
>{
>	intptr_t fd = 0;
>	struct scsi_ioctl_cmd_6 *cmd = NULL;
>
>	cmd = malloc(CMD_SIZE);
>	if (!cmd) {
>		perror("[-] malloc");
>		return 1;
>	}
>
>	memset(cmd, 0, CMD_SIZE);
>	cmd->inlen = 1337;
>	cmd->cmd[0] = READ_6;
>
>	fd = open("/dev/sg0", O_RDONLY);
>	if (fd == -1) {
>		perror("[-] opening sg");
>		return 1;
>	}
>
>	printf("[+] sg0 is opened\n");
>
>	printf("[.] qemu should break here:\n");
>	fflush(stdout);
>	ioctl(fd, SCSI_IOCTL_SEND_COMMAND, cmd);
>	printf("[-] qemu didn't break\n");
>
>	free(cmd);
>
>	return 1;
>}
>
>For fixing that let's check the number of bytes prepared for the transfer
>by the prepare_buf() handler. If it is not a multiple of 512 then end
>the DMA transfer with an error.
>
>That also fixes the I/O stall in guests after a DMA transfer request
>for less than the size of a sector.
>
>Signed-off-by: Alexander Popov <alex.popov@linux.com>

Reviewed-by: Darren Kenny <darren.kenny@oracle.com>

Thanks,

Darren.

>---
> hw/ide/core.c | 11 +++++------
> 1 file changed, 5 insertions(+), 6 deletions(-)
>
>diff --git a/hw/ide/core.c b/hw/ide/core.c
>index 754ff4dc34..85aac614f0 100644
>--- a/hw/ide/core.c
>+++ b/hw/ide/core.c
>@@ -849,6 +849,7 @@ static void ide_dma_cb(void *opaque, int ret)
>     int64_t sector_num;
>     uint64_t offset;
>     bool stay_active = false;
>+    int32_t prepared = 0;
>
>     if (ret == -EINVAL) {
>         ide_dma_error(s);
>@@ -892,12 +893,10 @@ static void ide_dma_cb(void *opaque, int ret)
>     n = s->nsector;
>     s->io_buffer_index = 0;
>     s->io_buffer_size = n * 512;
>-    if (s->bus->dma->ops->prepare_buf(s->bus->dma, s->io_buffer_size) < 512) {
>-        /* The PRDs were too short. Reset the Active bit, but don't raise an
>-         * interrupt. */
>-        s->status = READY_STAT | SEEK_STAT;
>-        dma_buf_commit(s, 0);
>-        goto eot;
>+    prepared = s->bus->dma->ops->prepare_buf(s->bus->dma, s->io_buffer_size);
>+    if (prepared % 512) {
>+        ide_dma_error(s);
>+        return;
>     }
>
>     trace_ide_dma_cb(s, sector_num, n, IDE_DMA_CMD_str(s->dma_cmd));
>-- 
>2.21.0
>
>
Kevin Wolf Nov. 21, 2019, 3:03 p.m. UTC | #2
Am 14.11.2019 um 18:25 hat Alexander Popov geschrieben:
> The commit a718978ed58a from July 2015 introduced the assertion which
> implies that the size of successful DMA transfers handled in ide_dma_cb()
> should be multiple of 512 (the size of a sector). But guest systems can
> initiate DMA transfers that don't fit this requirement.
> 
> PoC for Linux that uses SCSI_IOCTL_SEND_COMMAND to perform such an ATA
> command and crash qemu:
> 
> #include <stdio.h>
> #include <sys/ioctl.h>
> #include <stdint.h>
> #include <sys/types.h>
> #include <sys/stat.h>
> #include <fcntl.h>
> #include <string.h>
> #include <stdlib.h>
> #include <scsi/scsi.h>
> #include <scsi/scsi_ioctl.h>
> 
> #define CMD_SIZE 2048
> 
> struct scsi_ioctl_cmd_6 {
> 	unsigned int inlen;
> 	unsigned int outlen;
> 	unsigned char cmd[6];
> 	unsigned char data[];
> };
> 
> int main(void)
> {
> 	intptr_t fd = 0;
> 	struct scsi_ioctl_cmd_6 *cmd = NULL;
> 
> 	cmd = malloc(CMD_SIZE);
> 	if (!cmd) {
> 		perror("[-] malloc");
> 		return 1;
> 	}
> 
> 	memset(cmd, 0, CMD_SIZE);
> 	cmd->inlen = 1337;
> 	cmd->cmd[0] = READ_6;
> 
> 	fd = open("/dev/sg0", O_RDONLY);
> 	if (fd == -1) {
> 		perror("[-] opening sg");
> 		return 1;
> 	}
> 
> 	printf("[+] sg0 is opened\n");
> 
> 	printf("[.] qemu should break here:\n");
> 	fflush(stdout);
> 	ioctl(fd, SCSI_IOCTL_SEND_COMMAND, cmd);
> 	printf("[-] qemu didn't break\n");
> 
> 	free(cmd);
> 
> 	return 1;
> }

It would be nicer to turn the reproducer into a test case for
tests/ide-test.c.

> For fixing that let's check the number of bytes prepared for the transfer
> by the prepare_buf() handler. If it is not a multiple of 512 then end
> the DMA transfer with an error.
> 
> That also fixes the I/O stall in guests after a DMA transfer request
> for less than the size of a sector.
> 
> Signed-off-by: Alexander Popov <alex.popov@linux.com>

This patch makes ide-test fail:

  TEST    check-qtest-x86_64: tests/ide-test
**
ERROR:tests/ide-test.c:469:test_bmdma_short_prdt: assertion failed (status == 0): (0x00000004 == 0x00000000)
ERROR - Bail out! ERROR:tests/ide-test.c:469:test_bmdma_short_prdt: assertion failed (status == 0): (0x00000004 == 0x00000000)

> diff --git a/hw/ide/core.c b/hw/ide/core.c
> index 754ff4dc34..85aac614f0 100644
> --- a/hw/ide/core.c
> +++ b/hw/ide/core.c
> @@ -849,6 +849,7 @@ static void ide_dma_cb(void *opaque, int ret)
>      int64_t sector_num;
>      uint64_t offset;
>      bool stay_active = false;
> +    int32_t prepared = 0;
>  
>      if (ret == -EINVAL) {
>          ide_dma_error(s);
> @@ -892,12 +893,10 @@ static void ide_dma_cb(void *opaque, int ret)
>      n = s->nsector;
>      s->io_buffer_index = 0;
>      s->io_buffer_size = n * 512;
> -    if (s->bus->dma->ops->prepare_buf(s->bus->dma, s->io_buffer_size) < 512) {
> -        /* The PRDs were too short. Reset the Active bit, but don't raise an
> -         * interrupt. */
> -        s->status = READY_STAT | SEEK_STAT;
> -        dma_buf_commit(s, 0);
> -        goto eot;
> +    prepared = s->bus->dma->ops->prepare_buf(s->bus->dma, s->io_buffer_size);
> +    if (prepared % 512) {
> +        ide_dma_error(s);

Which I assume is because you changed the error mode here compared to
the old version.

I'm not sure offhand what the correct behaviour is for non-aligned
values > 512. I think we actually have two cases here: Either a short or
a long PRD. The commit message should explain this with spec references
and a test case should be added for both cases.

Kevin
Alexander Popov Nov. 26, 2019, 9:24 p.m. UTC | #3
Hello Kevin,

Thanks for your review,

On 21.11.2019 18:03, Kevin Wolf wrote:
> Am 14.11.2019 um 18:25 hat Alexander Popov geschrieben:
>> The commit a718978ed58a from July 2015 introduced the assertion which
>> implies that the size of successful DMA transfers handled in ide_dma_cb()
>> should be multiple of 512 (the size of a sector). But guest systems can
>> initiate DMA transfers that don't fit this requirement.
>>
>> PoC for Linux that uses SCSI_IOCTL_SEND_COMMAND to perform such an ATA
>> command and crash qemu:
...
> 
> It would be nicer to turn the reproducer into a test case for
> tests/ide-test.c.

Yes, I can do that.

>> For fixing that let's check the number of bytes prepared for the transfer
>> by the prepare_buf() handler. If it is not a multiple of 512 then end
>> the DMA transfer with an error.
>>
>> That also fixes the I/O stall in guests after a DMA transfer request
>> for less than the size of a sector.
>>
>> Signed-off-by: Alexander Popov <alex.popov@linux.com>
> 
> This patch makes ide-test fail:
> 
>   TEST    check-qtest-x86_64: tests/ide-test
> **
> ERROR:tests/ide-test.c:469:test_bmdma_short_prdt: assertion failed (status == 0): (0x00000004 == 0x00000000)
> ERROR - Bail out! ERROR:tests/ide-test.c:469:test_bmdma_short_prdt: assertion failed (status == 0): (0x00000004 == 0x00000000)

Thanks for the notice.
Yes, I can reproduce it too with `make check-qtest-i386`.

>> diff --git a/hw/ide/core.c b/hw/ide/core.c
>> index 754ff4dc34..85aac614f0 100644
>> --- a/hw/ide/core.c
>> +++ b/hw/ide/core.c
>> @@ -849,6 +849,7 @@ static void ide_dma_cb(void *opaque, int ret)
>>      int64_t sector_num;
>>      uint64_t offset;
>>      bool stay_active = false;
>> +    int32_t prepared = 0;
>>  
>>      if (ret == -EINVAL) {
>>          ide_dma_error(s);
>> @@ -892,12 +893,10 @@ static void ide_dma_cb(void *opaque, int ret)
>>      n = s->nsector;
>>      s->io_buffer_index = 0;
>>      s->io_buffer_size = n * 512;
>> -    if (s->bus->dma->ops->prepare_buf(s->bus->dma, s->io_buffer_size) < 512) {
>> -        /* The PRDs were too short. Reset the Active bit, but don't raise an
>> -         * interrupt. */
>> -        s->status = READY_STAT | SEEK_STAT;
>> -        dma_buf_commit(s, 0);
>> -        goto eot;
>> +    prepared = s->bus->dma->ops->prepare_buf(s->bus->dma, s->io_buffer_size);
>> +    if (prepared % 512) {
>> +        ide_dma_error(s);
> 
> Which I assume is because you changed the error mode here compared to
> the old version.

Yes, you are right.

> I'm not sure offhand what the correct behaviour is for non-aligned
> values > 512. I think we actually have two cases here: Either a short or
> a long PRD. The commit message should explain this with spec references
> and a test case should be added for both cases.

I've found the "Programming Interface for Bus Master IDE Controller" (revision
1.0 5/16/94). The chapter 3.1 (Status Bit Interpretation) provides some answers.
It says that:
  1. If PRD's specified a smaller size than the IDE transfer size, then the
Interrupt and Active bits in the Controller status register are not set.
  2. If the size of the physical memory regions was larger than the IDE device
transfer size, the Interrupt and Active bits in the Controller status register
are both set to 1.

So my changing of the error mode in short PRD's case was wrong, and the
test_bmdma_short_prdt() is correct.

Now let's think about the proper fix of the qemu crash.

Currently I don't really understand how ide_dma_cb() emulates the logic
described in Status Bit Interpretation chapter. I don't see any comparison
between the DMA transfer size and PRD's size.

We only have this check against the size of a sector (512 bytes), which doesn't
catch all short PRD's cases (PRD in my PoC is 1337 bytes).

Kevin, do you have any clues?

Best regards,
Alexander
Kevin Wolf Nov. 26, 2019, 10:09 p.m. UTC | #4
Am 26.11.2019 um 22:24 hat Alexander Popov geschrieben:
> Hello Kevin,
> 
> Thanks for your review,
> 
> On 21.11.2019 18:03, Kevin Wolf wrote:
> > Am 14.11.2019 um 18:25 hat Alexander Popov geschrieben:
> >> The commit a718978ed58a from July 2015 introduced the assertion which
> >> implies that the size of successful DMA transfers handled in ide_dma_cb()
> >> should be multiple of 512 (the size of a sector). But guest systems can
> >> initiate DMA transfers that don't fit this requirement.
> >>
> >> PoC for Linux that uses SCSI_IOCTL_SEND_COMMAND to perform such an ATA
> >> command and crash qemu:
> ...
> > 
> > It would be nicer to turn the reproducer into a test case for
> > tests/ide-test.c.
> 
> Yes, I can do that.
> 
> >> For fixing that let's check the number of bytes prepared for the transfer
> >> by the prepare_buf() handler. If it is not a multiple of 512 then end
> >> the DMA transfer with an error.
> >>
> >> That also fixes the I/O stall in guests after a DMA transfer request
> >> for less than the size of a sector.
> >>
> >> Signed-off-by: Alexander Popov <alex.popov@linux.com>
> > 
> > This patch makes ide-test fail:
> > 
> >   TEST    check-qtest-x86_64: tests/ide-test
> > **
> > ERROR:tests/ide-test.c:469:test_bmdma_short_prdt: assertion failed (status == 0): (0x00000004 == 0x00000000)
> > ERROR - Bail out! ERROR:tests/ide-test.c:469:test_bmdma_short_prdt: assertion failed (status == 0): (0x00000004 == 0x00000000)
> 
> Thanks for the notice.
> Yes, I can reproduce it too with `make check-qtest-i386`.
> 
> >> diff --git a/hw/ide/core.c b/hw/ide/core.c
> >> index 754ff4dc34..85aac614f0 100644
> >> --- a/hw/ide/core.c
> >> +++ b/hw/ide/core.c
> >> @@ -849,6 +849,7 @@ static void ide_dma_cb(void *opaque, int ret)
> >>      int64_t sector_num;
> >>      uint64_t offset;
> >>      bool stay_active = false;
> >> +    int32_t prepared = 0;
> >>  
> >>      if (ret == -EINVAL) {
> >>          ide_dma_error(s);
> >> @@ -892,12 +893,10 @@ static void ide_dma_cb(void *opaque, int ret)
> >>      n = s->nsector;
> >>      s->io_buffer_index = 0;
> >>      s->io_buffer_size = n * 512;
> >> -    if (s->bus->dma->ops->prepare_buf(s->bus->dma, s->io_buffer_size) < 512) {
> >> -        /* The PRDs were too short. Reset the Active bit, but don't raise an
> >> -         * interrupt. */
> >> -        s->status = READY_STAT | SEEK_STAT;
> >> -        dma_buf_commit(s, 0);
> >> -        goto eot;
> >> +    prepared = s->bus->dma->ops->prepare_buf(s->bus->dma, s->io_buffer_size);
> >> +    if (prepared % 512) {
> >> +        ide_dma_error(s);
> > 
> > Which I assume is because you changed the error mode here compared to
> > the old version.
> 
> Yes, you are right.
> 
> > I'm not sure offhand what the correct behaviour is for non-aligned
> > values > 512. I think we actually have two cases here: Either a short or
> > a long PRD. The commit message should explain this with spec references
> > and a test case should be added for both cases.
> 
> I've found the "Programming Interface for Bus Master IDE Controller" (revision
> 1.0 5/16/94). The chapter 3.1 (Status Bit Interpretation) provides some answers.

Yes, I think that's the same that I've used before. I assume it's the
relevant spec.

> It says that:
>   1. If PRD's specified a smaller size than the IDE transfer size, then the
> Interrupt and Active bits in the Controller status register are not set.

>   2. If the size of the physical memory regions was larger than the IDE device
> transfer size, the Interrupt and Active bits in the Controller status register
> are both set to 1.
> 
> So my changing of the error mode in short PRD's case was wrong, and the
> test_bmdma_short_prdt() is correct.

Yes, I think 1. is implemented correctly for PRDs that are too small and
smaller than a sector.

I think the assumption may have been that if the PRDT contains at least
one more full sector, we'll do that one sector before coming back to the
same place and hitting the code path for a too short PRDT.

However, the code neglects to actually use the return value of
.prepare_buf() to limit the number of sectors accessed. So if we ask for
a scatter/gather list for 5 sectors and we get 3 sectors, we still
assume we can write to all 5. This is obviously wrong.

> Now let's think about the proper fix of the qemu crash.
> 
> Currently I don't really understand how ide_dma_cb() emulates the logic
> described in Status Bit Interpretation chapter. I don't see any comparison
> between the DMA transfer size and PRD's size.
> 
> We only have this check against the size of a sector (512 bytes), which doesn't
> catch all short PRD's cases (PRD in my PoC is 1337 bytes).

I think for making the above assumption work, we'd have to check the
return value, which gets us something like:

    ret = s->bus->dma->ops->prepare_buf()
    if (ret < 512) {
        ... short PRDT code ...
    } else if (ret < n * 512) {
        n = ret / 512;
    }

Instead of doing the extra iteration and executing I/O for the first
part of the request, maybe this would work, too:

    ret = s->bus->dma->ops->prepare_buf()
    if (ret < n * 512) {
        ... short PRDT code ...
    }

We need to check in the spec whether we're supposed to actually do
partial I/O for short PRDTs. I couldn't find a clear answer with a quick
look, but I'm leaning towards doing the partial I/O (i.e. implementing
the first pseudo-code piece above).


As for handling long PRDTs, we have code that looks like it's meant to
handle the case:

    n = s->io_buffer_size >> 9;
    if (n > s->nsector) {
        /* The PRDs were longer than needed for this request. Shorten them so
         * we don't get a negative remainder. The Active bit must remain set
         * after the request completes. */
        n = s->nsector;
        stay_active = true;
    }

bmdma_prepare_buf() does potentially set s->io_buffer_size to a value
larger than the passed limit, so maybe this is already correct. We have
a basic test for it in test_bmdma_long_prdt(), but I can't rule out that
there are more complicated cases where it fails.

I'm pretty sure we must handle the long PRDT case only after doing I/O
(like we currently do) because the operation is supposed to have
completed by the time we signal that the PRDT was long, so the guest can
trust that a read has actually read something and a write has reached
the disk. The spec says "This is a valid completion case".


Does this make sense to you?

Kevin
Alexander Popov Nov. 30, 2019, 10:04 a.m. UTC | #5
On 27.11.2019 01:09, Kevin Wolf wrote:
> Am 26.11.2019 um 22:24 hat Alexander Popov geschrieben:
>> Hello Kevin,
>>
>> Thanks for your review,
>>
>> On 21.11.2019 18:03, Kevin Wolf wrote:
>>> Am 14.11.2019 um 18:25 hat Alexander Popov geschrieben:
>>>> The commit a718978ed58a from July 2015 introduced the assertion which
>>>> implies that the size of successful DMA transfers handled in ide_dma_cb()
>>>> should be multiple of 512 (the size of a sector). But guest systems can
>>>> initiate DMA transfers that don't fit this requirement.
>>>>
>>>> PoC for Linux that uses SCSI_IOCTL_SEND_COMMAND to perform such an ATA
>>>> command and crash qemu:
>> ...
>>>
>>> It would be nicer to turn the reproducer into a test case for
>>> tests/ide-test.c.
>>
>> Yes, I can do that.
>>
>>>> For fixing that let's check the number of bytes prepared for the transfer
>>>> by the prepare_buf() handler. If it is not a multiple of 512 then end
>>>> the DMA transfer with an error.
>>>>
>>>> That also fixes the I/O stall in guests after a DMA transfer request
>>>> for less than the size of a sector.
>>>>
>>>> Signed-off-by: Alexander Popov <alex.popov@linux.com>
>>>
>>> This patch makes ide-test fail:
>>>
>>>   TEST    check-qtest-x86_64: tests/ide-test
>>> **
>>> ERROR:tests/ide-test.c:469:test_bmdma_short_prdt: assertion failed (status == 0): (0x00000004 == 0x00000000)
>>> ERROR - Bail out! ERROR:tests/ide-test.c:469:test_bmdma_short_prdt: assertion failed (status == 0): (0x00000004 == 0x00000000)
>>
>> Thanks for the notice.
>> Yes, I can reproduce it too with `make check-qtest-i386`.
>>
>>>> diff --git a/hw/ide/core.c b/hw/ide/core.c
>>>> index 754ff4dc34..85aac614f0 100644
>>>> --- a/hw/ide/core.c
>>>> +++ b/hw/ide/core.c
>>>> @@ -849,6 +849,7 @@ static void ide_dma_cb(void *opaque, int ret)
>>>>      int64_t sector_num;
>>>>      uint64_t offset;
>>>>      bool stay_active = false;
>>>> +    int32_t prepared = 0;
>>>>  
>>>>      if (ret == -EINVAL) {
>>>>          ide_dma_error(s);
>>>> @@ -892,12 +893,10 @@ static void ide_dma_cb(void *opaque, int ret)
>>>>      n = s->nsector;
>>>>      s->io_buffer_index = 0;
>>>>      s->io_buffer_size = n * 512;
>>>> -    if (s->bus->dma->ops->prepare_buf(s->bus->dma, s->io_buffer_size) < 512) {
>>>> -        /* The PRDs were too short. Reset the Active bit, but don't raise an
>>>> -         * interrupt. */
>>>> -        s->status = READY_STAT | SEEK_STAT;
>>>> -        dma_buf_commit(s, 0);
>>>> -        goto eot;
>>>> +    prepared = s->bus->dma->ops->prepare_buf(s->bus->dma, s->io_buffer_size);
>>>> +    if (prepared % 512) {
>>>> +        ide_dma_error(s);
>>>
>>> Which I assume is because you changed the error mode here compared to
>>> the old version.
>>
>> Yes, you are right.
>>
>>> I'm not sure offhand what the correct behaviour is for non-aligned
>>> values > 512. I think we actually have two cases here: Either a short or
>>> a long PRD. The commit message should explain this with spec references
>>> and a test case should be added for both cases.
>>
>> I've found the "Programming Interface for Bus Master IDE Controller" (revision
>> 1.0 5/16/94). The chapter 3.1 (Status Bit Interpretation) provides some answers.
> 
> Yes, I think that's the same that I've used before. I assume it's the
> relevant spec.
> 
>> It says that:
>>   1. If PRD's specified a smaller size than the IDE transfer size, then the
>> Interrupt and Active bits in the Controller status register are not set.
> 
>>   2. If the size of the physical memory regions was larger than the IDE device
>> transfer size, the Interrupt and Active bits in the Controller status register
>> are both set to 1.
>>
>> So my changing of the error mode in short PRD's case was wrong, and the
>> test_bmdma_short_prdt() is correct.
> 
> Yes, I think 1. is implemented correctly for PRDs that are too small and
> smaller than a sector.
> 
> I think the assumption may have been that if the PRDT contains at least
> one more full sector, we'll do that one sector before coming back to the
> same place and hitting the code path for a too short PRDT.
> 
> However, the code neglects to actually use the return value of
> .prepare_buf() to limit the number of sectors accessed. So if we ask for
> a scatter/gather list for 5 sectors and we get 3 sectors, we still
> assume we can write to all 5. This is obviously wrong.
> 
>> Now let's think about the proper fix of the qemu crash.
>>
>> Currently I don't really understand how ide_dma_cb() emulates the logic
>> described in Status Bit Interpretation chapter. I don't see any comparison
>> between the DMA transfer size and PRD's size.
>>
>> We only have this check against the size of a sector (512 bytes), which doesn't
>> catch all short PRD's cases (PRD in my PoC is 1337 bytes).
> 
> I think for making the above assumption work, we'd have to check the
> return value, which gets us something like:
> 
>     ret = s->bus->dma->ops->prepare_buf()
>     if (ret < 512) {
>         ... short PRDT code ...
>     } else if (ret < n * 512) {
>         n = ret / 512;
>     }
> 
> Instead of doing the extra iteration and executing I/O for the first
> part of the request, maybe this would work, too:
> 
>     ret = s->bus->dma->ops->prepare_buf()
>     if (ret < n * 512) {
>         ... short PRDT code ...
>     }
> 
> We need to check in the spec whether we're supposed to actually do
> partial I/O for short PRDTs. I couldn't find a clear answer with a quick
> look, but I'm leaning towards doing the partial I/O (i.e. implementing
> the first pseudo-code piece above).
> 
> 
> As for handling long PRDTs, we have code that looks like it's meant to
> handle the case:
> 
>     n = s->io_buffer_size >> 9;
>     if (n > s->nsector) {
>         /* The PRDs were longer than needed for this request. Shorten them so
>          * we don't get a negative remainder. The Active bit must remain set
>          * after the request completes. */
>         n = s->nsector;
>         stay_active = true;
>     }
> 
> bmdma_prepare_buf() does potentially set s->io_buffer_size to a value
> larger than the passed limit, so maybe this is already correct. We have
> a basic test for it in test_bmdma_long_prdt(), but I can't rule out that
> there are more complicated cases where it fails.
> 
> I'm pretty sure we must handle the long PRDT case only after doing I/O
> (like we currently do) because the operation is supposed to have
> completed by the time we signal that the PRDT was long, so the guest can
> trust that a read has actually read something and a write has reached
> the disk. The spec says "This is a valid completion case".
> 
> 
> Does this make sense to you?

Thanks a lot, Kevin!

First of all I'll improve the unit-tests to cover all cases.

Then I'll try both approaches you described and return with the results.

I'll also try to find more info about partial I/O behavior in other datasheets.

Best regards,
Alexander

Patch
diff mbox series

diff --git a/hw/ide/core.c b/hw/ide/core.c
index 754ff4dc34..85aac614f0 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -849,6 +849,7 @@  static void ide_dma_cb(void *opaque, int ret)
     int64_t sector_num;
     uint64_t offset;
     bool stay_active = false;
+    int32_t prepared = 0;
 
     if (ret == -EINVAL) {
         ide_dma_error(s);
@@ -892,12 +893,10 @@  static void ide_dma_cb(void *opaque, int ret)
     n = s->nsector;
     s->io_buffer_index = 0;
     s->io_buffer_size = n * 512;
-    if (s->bus->dma->ops->prepare_buf(s->bus->dma, s->io_buffer_size) < 512) {
-        /* The PRDs were too short. Reset the Active bit, but don't raise an
-         * interrupt. */
-        s->status = READY_STAT | SEEK_STAT;
-        dma_buf_commit(s, 0);
-        goto eot;
+    prepared = s->bus->dma->ops->prepare_buf(s->bus->dma, s->io_buffer_size);
+    if (prepared % 512) {
+        ide_dma_error(s);
+        return;
     }
 
     trace_ide_dma_cb(s, sector_num, n, IDE_DMA_CMD_str(s->dma_cmd));