Patchwork Fix buffer run out in eepro100.

login
register
mail settings
Submitter Bo Yang
Date Aug. 28, 2012, 6:23 a.m.
Message ID <1346135017-5975-1-git-send-email-boyang@suse.com>
Download mbox | patch
Permalink /patch/180335/
State New
Headers show

Comments

Bo Yang - Aug. 28, 2012, 6:23 a.m.
The guest may enter into state of no receive descriptors,
and if there is no interrupt, the descriptor filling function
has no chance to run again,which causes network stall. According
to liunux driver's implementation, the descriptor with EL bit set
must not be touched by hardware, usually, the buffer size of this
descriptor is set to 0.

Signed-off-by: Bo Yang <boyang@suse.com>
---
 hw/eepro100.c |   21 +++++++++++++++------
 1 files changed, 15 insertions(+), 6 deletions(-)
Stefan Hajnoczi - Aug. 28, 2012, 10:59 a.m.
On Tue, Aug 28, 2012 at 7:23 AM, Bo Yang <boyang@suse.com> wrote:
> According
> to liunux driver's implementation, the descriptor with EL bit set
> must not be touched by hardware, usually, the buffer size of this
> descriptor is set to 0.

Please describe the bug you are seeing and how to reproduce it.  It's
not clear to me that your patch implements the behavior specified in
the datasheet.  I have included relevant quotes from the datasheet:

http://www.intel.com/content/dam/doc/manual/8255x-10-100-mbps-ethernet-controller-software-dev-manual.pdf

"Table 55. CU Activities Performed at the End of Execution" shows that
the EL bit takes effect at end of command execution.  I think this
means the descriptor *will* be processed by the hardware.

The following documents the behavior when the descriptor's buffer size is 0:

"6.4.3.3.1 Configuring the Next RFD
[...]
3. Initiates a receive DMA if the size of the data field in the RFD is
greater than zero. The receive
DMA is initiated with the address of the first byte of the destination
address to the byte count
specified by the RFD.
4. Forces a receive DMA completion if the size of the data field in
the RFD is zero."

> diff --git a/hw/eepro100.c b/hw/eepro100.c
> index 50d117e..e0efd96 100644
> --- a/hw/eepro100.c
> +++ b/hw/eepro100.c
> @@ -1619,8 +1619,13 @@ static const MemoryRegionOps eepro100_ops = {
>  static int nic_can_receive(NetClientState *nc)
>  {
>      EEPRO100State *s = DO_UPCAST(NICState, nc, nc)->opaque;
> +    ru_state_t state;
>      TRACE(RXTX, logout("%p\n", s));
> -    return get_ru_state(s) == ru_ready;
> +    state = get_ru_state(s);
> +    if (state == ru_no_resources) {
> +       eepro100_rnr_interrupt(s);
> +    }
> +    return state == ru_ready;
>  #if 0
>      return !eepro100_buffer_full(s);
>  #endif

This is a boolean function, it shouldn't have side-effects.

Why did you add the eepro100_rnr_interrupt() call here when it's also
called below from nic_receive()?

> @@ -1732,6 +1737,15 @@ static ssize_t nic_receive(NetClientState *nc, const uint8_t * buf, size_t size)
>                   &rx, sizeof(eepro100_rx_t));
>      uint16_t rfd_command = le16_to_cpu(rx.command);
>      uint16_t rfd_size = le16_to_cpu(rx.size);
> +    /* don't touch the rx descriptor with EL set. */
> +    if (rfd_command & COMMAND_EL) {
> +        /* EL bit is set, so this was the last frame. */
> +        logout("receive: Running out of frames\n");
> +        set_ru_state(s, ru_no_resources);
> +        s->statistics.rx_resource_errors++;
> +       eepro100_rnr_interrupt(s);
> +       return -1;
> +    }
>      if (size > rfd_size) {
>          logout("Receive buffer (%" PRId16 " bytes) too small for data "
> @@ -1767,11 +1781,6 @@ static ssize_t nic_receive(NetClientState *nc, const uint8_t * buf, size_t size)
>      s->statistics.rx_good_frames++;
>      eepro100_fr_interrupt(s);
>      s->ru_offset = le32_to_cpu(rx.link);
> -    if (rfd_command & COMMAND_EL) {
> -        /* EL bit is set, so this was the last frame. */
> -        logout("receive: Running out of frames\n");
> -        set_ru_state(s, ru_suspended);
> -    }

"6.4.3.3.3 Completion of Reception" describes how this should work.  I
think the rnr interrupt should be raised here and we should enter the
no resources state.

Stefan
Bo Yang - Aug. 29, 2012, 12:55 a.m.
On 08/28/2012 06:59 PM, Stefan Hajnoczi wrote:
> On Tue, Aug 28, 2012 at 7:23 AM, Bo Yang <boyang@suse.com> wrote:
>> According
>> to liunux driver's implementation, the descriptor with EL bit set
>> must not be touched by hardware, usually, the buffer size of this
>> descriptor is set to 0.
> 
> Please describe the bug you are seeing and how to reproduce it.  

Unfortunately, I cannot reproduce it. It is reported by QA, I used the
QA's test environment to debug here.

It's
> not clear to me that your patch implements the behavior specified in
> the datasheet.  I have included relevant quotes from the datasheet:
> 
> http://www.intel.com/content/dam/doc/manual/8255x-10-100-mbps-ethernet-controller-software-dev-manual.pdf
> 
> "Table 55. CU Activities Performed at the End of Execution" shows that
> the EL bit takes effect at end of command execution.  I think this
> means the descriptor *will* be processed by the hardware.
> 
> The following documents the behavior when the descriptor's buffer size is 0:
> 
> "6.4.3.3.1 Configuring the Next RFD
> [...]
> 3. Initiates a receive DMA if the size of the data field in the RFD is
> greater than zero. The receive
> DMA is initiated with the address of the first byte of the destination
> address to the byte count
> specified by the RFD.
> 4. Forces a receive DMA completion if the size of the data field in
> the RFD is zero."
> 
>> diff --git a/hw/eepro100.c b/hw/eepro100.c
>> index 50d117e..e0efd96 100644
>> --- a/hw/eepro100.c
>> +++ b/hw/eepro100.c
>> @@ -1619,8 +1619,13 @@ static const MemoryRegionOps eepro100_ops = {
>>  static int nic_can_receive(NetClientState *nc)
>>  {
>>      EEPRO100State *s = DO_UPCAST(NICState, nc, nc)->opaque;
>> +    ru_state_t state;
>>      TRACE(RXTX, logout("%p\n", s));
>> -    return get_ru_state(s) == ru_ready;
>> +    state = get_ru_state(s);
>> +    if (state == ru_no_resources) {
>> +       eepro100_rnr_interrupt(s);
>> +    }
>> +    return state == ru_ready;
>>  #if 0
>>      return !eepro100_buffer_full(s);
>>  #endif
> 
> This is a boolean function, it shouldn't have side-effects.
> 
> Why did you add the eepro100_rnr_interrupt() call here when it's also
> called below from nic_receive()?

This is needed. Let's consider tap with e100.
When rx descriptor is EL with size 0. e100 enter ru_no_resources state
and rnr interrupt is raised. Then, driver in guest handles the
interrupt, however, there can be out of memory condition in guest, so
this time the rx descriptor filling fails, so no rx descriptor is ready.

Have a look at tap, tap_can_send is ioh->fd_read_poll, tap_send is
ioh->fd_read, tap_can_send calls nic_can_receive(), tap_send() call
nic_receive(), if tap_can_send returns 0, the tap fd will not be added
to read set, and tap_send() will never be called, so nic_receive is
never called, no more rnr interrupt will be raised. then the network
just stall. So we need interrupt to notify the guest to prepare rx
descriptor again.

> 
>> @@ -1732,6 +1737,15 @@ static ssize_t nic_receive(NetClientState *nc, const uint8_t * buf, size_t size)
>>                   &rx, sizeof(eepro100_rx_t));
>>      uint16_t rfd_command = le16_to_cpu(rx.command);
>>      uint16_t rfd_size = le16_to_cpu(rx.size);
>> +    /* don't touch the rx descriptor with EL set. */
>> +    if (rfd_command & COMMAND_EL) {
>> +        /* EL bit is set, so this was the last frame. */
>> +        logout("receive: Running out of frames\n");
>> +        set_ru_state(s, ru_no_resources);
>> +        s->statistics.rx_resource_errors++;
>> +       eepro100_rnr_interrupt(s);
>> +       return -1;
>> +    }
>>      if (size > rfd_size) {
>>          logout("Receive buffer (%" PRId16 " bytes) too small for data "
>> @@ -1767,11 +1781,6 @@ static ssize_t nic_receive(NetClientState *nc, const uint8_t * buf, size_t size)
>>      s->statistics.rx_good_frames++;
>>      eepro100_fr_interrupt(s);
>>      s->ru_offset = le32_to_cpu(rx.link);
>> -    if (rfd_command & COMMAND_EL) {
>> -        /* EL bit is set, so this was the last frame. */
>> -        logout("receive: Running out of frames\n");
>> -        set_ru_state(s, ru_suspended);
>> -    }
> 
> "6.4.3.3.3 Completion of Reception" describes how this should work.  I
> think the rnr interrupt should be raised here and we should enter the
> no resources state.

I think you're right here. To comply to the specification of e100. We
should handle the rx descriptor with EL, and then enter ru_no_resources
state, raises rnr interrupt.

therefore,

set_ru_state(s, ru_no_resources);
eepro100_rnr_interrupt(s);

after handle rx descriptor with EL bit.

> 
> Stefan
>
Stefan Hajnoczi - Aug. 29, 2012, 6:45 a.m.
On Wed, Aug 29, 2012 at 1:55 AM, Bo Yang <boyang@suse.com> wrote:
> On 08/28/2012 06:59 PM, Stefan Hajnoczi wrote:
>> On Tue, Aug 28, 2012 at 7:23 AM, Bo Yang <boyang@suse.com> wrote:
>>> According
>>> to liunux driver's implementation, the descriptor with EL bit set
>>> must not be touched by hardware, usually, the buffer size of this
>>> descriptor is set to 0.
>>
>> Please describe the bug you are seeing and how to reproduce it.
>
> Unfortunately, I cannot reproduce it. It is reported by QA, I used the
> QA's test environment to debug here.

Please share the steps to reproduce in the commit message so we have
an idea of what this patch fixes (even if we can't reproduce it).

> It's
>> not clear to me that your patch implements the behavior specified in
>> the datasheet.  I have included relevant quotes from the datasheet:
>>
>> http://www.intel.com/content/dam/doc/manual/8255x-10-100-mbps-ethernet-controller-software-dev-manual.pdf
>>
>> "Table 55. CU Activities Performed at the End of Execution" shows that
>> the EL bit takes effect at end of command execution.  I think this
>> means the descriptor *will* be processed by the hardware.
>>
>> The following documents the behavior when the descriptor's buffer size is 0:
>>
>> "6.4.3.3.1 Configuring the Next RFD
>> [...]
>> 3. Initiates a receive DMA if the size of the data field in the RFD is
>> greater than zero. The receive
>> DMA is initiated with the address of the first byte of the destination
>> address to the byte count
>> specified by the RFD.
>> 4. Forces a receive DMA completion if the size of the data field in
>> the RFD is zero."
>>
>>> diff --git a/hw/eepro100.c b/hw/eepro100.c
>>> index 50d117e..e0efd96 100644
>>> --- a/hw/eepro100.c
>>> +++ b/hw/eepro100.c
>>> @@ -1619,8 +1619,13 @@ static const MemoryRegionOps eepro100_ops = {
>>>  static int nic_can_receive(NetClientState *nc)
>>>  {
>>>      EEPRO100State *s = DO_UPCAST(NICState, nc, nc)->opaque;
>>> +    ru_state_t state;
>>>      TRACE(RXTX, logout("%p\n", s));
>>> -    return get_ru_state(s) == ru_ready;
>>> +    state = get_ru_state(s);
>>> +    if (state == ru_no_resources) {
>>> +       eepro100_rnr_interrupt(s);
>>> +    }
>>> +    return state == ru_ready;
>>>  #if 0
>>>      return !eepro100_buffer_full(s);
>>>  #endif
>>
>> This is a boolean function, it shouldn't have side-effects.
>>
>> Why did you add the eepro100_rnr_interrupt() call here when it's also
>> called below from nic_receive()?
>
> This is needed. Let's consider tap with e100.
> When rx descriptor is EL with size 0. e100 enter ru_no_resources state
> and rnr interrupt is raised. Then, driver in guest handles the
> interrupt, however, there can be out of memory condition in guest, so
> this time the rx descriptor filling fails, so no rx descriptor is ready.
>
> Have a look at tap, tap_can_send is ioh->fd_read_poll, tap_send is
> ioh->fd_read, tap_can_send calls nic_can_receive(), tap_send() call
> nic_receive(), if tap_can_send returns 0, the tap fd will not be added
> to read set, and tap_send() will never be called, so nic_receive is
> never called, no more rnr interrupt will be raised. then the network
> just stall. So we need interrupt to notify the guest to prepare rx
> descriptor again.

Here is how virtio-net handles this:
hw/virtio-net.c:virtio_net_handle_rx() calls
qemu_flush_queued_packets() followed by qemu_notify_event() to resume
rx.

static void virtio_net_handle_rx(VirtIODevice *vdev, VirtQueue *vq)
{
    VirtIONet *n = to_virtio_net(vdev);

    qemu_flush_queued_packets(&n->nic->nc);

    /* We now have RX buffers, signal to the IO thread to break out of the
     * select to re-poll the tap file descriptor */
    qemu_notify_event();
}

This function gets invoked when new rx buffers have been given to the
virtio-net NIC by the guest.

eepro100 should do something similar, maybe on eepro100_ru_command() RX_START?

BTW, guest driver out-of-memory is not the hardware's problem.  The
driver needs to handle it.  For example, the Linux e100 driver has a
watchdog timer which will raise an interrupt to refill the rx ring if
allocation failed.

Stefan
Bo Yang - Aug. 29, 2012, 7:17 a.m.
On 08/29/2012 02:45 PM, Stefan Hajnoczi wrote:
> On Wed, Aug 29, 2012 at 1:55 AM, Bo Yang <boyang@suse.com> wrote:
>> On 08/28/2012 06:59 PM, Stefan Hajnoczi wrote:
>>> On Tue, Aug 28, 2012 at 7:23 AM, Bo Yang <boyang@suse.com> wrote:
>>>> According
>>>> to liunux driver's implementation, the descriptor with EL bit set
>>>> must not be touched by hardware, usually, the buffer size of this
>>>> descriptor is set to 0.
>>>
>>> Please describe the bug you are seeing and how to reproduce it.
>>
>> Unfortunately, I cannot reproduce it. It is reported by QA, I used the
>> QA's test environment to debug here.
> 
> Please share the steps to reproduce in the commit message so we have
> an idea of what this patch fixes (even if we can't reproduce it).

I see. According to the QA's report, the network stalls when doing pxe
install. After dhcp and loading the initial kernel/initrd and do some
configurations, then install process tries to copy files of OS to hard
disk, network fails in the process of copying files.

> 
>> It's
>>> not clear to me that your patch implements the behavior specified in
>>> the datasheet.  I have included relevant quotes from the datasheet:
>>>
>>> http://www.intel.com/content/dam/doc/manual/8255x-10-100-mbps-ethernet-controller-software-dev-manual.pdf
>>>
>>> "Table 55. CU Activities Performed at the End of Execution" shows that
>>> the EL bit takes effect at end of command execution.  I think this
>>> means the descriptor *will* be processed by the hardware.
>>>
>>> The following documents the behavior when the descriptor's buffer size is 0:
>>>
>>> "6.4.3.3.1 Configuring the Next RFD
>>> [...]
>>> 3. Initiates a receive DMA if the size of the data field in the RFD is
>>> greater than zero. The receive
>>> DMA is initiated with the address of the first byte of the destination
>>> address to the byte count
>>> specified by the RFD.
>>> 4. Forces a receive DMA completion if the size of the data field in
>>> the RFD is zero."
>>>
>>>> diff --git a/hw/eepro100.c b/hw/eepro100.c
>>>> index 50d117e..e0efd96 100644
>>>> --- a/hw/eepro100.c
>>>> +++ b/hw/eepro100.c
>>>> @@ -1619,8 +1619,13 @@ static const MemoryRegionOps eepro100_ops = {
>>>>  static int nic_can_receive(NetClientState *nc)
>>>>  {
>>>>      EEPRO100State *s = DO_UPCAST(NICState, nc, nc)->opaque;
>>>> +    ru_state_t state;
>>>>      TRACE(RXTX, logout("%p\n", s));
>>>> -    return get_ru_state(s) == ru_ready;
>>>> +    state = get_ru_state(s);
>>>> +    if (state == ru_no_resources) {
>>>> +       eepro100_rnr_interrupt(s);
>>>> +    }
>>>> +    return state == ru_ready;
>>>>  #if 0
>>>>      return !eepro100_buffer_full(s);
>>>>  #endif
>>>
>>> This is a boolean function, it shouldn't have side-effects.
>>>
>>> Why did you add the eepro100_rnr_interrupt() call here when it's also
>>> called below from nic_receive()?
>>
>> This is needed. Let's consider tap with e100.
>> When rx descriptor is EL with size 0. e100 enter ru_no_resources state
>> and rnr interrupt is raised. Then, driver in guest handles the
>> interrupt, however, there can be out of memory condition in guest, so
>> this time the rx descriptor filling fails, so no rx descriptor is ready.
>>
>> Have a look at tap, tap_can_send is ioh->fd_read_poll, tap_send is
>> ioh->fd_read, tap_can_send calls nic_can_receive(), tap_send() call
>> nic_receive(), if tap_can_send returns 0, the tap fd will not be added
>> to read set, and tap_send() will never be called, so nic_receive is
>> never called, no more rnr interrupt will be raised. then the network
>> just stall. So we need interrupt to notify the guest to prepare rx
>> descriptor again.
> 
> Here is how virtio-net handles this:
> hw/virtio-net.c:virtio_net_handle_rx() calls
> qemu_flush_queued_packets() followed by qemu_notify_event() to resume
> rx.
> 
> static void virtio_net_handle_rx(VirtIODevice *vdev, VirtQueue *vq)
> {
>     VirtIONet *n = to_virtio_net(vdev);
> 
>     qemu_flush_queued_packets(&n->nic->nc);
> 
>     /* We now have RX buffers, signal to the IO thread to break out of the
>      * select to re-poll the tap file descriptor */
>     qemu_notify_event();
> }
> 
> This function gets invoked when new rx buffers have been given to the
> virtio-net NIC by the guest.
> 
> eepro100 should do something similar, maybe on eepro100_ru_command() RX_START?

I think this makes the card more responsive here. Even we don't do this,
the tap fd can be polled later again, although with some delays. It is
no harm  to do this, and the ru command RX_START is the right place to
do it.

> 
> BTW, guest driver out-of-memory is not the hardware's problem.  The
> driver needs to handle it.  For example, the Linux e100 driver has a
> watchdog timer which will raise an interrupt to refill the rx ring if
> allocation failed.

You're absolutely right here. Can we do our best to work around broken
drivers? does it deserve to provide such a workaround?

> 
> Stefan
>
Stefan Hajnoczi - Aug. 29, 2012, 7:46 a.m.
On Wed, Aug 29, 2012 at 8:17 AM, Bo Yang <boyang@suse.com> wrote:
> On 08/29/2012 02:45 PM, Stefan Hajnoczi wrote:
>> BTW, guest driver out-of-memory is not the hardware's problem.  The
>> driver needs to handle it.  For example, the Linux e100 driver has a
>> watchdog timer which will raise an interrupt to refill the rx ring if
>> allocation failed.
>
> You're absolutely right here. Can we do our best to work around broken
> drivers? does it deserve to provide such a workaround?

Adding behavior that is not in the spec can introduce new bugs.

Without a reproducible bug that shows we need this non-spec behavior,
I think we shouldn't add it.

Stefan

Patch

diff --git a/hw/eepro100.c b/hw/eepro100.c
index 50d117e..e0efd96 100644
--- a/hw/eepro100.c
+++ b/hw/eepro100.c
@@ -1619,8 +1619,13 @@  static const MemoryRegionOps eepro100_ops = {
 static int nic_can_receive(NetClientState *nc)
 {
     EEPRO100State *s = DO_UPCAST(NICState, nc, nc)->opaque;
+    ru_state_t state;
     TRACE(RXTX, logout("%p\n", s));
-    return get_ru_state(s) == ru_ready;
+    state = get_ru_state(s);
+    if (state == ru_no_resources) {
+	eepro100_rnr_interrupt(s);
+    }
+    return state == ru_ready;
 #if 0
     return !eepro100_buffer_full(s);
 #endif
@@ -1732,6 +1737,15 @@  static ssize_t nic_receive(NetClientState *nc, const uint8_t * buf, size_t size)
                  &rx, sizeof(eepro100_rx_t));
     uint16_t rfd_command = le16_to_cpu(rx.command);
     uint16_t rfd_size = le16_to_cpu(rx.size);
+    /* don't touch the rx descriptor with EL set. */
+    if (rfd_command & COMMAND_EL) {
+        /* EL bit is set, so this was the last frame. */
+        logout("receive: Running out of frames\n");
+        set_ru_state(s, ru_no_resources);
+        s->statistics.rx_resource_errors++;
+	eepro100_rnr_interrupt(s);
+	return -1;
+    }
 
     if (size > rfd_size) {
         logout("Receive buffer (%" PRId16 " bytes) too small for data "
@@ -1767,11 +1781,6 @@  static ssize_t nic_receive(NetClientState *nc, const uint8_t * buf, size_t size)
     s->statistics.rx_good_frames++;
     eepro100_fr_interrupt(s);
     s->ru_offset = le32_to_cpu(rx.link);
-    if (rfd_command & COMMAND_EL) {
-        /* EL bit is set, so this was the last frame. */
-        logout("receive: Running out of frames\n");
-        set_ru_state(s, ru_suspended);
-    }
     if (rfd_command & COMMAND_S) {
         /* S bit is set. */
         set_ru_state(s, ru_suspended);