diff mbox

[comment,tweaked] msix: allow byte and word reading from mmio

Message ID 20101116131452.GA23963@redhat.com
State New
Headers show

Commit Message

Michael S. Tsirkin Nov. 16, 2010, 1:14 p.m. UTC
Although explicitly disallowed by the PCI spec, some guests read a
single byte or word from mmio.  Likely a guest OS bug, but I have an OS
which reads single bytes and it works fine on real hardware.

Signed-off-by: Bernhard Kohl <bernhard.kohl@nsn.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---

OK so it could like something like the below. However, my question is:
do we need to put this in or can the guest simply be fixed?

 hw/msix.c |   31 +++++++++++++++++++++++++++----
 1 files changed, 27 insertions(+), 4 deletions(-)

Comments

Bernhard Kohl Nov. 16, 2010, 4:43 p.m. UTC | #1
Am 16.11.2010 14:14, schrieb mst@redhat.com:
> Although explicitly disallowed by the PCI spec, some guests read a
> single byte or word from mmio.  Likely a guest OS bug, but I have an OS
> which reads single bytes and it works fine on real hardware.
>
> Signed-off-by: Bernhard Kohl<bernhard.kohl@nsn.com>
> Signed-off-by: Michael S. Tsirkin<mst@redhat.com>
> ---
>
> OK so it could like something like the below.

Yes, this looks good for me.

>   However, my question is:
> do we need to put this in or can the guest simply be fixed?
>    

I tried to locate the code where the readw occurs,
but not successful. It only occurs during booting our OS,
and the virtio-net driver seems to be OK. With 4 virtio
NICs we have the following readw accesses, thats all!
3 accesses per NIC and the first NIC appears twice.

MSI-X: msix_mmio_readw dev=0x9767c58 addr=0000000000000008
MSI-X: msix_mmio_readw dev=0x9767c58 addr=0000000000000018
MSI-X: msix_mmio_readw dev=0x9767c58 addr=0000000000000028
MSI-X: msix_mmio_readw dev=0x9772c40 addr=0000000000000008
MSI-X: msix_mmio_readw dev=0x9772c40 addr=0000000000000018
MSI-X: msix_mmio_readw dev=0x9772c40 addr=0000000000000028
MSI-X: msix_mmio_readw dev=0x977dc38 addr=0000000000000008
MSI-X: msix_mmio_readw dev=0x977dc38 addr=0000000000000018
MSI-X: msix_mmio_readw dev=0x977dc38 addr=0000000000000028
MSI-X: msix_mmio_readw dev=0x9788d90 addr=0000000000000008
MSI-X: msix_mmio_readw dev=0x9788d90 addr=0000000000000018
MSI-X: msix_mmio_readw dev=0x9788d90 addr=0000000000000028
MSI-X: msix_mmio_readw dev=0x9767c58 addr=0000000000000008
MSI-X: msix_mmio_readw dev=0x9767c58 addr=0000000000000018
MSI-X: msix_mmio_readw dev=0x9767c58 addr=0000000000000028

Is it possible to add a stack back tace printing to the
readw function?

>   hw/msix.c |   31 +++++++++++++++++++++++++++----
>   1 files changed, 27 insertions(+), 4 deletions(-)
>
> diff --git a/hw/msix.c b/hw/msix.c
> index f66d255..38dff59 100644
> --- a/hw/msix.c
> +++ b/hw/msix.c
> @@ -102,10 +102,28 @@ static uint32_t msix_mmio_readl(void *opaque, target_phys_addr_t addr)
>       return pci_get_long(page + offset);
>   }
>
> -static uint32_t msix_mmio_read_unallowed(void *opaque, target_phys_addr_t addr)
> + /* Note:
> +  * PCI spec requires that all MSI-X table accesses are either DWORD or QWORD,
> +  * size aligned.  Some guests seem to violate this rule for read accesses,
> +  * performing single byte reads.  Since it's easy to support this, let's do so.
> +  * Also support 16 bit size aligned reads, just in case.
> +  */
> +static uint32_t msix_mmio_readw(void *opaque, target_phys_addr_t addr)
>   {
> -    fprintf(stderr, "MSI-X: only dword read is allowed!\n");
> -    return 0;
> +    PCIDevice *dev = opaque;
> +    unsigned int offset = addr&  (MSIX_PAGE_SIZE - 1)&  ~0x1;
> +    void *page = dev->msix_table_page;
> +
> +    return pci_get_word(page + offset);
> +}
> +
> +static uint32_t msix_mmio_readb(void *opaque, target_phys_addr_t addr)
> +{
> +    PCIDevice *dev = opaque;
> +    unsigned int offset = addr&  (MSIX_PAGE_SIZE - 1);
> +    void *page = dev->msix_table_page;
> +
> +    return pci_get_byte(page + offset);
>   }
>
>   static uint8_t msix_pending_mask(int vector)
> @@ -192,6 +210,11 @@ static void msix_mmio_writel(void *opaque, target_phys_addr_t addr,
>       msix_handle_mask_update(dev, vector);
>   }
>
> +/* PCI spec:
> + * For all accesses to MSI-X Table and MSI-X PBA fields, software must use
> + * aligned full DWORD or aligned full QWORD transactions; otherwise, the result
> + * is undefined.
> + */
>   static void msix_mmio_write_unallowed(void *opaque, target_phys_addr_t addr,
>                                         uint32_t val)
>   {
> @@ -203,7 +226,7 @@ static CPUWriteMemoryFunc * const msix_mmio_write[] = {
>   };
>
>   static CPUReadMemoryFunc * const msix_mmio_read[] = {
> -    msix_mmio_read_unallowed, msix_mmio_read_unallowed, msix_mmio_readl
> +    msix_mmio_readb, msix_mmio_readw, msix_mmio_readl
>   };
>
>   /* Should be called from device's map method. */
>
Michael S. Tsirkin Nov. 17, 2010, 2:12 p.m. UTC | #2
On Tue, Nov 16, 2010 at 05:43:06PM +0100, Bernhard Kohl wrote:
> Am 16.11.2010 14:14, schrieb mst@redhat.com:
> >Although explicitly disallowed by the PCI spec, some guests read a
> >single byte or word from mmio.  Likely a guest OS bug, but I have an OS
> >which reads single bytes and it works fine on real hardware.
> >
> >Signed-off-by: Bernhard Kohl<bernhard.kohl@nsn.com>
> >Signed-off-by: Michael S. Tsirkin<mst@redhat.com>
> >---
> >
> >OK so it could like something like the below.
> 
> Yes, this looks good for me.

Wait, the below shows single-word reads. I thought you said bytes?

> >  However, my question is:
> >do we need to put this in or can the guest simply be fixed?
> 
> I tried to locate the code where the readw occurs,
> but not successful. It only occurs during booting our OS,
> and the virtio-net driver seems to be OK. With 4 virtio
> NICs we have the following readw accesses, thats all!
> 3 accesses per NIC and the first NIC appears twice.
> 
> MSI-X: msix_mmio_readw dev=0x9767c58 addr=0000000000000008
> MSI-X: msix_mmio_readw dev=0x9767c58 addr=0000000000000018
> MSI-X: msix_mmio_readw dev=0x9767c58 addr=0000000000000028
> MSI-X: msix_mmio_readw dev=0x9772c40 addr=0000000000000008
> MSI-X: msix_mmio_readw dev=0x9772c40 addr=0000000000000018
> MSI-X: msix_mmio_readw dev=0x9772c40 addr=0000000000000028
> MSI-X: msix_mmio_readw dev=0x977dc38 addr=0000000000000008
> MSI-X: msix_mmio_readw dev=0x977dc38 addr=0000000000000018
> MSI-X: msix_mmio_readw dev=0x977dc38 addr=0000000000000028
> MSI-X: msix_mmio_readw dev=0x9788d90 addr=0000000000000008
> MSI-X: msix_mmio_readw dev=0x9788d90 addr=0000000000000018
> MSI-X: msix_mmio_readw dev=0x9788d90 addr=0000000000000028
> MSI-X: msix_mmio_readw dev=0x9767c58 addr=0000000000000008
> MSI-X: msix_mmio_readw dev=0x9767c58 addr=0000000000000018
> MSI-X: msix_mmio_readw dev=0x9767c58 addr=0000000000000028

Hmm, message data is being read for some reason.

> Is it possible to add a stack back tace printing to the
> readw function?

There's the qemu -S option, it will let you debug the guest.
Still, question is, do we need work-around in qemu,
because a broken guest is in production and can not be fixed,
or can guest just be fixed?

> >  hw/msix.c |   31 +++++++++++++++++++++++++++----
> >  1 files changed, 27 insertions(+), 4 deletions(-)
> >
> >diff --git a/hw/msix.c b/hw/msix.c
> >index f66d255..38dff59 100644
> >--- a/hw/msix.c
> >+++ b/hw/msix.c
> >@@ -102,10 +102,28 @@ static uint32_t msix_mmio_readl(void *opaque, target_phys_addr_t addr)
> >      return pci_get_long(page + offset);
> >  }
> >
> >-static uint32_t msix_mmio_read_unallowed(void *opaque, target_phys_addr_t addr)
> >+ /* Note:
> >+  * PCI spec requires that all MSI-X table accesses are either DWORD or QWORD,
> >+  * size aligned.  Some guests seem to violate this rule for read accesses,
> >+  * performing single byte reads.  Since it's easy to support this, let's do so.
> >+  * Also support 16 bit size aligned reads, just in case.
> >+  */
> >+static uint32_t msix_mmio_readw(void *opaque, target_phys_addr_t addr)
> >  {
> >-    fprintf(stderr, "MSI-X: only dword read is allowed!\n");
> >-    return 0;
> >+    PCIDevice *dev = opaque;
> >+    unsigned int offset = addr&  (MSIX_PAGE_SIZE - 1)&  ~0x1;
> >+    void *page = dev->msix_table_page;
> >+
> >+    return pci_get_word(page + offset);
> >+}
> >+
> >+static uint32_t msix_mmio_readb(void *opaque, target_phys_addr_t addr)
> >+{
> >+    PCIDevice *dev = opaque;
> >+    unsigned int offset = addr&  (MSIX_PAGE_SIZE - 1);
> >+    void *page = dev->msix_table_page;
> >+
> >+    return pci_get_byte(page + offset);
> >  }
> >
> >  static uint8_t msix_pending_mask(int vector)
> >@@ -192,6 +210,11 @@ static void msix_mmio_writel(void *opaque, target_phys_addr_t addr,
> >      msix_handle_mask_update(dev, vector);
> >  }
> >
> >+/* PCI spec:
> >+ * For all accesses to MSI-X Table and MSI-X PBA fields, software must use
> >+ * aligned full DWORD or aligned full QWORD transactions; otherwise, the result
> >+ * is undefined.
> >+ */
> >  static void msix_mmio_write_unallowed(void *opaque, target_phys_addr_t addr,
> >                                        uint32_t val)
> >  {
> >@@ -203,7 +226,7 @@ static CPUWriteMemoryFunc * const msix_mmio_write[] = {
> >  };
> >
> >  static CPUReadMemoryFunc * const msix_mmio_read[] = {
> >-    msix_mmio_read_unallowed, msix_mmio_read_unallowed, msix_mmio_readl
> >+    msix_mmio_readb, msix_mmio_readw, msix_mmio_readl
> >  };
> >
> >  /* Should be called from device's map method. */
Michael S. Tsirkin Nov. 17, 2010, 2:28 p.m. UTC | #3
On Tue, Nov 16, 2010 at 05:43:06PM +0100, Bernhard Kohl wrote:
> Am 16.11.2010 14:14, schrieb mst@redhat.com:
> >Although explicitly disallowed by the PCI spec, some guests read a
> >single byte or word from mmio.  Likely a guest OS bug, but I have an OS
> >which reads single bytes and it works fine on real hardware.
> >
> >Signed-off-by: Bernhard Kohl<bernhard.kohl@nsn.com>
> >Signed-off-by: Michael S. Tsirkin<mst@redhat.com>
> >---
> >
> >OK so it could like something like the below.
> 
> Yes, this looks good for me.
> 
> >  However, my question is:
> >do we need to put this in or can the guest simply be fixed?
> 
> I tried to locate the code where the readw occurs,
> but not successful. It only occurs during booting our OS,
> and the virtio-net driver seems to be OK. With 4 virtio
> NICs we have the following readw accesses, thats all!
> 3 accesses per NIC and the first NIC appears twice.
> 
> MSI-X: msix_mmio_readw dev=0x9767c58 addr=0000000000000008
> MSI-X: msix_mmio_readw dev=0x9767c58 addr=0000000000000018
> MSI-X: msix_mmio_readw dev=0x9767c58 addr=0000000000000028
> MSI-X: msix_mmio_readw dev=0x9772c40 addr=0000000000000008
> MSI-X: msix_mmio_readw dev=0x9772c40 addr=0000000000000018
> MSI-X: msix_mmio_readw dev=0x9772c40 addr=0000000000000028
> MSI-X: msix_mmio_readw dev=0x977dc38 addr=0000000000000008
> MSI-X: msix_mmio_readw dev=0x977dc38 addr=0000000000000018
> MSI-X: msix_mmio_readw dev=0x977dc38 addr=0000000000000028
> MSI-X: msix_mmio_readw dev=0x9788d90 addr=0000000000000008
> MSI-X: msix_mmio_readw dev=0x9788d90 addr=0000000000000018
> MSI-X: msix_mmio_readw dev=0x9788d90 addr=0000000000000028
> MSI-X: msix_mmio_readw dev=0x9767c58 addr=0000000000000008
> MSI-X: msix_mmio_readw dev=0x9767c58 addr=0000000000000018
> MSI-X: msix_mmio_readw dev=0x9767c58 addr=0000000000000028
> 
> Is it possible to add a stack back tace printing to the
> readw function?


See some words of wisdom from Marcelo (this appliues to qemu-kvm.git):

<mst> marcelot, there is a guest that misbehaves
<mst> marcelot, so we get an fprintf in qemu
<mst> marcelot, but we don't know which place in guest caused the
problem, no backtrace, etc
<marcelot> mst, print guest RIP there? 
<mst> marcelot, 1. how to?
<marcelot> mst, cpu_synchronize_state(cpu_single_env) + print (env->eip) 
<marcelot> s/env/cpu_single_env/
<mst> marcelot, 2. can we also make it so that when io completes, guest
stops with backtrace in gdb?
<marcelot> mst, see kvm_show_code in qemu-kvm-x86.c
<marcelot> mst, either add a breakpoint there, or call abort() and gdb
core later
<mst> marcelot, abort will exit qemu though. gdb core will give me qemu
backtrace, not guest one, right?
<mst> marcelot, I would like kvm to pretend it hit a breakpoint
<mst> marcelot, or a segfault ... something
<marcelot> mst, well, vmstop() then. you can see guest RIP from there.
<marcelot> mst, yes gdb core will give qemu backtrace. inject a #GP to
see guest backtrace (see kvm_put_vcpu_events in qemu-kvm-x86.c
<marcelot> )
Bernhard Kohl Nov. 17, 2010, 4:12 p.m. UTC | #4
Am 17.11.2010 15:12, schrieb ext Michael S. Tsirkin:
> On Tue, Nov 16, 2010 at 05:43:06PM +0100, Bernhard Kohl wrote:
>    
>> Am 16.11.2010 14:14, schrieb mst@redhat.com:
>>      
>>> Although explicitly disallowed by the PCI spec, some guests read a
>>> single byte or word from mmio.  Likely a guest OS bug, but I have an OS
>>> which reads single bytes and it works fine on real hardware.
>>>
>>> Signed-off-by: Bernhard Kohl<bernhard.kohl@nsn.com>
>>> Signed-off-by: Michael S. Tsirkin<mst@redhat.com>
>>> ---
>>>
>>> OK so it could like something like the below.
>>>        
>> Yes, this looks good for me.
>>      
> Wait, the below shows single-word reads. I thought you said bytes?
>    

The original patch was several month old. When posting it
I added some more description text. Sorry, I mismatched
words with bytes. In fact there are only these word accesses
as seen below.

>    
>>>   However, my question is:
>>> do we need to put this in or can the guest simply be fixed?
>>>        
>> I tried to locate the code where the readw occurs,
>> but not successful. It only occurs during booting our OS,
>> and the virtio-net driver seems to be OK. With 4 virtio
>> NICs we have the following readw accesses, thats all!
>> 3 accesses per NIC and the first NIC appears twice.
>>
>> MSI-X: msix_mmio_readw dev=0x9767c58 addr=0000000000000008
>> MSI-X: msix_mmio_readw dev=0x9767c58 addr=0000000000000018
>> MSI-X: msix_mmio_readw dev=0x9767c58 addr=0000000000000028
>> MSI-X: msix_mmio_readw dev=0x9772c40 addr=0000000000000008
>> MSI-X: msix_mmio_readw dev=0x9772c40 addr=0000000000000018
>> MSI-X: msix_mmio_readw dev=0x9772c40 addr=0000000000000028
>> MSI-X: msix_mmio_readw dev=0x977dc38 addr=0000000000000008
>> MSI-X: msix_mmio_readw dev=0x977dc38 addr=0000000000000018
>> MSI-X: msix_mmio_readw dev=0x977dc38 addr=0000000000000028
>> MSI-X: msix_mmio_readw dev=0x9788d90 addr=0000000000000008
>> MSI-X: msix_mmio_readw dev=0x9788d90 addr=0000000000000018
>> MSI-X: msix_mmio_readw dev=0x9788d90 addr=0000000000000028
>> MSI-X: msix_mmio_readw dev=0x9767c58 addr=0000000000000008
>> MSI-X: msix_mmio_readw dev=0x9767c58 addr=0000000000000018
>> MSI-X: msix_mmio_readw dev=0x9767c58 addr=0000000000000028
>>      
> Hmm, message data is being read for some reason.
>    

Yes there is a helper library in our OS which does the
following (the MOVZX does 16-bit mem read):

*int_level_ptr = (byte) msi_x_table_ptr->message_data;

                                           ; STATEMENT # 2593
00001FC0  640FB74008      MOVZX   EAX,FS:[EAX]+8H
                   @29:
00001FC5  0FB44D10        LFS     ECX,[EBP].int_level_ptr
00001FC9  648801          MOV     FS:[ECX],AL

>    
>> Is it possible to add a stack back tace printing to the
>> readw function?
>>      
> There's the qemu -S option, it will let you debug the guest.
> Still, question is, do we need work-around in qemu,
> because a broken guest is in production and can not be fixed,
> or can guest just be fixed?
>    

I will fix the guest and inform you after testing.
Then we can skip this patch.

>    
>>>   hw/msix.c |   31 +++++++++++++++++++++++++++----
>>>   1 files changed, 27 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/hw/msix.c b/hw/msix.c
>>> index f66d255..38dff59 100644
>>> --- a/hw/msix.c
>>> +++ b/hw/msix.c
>>> @@ -102,10 +102,28 @@ static uint32_t msix_mmio_readl(void *opaque, target_phys_addr_t addr)
>>>       return pci_get_long(page + offset);
>>>   }
>>>
>>> -static uint32_t msix_mmio_read_unallowed(void *opaque, target_phys_addr_t addr)
>>> + /* Note:
>>> +  * PCI spec requires that all MSI-X table accesses are either DWORD or QWORD,
>>> +  * size aligned.  Some guests seem to violate this rule for read accesses,
>>> +  * performing single byte reads.  Since it's easy to support this, let's do so.
>>> +  * Also support 16 bit size aligned reads, just in case.
>>> +  */
>>> +static uint32_t msix_mmio_readw(void *opaque, target_phys_addr_t addr)
>>>   {
>>> -    fprintf(stderr, "MSI-X: only dword read is allowed!\n");
>>> -    return 0;
>>> +    PCIDevice *dev = opaque;
>>> +    unsigned int offset = addr&   (MSIX_PAGE_SIZE - 1)&   ~0x1;
>>> +    void *page = dev->msix_table_page;
>>> +
>>> +    return pci_get_word(page + offset);
>>> +}
>>> +
>>> +static uint32_t msix_mmio_readb(void *opaque, target_phys_addr_t addr)
>>> +{
>>> +    PCIDevice *dev = opaque;
>>> +    unsigned int offset = addr&   (MSIX_PAGE_SIZE - 1);
>>> +    void *page = dev->msix_table_page;
>>> +
>>> +    return pci_get_byte(page + offset);
>>>   }
>>>
>>>   static uint8_t msix_pending_mask(int vector)
>>> @@ -192,6 +210,11 @@ static void msix_mmio_writel(void *opaque, target_phys_addr_t addr,
>>>       msix_handle_mask_update(dev, vector);
>>>   }
>>>
>>> +/* PCI spec:
>>> + * For all accesses to MSI-X Table and MSI-X PBA fields, software must use
>>> + * aligned full DWORD or aligned full QWORD transactions; otherwise, the result
>>> + * is undefined.
>>> + */
>>>   static void msix_mmio_write_unallowed(void *opaque, target_phys_addr_t addr,
>>>                                         uint32_t val)
>>>   {
>>> @@ -203,7 +226,7 @@ static CPUWriteMemoryFunc * const msix_mmio_write[] = {
>>>   };
>>>
>>>   static CPUReadMemoryFunc * const msix_mmio_read[] = {
>>> -    msix_mmio_read_unallowed, msix_mmio_read_unallowed, msix_mmio_readl
>>> +    msix_mmio_readb, msix_mmio_readw, msix_mmio_readl
>>>   };
>>>
>>>   /* Should be called from device's map method. */
>>>        
>
Michael S. Tsirkin Nov. 17, 2010, 4:30 p.m. UTC | #5
On Wed, Nov 17, 2010 at 05:12:12PM +0100, Bernhard Kohl wrote:
> I will fix the guest and inform you after testing.
> Then we can skip this patch.

OK, that's best,
Bernhard Kohl Nov. 24, 2010, 2:35 p.m. UTC | #6
Am 17.11.2010 17:30, schrieb ext Michael S. Tsirkin:
> On Wed, Nov 17, 2010 at 05:12:12PM +0100, Bernhard Kohl wrote:
>    
>> I will fix the guest and inform you after testing.
>> Then we can skip this patch.
>>      
> OK, that's best,
>
>
>    
Yes, the guest is fixed now. Only dword accesses to msix!

Thanks for your support
Bernhard
diff mbox

Patch

diff --git a/hw/msix.c b/hw/msix.c
index f66d255..38dff59 100644
--- a/hw/msix.c
+++ b/hw/msix.c
@@ -102,10 +102,28 @@  static uint32_t msix_mmio_readl(void *opaque, target_phys_addr_t addr)
     return pci_get_long(page + offset);
 }
 
-static uint32_t msix_mmio_read_unallowed(void *opaque, target_phys_addr_t addr)
+ /* Note:
+  * PCI spec requires that all MSI-X table accesses are either DWORD or QWORD,
+  * size aligned.  Some guests seem to violate this rule for read accesses,
+  * performing single byte reads.  Since it's easy to support this, let's do so.
+  * Also support 16 bit size aligned reads, just in case.
+  */                                                                                                            
+static uint32_t msix_mmio_readw(void *opaque, target_phys_addr_t addr)
 {
-    fprintf(stderr, "MSI-X: only dword read is allowed!\n");
-    return 0;
+    PCIDevice *dev = opaque;
+    unsigned int offset = addr & (MSIX_PAGE_SIZE - 1) & ~0x1;
+    void *page = dev->msix_table_page;
+
+    return pci_get_word(page + offset);
+}
+
+static uint32_t msix_mmio_readb(void *opaque, target_phys_addr_t addr)
+{
+    PCIDevice *dev = opaque;
+    unsigned int offset = addr & (MSIX_PAGE_SIZE - 1);
+    void *page = dev->msix_table_page;
+
+    return pci_get_byte(page + offset);
 }
 
 static uint8_t msix_pending_mask(int vector)
@@ -192,6 +210,11 @@  static void msix_mmio_writel(void *opaque, target_phys_addr_t addr,
     msix_handle_mask_update(dev, vector);
 }
 
+/* PCI spec:
+ * For all accesses to MSI-X Table and MSI-X PBA fields, software must use
+ * aligned full DWORD or aligned full QWORD transactions; otherwise, the result
+ * is undefined.
+ */
 static void msix_mmio_write_unallowed(void *opaque, target_phys_addr_t addr,
                                       uint32_t val)
 {
@@ -203,7 +226,7 @@  static CPUWriteMemoryFunc * const msix_mmio_write[] = {
 };
 
 static CPUReadMemoryFunc * const msix_mmio_read[] = {
-    msix_mmio_read_unallowed, msix_mmio_read_unallowed, msix_mmio_readl
+    msix_mmio_readb, msix_mmio_readw, msix_mmio_readl
 };
 
 /* Should be called from device's map method. */