diff mbox

[V0] x86, mce: Basic support to add LMCE support to QEMU

Message ID 20151209230504.GB15886@otc-brkl-03.jf.intel.com
State New
Headers show

Commit Message

Ashok Raj Dec. 9, 2015, 11:05 p.m. UTC
On Wed, Dec 09, 2015 at 10:07:48PM +0100, Paolo Bonzini wrote:
> 
> 
> On 09/12/2015 20:57, Ashok Raj wrote:
> > +    /*
> > +     * We need to read back the value of MSREXT_MCG_CTL that was set by the
> > +     * guest kernel back into Qemu
> > +     */
> > +    cs->kvm_vcpu_dirty = false;
> > +    cpu_synchronize_state(cs);

This wasn't in my original patch, but was found required.

Will have Gong check this and report back.
> 
> This should not be necessary.  I've only skimmed the patches but, apart
> from this, the patches look good.  Eduardo knows more than me about
> machine types and backwards compatibility to older kernels, however, and
> I'm deferring to him on this aspect.
> 
> How was this tested?  (In general, how do you test MCE? :))

We tested on a real hardware that supported error injection via EINJ.

One additional patch is required to support the testing to translate
from GPA to HPA. Probably we could include this as well to make it easy
and not have us maintain out of tree? 

Here are logs from Gong's testing.. he has a pretty eloborate test to 
test this.  :-)

Look at the MCGCAP and MCGSTATUS in host and guest for the values
introduced by this change set.

===================================================================================================

dmesg on guest system:
…
[   35.294009] mce: [Hardware Error]: Machine check events logged
[   35.294009] mce: Uncorrected hardware memory error in user-access at 7451b000
[   35.334006] MCE 0x7451b: Killing victim:1822 due to hardware memory corruption
[   35.334515] MCE 0x7451b: dirty mlocked LRU page still referenced by 1 users
[   35.334930] MCE 0x7451b: recovery action for dirty mlocked LRU page: Failed
[   35.335372] mce: Memory error not recovered
…

------------------------------------------------------------------------------------------------------------------------

dmesg on host system:
…
[57629.858659] kvm: zapping shadow pages for mmio generation wraparound
[57629.859592] kvm: zapping shadow pages for mmio generation wraparound
[57637.023199] kvm [46095]: vcpu0 disabled perfctr wrmsr: 0xc2 data 0xffff
[57637.116429] kvm [46095]: vcpu0 unhandled rdmsr: 0x570
[57637.122112] kvm [46095]: vcpu1 unhandled rdmsr: 0x570
[57672.381651] mce: [Hardware Error]: Machine check events logged
[57672.388178] mce: Uncorrected hardware memory error in user-access at 1da71b000
[57672.396057] mce: [Hardware Error]: Machine check events logged
[57672.403345] MCE 0x1da71b: Killing qemu-system-x86:46095 due to hardware memory corruption
[57672.412499] MCE 0x1da71b: recovery action for dirty LRU page: Recovered

===================================================================================================
Mcelog on host system:

[root@BKD06SDP host]# mcelog
Hardware event. This is not a software error.
MCE 0
CPU 68 BANK 1 TSC 835ad3e00dfe
MISC 86 ADDR 1da71b000
TIME 1449669775 Wed Dec  9 09:02:55 2015
MCG status:RIPV EIPV MCIP
MCi status:
Uncorrected error
Error enabled
MCi_MISC register valid
MCi_ADDR register valid
SRAR
MCA: Data CACHE Level-0 Data-Read Error
STATUS bd80000000100134 MCGSTATUS 7
MCGCAP 7000c16 APICID f0 SOCKETID 3
CPUID Vendor Intel Family 6 Model 63
Hardware event. This is not a software error.
MCE 1
CPU 0 BANK 7
MISC 146588a86 ADDR 1da71b000
TIME 1449669775 Wed Dec  9 09:02:55 2015
MCG status:
MCi status:
Uncorrected error
MCi_MISC register valid
MCi_ADDR register valid
MCA: MEMORY CONTROLLER RD_CHANNEL2_ERR
Transaction: Memory read error
STATUS ac00000000010092 MCGSTATUS 0
MCGCAP 7000c16 APICID 0 SOCKETID 0
CPUID Vendor Intel Family 6 Model 63
[root@BKD06SDP host]#

----------------------------------------------------------------------------

GUEST system mcelog:

[root@localhost ~]# cat /var/log/mcelog
mcelog: mcelog server already running
mcelog: mcelog server already running
Hardware event. This is not a software error.
MCE 0
CPU 0 BANK 9 TSC 18ce71469a
RIP 33:401535
MISC 8c ADDR 7451b000
TIME 1449669775 Wed Dec  9 09:02:55 2015
MCG status:EIPV MCIP LMCE
MCi status:
Uncorrected error
Error enabled
MCi_MISC register valid
MCi_ADDR register valid
SRAR
MCA: Data CACHE Level-0 Data-Read Error
STATUS bd80000000000134 MCGSTATUS e
MCGCAP 900010a APICID 0 SOCKETID 0
CPUID Vendor Intel Family 6 Model 6
Signed-off-by: Feng Liu <feng.liu@intel.com>

From: root <root@lmcesrv.bj.intel.com>


---
 hmp-commands.hx       |   14 ++++++++++++++
 include/exec/memory.h |    2 ++
 kvm-all.c             |   24 ++++++++++++++++++++++++
 memory.c              |   13 +++++++++++++
 monitor.c             |   16 ++++++++++++++++
 5 files changed, 69 insertions(+)
 mode change 100644 => 100755 include/exec/memory.h
 mode change 100644 => 100755 kvm-all.c
 mode change 100644 => 100755 memory.c
 mode change 100644 => 100755 monitor.c
diff mbox

Patch

diff --git a/hmp-commands.hx b/hmp-commands.hx
index bb52e4d..673c00e 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -444,6 +444,20 @@  Start gdbserver session (default @var{port}=1234)
 ETEXI
 
     {
+	 .name         = "x-gpa2hva",
+	 .args_type    = "fmt:/,addr:l",
+	 .params       = "/fmt addr",
+	 .help	       = "translate guest physical 'addr' to host virtual address, only for debugging",
+	 .mhandler.cmd = do_gpa2hva,
+    },
+
+STEXI
+@item x-gpa2hva @var{addr}
+@findex x-gpa2hva
+Translate guest physical @var{addr} to host virtual address, only for debugging.
+ETEXI
+
+    {
         .name       = "x",
         .args_type  = "fmt:/,addr:l",
         .params     = "/fmt addr",
diff --git a/include/exec/memory.h b/include/exec/memory.h
old mode 100644
new mode 100755
index 0f07159..57d7bf8
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -222,6 +222,7 @@  struct MemoryListener {
                                hwaddr addr, hwaddr len);
     void (*coalesced_mmio_del)(MemoryListener *listener, MemoryRegionSection *section,
                                hwaddr addr, hwaddr len);
+    int  (*translate_gpa2hva)(MemoryListener *listener, uint64_t paddr, uint64_t *vaddr);
     /* Lower = earlier (during add), later (during del) */
     unsigned priority;
     AddressSpace *address_space_filter;
@@ -1123,6 +1124,7 @@  void memory_global_dirty_log_start(void);
 void memory_global_dirty_log_stop(void);
 
 void mtree_info(fprintf_function mon_printf, void *f);
+int  memory_translate_gpa2hva(hwaddr paddr, uint64_t *vaddr);
 
 /**
  * memory_region_dispatch_read: perform a read directly to the specified
diff --git a/kvm-all.c b/kvm-all.c
old mode 100644
new mode 100755
index c648b81..cb029be
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -197,6 +197,29 @@  static KVMSlot *kvm_lookup_overlapping_slot(KVMMemoryListener *kml,
     return found;
 }
 
+
+static int kvm_translate_gpa2hva(MemoryListener *listener, uint64_t paddr, uint64_t *vaddr)
+{
+    KVMState *s = kvm_state;
+    KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
+    KVMSlot *mem = NULL;
+    int i;
+
+    for (i = 0; i < s->nr_slots; i++) {
+        mem = &kml->slots[i];
+        if (paddr >= mem->start_addr && paddr < mem->start_addr + mem->memory_size) {
+            *vaddr = (uint64_t)mem->ram + paddr - mem->start_addr;
+            break;
+	}
+    }
+
+    if (i == s->nr_slots) {
+        fprintf(stderr, "fail to find target physical addr(%ld) in KVM memory range\n", paddr);
+	return 1;
+    }
+    return 0;
+}
+
 int kvm_physical_memory_addr_from_host(KVMState *s, void *ram,
                                        hwaddr *phys_addr)
 {
@@ -902,6 +925,7 @@  void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
     kml->listener.log_start = kvm_log_start;
     kml->listener.log_stop = kvm_log_stop;
     kml->listener.log_sync = kvm_log_sync;
+    kml->listener.translate_gpa2hva = kvm_translate_gpa2hva;
     kml->listener.priority = 10;
 
     memory_listener_register(&kml->listener, as);
diff --git a/memory.c b/memory.c
old mode 100644
new mode 100755
index e193658..979dcf8
--- a/memory.c
+++ b/memory.c
@@ -2294,6 +2294,19 @@  static const TypeInfo memory_region_info = {
     .instance_finalize  = memory_region_finalize,
 };
 
+int memory_translate_gpa2hva(hwaddr paddr, uint64_t *vaddr){
+    MemoryListener *ml = NULL;
+    int ret = 1;
+
+    QTAILQ_FOREACH(ml, &memory_listeners, link) {
+        if(ml->translate_gpa2hva)
+            ret = ml->translate_gpa2hva(ml, paddr, vaddr);
+	if(0 == ret)
+	    break;
+    }
+    return ret;
+}
+
 static void memory_register_types(void)
 {
     type_register_static(&memory_region_info);
diff --git a/monitor.c b/monitor.c
old mode 100644
new mode 100755
index 9a35d72..408e1fa
--- a/monitor.c
+++ b/monitor.c
@@ -76,6 +76,7 @@ 
 #include "qapi-event.h"
 #include "qmp-introspect.h"
 #include "sysemu/block-backend.h"
+#include "exec/memory.h"
 
 /* for hmp_info_irq/pic */
 #if defined(TARGET_SPARC)
@@ -1681,6 +1682,21 @@  static void hmp_acl_remove(Monitor *mon, const QDict *qdict)
     }
 }
 
+static void do_gpa2hva(Monitor *mon, const QDict *qdict)
+{
+    uint64_t paddr;
+    uint64_t vaddr;
+
+    paddr = qdict_get_int(qdict, "addr");
+    if (memory_translate_gpa2hva(paddr, &vaddr)){
+	monitor_printf(mon, "fail to translate gpa(0x%lx) to hva\n", paddr);
+	return;
+    }
+
+    monitor_printf(mon, "0x%lx\n", (unsigned long)vaddr);
+    return;
+}
+
 void qmp_getfd(const char *fdname, Error **errp)
 {
     mon_fd_t *monfd;