diff mbox

[14/40] xenner: kernel: Instruction emulator

Message ID 1288623713-28062-15-git-send-email-agraf@suse.de
State New
Headers show

Commit Message

Alexander Graf Nov. 1, 2010, 3:01 p.m. UTC
In some cases we need to emulate guest instructions. This patch adds
code to take care of this.

Signed-off-by: Alexander Graf <agraf@suse.de>
---
 pc-bios/xenner/xenner-instr.c |  405 +++++++++++++++++++++++++++++++++++++++++
 1 files changed, 405 insertions(+), 0 deletions(-)
 create mode 100644 pc-bios/xenner/xenner-instr.c

Comments

malc Nov. 1, 2010, 3:41 p.m. UTC | #1
On Mon, 1 Nov 2010, Alexander Graf wrote:

> In some cases we need to emulate guest instructions. This patch adds
> code to take care of this.
> 
> Signed-off-by: Alexander Graf <agraf@suse.de>
> ---
>  pc-bios/xenner/xenner-instr.c |  405 +++++++++++++++++++++++++++++++++++++++++
>  1 files changed, 405 insertions(+), 0 deletions(-)
>  create mode 100644 pc-bios/xenner/xenner-instr.c
> 
> diff --git a/pc-bios/xenner/xenner-instr.c b/pc-bios/xenner/xenner-instr.c
> new file mode 100644
> index 0000000..11be2ce
> --- /dev/null
> +++ b/pc-bios/xenner/xenner-instr.c
> @@ -0,0 +1,405 @@
> +/*
> + *  Copyright (C) Red Hat 2007
> + *  Copyright (C) Novell Inc. 2010
> + *
> + *  Author(s): Gerd Hoffmann <kraxel@redhat.com>
> + *             Alexander Graf <agraf@suse.de>
> + *
> + *  Xenner instruction emulator
> + *
> + *  This program is free software; you can redistribute it and/or modify
> + *  it under the terms of the GNU General Public License as published by
> + *  the Free Software Foundation; under version 2 of the License.
> + *
> + *  This program is distributed in the hope that it will be useful,
> + *  but WITHOUT ANY WARRANTY; without even the implied warranty of
> + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + *  GNU General Public License for more details.
> + *
> + *  You should have received a copy of the GNU General Public License along
> + *  with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "xenner.h"
> +#include "msr-index.h"
> +#include "cpufeature.h"
> +
> +void real_cpuid(struct kvm_cpuid_entry *entry)
> +{
> +    asm volatile("cpuid"
> +                 : "=a" (entry->eax),
> +                   "=b" (entry->ebx),
> +                   "=c" (entry->ecx),
> +                   "=d" (entry->edx)
> +                 : "a" (entry->function));
> +}
> +
> +static unsigned long clear_cpuid_bit(unsigned long bit, unsigned long x)
> +{
> +    unsigned long r = x;

This assignment serves no purpose.

> +
> +    bit %= 64;
> +    r = x & ~(1 << bit);
> +
> +    return r;
> +}
> +
> +static void filter_cpuid(struct kvm_cpuid_entry *entry)
> +{
> +    switch (entry->function) {
> +    case 0x00000001:
> +        entry->edx = clear_cpuid_bit(X86_FEATURE_SEP, entry->edx);
> +        entry->edx = clear_cpuid_bit(X86_FEATURE_DS, entry->edx);
> +        entry->edx = clear_cpuid_bit(X86_FEATURE_DS, entry->edx);
> +        entry->edx = clear_cpuid_bit(X86_FEATURE_ACC, entry->edx);
> +        entry->edx = clear_cpuid_bit(X86_FEATURE_PBE, entry->edx);
> +
> +        entry->ecx = clear_cpuid_bit(X86_FEATURE_DTES64, entry->ecx);
> +        entry->ecx = clear_cpuid_bit(X86_FEATURE_MWAIT, entry->ecx);
> +        entry->ecx = clear_cpuid_bit(X86_FEATURE_DSCPL, entry->ecx);
> +        entry->ecx = clear_cpuid_bit(X86_FEATURE_VMXE, entry->ecx);
> +        entry->ecx = clear_cpuid_bit(X86_FEATURE_SMXE, entry->ecx);
> +        entry->ecx = clear_cpuid_bit(X86_FEATURE_EST, entry->ecx);
> +        entry->ecx = clear_cpuid_bit(X86_FEATURE_TM2, entry->ecx);
> +        entry->ecx = clear_cpuid_bit(X86_FEATURE_XTPR, entry->ecx);
> +        entry->ecx = clear_cpuid_bit(X86_FEATURE_PDCM, entry->ecx);
> +        entry->ecx = clear_cpuid_bit(X86_FEATURE_DCA, entry->ecx);
> +        entry->ecx = clear_cpuid_bit(X86_FEATURE_XSAVE, entry->ecx);
> +        /* fall through */
> +    case 0x80000001:
> +        entry->edx = clear_cpuid_bit(X86_FEATURE_VME, entry->edx);
> +        entry->edx = clear_cpuid_bit(X86_FEATURE_PSE, entry->edx);
> +        entry->edx = clear_cpuid_bit(X86_FEATURE_PGE, entry->edx);
> +        entry->edx = clear_cpuid_bit(X86_FEATURE_MCE, entry->edx);
> +        entry->edx = clear_cpuid_bit(X86_FEATURE_MCA, entry->edx);
> +        entry->edx = clear_cpuid_bit(X86_FEATURE_MTRR, entry->edx);
> +        entry->edx = clear_cpuid_bit(X86_FEATURE_PSE36, entry->edx);
> +
> +#ifdef CONFIG_32BIT
> +        entry->edx = clear_cpuid_bit(X86_FEATURE_LM, entry->edx);
> +        entry->ecx = clear_cpuid_bit(X86_FEATURE_LAHF_LM, entry->ecx);
> +#endif
> +        entry->edx = clear_cpuid_bit(X86_FEATURE_PAGE1GB, entry->edx);
> +        entry->edx = clear_cpuid_bit(X86_FEATURE_RDTSCP, entry->edx);
> +
> +        entry->ecx = clear_cpuid_bit(X86_FEATURE_SVME, entry->ecx);
> +        entry->ecx = clear_cpuid_bit(X86_FEATURE_OSVW, entry->ecx);
> +        entry->ecx = clear_cpuid_bit(X86_FEATURE_IBS, entry->ecx);
> +        entry->ecx = clear_cpuid_bit(X86_FEATURE_SKINIT, entry->ecx);
> +        entry->ecx = clear_cpuid_bit(X86_FEATURE_WDT, entry->ecx);
> +        break;
> +
> +    case 0x00000005: /* MONITOR/MWAIT */
> +    case 0x0000000a: /* Architectural Performance Monitor Features */
> +    case 0x8000000a: /* SVM revision and features */
> +    case 0x8000001b: /* Instruction Based Sampling */
> +        entry->eax = 0;
> +        entry->ebx = 0;
> +        entry->ecx = 0;
> +        entry->edx = 0;
> +        break;
> +    }
> +}
> +
> +static void emulate_cpuid(struct regs *regs)
> +{
> +    struct kvm_cpuid_entry entry;
> +
> +    entry.function = regs->rax;
> +    real_cpuid(&entry);
> +    filter_cpuid(&entry);
> +    regs->rax = entry.eax;
> +    regs->rbx = entry.ebx;
> +    regs->rcx = entry.ecx;
> +    regs->rdx = entry.edx;
> +    printk(2, "cpuid 0x%08x: eax 0x%08x ebx 0x%08x ecx 0x%08x edx 0x%08x\n",
> +           entry.function, entry.eax, entry.ebx, entry.ecx, entry.edx);
> +}
> +
> +static void emulate_rdmsr(struct regs *regs)
> +{
> +    uint32_t ax,dx;
> +    switch (regs->rcx) {
> +    case MSR_EFER:
> +    case MSR_FS_BASE:
> +    case MSR_GS_BASE:
> +    case MSR_KERNEL_GS_BASE:
> +        /* white listed */
> +        rdmsr(regs->rcx, &ax, &dx);
> +        regs->rax = ax;
> +        regs->rdx = dx;
> +        break;
> +    default:
> +        printk(1, "%s: ignore: rcx 0x%" PRIxREG "\n", __FUNCTION__, regs->rcx);
> +        regs->rax = 0;
> +        regs->rdx = 0;
> +        break;
> +    }
> +}
> +
> +static void emulate_wrmsr(struct regs *regs)
> +{
> +    static const uint64_t known = (EFER_NX|EFER_LMA|EFER_LME|EFER_SCE);
> +    static const uint64_t fixed = (EFER_LMA|EFER_LME|EFER_SCE);
> +    uint32_t ax,dx;
> +
> +    switch (regs->rcx) {
> +    case MSR_EFER:
> +        if (regs->rax & ~known) {
> +            printk(1, "%s: efer: unknown bit set\n", __FUNCTION__);
> +            goto out;
> +        }
> +
> +        rdmsr(regs->rcx, &ax, &dx);
> +        if ((regs->rax & fixed) != (ax & fixed)) {
> +            printk(1, "%s: efer: modify fixed bit\n", __FUNCTION__);
> +            goto out;
> +        }
> +
> +        printk(1, "%s: efer:%s%s%s%s\n", __FUNCTION__,
> +               regs->rax & EFER_SCE ? " sce" : "",
> +               regs->rax & EFER_LME ? " lme" : "",
> +               regs->rax & EFER_LMA ? " lma" : "",
> +               regs->rax & EFER_NX  ? " nx"  : "");
> +        /* fall through */
> +    case MSR_FS_BASE:
> +    case MSR_GS_BASE:
> +    case MSR_KERNEL_GS_BASE:
> +        wrmsr(regs->rcx, regs->rax, regs->rdx);
> +        return;
> +    }
> +
> +out:
> +    printk(1, "%s: ignore: 0x%" PRIxREG " 0x%" PRIxREG ":0x%" PRIxREG "\n",
> +           __FUNCTION__, regs->rcx, regs->rdx, regs->rax);
> +}
> +
> +void print_emu_instr(int level, const char *prefix, uint8_t *instr)
> +{
> +    printk(level, "%s: rip %p bytes %02x %02x %02x %02x  %02x %02x %02x %02x\n",
> +           prefix, instr,
> +           instr[0], instr[1], instr[2], instr[3],
> +           instr[4], instr[5], instr[6], instr[7]);
> +}
> +
> +static ureg_t *decode_reg(struct regs *regs, uint8_t modrm, int rm)
> +{
> +    int shift = rm ? 0 : 3;
> +    ureg_t *reg = NULL;
> +
> +    switch ((modrm >> shift) & 0x07) {
> +    case 0: reg = (ureg_t*)&regs->rax; break;
> +    case 1: reg = (ureg_t*)&regs->rcx; break;
> +    case 2: reg = (ureg_t*)&regs->rdx; break;
> +    case 3: reg = (ureg_t*)&regs->rbx; break;
> +    case 4: reg = (ureg_t*)&regs->rsp; break;
> +    case 5: reg = (ureg_t*)&regs->rbp; break;
> +    case 6: reg = (ureg_t*)&regs->rsi; break;
> +    case 7: reg = (ureg_t*)&regs->rdi; break;
> +    }
> +    return reg;
> +}
> +
> +void print_bits(int level, const char *msg, uint32_t old, uint32_t new,
> +                const char *names[])
> +{
> +    char buf[128];
> +    int pos = 0;
> +    uint32_t mask;
> +    char *mod;
> +    int i;
> +
> +    pos += snprintf(buf+pos, sizeof(buf)-pos, "%s:", msg);
> +    for (i = 0; i < 32; i++) {
> +        mask = 1 << i;
> +        if (new&mask) {
> +            if (old&mask) {
> +                /* bit present */
> +                mod = "";
> +            } else {
> +                /* bit added */
> +                mod = "+";
> +            }
> +        } else {
> +            if (old&mask) {
> +                /* bit removed */
> +                mod = "-";
> +            } else {
> +                /* bit not present */
> +                continue;
> +            }
> +        }
> +        pos += snprintf(buf+pos, sizeof(buf)-pos, " %s%s",
> +                        mod, names[i] ? names[i] : "???");
> +    }
> +    pos += snprintf(buf+pos, sizeof(buf)-pos, "\n");
> +    printk(level, "%s", buf);
> +}
> +
> +int emulate(struct xen_cpu *cpu, struct regs *regs)
> +{
> +    static const uint8_t xen_emu_prefix[5] = {0x0f, 0x0b, 'x','e','n'};
> +    uint8_t *instr;
> +    int skip = 0;
> +    int in = 0;
> +    int shift = 0;
> +    int port = 0;
> +
> +restart:
> +    instr = (void*)regs->rip;
> +
> +    /* prefixes */
> +    if (instr[skip] == 0x66) {
> +        shift = 16;
> +        skip++;
> +    }
> +
> +    /* instructions */
> +    switch (instr[skip]) {
> +    case 0x0f:
> +        switch (instr[skip+1]) {
> +        case 0x06:
> +            /* clts */
> +            clts();
> +            skip += 2;
> +            break;
> +        case 0x09:
> +            /* wbinvd */
> +            __asm__("wbinvd" ::: "memory");
> +            skip += 2;
> +            break;
> +        case 0x0b:
> +            /* ud2a */
> +            if (xen_emu_prefix[2] == instr[skip+2] &&
> +                xen_emu_prefix[3] == instr[skip+3] &&
> +                xen_emu_prefix[4] == instr[skip+4]) {
> +                printk(2, "%s: xen emu prefix\n", __FUNCTION__);
> +                regs->rip += 5;
> +                goto restart;
> +            }
> +            printk(1, "%s: ud2a -- linux kernel BUG()?\n", __FUNCTION__);
> +            /* bounce to guest, hoping it prints more info */
> +            return 0;
> +        case 0x20:
> +        {
> +            /* read control registers */
> +            ureg_t *reg = decode_reg(regs, instr[skip+2], 1);
> +            switch (((instr[skip+2]) >> 3) & 0x07) {
> +            case 0:
> +                *reg = read_cr0();
> +                skip = 3;
> +                break;
> +            case 3:
> +                *reg = frame_to_addr(read_cr3_mfn(cpu));
> +                skip = 3;
> +                break;
> +            case 4:
> +                *reg = read_cr4();
> +                skip = 3;
> +                break;
> +            }
> +            break;
> +        }
> +        case 0x22:
> +        {
> +            /* write control registers */
> +            static const ureg_t cr0_fixed = ~(X86_CR0_TS);
> +            static const ureg_t cr4_fixed = X86_CR4_TSD;
> +            ureg_t *reg = decode_reg(regs, instr[skip+2], 1);
> +            ureg_t cr;
> +            switch (((instr[skip+2]) >> 3) & 0x07) {
> +            case 0:
> +                cr = read_cr0();
> +                if (cr != *reg) {
> +                    if ((cr & cr0_fixed) == (*reg & cr0_fixed)) {
> +                        print_bits(2, "apply cr0 update", cr, *reg, cr0_bits);
> +                        write_cr0(*reg);
> +                    } else {
> +                        print_bits(1, "IGNORE cr0 update", cr, *reg, cr0_bits);
> +                    }
> +                }
> +                skip = 3;
> +                break;
> +            case 4:
> +                cr = read_cr4();
> +                if (cr != *reg) {
> +                    if ((cr & cr4_fixed) == (*reg & cr4_fixed)) {
> +                        print_bits(1, "apply cr4 update", cr, *reg, cr4_bits);
> +                        write_cr4(*reg);
> +                    } else {
> +                        print_bits(1, "IGNORE cr4 update", cr, *reg, cr4_bits);
> +                    }
> +                }
> +                skip = 3;
> +                break;
> +            }
> +            break;
> +        }
> +        case 0x30:
> +            /* wrmsr */
> +            emulate_wrmsr(regs);
> +            skip += 2;
> +            break;
> +        case 0x32:
> +            /* rdmsr */
> +            emulate_rdmsr(regs);
> +            skip += 2;
> +            break;
> +        case 0xa2:
> +            /* cpuid */
> +            emulate_cpuid(regs);
> +            skip += 2;
> +            break;
> +        }
> +        break;
> +
> +    case 0xe4: /* in     <next byte>,%al */
> +    case 0xe5:
> +        in = (instr[skip] & 1) ? 2 : 1;
> +        port = instr[skip+1];
> +        skip += 2;
> +        break;
> +    case 0xec: /* in     (%dx),%al */
> +    case 0xed:
> +        in = (instr[skip] & 1) ? 2 : 1;
> +        port = regs->rdx & 0xffff;
> +        skip += 1;
> +        break;
> +    case 0xe6: /* out    %al,<next byte> */
> +    case 0xe7:
> +        port = instr[skip+1];
> +        skip += 2;
> +        break;
> +    case 0xee: /* out    %al,(%dx) */
> +    case 0xef:
> +        port = regs->rdx & 0xffff;
> +        skip += 1;
> +        break;
> +
> +    case 0xfa:
> +        /* cli */
> +        guest_cli(cpu);
> +        skip += 1;
> +        break;
> +    case 0xfb:
> +        /* sti */
> +        guest_sti(cpu);
> +        skip += 1;
> +        break;
> +    }
> +
> +    /* unknown instruction */
> +    if (!skip) {
> +        print_emu_instr(0, "instr emu failed", instr);
> +        return -1;
> +    }
> +
> +    /* I/O instruction */
> +    if (in == 2) {
> +        regs->rax |= 0xffffffff;
> +    } else if (in == 1) {
> +        regs->rax |= (0xffff << shift);
> +    }
> +
> +    return skip;
> +}
>
Paolo Bonzini Nov. 1, 2010, 6:46 p.m. UTC | #2
On 11/01/2010 04:01 PM, Alexander Graf wrote:
> +    /* I/O instruction */
> +    if (in == 2) {
> +        regs->rax |= 0xffffffff;
> +    } else if (in == 1) {
> +        regs->rax |= (0xffff<<  shift);
> +    }

I don't understand this, and also why it's here rather than near case 
0xe4/0xe5/0xec/0xed.

Paolo
diff mbox

Patch

diff --git a/pc-bios/xenner/xenner-instr.c b/pc-bios/xenner/xenner-instr.c
new file mode 100644
index 0000000..11be2ce
--- /dev/null
+++ b/pc-bios/xenner/xenner-instr.c
@@ -0,0 +1,405 @@ 
+/*
+ *  Copyright (C) Red Hat 2007
+ *  Copyright (C) Novell Inc. 2010
+ *
+ *  Author(s): Gerd Hoffmann <kraxel@redhat.com>
+ *             Alexander Graf <agraf@suse.de>
+ *
+ *  Xenner instruction emulator
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; under version 2 of the License.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "xenner.h"
+#include "msr-index.h"
+#include "cpufeature.h"
+
+void real_cpuid(struct kvm_cpuid_entry *entry)
+{
+    asm volatile("cpuid"
+                 : "=a" (entry->eax),
+                   "=b" (entry->ebx),
+                   "=c" (entry->ecx),
+                   "=d" (entry->edx)
+                 : "a" (entry->function));
+}
+
+static unsigned long clear_cpuid_bit(unsigned long bit, unsigned long x)
+{
+    unsigned long r = x;
+
+    bit %= 64;
+    r = x & ~(1 << bit);
+
+    return r;
+}
+
+static void filter_cpuid(struct kvm_cpuid_entry *entry)
+{
+    switch (entry->function) {
+    case 0x00000001:
+        entry->edx = clear_cpuid_bit(X86_FEATURE_SEP, entry->edx);
+        entry->edx = clear_cpuid_bit(X86_FEATURE_DS, entry->edx);
+        entry->edx = clear_cpuid_bit(X86_FEATURE_DS, entry->edx);
+        entry->edx = clear_cpuid_bit(X86_FEATURE_ACC, entry->edx);
+        entry->edx = clear_cpuid_bit(X86_FEATURE_PBE, entry->edx);
+
+        entry->ecx = clear_cpuid_bit(X86_FEATURE_DTES64, entry->ecx);
+        entry->ecx = clear_cpuid_bit(X86_FEATURE_MWAIT, entry->ecx);
+        entry->ecx = clear_cpuid_bit(X86_FEATURE_DSCPL, entry->ecx);
+        entry->ecx = clear_cpuid_bit(X86_FEATURE_VMXE, entry->ecx);
+        entry->ecx = clear_cpuid_bit(X86_FEATURE_SMXE, entry->ecx);
+        entry->ecx = clear_cpuid_bit(X86_FEATURE_EST, entry->ecx);
+        entry->ecx = clear_cpuid_bit(X86_FEATURE_TM2, entry->ecx);
+        entry->ecx = clear_cpuid_bit(X86_FEATURE_XTPR, entry->ecx);
+        entry->ecx = clear_cpuid_bit(X86_FEATURE_PDCM, entry->ecx);
+        entry->ecx = clear_cpuid_bit(X86_FEATURE_DCA, entry->ecx);
+        entry->ecx = clear_cpuid_bit(X86_FEATURE_XSAVE, entry->ecx);
+        /* fall through */
+    case 0x80000001:
+        entry->edx = clear_cpuid_bit(X86_FEATURE_VME, entry->edx);
+        entry->edx = clear_cpuid_bit(X86_FEATURE_PSE, entry->edx);
+        entry->edx = clear_cpuid_bit(X86_FEATURE_PGE, entry->edx);
+        entry->edx = clear_cpuid_bit(X86_FEATURE_MCE, entry->edx);
+        entry->edx = clear_cpuid_bit(X86_FEATURE_MCA, entry->edx);
+        entry->edx = clear_cpuid_bit(X86_FEATURE_MTRR, entry->edx);
+        entry->edx = clear_cpuid_bit(X86_FEATURE_PSE36, entry->edx);
+
+#ifdef CONFIG_32BIT
+        entry->edx = clear_cpuid_bit(X86_FEATURE_LM, entry->edx);
+        entry->ecx = clear_cpuid_bit(X86_FEATURE_LAHF_LM, entry->ecx);
+#endif
+        entry->edx = clear_cpuid_bit(X86_FEATURE_PAGE1GB, entry->edx);
+        entry->edx = clear_cpuid_bit(X86_FEATURE_RDTSCP, entry->edx);
+
+        entry->ecx = clear_cpuid_bit(X86_FEATURE_SVME, entry->ecx);
+        entry->ecx = clear_cpuid_bit(X86_FEATURE_OSVW, entry->ecx);
+        entry->ecx = clear_cpuid_bit(X86_FEATURE_IBS, entry->ecx);
+        entry->ecx = clear_cpuid_bit(X86_FEATURE_SKINIT, entry->ecx);
+        entry->ecx = clear_cpuid_bit(X86_FEATURE_WDT, entry->ecx);
+        break;
+
+    case 0x00000005: /* MONITOR/MWAIT */
+    case 0x0000000a: /* Architectural Performance Monitor Features */
+    case 0x8000000a: /* SVM revision and features */
+    case 0x8000001b: /* Instruction Based Sampling */
+        entry->eax = 0;
+        entry->ebx = 0;
+        entry->ecx = 0;
+        entry->edx = 0;
+        break;
+    }
+}
+
+static void emulate_cpuid(struct regs *regs)
+{
+    struct kvm_cpuid_entry entry;
+
+    entry.function = regs->rax;
+    real_cpuid(&entry);
+    filter_cpuid(&entry);
+    regs->rax = entry.eax;
+    regs->rbx = entry.ebx;
+    regs->rcx = entry.ecx;
+    regs->rdx = entry.edx;
+    printk(2, "cpuid 0x%08x: eax 0x%08x ebx 0x%08x ecx 0x%08x edx 0x%08x\n",
+           entry.function, entry.eax, entry.ebx, entry.ecx, entry.edx);
+}
+
+static void emulate_rdmsr(struct regs *regs)
+{
+    uint32_t ax,dx;
+    switch (regs->rcx) {
+    case MSR_EFER:
+    case MSR_FS_BASE:
+    case MSR_GS_BASE:
+    case MSR_KERNEL_GS_BASE:
+        /* white listed */
+        rdmsr(regs->rcx, &ax, &dx);
+        regs->rax = ax;
+        regs->rdx = dx;
+        break;
+    default:
+        printk(1, "%s: ignore: rcx 0x%" PRIxREG "\n", __FUNCTION__, regs->rcx);
+        regs->rax = 0;
+        regs->rdx = 0;
+        break;
+    }
+}
+
+static void emulate_wrmsr(struct regs *regs)
+{
+    static const uint64_t known = (EFER_NX|EFER_LMA|EFER_LME|EFER_SCE);
+    static const uint64_t fixed = (EFER_LMA|EFER_LME|EFER_SCE);
+    uint32_t ax,dx;
+
+    switch (regs->rcx) {
+    case MSR_EFER:
+        if (regs->rax & ~known) {
+            printk(1, "%s: efer: unknown bit set\n", __FUNCTION__);
+            goto out;
+        }
+
+        rdmsr(regs->rcx, &ax, &dx);
+        if ((regs->rax & fixed) != (ax & fixed)) {
+            printk(1, "%s: efer: modify fixed bit\n", __FUNCTION__);
+            goto out;
+        }
+
+        printk(1, "%s: efer:%s%s%s%s\n", __FUNCTION__,
+               regs->rax & EFER_SCE ? " sce" : "",
+               regs->rax & EFER_LME ? " lme" : "",
+               regs->rax & EFER_LMA ? " lma" : "",
+               regs->rax & EFER_NX  ? " nx"  : "");
+        /* fall through */
+    case MSR_FS_BASE:
+    case MSR_GS_BASE:
+    case MSR_KERNEL_GS_BASE:
+        wrmsr(regs->rcx, regs->rax, regs->rdx);
+        return;
+    }
+
+out:
+    printk(1, "%s: ignore: 0x%" PRIxREG " 0x%" PRIxREG ":0x%" PRIxREG "\n",
+           __FUNCTION__, regs->rcx, regs->rdx, regs->rax);
+}
+
+void print_emu_instr(int level, const char *prefix, uint8_t *instr)
+{
+    printk(level, "%s: rip %p bytes %02x %02x %02x %02x  %02x %02x %02x %02x\n",
+           prefix, instr,
+           instr[0], instr[1], instr[2], instr[3],
+           instr[4], instr[5], instr[6], instr[7]);
+}
+
+static ureg_t *decode_reg(struct regs *regs, uint8_t modrm, int rm)
+{
+    int shift = rm ? 0 : 3;
+    ureg_t *reg = NULL;
+
+    switch ((modrm >> shift) & 0x07) {
+    case 0: reg = (ureg_t*)&regs->rax; break;
+    case 1: reg = (ureg_t*)&regs->rcx; break;
+    case 2: reg = (ureg_t*)&regs->rdx; break;
+    case 3: reg = (ureg_t*)&regs->rbx; break;
+    case 4: reg = (ureg_t*)&regs->rsp; break;
+    case 5: reg = (ureg_t*)&regs->rbp; break;
+    case 6: reg = (ureg_t*)&regs->rsi; break;
+    case 7: reg = (ureg_t*)&regs->rdi; break;
+    }
+    return reg;
+}
+
+void print_bits(int level, const char *msg, uint32_t old, uint32_t new,
+                const char *names[])
+{
+    char buf[128];
+    int pos = 0;
+    uint32_t mask;
+    char *mod;
+    int i;
+
+    pos += snprintf(buf+pos, sizeof(buf)-pos, "%s:", msg);
+    for (i = 0; i < 32; i++) {
+        mask = 1 << i;
+        if (new&mask) {
+            if (old&mask) {
+                /* bit present */
+                mod = "";
+            } else {
+                /* bit added */
+                mod = "+";
+            }
+        } else {
+            if (old&mask) {
+                /* bit removed */
+                mod = "-";
+            } else {
+                /* bit not present */
+                continue;
+            }
+        }
+        pos += snprintf(buf+pos, sizeof(buf)-pos, " %s%s",
+                        mod, names[i] ? names[i] : "???");
+    }
+    pos += snprintf(buf+pos, sizeof(buf)-pos, "\n");
+    printk(level, "%s", buf);
+}
+
+int emulate(struct xen_cpu *cpu, struct regs *regs)
+{
+    static const uint8_t xen_emu_prefix[5] = {0x0f, 0x0b, 'x','e','n'};
+    uint8_t *instr;
+    int skip = 0;
+    int in = 0;
+    int shift = 0;
+    int port = 0;
+
+restart:
+    instr = (void*)regs->rip;
+
+    /* prefixes */
+    if (instr[skip] == 0x66) {
+        shift = 16;
+        skip++;
+    }
+
+    /* instructions */
+    switch (instr[skip]) {
+    case 0x0f:
+        switch (instr[skip+1]) {
+        case 0x06:
+            /* clts */
+            clts();
+            skip += 2;
+            break;
+        case 0x09:
+            /* wbinvd */
+            __asm__("wbinvd" ::: "memory");
+            skip += 2;
+            break;
+        case 0x0b:
+            /* ud2a */
+            if (xen_emu_prefix[2] == instr[skip+2] &&
+                xen_emu_prefix[3] == instr[skip+3] &&
+                xen_emu_prefix[4] == instr[skip+4]) {
+                printk(2, "%s: xen emu prefix\n", __FUNCTION__);
+                regs->rip += 5;
+                goto restart;
+            }
+            printk(1, "%s: ud2a -- linux kernel BUG()?\n", __FUNCTION__);
+            /* bounce to guest, hoping it prints more info */
+            return 0;
+        case 0x20:
+        {
+            /* read control registers */
+            ureg_t *reg = decode_reg(regs, instr[skip+2], 1);
+            switch (((instr[skip+2]) >> 3) & 0x07) {
+            case 0:
+                *reg = read_cr0();
+                skip = 3;
+                break;
+            case 3:
+                *reg = frame_to_addr(read_cr3_mfn(cpu));
+                skip = 3;
+                break;
+            case 4:
+                *reg = read_cr4();
+                skip = 3;
+                break;
+            }
+            break;
+        }
+        case 0x22:
+        {
+            /* write control registers */
+            static const ureg_t cr0_fixed = ~(X86_CR0_TS);
+            static const ureg_t cr4_fixed = X86_CR4_TSD;
+            ureg_t *reg = decode_reg(regs, instr[skip+2], 1);
+            ureg_t cr;
+            switch (((instr[skip+2]) >> 3) & 0x07) {
+            case 0:
+                cr = read_cr0();
+                if (cr != *reg) {
+                    if ((cr & cr0_fixed) == (*reg & cr0_fixed)) {
+                        print_bits(2, "apply cr0 update", cr, *reg, cr0_bits);
+                        write_cr0(*reg);
+                    } else {
+                        print_bits(1, "IGNORE cr0 update", cr, *reg, cr0_bits);
+                    }
+                }
+                skip = 3;
+                break;
+            case 4:
+                cr = read_cr4();
+                if (cr != *reg) {
+                    if ((cr & cr4_fixed) == (*reg & cr4_fixed)) {
+                        print_bits(1, "apply cr4 update", cr, *reg, cr4_bits);
+                        write_cr4(*reg);
+                    } else {
+                        print_bits(1, "IGNORE cr4 update", cr, *reg, cr4_bits);
+                    }
+                }
+                skip = 3;
+                break;
+            }
+            break;
+        }
+        case 0x30:
+            /* wrmsr */
+            emulate_wrmsr(regs);
+            skip += 2;
+            break;
+        case 0x32:
+            /* rdmsr */
+            emulate_rdmsr(regs);
+            skip += 2;
+            break;
+        case 0xa2:
+            /* cpuid */
+            emulate_cpuid(regs);
+            skip += 2;
+            break;
+        }
+        break;
+
+    case 0xe4: /* in     <next byte>,%al */
+    case 0xe5:
+        in = (instr[skip] & 1) ? 2 : 1;
+        port = instr[skip+1];
+        skip += 2;
+        break;
+    case 0xec: /* in     (%dx),%al */
+    case 0xed:
+        in = (instr[skip] & 1) ? 2 : 1;
+        port = regs->rdx & 0xffff;
+        skip += 1;
+        break;
+    case 0xe6: /* out    %al,<next byte> */
+    case 0xe7:
+        port = instr[skip+1];
+        skip += 2;
+        break;
+    case 0xee: /* out    %al,(%dx) */
+    case 0xef:
+        port = regs->rdx & 0xffff;
+        skip += 1;
+        break;
+
+    case 0xfa:
+        /* cli */
+        guest_cli(cpu);
+        skip += 1;
+        break;
+    case 0xfb:
+        /* sti */
+        guest_sti(cpu);
+        skip += 1;
+        break;
+    }
+
+    /* unknown instruction */
+    if (!skip) {
+        print_emu_instr(0, "instr emu failed", instr);
+        return -1;
+    }
+
+    /* I/O instruction */
+    if (in == 2) {
+        regs->rax |= 0xffffffff;
+    } else if (in == 1) {
+        regs->rax |= (0xffff << shift);
+    }
+
+    return skip;
+}