Patchwork [REPOST] fw_cfg: Allow guest to read kernel etc via fast, synchronous "DMA"-type operation.

login
register
mail settings
Submitter Richard W.M. Jones
Date Aug. 3, 2010, 2:49 p.m.
Message ID <20100803144931.GE22211@amd.home.annexia.org>
Download mbox | patch
Permalink /patch/60765/
State New
Headers show

Comments

Richard W.M. Jones - Aug. 3, 2010, 2:49 p.m.
I rebased this and rechecked it.  The *total* libguestfs boot time
goes from 86 seconds down to 7.7 seconds.  The proportion of that
attributable to loading the appliance is approximately 650 times [sic]
faster.

Rich.
Paul Brook - Nov. 25, 2010, 5:52 p.m.
> I rebased this and rechecked it.  The *total* libguestfs boot time
> goes from 86 seconds down to 7.7 seconds.  The proportion of that
> attributable to loading the appliance is approximately 650 times [sic]
> faster.

Nack.

> +/* Target address and size for DMA operations.  This is only used
> + * during boot and across 32 and 64 bit architectures, so only writes
> + * to lower 4GB addresses are supported.
> + */
> +static uint32_t dma_addr = 0;
> +static uint32_t dma_size = 0;

Storing per-device state in global variables is just plain wrong.
Limiting addresses to 32-bits is just lame. You can do better.
It's also completely broken on big-endian hosts.

Paul

Patch

diff --git a/hw/fw_cfg.c b/hw/fw_cfg.c
index 37e6f1f..a881bd5 100644
--- a/hw/fw_cfg.c
+++ b/hw/fw_cfg.c
@@ -55,6 +55,13 @@  struct FWCfgState {
     uint32_t cur_offset;
 };
 
+/* Target address and size for DMA operations.  This is only used
+ * during boot and across 32 and 64 bit architectures, so only writes
+ * to lower 4GB addresses are supported.
+ */
+static uint32_t dma_addr = 0;
+static uint32_t dma_size = 0;
+
 static void fw_cfg_write(FWCfgState *s, uint8_t value)
 {
     int arch = !!(s->cur_entry & FW_CFG_ARCH_LOCAL);
@@ -98,7 +105,22 @@  static uint8_t fw_cfg_read(FWCfgState *s)
 
     if (s->cur_entry == FW_CFG_INVALID || !e->data || s->cur_offset >= e->len)
         ret = 0;
-    else
+    else if (s->cur_entry & FW_CFG_DMA) {
+        if (dma_size > e->len - s->cur_offset)
+            dma_size = e->len - s->cur_offset;
+
+        cpu_physical_memory_write ((target_phys_addr_t) dma_addr,
+                                   &e->data[s->cur_offset],
+                                   dma_size);
+        s->cur_offset += e->len;
+        /* Returns 0 if there was an error, 1 if the DMA operation
+         * started.  Callers *must* poll for completion of the
+         * operation (waiting for FW_CFG_DMA_DONE == 0), even though
+         * in the current implementation the operation completes
+         * instantaneously from the p.o.v of the current guest vCPU.
+         */
+        ret = 1;
+    } else
         ret = e->data[s->cur_offset++];
 
     FW_CFG_DPRINTF("read %d\n", ret);
@@ -352,6 +374,19 @@  FWCfgState *fw_cfg_init(uint32_t ctl_port, uint32_t data_port,
     fw_cfg_add_i16(s, FW_CFG_MAX_CPUS, (uint16_t)max_cpus);
     fw_cfg_add_i16(s, FW_CFG_BOOT_MENU, (uint16_t)boot_menu);
 
+    fw_cfg_add_i32(s, FW_CFG_FEATURES, FW_CFG_FEATURES_DMA);
+
+    fw_cfg_add_bytes(s, FW_CFG_DMA_ADDR | FW_CFG_WRITE_CHANNEL,
+                     (uint8_t *)&dma_addr, sizeof dma_addr);
+    fw_cfg_add_bytes(s, FW_CFG_DMA_SIZE | FW_CFG_WRITE_CHANNEL,
+                     (uint8_t *)&dma_size, sizeof dma_size);
+    /* Current implementation is synchronous, so this value always reads
+     * as 0 (meaning "done").  In other possible implementations, this
+     * could return > 0 indicating that the caller should continue polling
+     * for completion of the operation.
+     */
+    fw_cfg_add_i32(s, FW_CFG_DMA_DONE, 0);
+
     return s;
 }
 
diff --git a/hw/fw_cfg.h b/hw/fw_cfg.h
index 4d13a4f..31ee4ff 100644
--- a/hw/fw_cfg.h
+++ b/hw/fw_cfg.h
@@ -27,17 +27,31 @@ 
 #define FW_CFG_SETUP_SIZE       0x17
 #define FW_CFG_SETUP_DATA       0x18
 #define FW_CFG_FILE_DIR         0x19
-
 #define FW_CFG_FILE_FIRST       0x20
 #define FW_CFG_FILE_SLOTS       0x10
-#define FW_CFG_MAX_ENTRY        (FW_CFG_FILE_FIRST+FW_CFG_FILE_SLOTS)
+#define FW_CFG_FEATURES         0x30
+#define FW_CFG_DMA_ADDR         0x31
+#define FW_CFG_DMA_SIZE         0x32
+#define FW_CFG_DMA_DONE         0x33
+
+#define FW_CFG_MAX_ENTRY        (FW_CFG_DMA_DONE+1)
 
+#define FW_CFG_DMA              0x2000
 #define FW_CFG_WRITE_CHANNEL    0x4000
 #define FW_CFG_ARCH_LOCAL       0x8000
-#define FW_CFG_ENTRY_MASK       ~(FW_CFG_WRITE_CHANNEL | FW_CFG_ARCH_LOCAL)
+#define FW_CFG_ENTRY_MASK       ~(FW_CFG_DMA | FW_CFG_WRITE_CHANNEL | FW_CFG_ARCH_LOCAL)
 
 #define FW_CFG_INVALID          0xffff
 
+/* Bitmask of features returned by reading FW_CFG_FEATURES entry.
+ * Older versions of qemu didn't implement this, but they returned 0
+ * when you read an invalid entry.  So 0 indicates that only the basic
+ * features < entry 0x30 are supported (in reality it's not well
+ * defined because features were incrementally added over time with no
+ * feature detection interface).
+ */
+#define FW_CFG_FEATURES_DMA     0x00000001
+
 #ifndef NO_QEMU_PROTOS
 typedef struct FWCfgFile {
     uint32_t  size;        /* file size */
diff --git a/pc-bios/optionrom/linuxboot.S b/pc-bios/optionrom/linuxboot.S
index c109363..d86973a 100644
--- a/pc-bios/optionrom/linuxboot.S
+++ b/pc-bios/optionrom/linuxboot.S
@@ -106,11 +106,20 @@  copy_kernel:
 	/* We're now running in 16-bit CS, but 32-bit ES! */
 
 	/* Load kernel and initrd */
-	read_fw_blob_addr32(FW_CFG_KERNEL)
+	read_fw         FW_CFG_FEATURES
+	andl            $FW_CFG_FEATURES_DMA, %eax
+	jz no_dma
+	read_fw_blob_dma(FW_CFG_KERNEL)
+	read_fw_blob_dma(FW_CFG_INITRD)
+	read_fw_blob_dma(FW_CFG_CMDLINE)
+	read_fw_blob_dma(FW_CFG_SETUP)
+	jz jmp_linux
+no_dma:	read_fw_blob_addr32(FW_CFG_KERNEL)
 	read_fw_blob_addr32(FW_CFG_INITRD)
 	read_fw_blob_addr32(FW_CFG_CMDLINE)
 	read_fw_blob_addr32(FW_CFG_SETUP)
 
+jmp_linux:
 	/* And now jump into Linux! */
 	mov		$0, %eax
 	mov		%eax, %cr0
diff --git a/pc-bios/optionrom/optionrom.h b/pc-bios/optionrom/optionrom.h
index fbdd48a..b8c2a76 100644
--- a/pc-bios/optionrom/optionrom.h
+++ b/pc-bios/optionrom/optionrom.h
@@ -50,6 +50,27 @@ 
 	bswap		%eax
 .endm
 
+/*
+ * Write %eax to a variable in the fw_cfg device.
+ * In:          %eax
+ * Clobbers:	%edx
+ */
+.macro write_fw VAR
+        push            %eax
+        mov             $(\VAR|FW_CFG_WRITE_CHANNEL), %ax
+        mov             $BIOS_CFG_IOPORT_CFG, %dx
+        outw            %ax, (%dx)
+        pop             %eax
+        mov             $BIOS_CFG_IOPORT_DATA, %dx
+        outb            %al, (%dx)
+        shr             $8, %eax
+        outb            %al, (%dx)
+        shr             $8, %eax
+        outb            %al, (%dx)
+        shr             $8, %eax
+        outb            %al, (%dx)
+.endm
+
 #define read_fw_blob_pre(var)				\
 	read_fw		var ## _ADDR;			\
 	mov		%eax, %edi;			\
@@ -87,6 +108,28 @@ 
 	*/						\
 	.dc.b		0x67,0xf3,0x6c
 
+/*
+ * Fast DMA of data from fw_cfg device into physical memory.
+ *
+ * Check FW_CFG_FEATURES contains FW_CFG_FEATURES_DMA before
+ * using this macro.
+ */
+#define read_fw_blob_dma(var)                           \
+        read_fw         var ## _ADDR;                   \
+        write_fw        FW_CFG_DMA_ADDR;                \
+        read_fw         var ## _SIZE;                   \
+        write_fw        FW_CFG_DMA_SIZE;                \
+        mov             $(var ## _DATA|FW_CFG_DMA), %ax; \
+        mov             $BIOS_CFG_IOPORT_CFG, %edx;     \
+        outw            %ax, (%dx);                     \
+        mov             $BIOS_CFG_IOPORT_DATA, %dx;     \
+        inb             (%dx), %al;                     \
+     1: test            %al, %al;                       \
+        jz              1f;                             \
+        read_fw         FW_CFG_DMA_DONE;                \
+        jmp 1b;                                         \
+     1:
+
 #define OPTION_ROM_START					\
     .code16;						\
     .text;						\