diff mbox series

multiboot: Use DMA instead port-based transfer

Message ID YWM6jboU9fUib6Iy@os.inf.tu-dresden.de
State New
Headers show
Series multiboot: Use DMA instead port-based transfer | expand

Commit Message

Adam Lackorzynski Oct. 10, 2021, 7:10 p.m. UTC
Use DMA transfers in the multiboot loader to copy
data.

This significantly lowers QEMU's startup latency by
a factor of about 40, for example, going from 30sec
to 0.8sec when loading modules of 120MB in size.
This change has been used successfully for some time.

Signed-off-by: Marcus Hähnel <marcus.haehnel@kernkonzept.com>
Signed-off-by: Adam Lackorzynski <adam@l4re.org>
---
 pc-bios/multiboot.bin         | Bin 1024 -> 1536 bytes
 pc-bios/optionrom/multiboot.S |  10 ++---
 pc-bios/optionrom/optionrom.h |  77 ++++++++++++++++++++++++++++++++++
 3 files changed, 82 insertions(+), 5 deletions(-)

Comments

Stefano Garzarella Oct. 19, 2021, 8:28 a.m. UTC | #1
CCing Paolo since kvm-unit-tests use multiboot.

On Sun, Oct 10, 2021 at 09:10:05PM +0200, Adam Lackorzynski wrote:
>Use DMA transfers in the multiboot loader to copy
>data.
>
>This significantly lowers QEMU's startup latency by
>a factor of about 40, for example, going from 30sec
>to 0.8sec when loading modules of 120MB in size.
>This change has been used successfully for some time.
>
>Signed-off-by: Marcus Hähnel <marcus.haehnel@kernkonzept.com>
>Signed-off-by: Adam Lackorzynski <adam@l4re.org>
>---
> pc-bios/multiboot.bin         | Bin 1024 -> 1536 bytes
> pc-bios/optionrom/multiboot.S |  10 ++---
> pc-bios/optionrom/optionrom.h |  77 ++++++++++++++++++++++++++++++++++
> 3 files changed, 82 insertions(+), 5 deletions(-)

I tested with https://gitlab.com/kvm-unit-tests/kvm-unit-tests running:
  export QEMU=`/path/to/qemu-system-x86_64 -L path/to/pc-bios/optionrom`
  time ./run_test.sh

Before this patch:
real	3m5.578s
user	2m32.231s
sys	0m50.390s

After this patch:
real	2m55.614s
user	2m20.536s
sys	0m50.046s

It's still a good improvement, considering the fact that the images 
aren't big.

Just a consideration, for blobs (kernel and initrd) I fully agree to use 
DMA, but for 32bit variables is it useful?

However, this patch LGTM:

Tested-by: Stefano Garzarella <sgarzare@redhat.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Paolo Bonzini Oct. 19, 2021, 4:45 p.m. UTC | #2
On 10/10/21 21:10, Adam Lackorzynski wrote:
> Use DMA transfers in the multiboot loader to copy
> data.
> 
> This significantly lowers QEMU's startup latency by
> a factor of about 40, for example, going from 30sec
> to 0.8sec when loading modules of 120MB in size.
> This change has been used successfully for some time.
> 
> Signed-off-by: Marcus Hähnel<marcus.haehnel@kernkonzept.com>
> Signed-off-by: Adam Lackorzynski<adam@l4re.org>
> ---
>   pc-bios/multiboot.bin         | Bin 1024 -> 1536 bytes
>   pc-bios/optionrom/multiboot.S |  10 ++---
>   pc-bios/optionrom/optionrom.h |  77 ++++++++++++++++++++++++++++++++++
>   3 files changed, 82 insertions(+), 5 deletions(-)

This would break migration to QEMU 2.4 and earlier - which is in fact 
why linuxboot has a completely different option ROM when DMA is in use.

On my system (a relatively recent laptop) I get 15-20 MiB per second, 
which is slow but not as slow as what you got.  Out of curiosity, can 
you test what you get with the following kernel patch?

diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
index 798508e8b6f5..5853ae93bcb2 100644
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -272,7 +272,7 @@ struct fetch_cache {
  };

  struct read_cache {
-	u8 data[1024];
+	u8 data[4096];
  	unsigned long pos;
  	unsigned long end;
  };


Paolo
Marcus Haehnel Oct. 21, 2021, 9:55 p.m. UTC | #3
On Tuesday, October 19, 2021 6:45:44 PM CEST Paolo Bonzini wrote:
> On my system (a relatively recent laptop) I get 15-20 MiB per second, 
> which is slow but not as slow as what you got.  Out of curiosity, can 
> you test what you get with the following kernel patch?
> 
> diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
> index 798508e8b6f5..5853ae93bcb2 100644
> --- a/arch/x86/kvm/kvm_emulate.h
> +++ b/arch/x86/kvm/kvm_emulate.h
> @@ -272,7 +272,7 @@ struct fetch_cache {
>   };
> 
>   struct read_cache {
> -	u8 data[1024];
> +	u8 data[4096];
>   	unsigned long pos;
>   	unsigned long end;
>   };

Hi Paolo,

Thank you very much for the cleaned up and improved patch in the other
thread, which solves our issue perfectly! Your work is much appreciated.

Regarding your question above I made some quick benchmark runs. Using a
195kB kernel image and measuring from QEmu start until the first complete
line is sent over the serial output I get the following timings, all 
numbers in seconds:

 kvm?  | DMA Multiboot | Old Multiboot |
-------|---------------|---------------|
no-kvm |  0.209 ± 0.01 | 15.283 ± 0.19 |
kvm    |  0.207 ± 0.01 | 20.771 ± 0.26 |
kvm-4k |  0.208 ± 0.01 | 19.878 ± 0.22 |

The tests were run 10 times using perf stat -r10. The table shows the
averages and standard deviation. While perf does have some overhead
the general issue is independent of that.

Changing the read cache to 4k has a negligible impact compared to running
without kvm. The numbers for DMA are roughly two orders of magnitude better
in all cases.

Hardware: Lenovo T480, Intel(R) Core(TM) i7-8550U CPU @ 1.80GHz
Software: Gentoo, Custom Linux 5.14.9 kernel
          QEmu master with your DMA multiboot patches applied, v6.1.0-1564-g1a510366d8-dirty
          configured without any options, only the x86_64 softmmu as target
Commandline: qemu-system-x86_64 -enable-kvm -kernel /path/to/kernel -serial stdio -nographic -monitor none -m 512

General ballpark of results was confirmed on a different T480 with
another OS and QEmu version and on a server system. As you can see we get
not MiB/s when booting through multiboot without DMA but KiB/s.


- Marcus
diff mbox series

Patch

diff --git a/pc-bios/multiboot.bin b/pc-bios/multiboot.bin
index e772713c95749bee82c20002b50ec6d05b2d4987..94dc3fc9644ad2e1c87a3ac62a60cce659ada04f 100644
GIT binary patch
literal 1536
zcmds1&ubG=5S~qT?GnMHKP*K`h!WyKk>(<ZA_&c~7}5R#%H}IVH4u|y53=;|NZ5lX
z@gRsdLGZZ7%WiV$qLfDP5b0%GFG*Uchf0G4*4a(gOZA{s1t09r_ukC=X6C(bckxB^
z9sO|3QaXP>FGWJ<31w0V@s$A79HaepfSLo_l~H>SdrOA+1nDF6ZVLMD9PF-abWu&V
zWk}5eM3U%@N(GUWshU(32FHlqF=gad30cdqAhx0RreGfDM9iLm?nL-e5qFj#=F^{H
z%D!tf-G^v}cmUI{E(q&GxX?QZ_SI(;?7@`$T?fP9>MiCu-Kcz7LUlg93t@lH8}q~d
zTHZC76F*HToUk+QvLE8#=<?q*XLn@7U7x^X4bNrL@NA9`me(v~H<9)9H5sp`{N~M6
zhu`)5wS}AOCrgtVzQF4PNE@2CdY-VlS^1FlP$x=|c9~bo7kO(2`bt8oq%)}IWUN}1
zQ;ZZ`YLvz^h~`vA55^!GS#RepFRTo9nt0grIbio<8pJe<5pijnRTxMke#R=myFg?n
zxF(}zv<=H}jgHu6Ihrc3u_#!JEXBCv>oAaE0QW^Ju!BH;?CT4cr5AU=(%TEz4>-aX
z9c!-dD^A!$-m#<RT{FiIQwnEPXa85~99HO`#-Hfi|IUt-w`oRRy*%e_%|wa(>k5bG
uF?zB=fqgWxPiZtKI!iA9P;TTF7U^DPxqSCdu~<$m7jJ8YQb_SX+SzZT$^;((

delta 357
zcmZqRY2aW9UBz^IvLUP3L@5E5X-u=3ChA$%CorWk9!^c`lw)An&B2h`DaXRlEjNJ)
z$Y5hg>z11YljdMZGd$3DU<Vh&E&<l}y98O^q#b;~flTn9Nb5YlgB!2>!5u;jX+Q%Q
z7(hn+`J5w**TBgMOvmffcCdr>q`glASqriihWSr)9#7i=bQ;h|?9vC*b_jx8#J~WN
w2U?l7Ljb=9E);1=RQnF^VA`-&`~O3R-HelOu^2FAFiqxR{XE%$wSw^v0Movl^Z)<=

diff --git a/pc-bios/optionrom/multiboot.S b/pc-bios/optionrom/multiboot.S
index b7efe4de34..5fe4da6486 100644
--- a/pc-bios/optionrom/multiboot.S
+++ b/pc-bios/optionrom/multiboot.S
@@ -68,10 +68,10 @@  run_multiboot:
 	mov		%eax, %es
 
 	/* Read the bootinfo struct into RAM */
-	read_fw_blob(FW_CFG_INITRD)
+	read_fw_blob_dma(FW_CFG_INITRD)
 
 	/* FS = bootinfo_struct */
-	read_fw		FW_CFG_INITRD_ADDR
+	read_fw_dma_var32	FW_CFG_INITRD_ADDR
 	shr		$4, %eax
 	mov		%ax, %fs
 
@@ -188,14 +188,14 @@  prot_mode:
 	movl		%eax, %gs
 
 	/* Read the kernel and modules into RAM */
-	read_fw_blob(FW_CFG_KERNEL)
+	read_fw_blob_dma(FW_CFG_KERNEL)
 
 	/* Jump off to the kernel */
-	read_fw		FW_CFG_KERNEL_ENTRY
+	read_fw_dma_var32 FW_CFG_KERNEL_ENTRY
 	mov		%eax, %ecx
 
 	/* EBX contains a pointer to the bootinfo struct */
-	read_fw		FW_CFG_INITRD_ADDR
+	read_fw_dma_var32	FW_CFG_INITRD_ADDR
 	movl		%eax, %ebx
 
 	/* EAX has to contain the magic */
diff --git a/pc-bios/optionrom/optionrom.h b/pc-bios/optionrom/optionrom.h
index a2b612f1a7..5a88c7c5a3 100644
--- a/pc-bios/optionrom/optionrom.h
+++ b/pc-bios/optionrom/optionrom.h
@@ -37,6 +37,17 @@ 
 #define BIOS_CFG_IOPORT_CFG	0x510
 #define BIOS_CFG_IOPORT_DATA	0x511
 
+#define FW_CFG_DMA_CTL_ERROR   0x01
+#define FW_CFG_DMA_CTL_READ    0x02
+#define FW_CFG_DMA_CTL_SKIP    0x04
+#define FW_CFG_DMA_CTL_SELECT  0x08
+#define FW_CFG_DMA_CTL_WRITE   0x10
+
+#define FW_CFG_DMA_SIGNATURE 0x51454d5520434647ULL /* "QEMU CFG" */
+
+#define BIOS_CFG_DMA_ADDR_HIGH  0x514
+#define BIOS_CFG_DMA_ADDR_LOW   0x518
+
 /* Break the translation block flow so -d cpu shows us values */
 #define DEBUG_HERE \
 	jmp		1f;				\
@@ -62,6 +73,72 @@ 
 	bswap		%eax
 .endm
 
+
+/*
+ * Read data from the fw_cfg device using DMA.
+ * Clobbers:	%eax, %edx, memory[%esp-16] to memory[%esp]
+ */
+.macro read_fw_dma VAR, SIZE, ADDR
+	movl		$\VAR, %eax /* Control */
+	shl		$16, %eax
+	or		$FW_CFG_DMA_CTL_READ, %eax
+	or		$FW_CFG_DMA_CTL_SELECT, %eax
+	bswapl		%eax
+	movl		%eax, -16(%esp)
+
+	movl		\SIZE, %eax /* Length */
+	bswapl		%eax
+	mov		%eax, -12(%esp)
+
+	mov		\ADDR, %eax /* Address to write to */
+	bswapl		%eax
+	mov		%eax, -4(%esp)
+
+	movl		$0, %eax  /* We only support 32 bit target addresses */
+	mov		%eax, -8(%esp)
+
+	mov		%esp, %eax /* Address of the struct we generated */
+	subl		$16, %eax
+	bswapl		%eax
+
+	mov		$BIOS_CFG_DMA_ADDR_LOW, %dx
+	outl		%eax, (%dx) /* Initiate DMA */
+
+	movl		$FW_CFG_DMA_CTL_ERROR, %eax
+	not		%eax
+	bswapl		%eax
+
+1:  mov		-16(%esp), %edx /* Wait for completion */
+	and		%eax, %edx
+	jnz		1b
+.endm
+
+/*
+ * Read a single 32 bit value from the fw_cfg device using DMA
+ * Clobbers: %edx, memory[%esp-20] to memory[%esp]
+ * Out:		%eax
+ */
+.macro read_fw_dma_var32 VAR
+	mov		%esp, %edx
+	subl		$20, %edx
+	read_fw_dma	\VAR, $4, %edx
+	mov		-20(%esp), %eax
+.endm
+
+
+/*
+ * Read a blob from the fw_cfg device using DMA
+ * Requires _ADDR, _SIZE and _DATA values for the parameter.
+ *
+ * Clobbers:	%eax, %edx, %es, %ecx, %edi and adresses %esp-20 to %esp
+ */
+#define read_fw_blob_dma(var) \
+	read_fw_dma_var32	var ## _SIZE; \
+	mov		%eax, %ecx; \
+	read_fw_dma_var32	var ## _ADDR; \
+	mov		%eax, %edi ;\
+	read_fw_dma	var ## _DATA, %ecx, %edi
+
 #define read_fw_blob_pre(var)				\
 	read_fw		var ## _SIZE;			\
 	mov		%eax, %ecx;			\