Message ID | m38vyit1ao.fsf@blackfin.pond.sub.org |
---|---|
State | New |
Headers | show |
On 01/18/2011 02:16 PM, Markus Armbruster wrote: > The problem: you want to do serious scalability testing (1000s of VMs) > of your management stack. If each guest eats up a few 100MiB and > competes for CPU, that requires a serious host machine. Which you don't > have. You also don't want to modify the management stack at all, if you > can help it. > > The solution: a perfectly normal-looking QEMU that uses minimal > resources. Ability to execute any guest code is strictly optional ;) > > New option -fake-machine creates a fake machine incapable of running > guest code. Completely compiled out by default, enable with configure > --enable-fake-machine. > > With -fake-machine, CPU use is negligible, and memory use is rather > modest. > > Non-fake VM running F-14 live, right after boot: > UID PID PPID C SZ RSS PSR STIME TTY TIME CMD > armbru 15707 2558 53 191837 414388 1 21:05 pts/3 00:00:29 [...] > > Same VM -fake-machine, after similar time elapsed: > UID PID PPID C SZ RSS PSR STIME TTY TIME CMD > armbru 15742 2558 0 85129 9412 0 21:07 pts/3 00:00:00 [...] > > We're using a very similar patch for RHEL scalability testing. > Interesting, but: 9432 anthony 20 0 153m 14m 5384 S 0 0.2 0:00.22 qemu-system-x86 That's qemu-system-x86 -m 4 In terms of memory overhead, the largest source is not really going to be addressed by -fake-machine (l1_phys_map and phys_ram_dirty). I don't really understand the point of not creating a VCPU with KVM. Is there some type of overhead in doing that? Regards, Anthony Liguori > HACK ALERT: Works by hacking the main loop so it never executes any > guest code. Not implemented for KVM's main loop at this time, thus > -fake-machine needs to force KVM off. It also replaces guest RAM by a > token amount (pc machine only at this time), and forces -vga none, > because VGA eats too much memory. > > Note the TODO and FIXME comments. > > Dan Berrange explored a different solution a while ago: a new do-nothing > target, patterned after i386, and a new do-nothing machine, patterned > after pc. His patch works. But it duplicates much target and machine > code --- adds more than ten times as many lines as this patch. Keeping > the duplicated code reasonably in sync would be bothersome. I didn't > like that, talked it over with Dan, and we came up with this idea > instead. > > Comments? Better ideas? > --- > configure | 12 ++++++++++++ > cpu-exec.c | 2 +- > cpus.c | 3 +++ > hw/pc.c | 30 ++++++++++++++++++++---------- > qemu-options.hx | 7 +++++++ > targphys.h | 7 +++++++ > vl.c | 21 +++++++++++++++++++++ > 7 files changed, 71 insertions(+), 11 deletions(-) > > diff --git a/configure b/configure > index d68f862..98b0a5f 100755 > --- a/configure > +++ b/configure > @@ -174,6 +174,7 @@ trace_backend="nop" > trace_file="trace" > spice="" > rbd="" > +fake_machine="no" > > # parse CC options first > for opt do > @@ -719,6 +720,10 @@ for opt do > ;; > --enable-rbd) rbd="yes" > ;; > + --disable-fake-machine) fake_machine="no" > + ;; > + --enable-fake-machine) fake_machine="yes" > + ;; > *) echo "ERROR: unknown option $opt"; show_help="yes" > ;; > esac > @@ -913,6 +918,8 @@ echo " Default:trace-<pid>" > echo " --disable-spice disable spice" > echo " --enable-spice enable spice" > echo " --enable-rbd enable building the rados block device (rbd)" > +echo " --disable-fake-machine disable -fake-machine option" > +echo " --enable-fake-machine enable -fake-machine option" > echo "" > echo "NOTE: The object files are built at the place where configure is launched" > exit 1 > @@ -2455,6 +2462,7 @@ echo "Trace output file $trace_file-<pid>" > echo "spice support $spice" > echo "rbd support $rbd" > echo "xfsctl support $xfs" > +echo "-fake-machine $fake_machine" > > if test $sdl_too_old = "yes"; then > echo "-> Your SDL version is too old - please upgrade to have SDL support" > @@ -2727,6 +2735,10 @@ if test "$spice" = "yes" ; then > echo "CONFIG_SPICE=y">> $config_host_mak > fi > > +if test $fake_machine = "yes" ; then > + echo "CONFIG_FAKE_MACHINE=y">> $config_host_mak > +fi > + > # XXX: suppress that > if [ "$bsd" = "yes" ] ; then > echo "CONFIG_BSD=y">> $config_host_mak > diff --git a/cpu-exec.c b/cpu-exec.c > index 8c9fb8b..cd1259a 100644 > --- a/cpu-exec.c > +++ b/cpu-exec.c > @@ -230,7 +230,7 @@ int cpu_exec(CPUState *env1) > uint8_t *tc_ptr; > unsigned long next_tb; > > - if (cpu_halted(env1) == EXCP_HALTED) > + if (fake_machine || cpu_halted(env1) == EXCP_HALTED) > return EXCP_HALTED; > > cpu_single_env = env1; > diff --git a/cpus.c b/cpus.c > index 0309189..91e708f 100644 > --- a/cpus.c > +++ b/cpus.c > @@ -128,6 +128,9 @@ static int cpu_can_run(CPUState *env) > > static int cpu_has_work(CPUState *env) > { > + if (fake_machine) { > + return 0; > + } > if (env->stop) > return 1; > if (env->queued_work_first) > diff --git a/hw/pc.c b/hw/pc.c > index fface7d..809f53e 100644 > --- a/hw/pc.c > +++ b/hw/pc.c > @@ -993,18 +993,28 @@ void pc_memory_init(ram_addr_t ram_size, > linux_boot = (kernel_filename != NULL); > > /* allocate RAM */ > - ram_addr = qemu_ram_alloc(NULL, "pc.ram", > - below_4g_mem_size + above_4g_mem_size); > - cpu_register_physical_memory(0, 0xa0000, ram_addr); > - cpu_register_physical_memory(0x100000, > - below_4g_mem_size - 0x100000, > - ram_addr + 0x100000); > + if (fake_machine) { > + /* If user boots with -m 1000 We don't actually want to > + * allocate a GB of RAM, so lets force all RAM allocs to one > + * page to keep our memory footprint nice and low. > + * > + * TODO try to use -m 1k instead > + */ > + ram_addr = qemu_ram_alloc(NULL, "pc.ram", 1); > + } else { > + ram_addr = qemu_ram_alloc(NULL, "pc.ram", > + below_4g_mem_size + above_4g_mem_size); > + cpu_register_physical_memory(0, 0xa0000, ram_addr); > + cpu_register_physical_memory(0x100000, > + below_4g_mem_size - 0x100000, > + ram_addr + 0x100000); > #if TARGET_PHYS_ADDR_BITS> 32 > - if (above_4g_mem_size> 0) { > - cpu_register_physical_memory(0x100000000ULL, above_4g_mem_size, > - ram_addr + below_4g_mem_size); > - } > + if (above_4g_mem_size> 0) { > + cpu_register_physical_memory(0x100000000ULL, above_4g_mem_size, > + ram_addr + below_4g_mem_size); > + } > #endif > + } > > /* BIOS load */ > if (bios_name == NULL) > diff --git a/qemu-options.hx b/qemu-options.hx > index 898561d..8a8ef4b 100644 > --- a/qemu-options.hx > +++ b/qemu-options.hx > @@ -2324,6 +2324,13 @@ Specify a trace file to log output traces to. > ETEXI > #endif > > +#ifdef CONFIG_FAKE_MACHINE > +DEF("fake-machine", 0, QEMU_OPTION_fake_machine, > + "-fake-machine create a fake machine incapable of running guest code\n" > + " mimimal resource use, use for scalability testing\n", > + QEMU_ARCH_ALL) > +#endif > + > HXCOMM This is the last statement. Insert new options before this line! > STEXI > @end table > diff --git a/targphys.h b/targphys.h > index 95648d6..f30530c 100644 > --- a/targphys.h > +++ b/targphys.h > @@ -18,4 +18,11 @@ typedef uint64_t target_phys_addr_t; > #endif > #endif > > +/* FIXME definitely in the wrong place here; where should it go? */ > +#ifdef CONFIG_FAKE_MACHINE > +extern int fake_machine; > +#else > +#define fake_machine 0 > +#endif > + > #endif > diff --git a/vl.c b/vl.c > index 0292184..bcc60b0 100644 > --- a/vl.c > +++ b/vl.c > @@ -240,6 +240,10 @@ struct FWBootEntry { > > QTAILQ_HEAD(, FWBootEntry) fw_boot_order = QTAILQ_HEAD_INITIALIZER(fw_boot_order); > > +#ifdef CONFIG_FAKE_MACHINE > +int fake_machine = 0; > +#endif > + > int nb_numa_nodes; > uint64_t node_mem[MAX_NODES]; > uint64_t node_cpumask[MAX_NODES]; > @@ -2727,6 +2731,11 @@ int main(int argc, char **argv, char **envp) > fclose(fp); > break; > } > +#ifdef CONFIG_FAKE_MACHINE > + case QEMU_OPTION_fake_machine: > + fake_machine = 1; > + break; > +#endif > default: > os_parse_cmd_args(popt->index, optarg); > } > @@ -2817,6 +2826,15 @@ int main(int argc, char **argv, char **envp) > } > if (default_vga) > vga_interface_type = VGA_CIRRUS; > + if (fake_machine) { > + /* HACK: Ideally we'd configure VGA as usual, but this causes > + * several MB of VGA RAM to be allocated, and we can't do the > + * tricks we use elsewhere to just return a single 4k page, > + * because the VGA driver immediately memsets() the entire > + * allocation it requested. > + */ > + vga_interface_type = VGA_NONE; > + } > > socket_init(); > > @@ -2835,6 +2853,9 @@ int main(int argc, char **argv, char **envp) > exit(1); > } > > + if (fake_machine) { > + kvm_allowed = 0; > + } > if (kvm_allowed) { > int ret = kvm_init(smp_cpus); > if (ret< 0) { >
Anthony Liguori <anthony@codemonkey.ws> writes: > On 01/18/2011 02:16 PM, Markus Armbruster wrote: >> The problem: you want to do serious scalability testing (1000s of VMs) >> of your management stack. If each guest eats up a few 100MiB and >> competes for CPU, that requires a serious host machine. Which you don't >> have. You also don't want to modify the management stack at all, if you >> can help it. >> >> The solution: a perfectly normal-looking QEMU that uses minimal >> resources. Ability to execute any guest code is strictly optional ;) >> >> New option -fake-machine creates a fake machine incapable of running >> guest code. Completely compiled out by default, enable with configure >> --enable-fake-machine. >> >> With -fake-machine, CPU use is negligible, and memory use is rather >> modest. >> >> Non-fake VM running F-14 live, right after boot: >> UID PID PPID C SZ RSS PSR STIME TTY TIME CMD >> armbru 15707 2558 53 191837 414388 1 21:05 pts/3 00:00:29 [...] >> >> Same VM -fake-machine, after similar time elapsed: >> UID PID PPID C SZ RSS PSR STIME TTY TIME CMD >> armbru 15742 2558 0 85129 9412 0 21:07 pts/3 00:00:00 [...] >> >> We're using a very similar patch for RHEL scalability testing. >> > > Interesting, but: > > 9432 anthony 20 0 153m 14m 5384 S 0 0.2 0:00.22 > qemu-system-x86 > > That's qemu-system-x86 -m 4 Sure you ran qemu-system-x86 -fake-machine? > In terms of memory overhead, the largest source is not really going to > be addressed by -fake-machine (l1_phys_map and phys_ram_dirty). git-grep phys_ram_dirty finds nothing. > I don't really understand the point of not creating a VCPU with KVM. > Is there some type of overhead in doing that? I briefly looked at both main loops, TCG's was the first one I happened to crack, and I didn't feel like doing both then. If the general approach is okay, I'll gladly investigate how to do it with KVM.
diff --git a/configure b/configure index d68f862..98b0a5f 100755 --- a/configure +++ b/configure @@ -174,6 +174,7 @@ trace_backend="nop" trace_file="trace" spice="" rbd="" +fake_machine="no" # parse CC options first for opt do @@ -719,6 +720,10 @@ for opt do ;; --enable-rbd) rbd="yes" ;; + --disable-fake-machine) fake_machine="no" + ;; + --enable-fake-machine) fake_machine="yes" + ;; *) echo "ERROR: unknown option $opt"; show_help="yes" ;; esac @@ -913,6 +918,8 @@ echo " Default:trace-<pid>" echo " --disable-spice disable spice" echo " --enable-spice enable spice" echo " --enable-rbd enable building the rados block device (rbd)" +echo " --disable-fake-machine disable -fake-machine option" +echo " --enable-fake-machine enable -fake-machine option" echo "" echo "NOTE: The object files are built at the place where configure is launched" exit 1 @@ -2455,6 +2462,7 @@ echo "Trace output file $trace_file-<pid>" echo "spice support $spice" echo "rbd support $rbd" echo "xfsctl support $xfs" +echo "-fake-machine $fake_machine" if test $sdl_too_old = "yes"; then echo "-> Your SDL version is too old - please upgrade to have SDL support" @@ -2727,6 +2735,10 @@ if test "$spice" = "yes" ; then echo "CONFIG_SPICE=y" >> $config_host_mak fi +if test $fake_machine = "yes" ; then + echo "CONFIG_FAKE_MACHINE=y" >> $config_host_mak +fi + # XXX: suppress that if [ "$bsd" = "yes" ] ; then echo "CONFIG_BSD=y" >> $config_host_mak diff --git a/cpu-exec.c b/cpu-exec.c index 8c9fb8b..cd1259a 100644 --- a/cpu-exec.c +++ b/cpu-exec.c @@ -230,7 +230,7 @@ int cpu_exec(CPUState *env1) uint8_t *tc_ptr; unsigned long next_tb; - if (cpu_halted(env1) == EXCP_HALTED) + if (fake_machine || cpu_halted(env1) == EXCP_HALTED) return EXCP_HALTED; cpu_single_env = env1; diff --git a/cpus.c b/cpus.c index 0309189..91e708f 100644 --- a/cpus.c +++ b/cpus.c @@ -128,6 +128,9 @@ static int cpu_can_run(CPUState *env) static int cpu_has_work(CPUState *env) { + if (fake_machine) { + return 0; + } if (env->stop) return 1; if (env->queued_work_first) diff --git a/hw/pc.c b/hw/pc.c index fface7d..809f53e 100644 --- a/hw/pc.c +++ b/hw/pc.c @@ -993,18 +993,28 @@ void pc_memory_init(ram_addr_t ram_size, linux_boot = (kernel_filename != NULL); /* allocate RAM */ - ram_addr = qemu_ram_alloc(NULL, "pc.ram", - below_4g_mem_size + above_4g_mem_size); - cpu_register_physical_memory(0, 0xa0000, ram_addr); - cpu_register_physical_memory(0x100000, - below_4g_mem_size - 0x100000, - ram_addr + 0x100000); + if (fake_machine) { + /* If user boots with -m 1000 We don't actually want to + * allocate a GB of RAM, so lets force all RAM allocs to one + * page to keep our memory footprint nice and low. + * + * TODO try to use -m 1k instead + */ + ram_addr = qemu_ram_alloc(NULL, "pc.ram", 1); + } else { + ram_addr = qemu_ram_alloc(NULL, "pc.ram", + below_4g_mem_size + above_4g_mem_size); + cpu_register_physical_memory(0, 0xa0000, ram_addr); + cpu_register_physical_memory(0x100000, + below_4g_mem_size - 0x100000, + ram_addr + 0x100000); #if TARGET_PHYS_ADDR_BITS > 32 - if (above_4g_mem_size > 0) { - cpu_register_physical_memory(0x100000000ULL, above_4g_mem_size, - ram_addr + below_4g_mem_size); - } + if (above_4g_mem_size > 0) { + cpu_register_physical_memory(0x100000000ULL, above_4g_mem_size, + ram_addr + below_4g_mem_size); + } #endif + } /* BIOS load */ if (bios_name == NULL) diff --git a/qemu-options.hx b/qemu-options.hx index 898561d..8a8ef4b 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -2324,6 +2324,13 @@ Specify a trace file to log output traces to. ETEXI #endif +#ifdef CONFIG_FAKE_MACHINE +DEF("fake-machine", 0, QEMU_OPTION_fake_machine, + "-fake-machine create a fake machine incapable of running guest code\n" + " mimimal resource use, use for scalability testing\n", + QEMU_ARCH_ALL) +#endif + HXCOMM This is the last statement. Insert new options before this line! STEXI @end table diff --git a/targphys.h b/targphys.h index 95648d6..f30530c 100644 --- a/targphys.h +++ b/targphys.h @@ -18,4 +18,11 @@ typedef uint64_t target_phys_addr_t; #endif #endif +/* FIXME definitely in the wrong place here; where should it go? */ +#ifdef CONFIG_FAKE_MACHINE +extern int fake_machine; +#else +#define fake_machine 0 +#endif + #endif diff --git a/vl.c b/vl.c index 0292184..bcc60b0 100644 --- a/vl.c +++ b/vl.c @@ -240,6 +240,10 @@ struct FWBootEntry { QTAILQ_HEAD(, FWBootEntry) fw_boot_order = QTAILQ_HEAD_INITIALIZER(fw_boot_order); +#ifdef CONFIG_FAKE_MACHINE +int fake_machine = 0; +#endif + int nb_numa_nodes; uint64_t node_mem[MAX_NODES]; uint64_t node_cpumask[MAX_NODES]; @@ -2727,6 +2731,11 @@ int main(int argc, char **argv, char **envp) fclose(fp); break; } +#ifdef CONFIG_FAKE_MACHINE + case QEMU_OPTION_fake_machine: + fake_machine = 1; + break; +#endif default: os_parse_cmd_args(popt->index, optarg); } @@ -2817,6 +2826,15 @@ int main(int argc, char **argv, char **envp) } if (default_vga) vga_interface_type = VGA_CIRRUS; + if (fake_machine) { + /* HACK: Ideally we'd configure VGA as usual, but this causes + * several MB of VGA RAM to be allocated, and we can't do the + * tricks we use elsewhere to just return a single 4k page, + * because the VGA driver immediately memsets() the entire + * allocation it requested. + */ + vga_interface_type = VGA_NONE; + } socket_init(); @@ -2835,6 +2853,9 @@ int main(int argc, char **argv, char **envp) exit(1); } + if (fake_machine) { + kvm_allowed = 0; + } if (kvm_allowed) { int ret = kvm_init(smp_cpus); if (ret < 0) {