diff mbox series

[RFC,3/4] KVM: stats: Add ioctl commands to pull statistics in binary format

Message ID 20210310003024.2026253-4-jingzhangos@google.com
State Not Applicable
Headers show
Series [RFC,1/4] KVM: stats: Separate statistics name strings from debugfs code | expand

Commit Message

Jing Zhang March 10, 2021, 12:30 a.m. UTC
Three ioctl commands are added to support binary form statistics data
retrieval. KVM_STATS_GET_INFO, KVM_STATS_GET_NAMES, KVM_STATS_GET_DATA.
KVM_CAP_STATS_BINARY_FORM indicates the capability.

Signed-off-by: Jing Zhang <jingzhangos@google.com>
---
 virt/kvm/kvm_main.c | 115 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 115 insertions(+)

Comments

Paolo Bonzini March 10, 2021, 2:55 p.m. UTC | #1
On 10/03/21 01:30, Jing Zhang wrote:
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 383df23514b9..87dd62516c8b 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -3464,6 +3464,51 @@ static long kvm_vcpu_ioctl(struct file *filp,
>   		r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
>   		break;
>   	}
> +	case KVM_STATS_GET_INFO: {
> +		struct kvm_stats_info stats_info;
> +
> +		r = -EFAULT;
> +		stats_info.num_stats = VCPU_STAT_COUNT;
> +		if (copy_to_user(argp, &stats_info, sizeof(stats_info)))
> +			goto out;
> +		r = 0;
> +		break;
> +	}
> +	case KVM_STATS_GET_NAMES: {
> +		struct kvm_stats_names stats_names;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&stats_names, argp, sizeof(stats_names)))
> +			goto out;
> +		r = -EINVAL;
> +		if (stats_names.size < VCPU_STAT_COUNT * KVM_STATS_NAME_LEN)
> +			goto out;
> +
> +		r = -EFAULT;
> +		if (copy_to_user(argp + sizeof(stats_names),
> +				kvm_vcpu_stat_strings,
> +				VCPU_STAT_COUNT * KVM_STATS_NAME_LEN))

The only reason to separate the strings in patch 1 is to pass them here. 
  But this is a poor API because it imposes a limit on the length of the 
statistics, and makes that length part of the binary interface.

I would prefer a completely different interface, where you have a file 
descriptor that can be created and associated to a vCPU or VM (or even 
to /dev/kvm).  Having a file descriptor is important because the fd can 
be passed to a less-privileged process that takes care of gathering the 
metrics

The result of reading the file descriptor could be either ASCII or 
binary.  IMO the real cost lies in opening and reading a multitude of 
files rather than in the ASCII<->binary conversion.

The format could be one of the following:

* binary:

4 bytes flags (always zero)
4 bytes number of statistics
4 bytes offset of the first stat description
4 bytes offset of the first stat value
stat descriptions:
   - 4 bytes for the type (for now always zero: uint64_t)
   - 4 bytes for the flags (for now always zero)
   - length of name
   - name
statistics in 64-bit format

* text:

stat1_name uint64 123
stat2_name uint64 456
...

What do you think?

Paolo
Marc Zyngier March 10, 2021, 3:51 p.m. UTC | #2
On Wed, 10 Mar 2021 00:30:23 +0000,
Jing Zhang <jingzhangos@google.com> wrote:
> 
> Three ioctl commands are added to support binary form statistics data
> retrieval. KVM_STATS_GET_INFO, KVM_STATS_GET_NAMES, KVM_STATS_GET_DATA.
> KVM_CAP_STATS_BINARY_FORM indicates the capability.
> 
> Signed-off-by: Jing Zhang <jingzhangos@google.com>
> ---
>  virt/kvm/kvm_main.c | 115 ++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 115 insertions(+)
> 
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 383df23514b9..87dd62516c8b 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -3464,6 +3464,51 @@ static long kvm_vcpu_ioctl(struct file *filp,
>  		r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
>  		break;
>  	}
> +	case KVM_STATS_GET_INFO: {
> +		struct kvm_stats_info stats_info;
> +
> +		r = -EFAULT;
> +		stats_info.num_stats = VCPU_STAT_COUNT;
> +		if (copy_to_user(argp, &stats_info, sizeof(stats_info)))
> +			goto out;
> +		r = 0;
> +		break;
> +	}
> +	case KVM_STATS_GET_NAMES: {
> +		struct kvm_stats_names stats_names;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&stats_names, argp, sizeof(stats_names)))
> +			goto out;
> +		r = -EINVAL;
> +		if (stats_names.size < VCPU_STAT_COUNT * KVM_STATS_NAME_LEN)
> +			goto out;
> +
> +		r = -EFAULT;
> +		if (copy_to_user(argp + sizeof(stats_names),
> +				kvm_vcpu_stat_strings,
> +				VCPU_STAT_COUNT * KVM_STATS_NAME_LEN))
> +			goto out;
> +		r = 0;
> +		break;
> +	}
> +	case KVM_STATS_GET_DATA: {
> +		struct kvm_stats_data stats_data;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&stats_data, argp, sizeof(stats_data)))
> +			goto out;
> +		r = -EINVAL;
> +		if (stats_data.size < sizeof(vcpu->stat))
> +			goto out;
> +
> +		r = -EFAULT;
> +		argp += sizeof(stats_data);
> +		if (copy_to_user(argp, &vcpu->stat, sizeof(vcpu->stat)))
> +			goto out;
> +		r = 0;
> +		break;
> +	}
>  	default:
>  		r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
>  	}
> @@ -3695,6 +3740,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
>  	case KVM_CAP_CHECK_EXTENSION_VM:
>  	case KVM_CAP_ENABLE_CAP_VM:
>  	case KVM_CAP_HALT_POLL:
> +	case KVM_CAP_STATS_BINARY_FORM:
>  		return 1;
>  #ifdef CONFIG_KVM_MMIO
>  	case KVM_CAP_COALESCED_MMIO:
> @@ -3825,6 +3871,40 @@ static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
>  	}
>  }
>  
> +static long kvm_vm_ioctl_stats_get_data(struct kvm *kvm, unsigned long arg)
> +{
> +	void __user *argp = (void __user *)arg;
> +	struct kvm_vcpu *vcpu;
> +	struct kvm_stats_data stats_data;
> +	u64 *data = NULL, *pdata;
> +	int i, j, ret = 0;
> +	size_t dsize = (VM_STAT_COUNT + VCPU_STAT_COUNT) * sizeof(*data);
> +
> +
> +	if (copy_from_user(&stats_data, argp, sizeof(stats_data)))
> +		return -EFAULT;
> +	if (stats_data.size < dsize)
> +		return -EINVAL;
> +	data = kzalloc(dsize, GFP_KERNEL_ACCOUNT);
> +	if (!data)
> +		return -ENOMEM;
> +
> +	for (i = 0; i < VM_STAT_COUNT; i++)
> +		*(data + i) = *((ulong *)&kvm->stat + i);

This kind of dance could be avoided if your stats were just an array,
or a union of the current data structure and an array.

> +
> +	kvm_for_each_vcpu(j, vcpu, kvm) {
> +		pdata = data + VM_STAT_COUNT;
> +		for (i = 0; i < VCPU_STAT_COUNT; i++, pdata++)
> +			*pdata += *((u64 *)&vcpu->stat + i);

Do you really need the in-kernel copy? Why not directly organise the
data structures in a way that would allow a bulk copy using
copy_to_user()?

Another thing is the atomicity of what you are reporting. Don't you
care about the consistency of the counters?

Thanks,

	M.
Paolo Bonzini March 10, 2021, 4:03 p.m. UTC | #3
On 10/03/21 16:51, Marc Zyngier wrote:
>> +	kvm_for_each_vcpu(j, vcpu, kvm) {
>> +		pdata = data + VM_STAT_COUNT;
>> +		for (i = 0; i < VCPU_STAT_COUNT; i++, pdata++)
>> +			*pdata += *((u64 *)&vcpu->stat + i);
> Do you really need the in-kernel copy? Why not directly organise the
> data structures in a way that would allow a bulk copy using
> copy_to_user()?

The result is built by summing per-vCPU counters, so that the counter 
updates are fast and do not require a lock.  So consistency basically 
cannot be guaranteed.

Paolo
Marc Zyngier March 10, 2021, 5:05 p.m. UTC | #4
On Wed, 10 Mar 2021 16:03:42 +0000,
Paolo Bonzini <pbonzini@redhat.com> wrote:
> 
> On 10/03/21 16:51, Marc Zyngier wrote:
> >> +	kvm_for_each_vcpu(j, vcpu, kvm) {
> >> +		pdata = data + VM_STAT_COUNT;
> >> +		for (i = 0; i < VCPU_STAT_COUNT; i++, pdata++)
> >> +			*pdata += *((u64 *)&vcpu->stat + i);
> > Do you really need the in-kernel copy? Why not directly organise the
> > data structures in a way that would allow a bulk copy using
> > copy_to_user()?
> 
> The result is built by summing per-vCPU counters, so that the counter
> updates are fast and do not require a lock.  So consistency basically
> cannot be guaranteed.

Sure, but I wonder whether there is scope for VM-global counters to be
maintained in parallel with per-vCPU counters if speed/efficiency is
of the essence (and this seems to be how it is sold in the cover
letter).

Thanks,

	M.
Paolo Bonzini March 10, 2021, 5:11 p.m. UTC | #5
On 10/03/21 18:05, Marc Zyngier wrote:
> On Wed, 10 Mar 2021 16:03:42 +0000,
> Paolo Bonzini <pbonzini@redhat.com> wrote:
>>
>> On 10/03/21 16:51, Marc Zyngier wrote:
>>>> +	kvm_for_each_vcpu(j, vcpu, kvm) {
>>>> +		pdata = data + VM_STAT_COUNT;
>>>> +		for (i = 0; i < VCPU_STAT_COUNT; i++, pdata++)
>>>> +			*pdata += *((u64 *)&vcpu->stat + i);
>>> Do you really need the in-kernel copy? Why not directly organise the
>>> data structures in a way that would allow a bulk copy using
>>> copy_to_user()?
>>
>> The result is built by summing per-vCPU counters, so that the counter
>> updates are fast and do not require a lock.  So consistency basically
>> cannot be guaranteed.
> 
> Sure, but I wonder whether there is scope for VM-global counters to be
> maintained in parallel with per-vCPU counters if speed/efficiency is
> of the essence (and this seems to be how it is sold in the cover
> letter).

Maintaining VM-global counters would require an atomic instruction and 
would suffer lots of cacheline bouncing even on architectures that have 
relaxed atomic memory operations.

Speed/efficiency of retrieving statistics is important, but let's keep 
in mind that the baseline for comparison is hundreds of syscalls and 
filesystem lookups.

Paolo
Marc Zyngier March 10, 2021, 5:31 p.m. UTC | #6
On Wed, 10 Mar 2021 17:11:47 +0000,
Paolo Bonzini <pbonzini@redhat.com> wrote:
> 
> On 10/03/21 18:05, Marc Zyngier wrote:
> > On Wed, 10 Mar 2021 16:03:42 +0000,
> > Paolo Bonzini <pbonzini@redhat.com> wrote:
> >> 
> >> On 10/03/21 16:51, Marc Zyngier wrote:
> >>>> +	kvm_for_each_vcpu(j, vcpu, kvm) {
> >>>> +		pdata = data + VM_STAT_COUNT;
> >>>> +		for (i = 0; i < VCPU_STAT_COUNT; i++, pdata++)
> >>>> +			*pdata += *((u64 *)&vcpu->stat + i);
> >>> Do you really need the in-kernel copy? Why not directly organise the
> >>> data structures in a way that would allow a bulk copy using
> >>> copy_to_user()?
> >> 
> >> The result is built by summing per-vCPU counters, so that the counter
> >> updates are fast and do not require a lock.  So consistency basically
> >> cannot be guaranteed.
> > 
> > Sure, but I wonder whether there is scope for VM-global counters to be
> > maintained in parallel with per-vCPU counters if speed/efficiency is
> > of the essence (and this seems to be how it is sold in the cover
> > letter).
> 
> Maintaining VM-global counters would require an atomic instruction and
> would suffer lots of cacheline bouncing even on architectures that
> have relaxed atomic memory operations.

Which is why we have per-cpu counters already. Making use of them
doesn't seem that outlandish.

> Speed/efficiency of retrieving statistics is important, but let's keep
> in mind that the baseline for comparison is hundreds of syscalls and
> filesystem lookups.

Having that baseline in the cover letter would be a good start, as
well as an indication of the frequency this is used at.

	M.
Paolo Bonzini March 10, 2021, 5:44 p.m. UTC | #7
On 10/03/21 18:31, Marc Zyngier wrote:
>> Maintaining VM-global counters would require an atomic instruction and
>> would suffer lots of cacheline bouncing even on architectures that
>> have relaxed atomic memory operations.
> Which is why we have per-cpu counters already. Making use of them
> doesn't seem that outlandish.

But you wouldn't be able to guarantee consistency anyway, would you? 
You *could* copy N*M counters to userspace, but there's no guarantee 
that they are consistent, neither within a single vCPU nor within a 
single counter.

>> Speed/efficiency of retrieving statistics is important, but let's keep
>> in mind that the baseline for comparison is hundreds of syscalls and
>> filesystem lookups.
>
> Having that baseline in the cover letter would be a good start, as
> well as an indication of the frequency this is used at.

Can't disagree, especially on the latter which I have no idea about.

Paolo
Jing Zhang March 10, 2021, 9:41 p.m. UTC | #8
Hi Paolo,

On Wed, Mar 10, 2021 at 8:55 AM Paolo Bonzini <pbonzini@redhat.com> wrote:
>
> On 10/03/21 01:30, Jing Zhang wrote:
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index 383df23514b9..87dd62516c8b 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -3464,6 +3464,51 @@ static long kvm_vcpu_ioctl(struct file *filp,
> >               r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
> >               break;
> >       }
> > +     case KVM_STATS_GET_INFO: {
> > +             struct kvm_stats_info stats_info;
> > +
> > +             r = -EFAULT;
> > +             stats_info.num_stats = VCPU_STAT_COUNT;
> > +             if (copy_to_user(argp, &stats_info, sizeof(stats_info)))
> > +                     goto out;
> > +             r = 0;
> > +             break;
> > +     }
> > +     case KVM_STATS_GET_NAMES: {
> > +             struct kvm_stats_names stats_names;
> > +
> > +             r = -EFAULT;
> > +             if (copy_from_user(&stats_names, argp, sizeof(stats_names)))
> > +                     goto out;
> > +             r = -EINVAL;
> > +             if (stats_names.size < VCPU_STAT_COUNT * KVM_STATS_NAME_LEN)
> > +                     goto out;
> > +
> > +             r = -EFAULT;
> > +             if (copy_to_user(argp + sizeof(stats_names),
> > +                             kvm_vcpu_stat_strings,
> > +                             VCPU_STAT_COUNT * KVM_STATS_NAME_LEN))
>
> The only reason to separate the strings in patch 1 is to pass them here.
>   But this is a poor API because it imposes a limit on the length of the
> statistics, and makes that length part of the binary interface.
Agreed. I am considering returning the length of stats name strings in
the kvm_stats_info structure instead of exporting it as a constant in uapi,
which would put no limit on the length of the stats name strings.
>
> I would prefer a completely different interface, where you have a file
> descriptor that can be created and associated to a vCPU or VM (or even
> to /dev/kvm).  Having a file descriptor is important because the fd can
> be passed to a less-privileged process that takes care of gathering the
> metrics
Separate file descriptor solution is very tempting. We are still considering it
seriously. Our biggest concern is that the metrics gathering/handling process
is not necessary running on the same node as the one file descriptor belongs to.
It scales better to pass metrics data directly than to pass file descriptors.
>
> The result of reading the file descriptor could be either ASCII or
> binary.  IMO the real cost lies in opening and reading a multitude of
> files rather than in the ASCII<->binary conversion.
Agreed.
>
> The format could be one of the following:
>
> * binary:
>
> 4 bytes flags (always zero)
> 4 bytes number of statistics
> 4 bytes offset of the first stat description
> 4 bytes offset of the first stat value
> stat descriptions:
>    - 4 bytes for the type (for now always zero: uint64_t)
>    - 4 bytes for the flags (for now always zero)
>    - length of name
>    - name
> statistics in 64-bit format
>
> * text:
>
> stat1_name uint64 123
> stat2_name uint64 456
> ...
>
> What do you think?
The binary format presented above is very flexible. I understand why it is
organized this way.
In our situation, the metrics data could be pulled periodically as short as
half second. They are used by different kinds of monitors/triggers/alerts.
To enhance efficiency and reduce traffic caused by metrics passing, we
treat all metrics info/data as two kinds. One is immutable information,
which doesn't change in a given system boot. The other is mutable
data (statistics data), which is pulled/transferred periodically at a high
frequency.

>
> Paolo
>

Thanks,
Jing
Jing Zhang March 10, 2021, 9:43 p.m. UTC | #9
On Wed, Mar 10, 2021 at 11:44 AM Paolo Bonzini <pbonzini@redhat.com> wrote:
>
> On 10/03/21 18:31, Marc Zyngier wrote:
> >> Maintaining VM-global counters would require an atomic instruction and
> >> would suffer lots of cacheline bouncing even on architectures that
> >> have relaxed atomic memory operations.
> > Which is why we have per-cpu counters already. Making use of them
> > doesn't seem that outlandish.
>
> But you wouldn't be able to guarantee consistency anyway, would you?
> You *could* copy N*M counters to userspace, but there's no guarantee
> that they are consistent, neither within a single vCPU nor within a
> single counter.
>
> >> Speed/efficiency of retrieving statistics is important, but let's keep
> >> in mind that the baseline for comparison is hundreds of syscalls and
> >> filesystem lookups.
> >
> > Having that baseline in the cover letter would be a good start, as
> > well as an indication of the frequency this is used at.
>
> Can't disagree, especially on the latter which I have no idea about.
>
> Paolo
>
Marc, Paolo, thanks for the comments. I will add some more information
in the cover letter.

Thanks,
Jing
Paolo Bonzini March 12, 2021, 6:11 p.m. UTC | #10
On 10/03/21 22:41, Jing Zhang wrote:
>> I would prefer a completely different interface, where you have a file
>> descriptor that can be created and associated to a vCPU or VM (or even
>> to /dev/kvm).  Having a file descriptor is important because the fd can
>> be passed to a less-privileged process that takes care of gathering the
>> metrics
> Separate file descriptor solution is very tempting. We are still considering it
> seriously. Our biggest concern is that the metrics gathering/handling process
> is not necessary running on the same node as the one file descriptor belongs to.
> It scales better to pass metrics data directly than to pass file descriptors.

If you want to pass metrics data directly, you can just read the file 
descriptor from your VMM, just like you're using the ioctls now. 
However the file descriptor also allows a privilege-separated same-host 
interface.

>> 4 bytes flags (always zero)
>> 4 bytes number of statistics
>> 4 bytes offset of the first stat description
>> 4 bytes offset of the first stat value
>> stat descriptions:
>>    - 4 bytes for the type (for now always zero: uint64_t)
>>    - 4 bytes for the flags (for now always zero)
>>    - length of name
>>    - name
>> statistics in 64-bit format
> 
> The binary format presented above is very flexible. I understand why it is
> organized this way.
> In our situation, the metrics data could be pulled periodically as short as
> half second. They are used by different kinds of monitors/triggers/alerts.
> To enhance efficiency and reduce traffic caused by metrics passing, we
> treat all metrics info/data as two kinds. One is immutable information,
> which doesn't change in a given system boot. The other is mutable
> data (statistics data), which is pulled/transferred periodically at a high
> frequency.

The format allows to place the values before the descriptions.  So you 
could use pread to only read the first part of the file descriptor, and 
the file_operations implementation would then skip the work of building 
the immutable data.  It doesn't have to be implemented from the
beginning like that, but the above format supports it.

Paolo
Jing Zhang March 12, 2021, 10:27 p.m. UTC | #11
Hi Paolo,

On Fri, Mar 12, 2021 at 12:11 PM Paolo Bonzini <pbonzini@redhat.com> wrote:
>
> On 10/03/21 22:41, Jing Zhang wrote:
> >> I would prefer a completely different interface, where you have a file
> >> descriptor that can be created and associated to a vCPU or VM (or even
> >> to /dev/kvm).  Having a file descriptor is important because the fd can
> >> be passed to a less-privileged process that takes care of gathering the
> >> metrics
> > Separate file descriptor solution is very tempting. We are still considering it
> > seriously. Our biggest concern is that the metrics gathering/handling process
> > is not necessary running on the same node as the one file descriptor belongs to.
> > It scales better to pass metrics data directly than to pass file descriptors.
>
> If you want to pass metrics data directly, you can just read the file
> descriptor from your VMM, just like you're using the ioctls now.
> However the file descriptor also allows a privilege-separated same-host
> interface.
It makes sense.
>
> >> 4 bytes flags (always zero)
Could you give some potential use for this flag?
> >> 4 bytes number of statistics
> >> 4 bytes offset of the first stat description
> >> 4 bytes offset of the first stat value
> >> stat descriptions:
> >>    - 4 bytes for the type (for now always zero: uint64_t)
Potential use for this type? Should we move this outside descriptor? Since
all stats probably have the same size.
> >>    - 4 bytes for the flags (for now always zero)
Potential use for this flag?
> >>    - length of name
> >>    - name
> >> statistics in 64-bit format
> >
> > The binary format presented above is very flexible. I understand why it is
> > organized this way.
> > In our situation, the metrics data could be pulled periodically as short as
> > half second. They are used by different kinds of monitors/triggers/alerts.
> > To enhance efficiency and reduce traffic caused by metrics passing, we
> > treat all metrics info/data as two kinds. One is immutable information,
> > which doesn't change in a given system boot. The other is mutable
> > data (statistics data), which is pulled/transferred periodically at a high
> > frequency.
>
> The format allows to place the values before the descriptions.  So you
> could use pread to only read the first part of the file descriptor, and
> the file_operations implementation would then skip the work of building
> the immutable data.  It doesn't have to be implemented from the
> beginning like that, but the above format supports it.
Good point! I'll be working on the new fd-based interface and come back
with new patchset.
>
> Paolo
>
Thanks,
Jing
Paolo Bonzini March 13, 2021, 9:35 a.m. UTC | #12
On 12/03/21 23:27, Jing Zhang wrote:
>>>> 4 bytes flags (always zero)
> Could you give some potential use for this flag?

No idea, honestly.  It probably would signal the presence of more fields 
after "offset of the first stat value".  In general it's better to leave 
some room for extension.

>>>> 4 bytes number of statistics
>>>> 4 bytes offset of the first stat description
>>>> 4 bytes offset of the first stat value
>>>> stat descriptions:
>>>>     - 4 bytes for the type (for now always zero: uint64_t)
> Potential use for this type? Should we move this outside descriptor? Since
> all stats probably have the same size.

Yes, all stats should be 8 bytes.  But for example:

- 0 = uint64_t

- 1 = int64_t

- 0x80000000 | n: enum with n different values, which are stored after 
the name

>>>>     - 4 bytes for the flags (for now always zero)
> Potential use for this flag?

Looking back at Emanuele's statsfs, it could be:

- bit 0: can be cleared (by writing eight zero bytes in the statistics' 
offset)

- bit 1: cumulative value (count of events, can only grow) vs. 
instantaneous value (can go up or down)

This is currently stored in the debugfs mode, so we can already use 
these flags.

Paolo
Jing Zhang March 15, 2021, 10:31 p.m. UTC | #13
Hi Paolo,

On Wed, Mar 10, 2021 at 8:55 AM Paolo Bonzini <pbonzini@redhat.com> wrote:
>
> On 10/03/21 01:30, Jing Zhang wrote:
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index 383df23514b9..87dd62516c8b 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -3464,6 +3464,51 @@ static long kvm_vcpu_ioctl(struct file *filp,
> >               r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
> >               break;
> >       }
> > +     case KVM_STATS_GET_INFO: {
> > +             struct kvm_stats_info stats_info;
> > +
> > +             r = -EFAULT;
> > +             stats_info.num_stats = VCPU_STAT_COUNT;
> > +             if (copy_to_user(argp, &stats_info, sizeof(stats_info)))
> > +                     goto out;
> > +             r = 0;
> > +             break;
> > +     }
> > +     case KVM_STATS_GET_NAMES: {
> > +             struct kvm_stats_names stats_names;
> > +
> > +             r = -EFAULT;
> > +             if (copy_from_user(&stats_names, argp, sizeof(stats_names)))
> > +                     goto out;
> > +             r = -EINVAL;
> > +             if (stats_names.size < VCPU_STAT_COUNT * KVM_STATS_NAME_LEN)
> > +                     goto out;
> > +
> > +             r = -EFAULT;
> > +             if (copy_to_user(argp + sizeof(stats_names),
> > +                             kvm_vcpu_stat_strings,
> > +                             VCPU_STAT_COUNT * KVM_STATS_NAME_LEN))
>
> The only reason to separate the strings in patch 1 is to pass them here.
>   But this is a poor API because it imposes a limit on the length of the
> statistics, and makes that length part of the binary interface.
>
> I would prefer a completely different interface, where you have a file
> descriptor that can be created and associated to a vCPU or VM (or even
> to /dev/kvm).  Having a file descriptor is important because the fd can
We are considering about how to create the file descriptor. It might be risky
to create an extra fd for every vCPU. It will easily hit the fd limit for the
process or the system for machines running a ton of small VMs.
Looks like creating an extra file descriptor for every VM is a better option.
And then we can check per vCPU stats through Ioctl of this VM fd by
passing the vCPU index.
What do you think?
> be passed to a less-privileged process that takes care of gathering the
> metrics
>
> The result of reading the file descriptor could be either ASCII or
> binary.  IMO the real cost lies in opening and reading a multitude of
> files rather than in the ASCII<->binary conversion.
>
> The format could be one of the following:
>
> * binary:
>
> 4 bytes flags (always zero)
> 4 bytes number of statistics
> 4 bytes offset of the first stat description
> 4 bytes offset of the first stat value
> stat descriptions:
>    - 4 bytes for the type (for now always zero: uint64_t)
>    - 4 bytes for the flags (for now always zero)
>    - length of name
>    - name
> statistics in 64-bit format
>
> * text:
>
> stat1_name uint64 123
> stat2_name uint64 456
> ...
>
> What do you think?
>
> Paolo
>
Paolo Bonzini March 16, 2021, 5:54 p.m. UTC | #14
On 15/03/21 23:31, Jing Zhang wrote:
> We are considering about how to create the file descriptor. It might be risky
> to create an extra fd for every vCPU. It will easily hit the fd limit for the
> process or the system for machines running a ton of small VMs.

You already have a file descriptor for every vCPU, but I agree that 
having twice as many is not very good.

> Looks like creating an extra file descriptor for every VM is a better option.
> And then we can check per vCPU stats through Ioctl of this VM fd by
> passing the vCPU index.

The file descriptor idea is not really infeasible I think (not just 
because the # of file descriptors is "only" doubled, but also because 
most of the time I think you'd only care of per-VM stats).

If you really believe it's not usable for you, you can use two ioctls to 
fill the description and the data respectively (i.e. ioctl(fd, 
KVM_GET_STATS_{DESCRIPTION,VALUES}, pdata) using the same layout as 
below.  If called with NULL argument, the ioctl returns how much data 
they will fill in.

The (always zero) global flags can be replaced by the value returned by 
KVM_CHECK_EXTENSION.

The number of statistics can be obtained by ioctl(fd, 
KVM_GET_STATS_VALUES, NULL), just divide the returned value by 8.

Paolo

>> 4 bytes flags (always zero)
>> 4 bytes number of statistics
>> 4 bytes offset of the first stat description
>> 4 bytes offset of the first stat value
>> stat descriptions:
>>     - 4 bytes for the type (for now always zero: uint64_t)
>>     - 4 bytes for the flags (for now always zero)
>>     - length of name
>>     - name
>> statistics in 64-bit format
diff mbox series

Patch

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 383df23514b9..87dd62516c8b 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3464,6 +3464,51 @@  static long kvm_vcpu_ioctl(struct file *filp,
 		r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
 		break;
 	}
+	case KVM_STATS_GET_INFO: {
+		struct kvm_stats_info stats_info;
+
+		r = -EFAULT;
+		stats_info.num_stats = VCPU_STAT_COUNT;
+		if (copy_to_user(argp, &stats_info, sizeof(stats_info)))
+			goto out;
+		r = 0;
+		break;
+	}
+	case KVM_STATS_GET_NAMES: {
+		struct kvm_stats_names stats_names;
+
+		r = -EFAULT;
+		if (copy_from_user(&stats_names, argp, sizeof(stats_names)))
+			goto out;
+		r = -EINVAL;
+		if (stats_names.size < VCPU_STAT_COUNT * KVM_STATS_NAME_LEN)
+			goto out;
+
+		r = -EFAULT;
+		if (copy_to_user(argp + sizeof(stats_names),
+				kvm_vcpu_stat_strings,
+				VCPU_STAT_COUNT * KVM_STATS_NAME_LEN))
+			goto out;
+		r = 0;
+		break;
+	}
+	case KVM_STATS_GET_DATA: {
+		struct kvm_stats_data stats_data;
+
+		r = -EFAULT;
+		if (copy_from_user(&stats_data, argp, sizeof(stats_data)))
+			goto out;
+		r = -EINVAL;
+		if (stats_data.size < sizeof(vcpu->stat))
+			goto out;
+
+		r = -EFAULT;
+		argp += sizeof(stats_data);
+		if (copy_to_user(argp, &vcpu->stat, sizeof(vcpu->stat)))
+			goto out;
+		r = 0;
+		break;
+	}
 	default:
 		r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
 	}
@@ -3695,6 +3740,7 @@  static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 	case KVM_CAP_CHECK_EXTENSION_VM:
 	case KVM_CAP_ENABLE_CAP_VM:
 	case KVM_CAP_HALT_POLL:
+	case KVM_CAP_STATS_BINARY_FORM:
 		return 1;
 #ifdef CONFIG_KVM_MMIO
 	case KVM_CAP_COALESCED_MMIO:
@@ -3825,6 +3871,40 @@  static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
 	}
 }
 
+static long kvm_vm_ioctl_stats_get_data(struct kvm *kvm, unsigned long arg)
+{
+	void __user *argp = (void __user *)arg;
+	struct kvm_vcpu *vcpu;
+	struct kvm_stats_data stats_data;
+	u64 *data = NULL, *pdata;
+	int i, j, ret = 0;
+	size_t dsize = (VM_STAT_COUNT + VCPU_STAT_COUNT) * sizeof(*data);
+
+
+	if (copy_from_user(&stats_data, argp, sizeof(stats_data)))
+		return -EFAULT;
+	if (stats_data.size < dsize)
+		return -EINVAL;
+	data = kzalloc(dsize, GFP_KERNEL_ACCOUNT);
+	if (!data)
+		return -ENOMEM;
+
+	for (i = 0; i < VM_STAT_COUNT; i++)
+		*(data + i) = *((ulong *)&kvm->stat + i);
+
+	kvm_for_each_vcpu(j, vcpu, kvm) {
+		pdata = data + VM_STAT_COUNT;
+		for (i = 0; i < VCPU_STAT_COUNT; i++, pdata++)
+			*pdata += *((u64 *)&vcpu->stat + i);
+	}
+
+	if (copy_to_user(argp + sizeof(stats_data), data, dsize))
+		ret = -EFAULT;
+
+	kfree(data);
+	return ret;
+}
+
 static long kvm_vm_ioctl(struct file *filp,
 			   unsigned int ioctl, unsigned long arg)
 {
@@ -4001,6 +4081,41 @@  static long kvm_vm_ioctl(struct file *filp,
 		r = 0;
 		break;
 	}
+	case KVM_STATS_GET_INFO: {
+		struct kvm_stats_info stats_info;
+
+		r = -EFAULT;
+		stats_info.num_stats = VM_STAT_COUNT + VCPU_STAT_COUNT;
+		if (copy_to_user(argp, &stats_info, sizeof(stats_info)))
+			goto out;
+		r = 0;
+		break;
+	}
+	case KVM_STATS_GET_NAMES: {
+		struct kvm_stats_names stats_names;
+
+		r = -EFAULT;
+		if (copy_from_user(&stats_names, argp, sizeof(stats_names)))
+			goto out;
+		r = -EINVAL;
+		if (stats_names.size <
+			(VM_STAT_COUNT + VCPU_STAT_COUNT) * KVM_STATS_NAME_LEN)
+			goto out;
+		r = -EFAULT;
+		argp += sizeof(stats_names);
+		if (copy_to_user(argp, kvm_vm_stat_strings,
+				VM_STAT_COUNT * KVM_STATS_NAME_LEN))
+			goto out;
+		argp += VM_STAT_COUNT * KVM_STATS_NAME_LEN;
+		if (copy_to_user(argp, kvm_vcpu_stat_strings,
+				VCPU_STAT_COUNT * KVM_STATS_NAME_LEN))
+			goto out;
+		r = 0;
+		break;
+	}
+	case KVM_STATS_GET_DATA:
+		r =  kvm_vm_ioctl_stats_get_data(kvm, arg);
+		break;
 	case KVM_CHECK_EXTENSION:
 		r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
 		break;