diff mbox

[RFC,v2,47/47] qapi-introspect: Hide type names

Message ID 1435782155-31412-48-git-send-email-armbru@redhat.com
State New
Headers show

Commit Message

Markus Armbruster July 1, 2015, 8:22 p.m. UTC
To eliminate the temptation for clients to look up types by name
(which are not ABI), replace all type names by meaningless strings.

Reduces output of query-schema by 9 out of 80KiB.

Signed-off-by: Markus Armbruster <armbru@redhat.com>
---
 scripts/qapi-introspect.py | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

Comments

Eric Blake July 24, 2015, 3:44 a.m. UTC | #1
On 07/01/2015 02:22 PM, Markus Armbruster wrote:
> To eliminate the temptation for clients to look up types by name
> (which are not ABI), replace all type names by meaningless strings.
> 
> Reduces output of query-schema by 9 out of 80KiB.

I'm not sure whether I like this or not.  It does make sense from the
perspective of forcing clients to stick to ABI queries, but makes it a
bit harder to navigate things except by automated scripts.

> 
> Signed-off-by: Markus Armbruster <armbru@redhat.com>
> ---
>  scripts/qapi-introspect.py | 27 +++++++++++++++++++++++++--
>  1 file changed, 25 insertions(+), 2 deletions(-)
> 
> diff --git a/scripts/qapi-introspect.py b/scripts/qapi-introspect.py
> index 961fe88..efb34ff 100644
> --- a/scripts/qapi-introspect.py
> +++ b/scripts/qapi-introspect.py
> @@ -9,13 +9,18 @@
>  # This work is licensed under the terms of the GNU GPL, version 2.
>  # See the COPYING file in the top-level directory.
>  
> +import string
>  from qapi import *
>  
> +def _b32digit(num):
> +    return (string.lowercase + string.digits[2:])[num]

This feels a bit too magic for me to decipher late at night.

> +
>  class QAPISchemaGenIntrospectVisitor(QAPISchemaVisitor):
>      def __init__(self):
>          self.schema = None
>          self.jsons = None
>          self.used_types = None
> +        self.name_map = None
>          self.defn = None
>          self.decl = None
>  
> @@ -23,13 +28,17 @@ class QAPISchemaGenIntrospectVisitor(QAPISchemaVisitor):
>          self.schema = schema
>          self.jsons = []
>          self.used_types = []
> +        self.name_map = {}
>          return QAPISchemaType   # don't visit types for now
>  
>      def visit_end(self):
>          # visit the types that are actually used
> +        jsons = self.jsons
> +        self.jsons = []
>          for typ in self.used_types:
>              typ.visit(self)
>          self.jsons.sort()
> +        jsons.extend(self.jsons)

I'm not sure I follow the use of .extends() here.

>          name = prefix + 'qmp_schema_json'
>          self.decl = mcgen('''
>  extern char %(c_name)s[];
> @@ -40,10 +49,19 @@ char %(c_name)s[] = "["
>      "%(c_jsons)s]";
>  ''',
>                            c_name=c_name(name),
> -                          c_jsons=', "\n    "'.join(self.jsons))
> +                          c_jsons=', "\n    "'.join(jsons))
>          self.schema = None
>          self.jsons = None
>          self.used_types = None
> +        self.name_map = None
> +
> +    def _name(self, name):
> +        if name not in self.name_map:
> +            n = len(self.name_map)
> +            self.name_map[name] = ':' + _b32digit(n / 32 / 32) \
> +                                  + _b32digit(n / 32 % 32) \
> +                                  + _b32digit(n % 32)

Caps our qapi to 32k types (including implicit ones); will that be an issue?

> +        return self.name_map[name]
>  
>      def _use_type(self, typ):
>          # Map the various integer types to plain int
> @@ -55,9 +73,14 @@ char %(c_name)s[] = "["
>          # Add type to work queue if new
>          if typ not in self.used_types:
>              self.used_types.append(typ)
> -        return typ.name
> +        # Clients should examine commands and events, not types.  Hide
> +        # type names to reduce the temptation.  Also saves a few
> +        # characters.
> +        return self._name(typ.name)
>  
>      def _gen_json(self, name, mtype, extra):
> +        if mtype != 'command' and mtype != 'event':
> +            name = self._name(name)
>          self.jsons.append("{ 'name': '%s', 'meta-type': '%s', %s }"
>                            % (name, mtype, extra))
>  
>
Eric Blake July 27, 2015, 4:15 p.m. UTC | #2
On 07/23/2015 09:44 PM, Eric Blake wrote:
> On 07/01/2015 02:22 PM, Markus Armbruster wrote:
>> To eliminate the temptation for clients to look up types by name
>> (which are not ABI), replace all type names by meaningless strings.
>>
>> Reduces output of query-schema by 9 out of 80KiB.
> 
> I'm not sure whether I like this or not.  It does make sense from the
> perspective of forcing clients to stick to ABI queries, but makes it a
> bit harder to navigate things except by automated scripts.
> 

>>  
>> +import string
>>  from qapi import *
>>  
>> +def _b32digit(num):
>> +    return (string.lowercase + string.digits[2:])[num]
> 
> This feels a bit too magic for me to decipher late at night.

Looking at this again after a weekend: you are doing a poor-man's base32
encoding that maps a number between 0 and 31 inclusive to
"abc...xyz234567" (skipping 0 and 1 due to some fonts being unclear with
O and l/I).

Question - if we really want to compress things down to an integer,
would it make any more sense to actually use an integer instead of a
string encoding of an unusual base32 alphabet (that is, output an
integer instead of a string)?  Conversely, if we are going to insist on
an encoding, can we use the RFC4648 base32 definition (upper case, not
lower case)?

More thoughts: I'm probably okay with hiding type names (since they
aren't ABI), but can we come up with an output format that is more
conducive for use by the end user?  Considering that we are returning a
JSON array, what if we return the integer offset of the type as recorded
in the returned array.  That is, if we have
 { 'name':'ACPI_DEVICE_OST', 'meta-type':'event', 'data':100 }
in slot [0] of the return array, then
 { 'name': ':aaa', 'meta-type': 'object', 'members': [ { 'name': 'info',
'type': 250 } ] }
in slot [100], then the definition for type ':ae5' in slot [250], and so
forth.  My goal here is not a topologically sorted return array, so much
as a shorthand where using an integer for a type-name means that we can
quickly locate that offset within the JSON array, instead of having to
do a linear search over the entire array for an array member that has
the matching name.  Or if type names are truly unimportant, then omit
names for type elements (by making name optional in the introspection
qapi description), and using ONLY offsets in the returned JSON array for
referring to types.  Of course, if we do this, life gets a lot trickier
for adding filtering down to a subset of the overall schema (unless you
don't mind populating lots of 'null' entries for parts that get filtered
out so that the parts that are displayed are always at the same array
offset, just with less overall output bulk due to the filtering).
Markus Armbruster July 28, 2015, 6:24 p.m. UTC | #3
Eric Blake <eblake@redhat.com> writes:

> On 07/01/2015 02:22 PM, Markus Armbruster wrote:
>> To eliminate the temptation for clients to look up types by name
>> (which are not ABI), replace all type names by meaningless strings.
>> 
>> Reduces output of query-schema by 9 out of 80KiB.
>
> I'm not sure whether I like this or not.  It does make sense from the
> perspective of forcing clients to stick to ABI queries, but makes it a
> bit harder to navigate things except by automated scripts.

Yes.  I'm not sure it's a good idea.  If we decide to hide types this
way, then I'd find an option to generate without type hiding useful.

>> Signed-off-by: Markus Armbruster <armbru@redhat.com>
>> ---
>>  scripts/qapi-introspect.py | 27 +++++++++++++++++++++++++--
>>  1 file changed, 25 insertions(+), 2 deletions(-)
>> 
>> diff --git a/scripts/qapi-introspect.py b/scripts/qapi-introspect.py
>> index 961fe88..efb34ff 100644
>> --- a/scripts/qapi-introspect.py
>> +++ b/scripts/qapi-introspect.py
>> @@ -9,13 +9,18 @@
>>  # This work is licensed under the terms of the GNU GPL, version 2.
>>  # See the COPYING file in the top-level directory.
>>  
>> +import string
>>  from qapi import *
>>  
>> +def _b32digit(num):
>> +    return (string.lowercase + string.digits[2:])[num]
>
> This feels a bit too magic for me to decipher late at night.
>
>> +
>>  class QAPISchemaGenIntrospectVisitor(QAPISchemaVisitor):
>>      def __init__(self):
>>          self.schema = None
>>          self.jsons = None
>>          self.used_types = None
>> +        self.name_map = None
>>          self.defn = None
>>          self.decl = None
>>  
>> @@ -23,13 +28,17 @@ class QAPISchemaGenIntrospectVisitor(QAPISchemaVisitor):
>>          self.schema = schema
>>          self.jsons = []
>>          self.used_types = []
>> +        self.name_map = {}
>>          return QAPISchemaType   # don't visit types for now
>>  
>>      def visit_end(self):
>>          # visit the types that are actually used
>> +        jsons = self.jsons
>> +        self.jsons = []
>>          for typ in self.used_types:
>>              typ.visit(self)
>>          self.jsons.sort()
>> +        jsons.extend(self.jsons)
>
> I'm not sure I follow the use of .extends() here.

jsons.extend(self.jsons) appends the elements of array self.jsons to
array jsons.

Before the patch:

At function entry, self.jsons contains introspection information for the
non-types.

The loop visits the types.  This adds introspection information for the
types to self.json.  Sort self.jsons by name.

What the patch adds:

Move the introspection information for the non-types out of the way
before the loop, append the information on types afterwards.  The result
is now in jsons rather than self.jsons (see the next patch hunk).

Why it does that:

With the funny typenames, sorting everything by name results in a mess.
Keeping non-types and types separate is less of a mess.

>>          name = prefix + 'qmp_schema_json'
>>          self.decl = mcgen('''
>>  extern char %(c_name)s[];
>> @@ -40,10 +49,19 @@ char %(c_name)s[] = "["
>>      "%(c_jsons)s]";
>>  ''',
>>                            c_name=c_name(name),
>> -                          c_jsons=', "\n    "'.join(self.jsons))
>> +                          c_jsons=', "\n    "'.join(jsons))
>>          self.schema = None
>>          self.jsons = None
>>          self.used_types = None
>> +        self.name_map = None
>> +
>> +    def _name(self, name):
>> +        if name not in self.name_map:
>> +            n = len(self.name_map)
>> +            self.name_map[name] = ':' + _b32digit(n / 32 / 32) \
>> +                                  + _b32digit(n / 32 % 32) \
>> +                                  + _b32digit(n % 32)
>
> Caps our qapi to 32k types (including implicit ones); will that be an issue?

I doubt it :)

Should we ever hit the limit, we can simply add another base32
character.

>> +        return self.name_map[name]
>>  
>>      def _use_type(self, typ):
>>          # Map the various integer types to plain int
>> @@ -55,9 +73,14 @@ char %(c_name)s[] = "["
>>          # Add type to work queue if new
>>          if typ not in self.used_types:
>>              self.used_types.append(typ)
>> -        return typ.name
>> +        # Clients should examine commands and events, not types.  Hide
>> +        # type names to reduce the temptation.  Also saves a few
>> +        # characters.
>> +        return self._name(typ.name)
>>  
>>      def _gen_json(self, name, mtype, extra):
>> +        if mtype != 'command' and mtype != 'event':
>> +            name = self._name(name)
>>          self.jsons.append("{ 'name': '%s', 'meta-type': '%s', %s }"
>>                            % (name, mtype, extra))
>>  
>>
Markus Armbruster July 28, 2015, 6:39 p.m. UTC | #4
Eric Blake <eblake@redhat.com> writes:

> On 07/23/2015 09:44 PM, Eric Blake wrote:
>> On 07/01/2015 02:22 PM, Markus Armbruster wrote:
>>> To eliminate the temptation for clients to look up types by name
>>> (which are not ABI), replace all type names by meaningless strings.
>>>
>>> Reduces output of query-schema by 9 out of 80KiB.
>> 
>> I'm not sure whether I like this or not.  It does make sense from the
>> perspective of forcing clients to stick to ABI queries, but makes it a
>> bit harder to navigate things except by automated scripts.
>> 
>
>>>  
>>> +import string
>>>  from qapi import *
>>>  
>>> +def _b32digit(num):
>>> +    return (string.lowercase + string.digits[2:])[num]
>> 
>> This feels a bit too magic for me to decipher late at night.
>
> Looking at this again after a weekend: you are doing a poor-man's base32
> encoding that maps a number between 0 and 31 inclusive to
> "abc...xyz234567" (skipping 0 and 1 due to some fonts being unclear with
> O and l/I).
>
> Question - if we really want to compress things down to an integer,
> would it make any more sense to actually use an integer instead of a
> string encoding of an unusual base32 alphabet (that is, output an
> integer instead of a string)?  Conversely, if we are going to insist on
> an encoding, can we use the RFC4648 base32 definition (upper case, not
> lower case)?

I don't remember why I did lowercase.  Possibly an accident at the tail
end of a mad hacking spree.  Uppercase is fine with me.

Could do plain integer.  I guess I started down the base32 road to
squeeze out a few more characters, then sabotaged myself by always using
three base32 characters.

> More thoughts: I'm probably okay with hiding type names (since they
> aren't ABI), but can we come up with an output format that is more
> conducive for use by the end user?  Considering that we are returning a
> JSON array, what if we return the integer offset of the type as recorded
> in the returned array.  That is, if we have
>  { 'name':'ACPI_DEVICE_OST', 'meta-type':'event', 'data':100 }
> in slot [0] of the return array, then
>  { 'name': ':aaa', 'meta-type': 'object', 'members': [ { 'name': 'info',
> 'type': 250 } ] }
> in slot [100], then the definition for type ':ae5' in slot [250], and so
> forth.  My goal here is not a topologically sorted return array, so much
> as a shorthand where using an integer for a type-name means that we can
> quickly locate that offset within the JSON array, instead of having to
> do a linear search over the entire array for an array member that has
> the matching name.

In the introspection schema, every 'str' that's really a type name needs
to be replaced by 'int'.

Should we later decide we don't want to hide type names after all, then
backward compatibility will make it very hard to go back.

I wouldn't expect clients to find stuff with a linear search.  Use a
dictionary.  Should be plenty fast enough for processing the schema.

>                     Or if type names are truly unimportant, then omit
> names for type elements (by making name optional in the introspection
> qapi description), and using ONLY offsets in the returned JSON array for
> referring to types.  Of course, if we do this, life gets a lot trickier
> for adding filtering down to a subset of the overall schema (unless you
> don't mind populating lots of 'null' entries for parts that get filtered
> out so that the parts that are displayed are always at the same array
> offset, just with less overall output bulk due to the filtering).

Filtering is a headache I'd prefer to avoid.
Eric Blake July 28, 2015, 9:26 p.m. UTC | #5
On 07/28/2015 12:39 PM, Markus Armbruster wrote:

> 
> Could do plain integer.  I guess I started down the base32 road to
> squeeze out a few more characters, then sabotaged myself by always using
> three base32 characters.
> 

> 
> In the introspection schema, every 'str' that's really a type name needs
> to be replaced by 'int'.

Sadly, our introspection union is based on 'name':'str',
'meta-type':'...' as the base members of the union; and that 'name'
would be one of the places where we'd want an integer; so we probably
have to stick to a string.  That said, using a string-ized integer may
be a bit more legible (for some reason, humans are better at reading
decimal numbers than base32 numbers), and since no valid command or
event starts with an integer, we could use the stringized integer
directly without having to use a colon (in fact, for the first 999 types
encountered, a stringized integer is shorter than the base32 compression
+ the leading colon for namespacing) :)

> 
> Should we later decide we don't want to hide type names after all, then
> backward compatibility will make it very hard to go back.

Indeed - changing 'str' to 'int' in the schema is a much stronger
commitment than leaving things 'str' but passing a stringized integer in
that string.

> 
> I wouldn't expect clients to find stuff with a linear search.  Use a
> dictionary.  Should be plenty fast enough for processing the schema.

Requires more memory in the client to create that hash table on the one
pass through the JSON - but as is often the case in computer science,
asking the client to trade space for time is not too onerous.
(Libvirt's current JSON parser does NOT create a hash table of the
dictionaries that it reads, so libvirt would actually have to do two
passes and/or update its use of libyajl into creating hash tables when
appropriate - but that should not be a driving factor in your design.)

> 
>>                     Or if type names are truly unimportant, then omit
>> names for type elements (by making name optional in the introspection
>> qapi description), and using ONLY offsets in the returned JSON array for
>> referring to types.  Of course, if we do this, life gets a lot trickier
>> for adding filtering down to a subset of the overall schema (unless you
>> don't mind populating lots of 'null' entries for parts that get filtered
>> out so that the parts that are displayed are always at the same array
>> offset, just with less overall output bulk due to the filtering).
> 
> Filtering is a headache I'd prefer to avoid.

Well, since I've had most of the ideas about how filtering could even be
done, I'm perfectly okay with you leaving the guts of filtering for me
(or someone else) to do as a followup series; and even then, I first
have to decide if it would help libvirt to have a filtered list.  What's
important for this series is that we haven't precluded the possibility
of adding filtering later, and I think we've succeeded at that.  And as
for using raw integers to represent offsets into the returned JSON
array, I think that is a bit too brittle; so I'm happy with forcing
clients to create their own dictionary/hash lookup of string type names,
even if the strings are munged to avoid leaking qapi types as non-ABI.
Eric Blake July 28, 2015, 9:32 p.m. UTC | #6
On 07/28/2015 12:24 PM, Markus Armbruster wrote:
> Eric Blake <eblake@redhat.com> writes:
> 
>> On 07/01/2015 02:22 PM, Markus Armbruster wrote:
>>> To eliminate the temptation for clients to look up types by name
>>> (which are not ABI), replace all type names by meaningless strings.
>>>
>>> Reduces output of query-schema by 9 out of 80KiB.
>>
>> I'm not sure whether I like this or not.  It does make sense from the
>> perspective of forcing clients to stick to ABI queries, but makes it a
>> bit harder to navigate things except by automated scripts.
> 
> Yes.  I'm not sure it's a good idea.  If we decide to hide types this
> way, then I'd find an option to generate without type hiding useful.

As in, an optional boolean flag to the QMP command that requests whether
to get compact output with hidden names vs. full output with qapi names
exposed (but then, are we storing TWO copies of the introspection
strings)?  Or merely as in the generated qmp-introspect.c file having
strategic comments so that reading _that_ file lets you see the type
names, even if they don't get passed on to the end user?


> What the patch adds:
> 
> Move the introspection information for the non-types out of the way
> before the loop, append the information on types afterwards.  The result
> is now in jsons rather than self.jsons (see the next patch hunk).

I did notice that; pre-patch interleaved commands and types all
according to a global namespace, while post-patch sank all types to the
bottom.  Interestingly enough, if libvirt is going to query what
features a command has, it will first find the command (in the first
half of the returned array), then resolve the types used by that command
until it learns if the member is present (whether new enum value or
added dictionary member) - so there is that slight optimization that if
we guarantee that types are always output last, then libvirt doesn't
have to start generating its own hash table lookup of types until the
first type is seen, after already learning everything it needed from the
earlier command listings.

> 
> Why it does that:
> 
> With the funny typenames, sorting everything by name results in a mess.
> Keeping non-types and types separate is less of a mess.

You do have a point there - as soon as we introduce name aliases, the
aliases are unlikely to sort in the same manner as the original types,
particularly if we generate the aliases based solely on what order we
detected that those particular types were in use.
Eric Blake July 28, 2015, 11:19 p.m. UTC | #7
On 07/01/2015 02:22 PM, Markus Armbruster wrote:
> To eliminate the temptation for clients to look up types by name
> (which are not ABI), replace all type names by meaningless strings.
> 
> Reduces output of query-schema by 9 out of 80KiB.

Among other things, it replaced all instances of 'str' with ':acg', and
all instances of 'int' with ':adu' (at least for the qapi-schema.json
files at the point in qemu.git that I tested on); if you were to tweak
things to NOT rename builtin types (limiting the renaming to just object
and array types), you'd save even more space and make libvirt's life
easier by not masking builtin types.  (Yes, the 'meta-type':'builtin'
entry for 'str' should still be present whether or not we hide the name
'str', but knowing the small set of builtin names up front and only
having to chase objects and arrays seems a bit more efficient)
Markus Armbruster July 29, 2015, 9:24 a.m. UTC | #8
Eric Blake <eblake@redhat.com> writes:

> On 07/28/2015 12:39 PM, Markus Armbruster wrote:
>
>> 
>> Could do plain integer.  I guess I started down the base32 road to
>> squeeze out a few more characters, then sabotaged myself by always using
>> three base32 characters.
>> 
>
>> 
>> In the introspection schema, every 'str' that's really a type name needs
>> to be replaced by 'int'.
>
> Sadly, our introspection union is based on 'name':'str',
> 'meta-type':'...' as the base members of the union; and that 'name'
> would be one of the places where we'd want an integer; so we probably
> have to stick to a string.

We could use an alternate.  Feels like gratuitous complexity to me,
though.

>                             That said, using a string-ized integer may
> be a bit more legible (for some reason, humans are better at reading
> decimal numbers than base32 numbers), and since no valid command or
> event starts with an integer, we could use the stringized integer
> directly without having to use a colon (in fact, for the first 999 types
> encountered, a stringized integer is shorter than the base32 compression
> + the leading colon for namespacing) :)

I'll try and see how that pans out.

>> Should we later decide we don't want to hide type names after all, then
>> backward compatibility will make it very hard to go back.
>
> Indeed - changing 'str' to 'int' in the schema is a much stronger
> commitment than leaving things 'str' but passing a stringized integer in
> that string.
>
>> 
>> I wouldn't expect clients to find stuff with a linear search.  Use a
>> dictionary.  Should be plenty fast enough for processing the schema.
>
> Requires more memory in the client to create that hash table on the one
> pass through the JSON - but as is often the case in computer science,
> asking the client to trade space for time is not too onerous.
> (Libvirt's current JSON parser does NOT create a hash table of the
> dictionaries that it reads, so libvirt would actually have to do two
> passes and/or update its use of libyajl into creating hash tables when
> appropriate - but that should not be a driving factor in your design.)

For completeness: dictionary needn't be implemented as hash table.

>>>                     Or if type names are truly unimportant, then omit
>>> names for type elements (by making name optional in the introspection
>>> qapi description), and using ONLY offsets in the returned JSON array for
>>> referring to types.  Of course, if we do this, life gets a lot trickier
>>> for adding filtering down to a subset of the overall schema (unless you
>>> don't mind populating lots of 'null' entries for parts that get filtered
>>> out so that the parts that are displayed are always at the same array
>>> offset, just with less overall output bulk due to the filtering).
>> 
>> Filtering is a headache I'd prefer to avoid.
>
> Well, since I've had most of the ideas about how filtering could even be
> done, I'm perfectly okay with you leaving the guts of filtering for me
> (or someone else) to do as a followup series; and even then, I first
> have to decide if it would help libvirt to have a filtered list.  What's
> important for this series is that we haven't precluded the possibility
> of adding filtering later, and I think we've succeeded at that.  And as
> for using raw integers to represent offsets into the returned JSON
> array, I think that is a bit too brittle; so I'm happy with forcing
> clients to create their own dictionary/hash lookup of string type names,
> even if the strings are munged to avoid leaking qapi types as non-ABI.

Okay.  Thanks!
Markus Armbruster July 29, 2015, 9:34 a.m. UTC | #9
Eric Blake <eblake@redhat.com> writes:

> On 07/28/2015 12:24 PM, Markus Armbruster wrote:
>> Eric Blake <eblake@redhat.com> writes:
>> 
>>> On 07/01/2015 02:22 PM, Markus Armbruster wrote:
>>>> To eliminate the temptation for clients to look up types by name
>>>> (which are not ABI), replace all type names by meaningless strings.
>>>>
>>>> Reduces output of query-schema by 9 out of 80KiB.
>>>
>>> I'm not sure whether I like this or not.  It does make sense from the
>>> perspective of forcing clients to stick to ABI queries, but makes it a
>>> bit harder to navigate things except by automated scripts.
>> 
>> Yes.  I'm not sure it's a good idea.  If we decide to hide types this
>> way, then I'd find an option to generate without type hiding useful.
>
> As in, an optional boolean flag to the QMP command that requests whether
> to get compact output with hidden names vs. full output with qapi names
> exposed (but then, are we storing TWO copies of the introspection
> strings)?  Or merely as in the generated qmp-introspect.c file having
> strategic comments so that reading _that_ file lets you see the type
> names, even if they don't get passed on to the end user?

I was thinking of optionally unhidden type names, for when you messed up
the output, and the stupid hidden type names make it hard to see what
exactly's wrong.  I.e. the option is merely a development aid.

>> What the patch adds:
>> 
>> Move the introspection information for the non-types out of the way
>> before the loop, append the information on types afterwards.  The result
>> is now in jsons rather than self.jsons (see the next patch hunk).
>
> I did notice that; pre-patch interleaved commands and types all
> according to a global namespace, while post-patch sank all types to the
> bottom.  Interestingly enough, if libvirt is going to query what
> features a command has, it will first find the command (in the first
> half of the returned array), then resolve the types used by that command
> until it learns if the member is present (whether new enum value or
> added dictionary member) - so there is that slight optimization that if
> we guarantee that types are always output last, then libvirt doesn't
> have to start generating its own hash table lookup of types until the
> first type is seen, after already learning everything it needed from the
> earlier command listings.

I'm reluctant to complicate the contract with such a promise, though.

I guess I'd simply slurp in the JSON, then do a quick pass over the
resulting data structure to build an index.

>> Why it does that:
>> 
>> With the funny typenames, sorting everything by name results in a mess.
>> Keeping non-types and types separate is less of a mess.
>
> You do have a point there - as soon as we introduce name aliases, the
> aliases are unlikely to sort in the same manner as the original types,

s/are unlikely to/won't/ :)

> particularly if we generate the aliases based solely on what order we
> detected that those particular types were in use.

That's the easy order.  Other orders are possible, but they're extra
work.  Best to keep things simple.
Markus Armbruster July 29, 2015, 9:35 a.m. UTC | #10
Eric Blake <eblake@redhat.com> writes:

> On 07/01/2015 02:22 PM, Markus Armbruster wrote:
>> To eliminate the temptation for clients to look up types by name
>> (which are not ABI), replace all type names by meaningless strings.
>> 
>> Reduces output of query-schema by 9 out of 80KiB.
>
> Among other things, it replaced all instances of 'str' with ':acg', and
> all instances of 'int' with ':adu' (at least for the qapi-schema.json
> files at the point in qemu.git that I tested on); if you were to tweak
> things to NOT rename builtin types (limiting the renaming to just object
> and array types), you'd save even more space and make libvirt's life
> easier by not masking builtin types.  (Yes, the 'meta-type':'builtin'
> entry for 'str' should still be present whether or not we hide the name
> 'str', but knowing the small set of builtin names up front and only
> having to chase objects and arrays seems a bit more efficient)

You're right, hiding builtin types is pointless, they're ABI.
Eric Blake July 29, 2015, 4:03 p.m. UTC | #11
On 07/29/2015 03:34 AM, Markus Armbruster wrote:

>>>> I'm not sure whether I like this or not.  It does make sense from the
>>>> perspective of forcing clients to stick to ABI queries, but makes it a
>>>> bit harder to navigate things except by automated scripts.
>>>
>>> Yes.  I'm not sure it's a good idea.  If we decide to hide types this
>>> way, then I'd find an option to generate without type hiding useful.
>>
>> As in, an optional boolean flag to the QMP command that requests whether
>> to get compact output with hidden names vs. full output with qapi names
>> exposed (but then, are we storing TWO copies of the introspection
>> strings)?  Or merely as in the generated qmp-introspect.c file having
>> strategic comments so that reading _that_ file lets you see the type
>> names, even if they don't get passed on to the end user?
> 
> I was thinking of optionally unhidden type names, for when you messed up
> the output, and the stupid hidden type names make it hard to see what
> exactly's wrong.  I.e. the option is merely a development aid.

Ah, an option to qapi-introspect.py that says to generate the file
differently, but which is not normally enabled and therefore not exposed
to the QMP end user.  That, or unconditional strategic comments, both do
the trick for me.

> 
>>> What the patch adds:
>>>
>>> Move the introspection information for the non-types out of the way
>>> before the loop, append the information on types afterwards.  The result
>>> is now in jsons rather than self.jsons (see the next patch hunk).
>>
>> I did notice that; pre-patch interleaved commands and types all
>> according to a global namespace, while post-patch sank all types to the
>> bottom.  Interestingly enough, if libvirt is going to query what
>> features a command has, it will first find the command (in the first
>> half of the returned array), then resolve the types used by that command
>> until it learns if the member is present (whether new enum value or
>> added dictionary member) - so there is that slight optimization that if
>> we guarantee that types are always output last, then libvirt doesn't
>> have to start generating its own hash table lookup of types until the
>> first type is seen, after already learning everything it needed from the
>> earlier command listings.
> 
> I'm reluctant to complicate the contract with such a promise, though.

Concur. If we aren't going to guarantee sorting, then we also should not
guarantee that all commands come before any types, but that the overall
output is just an entire lookup dictionary where any entry can hold any
metatype.

> 
> I guess I'd simply slurp in the JSON, then do a quick pass over the
> resulting data structure to build an index.

Yep, that's probably what I'll have to code into libvirt.

> 
> That's the easy order.  Other orders are possible, but they're extra
> work.  Best to keep things simple.

Simplest for qemu is no order at all (and the client has to take on the
burden of sorting things as desired).  Not always the smartest (if every
client wants to sort, we've pushed a lot of duplicated work onto each
client instead of doing the work ourselves), but we have right up until
2.5 is released to commit to our decision of whether to sort in qemu or not.
diff mbox

Patch

diff --git a/scripts/qapi-introspect.py b/scripts/qapi-introspect.py
index 961fe88..efb34ff 100644
--- a/scripts/qapi-introspect.py
+++ b/scripts/qapi-introspect.py
@@ -9,13 +9,18 @@ 
 # This work is licensed under the terms of the GNU GPL, version 2.
 # See the COPYING file in the top-level directory.
 
+import string
 from qapi import *
 
+def _b32digit(num):
+    return (string.lowercase + string.digits[2:])[num]
+
 class QAPISchemaGenIntrospectVisitor(QAPISchemaVisitor):
     def __init__(self):
         self.schema = None
         self.jsons = None
         self.used_types = None
+        self.name_map = None
         self.defn = None
         self.decl = None
 
@@ -23,13 +28,17 @@  class QAPISchemaGenIntrospectVisitor(QAPISchemaVisitor):
         self.schema = schema
         self.jsons = []
         self.used_types = []
+        self.name_map = {}
         return QAPISchemaType   # don't visit types for now
 
     def visit_end(self):
         # visit the types that are actually used
+        jsons = self.jsons
+        self.jsons = []
         for typ in self.used_types:
             typ.visit(self)
         self.jsons.sort()
+        jsons.extend(self.jsons)
         name = prefix + 'qmp_schema_json'
         self.decl = mcgen('''
 extern char %(c_name)s[];
@@ -40,10 +49,19 @@  char %(c_name)s[] = "["
     "%(c_jsons)s]";
 ''',
                           c_name=c_name(name),
-                          c_jsons=', "\n    "'.join(self.jsons))
+                          c_jsons=', "\n    "'.join(jsons))
         self.schema = None
         self.jsons = None
         self.used_types = None
+        self.name_map = None
+
+    def _name(self, name):
+        if name not in self.name_map:
+            n = len(self.name_map)
+            self.name_map[name] = ':' + _b32digit(n / 32 / 32) \
+                                  + _b32digit(n / 32 % 32) \
+                                  + _b32digit(n % 32)
+        return self.name_map[name]
 
     def _use_type(self, typ):
         # Map the various integer types to plain int
@@ -55,9 +73,14 @@  char %(c_name)s[] = "["
         # Add type to work queue if new
         if typ not in self.used_types:
             self.used_types.append(typ)
-        return typ.name
+        # Clients should examine commands and events, not types.  Hide
+        # type names to reduce the temptation.  Also saves a few
+        # characters.
+        return self._name(typ.name)
 
     def _gen_json(self, name, mtype, extra):
+        if mtype != 'command' and mtype != 'event':
+            name = self._name(name)
         self.jsons.append("{ 'name': '%s', 'meta-type': '%s', %s }"
                           % (name, mtype, extra))