Message ID | 1475040180-10770-5-git-send-email-dja@axtens.net |
---|---|
State | Accepted |
Headers | show |
On 28 Sep 15:22, Daniel Axtens wrote: > If there is a non-ascii character in a header, parsing fails, > even on Py27. > > This has huge Py2/Py3 complexities. The Py3 email package has tools > to handle this - we just need to use them. Py2, on the other hand, > needs a lot of hand-holding, as explained in the comments. > > Additionally, support headers that claim an encoding, but fail to > decode with that encoding. > > This is handy for mails with malformed headers containing weird > bytes. > > Reported-by: Thomas Monjalon <thomas.monjalon@6wind.com> > Suggested-by: Stephen Finucane <stephenfinucane@hotmail.com> > Signed-off-by: Daniel Axtens <dja@axtens.net> Yes, this is much better. Thanks for taking the feedback on board. Some questions below. Stephen > > --- > > Many thanks to Thomas for his help debugging this, and to Stephen > for insights leading to a much better patch - twice! > > This should probably go to a stable branch too. We'll need to start > some discussion about how to handle bug fixes for people not running > git mainline (like ozlabs.org and kernel.org). > > --- > > v2: - refactor to generalise header handling. Thanks Stephen. > - support headers that claim an encoding but don't decode > This actually leads to slightly simpler patch. > --- > patchwork/parser.py | 93 +++++++++++++++++++++++++++++++++++++++++++++-------- > 1 file changed, 79 insertions(+), 14 deletions(-) > > diff --git a/patchwork/parser.py b/patchwork/parser.py > index 7c05479b33bc..fbc36125c2ec 100644 > --- a/patchwork/parser.py > +++ b/patchwork/parser.py > @@ -21,7 +21,7 @@ > > import codecs > import datetime > -from email.header import Header, decode_header > +from email.header import decode_header, make_header > from email.utils import parsedate_tz, mktime_tz > from fnmatch import fnmatch > from functools import reduce > @@ -49,19 +49,81 @@ def normalise_space(value): > return whitespace_re.sub(' ', value).strip() > > > +def sanitise_header(header_contents, header_name=None): > + """Given a header with header_contents, optionally labelled > + header_name, decode it with decode_header, sanitise it to make > + sure it decodes correctly and contains no invalid characters. > + Then encode the result with make_header() > + """ > + > + # We have some Py2/Py3 issues here. > + # > + # Firstly, the email parser (before we get here) > + # Python 3: headers with weird chars are email.header.Header > + # class, others as str > + # Python 2: every header is an str > + # > + # Secondly, the behaviour of decode_header: > + # Python 3: weird headers are labelled as unknown-8bit > + # Python 2: weird headers are not labelled differently > + # > + # Lastly, aking matters worse, in Python2, unknown-8bit doesn't > + # seem to be supported as an input to make_header, so not only do > + # we have to detect dodgy headers, we have to fix them ourselves. > + # > + # We solve this by catching any Unicode errors, and then manually > + # handling any interesting headers. I'm going to move all the above into the docstring, if that's OK by you. > + > + value = decode_header(header_contents) > + try: > + header = make_header(value, > + header_name=header_name, > + continuation_ws='\t') > + except UnicodeDecodeError: > + # At least one of the parts cannot be encoded as ascii. > + # Find out which one and fix it somehow. > + # > + # We get here under Py2 when there's non-7-bit chars in header, > + # or under Py2 or Py3 where decoding with the coding hint fails. > + > + new_value=[] > + for (part, coding) in value: > + # We have random bytes that aren't properly coded. > + # If we had a coding hint, it failed to help. > + if six.PY3: > + # python3 - force coding to unknown-8bit > + new_value += [(part, 'unknown-8bit')] > + else: > + # python2 - no support in make_header for unknown-8bit > + # We should do unknown-8bit coding ourselves. > + # For now, we're just going to replace any dubious > + # chars with ?. > + # > + # TODO: replace it with a proper QP unknown-8bit codec. How could we resolve this TODO in the future? > + new_value += [(part.decode('ascii', errors='replace') > + .encode('ascii', errors='replace'), > + None)] > + > + header = make_header(new_value, > + header_name=header_name, > + continuation_ws='\t') > + > + return header > + > + > def clean_header(header): > """Decode (possibly non-ascii) headers.""" > - def decode(fragment): > - (frag_str, frag_encoding) = fragment > - if frag_encoding: > - return frag_str.decode(frag_encoding) > - elif isinstance(frag_str, six.binary_type): # python 2 > - return frag_str.decode() > - return frag_str > > - fragments = [decode(x) for x in decode_header(header)] > + sane_header = sanitise_header(header) > > - return normalise_space(u' '.join(fragments)) > + # on Py2, we want to do unicode(), on Py3, str(). > + # That gets us the decoded, un-wrapped header. > + if six.PY2: > + header_str = unicode(sane_header) > + else: > + header_str = str(sane_header) nit: this looks like something we could use the 'six.u()' function for? https://pythonhosted.org/six/#six.u > + > + return normalise_space(header_str) > > > def find_project_by_id(list_id): > @@ -154,10 +216,13 @@ def find_date(mail): > > > def find_headers(mail): > - return reduce(operator.__concat__, > - ['%s: %s\n' % (k, Header(v, header_name=k, > - continuation_ws='\t').encode()) > - for (k, v) in list(mail.items())]) > + headers = [(key, sanitise_header(value, header_name=key)) > + for key, value in mail.items()] > + > + strings = [('%s: %s' % (key, header.encode())) How come '.encode' is suitable here, yet we need to call 'unicode'/'str' in the 'clean_header' function? > + for (key, header) in headers] > + > + return '\n'.join(strings) > > > def find_references(mail): > -- > 2.7.4 >
>> +def sanitise_header(header_contents, header_name=None): >> + """Given a header with header_contents, optionally labelled >> + header_name, decode it with decode_header, sanitise it to make >> + sure it decodes correctly and contains no invalid characters. >> + Then encode the result with make_header() >> + """ >> + >> + # We have some Py2/Py3 issues here. >> + # >> + # Firstly, the email parser (before we get here) >> + # Python 3: headers with weird chars are email.header.Header >> + # class, others as str >> + # Python 2: every header is an str >> + # >> + # Secondly, the behaviour of decode_header: >> + # Python 3: weird headers are labelled as unknown-8bit >> + # Python 2: weird headers are not labelled differently >> + # >> + # Lastly, aking matters worse, in Python2, unknown-8bit doesn't >> + # seem to be supported as an input to make_header, so not only do >> + # we have to detect dodgy headers, we have to fix them ourselves. >> + # >> + # We solve this by catching any Unicode errors, and then manually >> + # handling any interesting headers. > > I'm going to move all the above into the docstring, if that's OK by > you. I don't mind, but my understanding was that the docstring described the function, rather than the implementation details, hence why I had it in a comment originally. >> + # python2 - no support in make_header for unknown-8bit >> + # We should do unknown-8bit coding ourselves. >> + # For now, we're just going to replace any dubious >> + # chars with ?. >> + # >> + # TODO: replace it with a proper QP unknown-8bit codec. > > How could we resolve this TODO in the future? > I think we use email.charset. Possibly we can copy some of the Py3 code that implements unknown-8bit. It all looks quite finnicky. Also, any time we get a message with an invalid header and we're relying on the patchwork header display to figure it out is an edge case on top of another edge case, so I didn't think it worth much more time. >> + # on Py2, we want to do unicode(), on Py3, str(). >> + # That gets us the decoded, un-wrapped header. >> + if six.PY2: >> + header_str = unicode(sane_header) >> + else: >> + header_str = str(sane_header) > > nit: this looks like something we could use the 'six.u()' function for? > > https://pythonhosted.org/six/#six.u > I thought this too. Then I looked at the code for django.utils.six: https://github.com/django/django/blob/master/django/utils/six.py#L646 def u(s): return unicode(s.replace(r'\\', r'\\\\'), "unicode_escape") That won't work because we're passing in a header class to u(). The header class, as far as I know, doesn't provide .replace(), and even if it did it's not clear we want to call it. >> + strings = [('%s: %s' % (key, header.encode())) > > How come '.encode' is suitable here, yet we need to call 'unicode'/'str' > in the 'clean_header' function? Because the header class overrides .encode(), __unicode__ and __str__, and they do different things. For headers: - str (py3) and unicode (py2) provides a non-line-wrapped version of the header, in unicode - that is, not in quoted-printable or base64 7-bit form. - encode provides a line-wrapped, 7-bit encoded form of the header. For find_headers, we're creating the headers that are displayed when you click on the 'show' link on the patch display, under Details, and just beneath the message id and state. These are supposed to be fully encoded and line wrapped: they're supposed to look like mail headers. For clean_header, we're looking for the human readable form: decoded From quoted printable or base64, converted from whatever charset they were in, and not wrapped. Hence the code. Admittedly this is not super clear, but I blame the API of the email.Header module. Regards, Daniel
On 29 Sep 09:01, Daniel Axtens wrote: > >> +def sanitise_header(header_contents, header_name=None): > >> + """Given a header with header_contents, optionally labelled > >> + header_name, decode it with decode_header, sanitise it to make > >> + sure it decodes correctly and contains no invalid characters. > >> + Then encode the result with make_header() > >> + """ > >> + > >> + # We have some Py2/Py3 issues here. > >> + # > >> + # Firstly, the email parser (before we get here) > >> + # Python 3: headers with weird chars are email.header.Header > >> + # class, others as str > >> + # Python 2: every header is an str > >> + # > >> + # Secondly, the behaviour of decode_header: > >> + # Python 3: weird headers are labelled as unknown-8bit > >> + # Python 2: weird headers are not labelled differently > >> + # > >> + # Lastly, aking matters worse, in Python2, unknown-8bit doesn't > >> + # seem to be supported as an input to make_header, so not only do > >> + # we have to detect dodgy headers, we have to fix them ourselves. > >> + # > >> + # We solve this by catching any Unicode errors, and then manually > >> + # handling any interesting headers. > > > > I'm going to move all the above into the docstring, if that's OK by > > you. > > I don't mind, but my understanding was that the docstring described the > function, rather than the implementation details, hence why I had it in > a comment originally. You're right. Let's keep this as is. > >> + # python2 - no support in make_header for unknown-8bit > >> + # We should do unknown-8bit coding ourselves. > >> + # For now, we're just going to replace any dubious > >> + # chars with ?. > >> + # > >> + # TODO: replace it with a proper QP unknown-8bit codec. > > > > How could we resolve this TODO in the future? > > > > I think we use email.charset. Possibly we can copy some of the Py3 code > that implements unknown-8bit. It all looks quite finnicky. Also, any > time we get a message with an invalid header and we're relying on the > patchwork header display to figure it out is an edge case on top of > another edge case, so I didn't think it worth much more time. Yeah, we can solve this if it comes up. Just good to see that there is a potential resolution if needed. > >> + # on Py2, we want to do unicode(), on Py3, str(). > >> + # That gets us the decoded, un-wrapped header. > >> + if six.PY2: > >> + header_str = unicode(sane_header) > >> + else: > >> + header_str = str(sane_header) > > > > nit: this looks like something we could use the 'six.u()' function for? > > > > https://pythonhosted.org/six/#six.u > > > I thought this too. Then I looked at the code for django.utils.six: > https://github.com/django/django/blob/master/django/utils/six.py#L646 > > > def u(s): > return unicode(s.replace(r'\\', r'\\\\'), "unicode_escape") > > That won't work because we're passing in a header class to u(). The > header class, as far as I know, doesn't provide .replace(), and even if > it did it's not clear we want to call it. Good spot. Thanks for checking this out. > >> + strings = [('%s: %s' % (key, header.encode())) > > > > How come '.encode' is suitable here, yet we need to call 'unicode'/'str' > > in the 'clean_header' function? > > Because the header class overrides .encode(), __unicode__ and __str__, > and they do different things. > > For headers: > > - str (py3) and unicode (py2) provides a non-line-wrapped version of > the header, in unicode - that is, not in quoted-printable or base64 > 7-bit form. > > - encode provides a line-wrapped, 7-bit encoded form of the header. > > For find_headers, we're creating the headers that are displayed when you > click on the 'show' link on the patch display, under Details, and just > beneath the message id and state. These are supposed to be fully encoded > and line wrapped: they're supposed to look like mail headers. > > For clean_header, we're looking for the human readable form: decoded > From quoted printable or base64, converted from whatever charset they > were in, and not wrapped. > > Hence the code. Admittedly this is not super clear, but I blame the API > of the email.Header module. Ah, I understand you now. I'll stick some of those comments in there. This looks good now so, hence: Reviewed-by: Stephen Finucane <stephen@that.guru> and applied.
> Reviewed-by: Stephen Finucane <stephen@that.guru> > > and applied. Excellent. Please could you help to backport this patch in branch 1.1?
On 08 Oct 16:22, Thomas Monjalon wrote: > > Reviewed-by: Stephen Finucane <stephen@that.guru> > > > > and applied. > > Excellent. Please could you help to backport this patch in branch 1.1? Sure. Done.
2016-10-08 20:33, Stephen Finucane: > On 08 Oct 16:22, Thomas Monjalon wrote: > > > Reviewed-by: Stephen Finucane <stephen@that.guru> > > > > > > and applied. > > > > Excellent. Please could you help to backport this patch in branch 1.1? > > Sure. Done. Perfect, thanks a lot for the version 1.1.2.
diff --git a/patchwork/parser.py b/patchwork/parser.py index 7c05479b33bc..fbc36125c2ec 100644 --- a/patchwork/parser.py +++ b/patchwork/parser.py @@ -21,7 +21,7 @@ import codecs import datetime -from email.header import Header, decode_header +from email.header import decode_header, make_header from email.utils import parsedate_tz, mktime_tz from fnmatch import fnmatch from functools import reduce @@ -49,19 +49,81 @@ def normalise_space(value): return whitespace_re.sub(' ', value).strip() +def sanitise_header(header_contents, header_name=None): + """Given a header with header_contents, optionally labelled + header_name, decode it with decode_header, sanitise it to make + sure it decodes correctly and contains no invalid characters. + Then encode the result with make_header() + """ + + # We have some Py2/Py3 issues here. + # + # Firstly, the email parser (before we get here) + # Python 3: headers with weird chars are email.header.Header + # class, others as str + # Python 2: every header is an str + # + # Secondly, the behaviour of decode_header: + # Python 3: weird headers are labelled as unknown-8bit + # Python 2: weird headers are not labelled differently + # + # Lastly, aking matters worse, in Python2, unknown-8bit doesn't + # seem to be supported as an input to make_header, so not only do + # we have to detect dodgy headers, we have to fix them ourselves. + # + # We solve this by catching any Unicode errors, and then manually + # handling any interesting headers. + + value = decode_header(header_contents) + try: + header = make_header(value, + header_name=header_name, + continuation_ws='\t') + except UnicodeDecodeError: + # At least one of the parts cannot be encoded as ascii. + # Find out which one and fix it somehow. + # + # We get here under Py2 when there's non-7-bit chars in header, + # or under Py2 or Py3 where decoding with the coding hint fails. + + new_value=[] + for (part, coding) in value: + # We have random bytes that aren't properly coded. + # If we had a coding hint, it failed to help. + if six.PY3: + # python3 - force coding to unknown-8bit + new_value += [(part, 'unknown-8bit')] + else: + # python2 - no support in make_header for unknown-8bit + # We should do unknown-8bit coding ourselves. + # For now, we're just going to replace any dubious + # chars with ?. + # + # TODO: replace it with a proper QP unknown-8bit codec. + new_value += [(part.decode('ascii', errors='replace') + .encode('ascii', errors='replace'), + None)] + + header = make_header(new_value, + header_name=header_name, + continuation_ws='\t') + + return header + + def clean_header(header): """Decode (possibly non-ascii) headers.""" - def decode(fragment): - (frag_str, frag_encoding) = fragment - if frag_encoding: - return frag_str.decode(frag_encoding) - elif isinstance(frag_str, six.binary_type): # python 2 - return frag_str.decode() - return frag_str - fragments = [decode(x) for x in decode_header(header)] + sane_header = sanitise_header(header) - return normalise_space(u' '.join(fragments)) + # on Py2, we want to do unicode(), on Py3, str(). + # That gets us the decoded, un-wrapped header. + if six.PY2: + header_str = unicode(sane_header) + else: + header_str = str(sane_header) + + return normalise_space(header_str) def find_project_by_id(list_id): @@ -154,10 +216,13 @@ def find_date(mail): def find_headers(mail): - return reduce(operator.__concat__, - ['%s: %s\n' % (k, Header(v, header_name=k, - continuation_ws='\t').encode()) - for (k, v) in list(mail.items())]) + headers = [(key, sanitise_header(value, header_name=key)) + for key, value in mail.items()] + + strings = [('%s: %s' % (key, header.encode())) + for (key, header) in headers] + + return '\n'.join(strings) def find_references(mail):
If there is a non-ascii character in a header, parsing fails, even on Py27. This has huge Py2/Py3 complexities. The Py3 email package has tools to handle this - we just need to use them. Py2, on the other hand, needs a lot of hand-holding, as explained in the comments. Additionally, support headers that claim an encoding, but fail to decode with that encoding. This is handy for mails with malformed headers containing weird bytes. Reported-by: Thomas Monjalon <thomas.monjalon@6wind.com> Suggested-by: Stephen Finucane <stephenfinucane@hotmail.com> Signed-off-by: Daniel Axtens <dja@axtens.net> --- Many thanks to Thomas for his help debugging this, and to Stephen for insights leading to a much better patch - twice! This should probably go to a stable branch too. We'll need to start some discussion about how to handle bug fixes for people not running git mainline (like ozlabs.org and kernel.org). --- v2: - refactor to generalise header handling. Thanks Stephen. - support headers that claim an encoding but don't decode This actually leads to slightly simpler patch. --- patchwork/parser.py | 93 +++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 79 insertions(+), 14 deletions(-)