[v2,3/6] parser: Move mail parsing into another module

Message ID	20161130184247.9913-4-stephen@that.guru
State	Superseded
Headers	show Return-Path: <patchwork-bounces+incoming=patchwork.ozlabs.org@lists.ozlabs.org> From: Stephen Finucane <stephen@that.guru> To: patchwork@lists.ozlabs.org Subject: [PATCH v2 3/6] parser: Move mail parsing into another module Date: Wed, 30 Nov 2016 18:42:44 +0000 Message-Id: <20161130184247.9913-4-stephen@that.guru> In-Reply-To: <20161130184247.9913-1-stephen@that.guru> References: <20161130184247.9913-1-stephen@that.guru> Precedence: list Cc: Tom Rini <trini@konsulko.com>, Paul Jakma <paul@jakma.org> MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: patchwork-bounces+incoming=patchwork.ozlabs.org@lists.ozlabs.org Sender: "Patchwork" <patchwork-bounces+incoming=patchwork.ozlabs.org@lists.ozlabs.org>

diff --git a/patchwork/extractor.py b/patchwork/extractor.py new file mode 100644 index 0000000..74fe4ab --- /dev/null +++ b/patchwork/extractor.py @@ -0,0 +1,298 @@ +#!/usr/bin/env python +# +# Patchwork - automated patch tracking system +# Copyright (C) 2008 Jeremy Kerr <jk@ozlabs.org> +# +# This file is part of the Patchwork package. +# +# Patchwork is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# Patchwork is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Patchwork; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +"""Extract comments and (optional) diffs from mbox files.""" + +import codecs +import hashlib +import re + +from django.utils import six + +HUNK_RE = re.compile(r'^\@\@ -\d+(?:,(\d+))? \+\d+(?:,(\d+))? \@\@') +FILENAME_RE = re.compile(r'^(---|\+\+\+) (\S+)') + + +def parse_patch(content): + """Split a mail's contents into a diff and comment. + + This is a state machine that takes a patch, generally in UNIX mbox + format, and splits it into the component comments and diff. + + Args: + patch: The patch to be split + + Returns: + A tuple containing the diff and comment. Either one or both of + these can be empty. + + Raises: + Exception: The state machine transitioned to an invalid state. + """ + patchbuf = '' + commentbuf = '' + buf = '' + + # state specified the line we just saw, and what to expect next + state = 0 + # 0: text + # 1: suspected patch header (diff, ====, Index:) + # 2: patch header line 1 (---) + # 3: patch header line 2 (+++) + # 4: patch hunk header line (@@ line) + # 5: patch hunk content + # 6: patch meta header (rename from/rename to) + # + # valid transitions: + # 0 -> 1 (diff, ===, Index:) + # 0 -> 2 (---) + # 1 -> 2 (---) + # 2 -> 3 (+++) + # 3 -> 4 (@@ line) + # 4 -> 5 (patch content) + # 5 -> 1 (run out of lines from @@-specifed count) + # 1 -> 6 (rename from / rename to) + # 6 -> 2 (---) + # 6 -> 1 (other text) + # + # Suspected patch header is stored into buf, and appended to + # patchbuf if we find a following hunk. Otherwise, append to + # comment after parsing. + + # line counts while parsing a patch hunk + lc = (0, 0) + hunk = 0 + + for line in content.split('\n'): + line += '\n' + + if state == 0: + if line.startswith('diff ') or line.startswith('===') \ + or line.startswith('Index: '): + state = 1 + buf += line + elif line.startswith('--- '): + state = 2 + buf += line + else: + commentbuf += line + elif state == 1: + buf += line + if line.startswith('--- '): + state = 2 + + if line.startswith(('rename from ', 'rename to ')): + state = 6 + elif state == 2: + if line.startswith('+++ '): + state = 3 + buf += line + elif hunk: + state = 1 + buf += line + else: + state = 0 + commentbuf += buf + line + buf = '' + elif state == 3: + match = HUNK_RE.match(line) + if match: + def fn(x): + if not x: + return 1 + return int(x) + + lc = [fn(x) for x in match.groups()] + + state = 4 + patchbuf += buf + line + buf = '' + elif line.startswith('--- '): + patchbuf += buf + line + buf = '' + state = 2 + elif hunk and line.startswith(r'\ No newline at end of file'): + # If we had a hunk and now we see this, it's part of the patch, + # and we're still expecting another @@ line. + patchbuf += line + elif hunk: + state = 1 + buf += line + else: + state = 0 + commentbuf += buf + line + buf = '' + elif state in [4, 5]: + if line.startswith('-'): + lc[0] -= 1 + elif line.startswith('+'): + lc[1] -= 1 + elif line.startswith(r'\ No newline at end of file'): + # Special case: Not included as part of the hunk's line count + pass + else: + lc[0] -= 1 + lc[1] -= 1 + + patchbuf += line + + if lc[0] <= 0 and lc[1] <= 0: + state = 3 + hunk += 1 + else: + state = 5 + elif state == 6: + if line.startswith(('rename to ', 'rename from ')): + patchbuf += buf + line + buf = '' + elif line.startswith('--- '): + patchbuf += buf + line + buf = '' + state = 2 + else: + buf += line + state = 1 + else: + raise Exception("Unknown state %d! (line '%s')" % (state, line)) + + commentbuf += buf + + if patchbuf == '': + patchbuf = None + + if commentbuf == '': + commentbuf = None + + return patchbuf, commentbuf + + +def clean_content(content): + """Remove cruft from the email message. + + Catch signature (-- ) and list footer (_____) cruft. + """ + sig_re = re.compile(r'^(-- |_+)\n.*', re.S | re.M) + + content = sig_re.sub('', content) + + return content.strip() + + +def find_content(mail): + """Extract a comment and potential diff from a mail.""" + patchbuf = None + commentbuf = '' + + for part in mail.walk(): + if part.get_content_maintype() != 'text': + continue + + payload = part.get_payload(decode=True) + subtype = part.get_content_subtype() + + if not isinstance(payload, six.text_type): + charset = part.get_content_charset() + + # Check that we have a charset that we understand. Otherwise, + # ignore it and fallback to our standard set. + if charset is not None: + try: + codecs.lookup(charset) + except LookupError: + charset = None + + # If there is no charset or if it is unknown, then try some common + # charsets before we fail. + if charset is None: + try_charsets = ['utf-8', 'windows-1252', 'iso-8859-1'] + else: + try_charsets = [charset] + + for cset in try_charsets: + try: + payload = six.text_type(payload, cset) + break + except UnicodeDecodeError: + payload = None + + # Could not find a valid decoded payload. Fail. + if payload is None: + return None, None + + if subtype in ['x-patch', 'x-diff']: + patchbuf = payload + elif subtype == 'plain': + c = payload + + if not patchbuf: + patchbuf, c = parse_patch(payload) + + if c is not None: + commentbuf += c.strip() + '\n' + + commentbuf = clean_content(commentbuf) + + return patchbuf, commentbuf + + +def hash_diff(diff): + """Generate a hash from a diff.""" + # normalise spaces + diff = diff.replace('\r', '') + diff = diff.strip() + '\n' + + prefixes = ['-', '+', ' '] + hash = hashlib.sha1() + + for line in diff.split('\n'): + if len(line) <= 0: + continue + + hunk_match = HUNK_RE.match(line) + filename_match = FILENAME_RE.match(line) + + if filename_match: + # normalise -p1 top-directories + if filename_match.group(1) == '---': + filename = 'a/' + else: + filename = 'b/' + filename += '/'.join(filename_match.group(2).split('/')[1:]) + + line = filename_match.group(1) + ' ' + filename + elif hunk_match: + # remove line numbers, but leave line counts + def fn(x): + if not x: + return 1 + return int(x) + line_nos = list(map(fn, hunk_match.groups())) + line = '@@ -%d +%d @@' % tuple(line_nos) + elif line[0] in prefixes: + # if we have a +, - or context line, leave as-is + pass + else: + # other lines are ignored + continue + + hash.update((line + '\n').encode('utf-8')) + + return hash diff --git a/patchwork/models.py b/patchwork/models.py index 15a2936..19a5789 100644 --- a/patchwork/models.py +++ b/patchwork/models.py @@ -22,7 +22,6 @@ from __future__ import absolute_import from collections import Counter, OrderedDict import datetime -import hashlib import random import re @@ -34,6 +33,7 @@ from django.db import models from django.utils.encoding import python_2_unicode_compatible from django.utils.functional import cached_property +from patchwork import extractor from patchwork.fields import HashField @@ -366,54 +366,6 @@ class Patch(SeriesMixin, Submission): return counts - @staticmethod - def hash_diff(diff): - """Generate a hash from a diff.""" - hunk_re = re.compile(r'^\@\@ -\d+(?:,(\d+))? \+\d+(?:,(\d+))? \@\@') - filename_re = re.compile(r'^(---|\+\+\+) (\S+)') - - # normalise spaces - diff = diff.replace('\r', '') - diff = diff.strip() + '\n' - - prefixes = ['-', '+', ' '] - hash = hashlib.sha1() - - for line in diff.split('\n'): - if len(line) <= 0: - continue - - hunk_match = hunk_re.match(line) - filename_match = filename_re.match(line) - - if filename_match: - # normalise -p1 top-directories - if filename_match.group(1) == '---': - filename = 'a/' - else: - filename = 'b/' - filename += '/'.join(filename_match.group(2).split('/')[1:]) - - line = filename_match.group(1) + ' ' + filename - elif hunk_match: - # remove line numbers, but leave line counts - def fn(x): - if not x: - return 1 - return int(x) - line_nos = list(map(fn, hunk_match.groups())) - line = '@@ -%d +%d @@' % tuple(line_nos) - elif line[0] in prefixes: - # if we have a +, - or context line, leave as-is - pass - else: - # other lines are ignored - continue - - hash.update((line + '\n').encode('utf-8')) - - return hash - def _set_tag(self, tag, count): if count == 0: self.patchtag_set.filter(tag=tag).delete() @@ -441,7 +393,7 @@ class Patch(SeriesMixin, Submission): self.state = get_default_initial_patch_state() if self.hash is None and self.diff is not None: - self.hash = self.hash_diff(self.diff).hexdigest() + self.hash = extractor.hash_diff(self.diff).hexdigest() super(Patch, self).save(**kwargs) diff --git a/patchwork/parser.py b/patchwork/parser.py index 9d1b79e..3c2b89b 100644 --- a/patchwork/parser.py +++ b/patchwork/parser.py @@ -1,5 +1,3 @@ -#!/usr/bin/env python -# # Patchwork - automated patch tracking system # Copyright (C) 2008 Jeremy Kerr <jk@ozlabs.org> # @@ -19,7 +17,8 @@ # along with Patchwork; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -import codecs +"""Parse mbox files and add to Patchwork.""" + import datetime from email.header import decode_header from email.header import make_header @@ -32,6 +31,8 @@ import re from django.contrib.auth.models import User from django.utils import six +from patchwork.extractor import find_content +from patchwork.extractor import FILENAME_RE from patchwork.models import Comment from patchwork.models import CoverLetter from patchwork.models import DelegationRule @@ -46,8 +47,6 @@ from patchwork.models import State from patchwork.models import Submission -_hunk_re = re.compile(r'^\@\@ -\d+(?:,(\d+))? \+\d+(?:,(\d+))? \@\@') -_filename_re = re.compile(r'^(---|\+\+\+) (\S+)') list_id_headers = ['List-ID', 'X-Mailing-List', 'X-list'] logger = logging.getLogger(__name__) @@ -342,63 +341,6 @@ def parse_version(subject, subject_prefixes): return 1 -def find_content(mail): - """Extract a comment and potential diff from a mail.""" - patchbuf = None - commentbuf = '' - - for part in mail.walk(): - if part.get_content_maintype() != 'text': - continue - - payload = part.get_payload(decode=True) - subtype = part.get_content_subtype() - - if not isinstance(payload, six.text_type): - charset = part.get_content_charset() - - # Check that we have a charset that we understand. Otherwise, - # ignore it and fallback to our standard set. - if charset is not None: - try: - codecs.lookup(charset) - except LookupError: - charset = None - - # If there is no charset or if it is unknown, then try some common - # charsets before we fail. - if charset is None: - try_charsets = ['utf-8', 'windows-1252', 'iso-8859-1'] - else: - try_charsets = [charset] - - for cset in try_charsets: - try: - payload = six.text_type(payload, cset) - break - except UnicodeDecodeError: - payload = None - - # Could not find a valid decoded payload. Fail. - if payload is None: - return None, None - - if subtype in ['x-patch', 'x-diff']: - patchbuf = payload - elif subtype == 'plain': - c = payload - - if not patchbuf: - patchbuf, c = parse_patch(payload) - - if c is not None: - commentbuf += c.strip() + '\n' - - commentbuf = clean_content(commentbuf) - - return patchbuf, commentbuf - - def find_submission_for_comment(project, refs): for ref in refs: # first, check for a direct reply @@ -502,170 +444,6 @@ def subject_check(subject): return comment_re.match(clean_header(subject)) -def clean_content(content): - """Remove cruft from the email message. - - Catch signature (-- ) and list footer (_____) cruft. - """ - sig_re = re.compile(r'^(-- |_+)\n.*', re.S | re.M) - content = sig_re.sub('', content) - - return content.strip() - - -def parse_patch(content): - """Split a mail's contents into a diff and comment. - - This is a state machine that takes a patch, generally in UNIX mbox - format, and splits it into the component comments and diff. - - Args: - patch: The patch to be split - - Returns: - A tuple containing the diff and comment. Either one or both of - these can be empty. - - Raises: - Exception: The state machine transitioned to an invalid state. - """ - patchbuf = '' - commentbuf = '' - buf = '' - - # state specified the line we just saw, and what to expect next - state = 0 - # 0: text - # 1: suspected patch header (diff, ====, Index:) - # 2: patch header line 1 (---) - # 3: patch header line 2 (+++) - # 4: patch hunk header line (@@ line) - # 5: patch hunk content - # 6: patch meta header (rename from/rename to) - # - # valid transitions: - # 0 -> 1 (diff, ===, Index:) - # 0 -> 2 (---) - # 1 -> 2 (---) - # 2 -> 3 (+++) - # 3 -> 4 (@@ line) - # 4 -> 5 (patch content) - # 5 -> 1 (run out of lines from @@-specifed count) - # 1 -> 6 (rename from / rename to) - # 6 -> 2 (---) - # 6 -> 1 (other text) - # - # Suspected patch header is stored into buf, and appended to - # patchbuf if we find a following hunk. Otherwise, append to - # comment after parsing. - - # line counts while parsing a patch hunk - lc = (0, 0) - hunk = 0 - - for line in content.split('\n'): - line += '\n' - - if state == 0: - if line.startswith('diff ') or line.startswith('===') \ - or line.startswith('Index: '): - state = 1 - buf += line - elif line.startswith('--- '): - state = 2 - buf += line - else: - commentbuf += line - elif state == 1: - buf += line - if line.startswith('--- '): - state = 2 - - if line.startswith(('rename from ', 'rename to ')): - state = 6 - elif state == 2: - if line.startswith('+++ '): - state = 3 - buf += line - elif hunk: - state = 1 - buf += line - else: - state = 0 - commentbuf += buf + line - buf = '' - elif state == 3: - match = _hunk_re.match(line) - if match: - def fn(x): - if not x: - return 1 - return int(x) - - lc = [fn(x) for x in match.groups()] - - state = 4 - patchbuf += buf + line - buf = '' - elif line.startswith('--- '): - patchbuf += buf + line - buf = '' - state = 2 - elif hunk and line.startswith(r'\ No newline at end of file'): - # If we had a hunk and now we see this, it's part of the patch, - # and we're still expecting another @@ line. - patchbuf += line - elif hunk: - state = 1 - buf += line - else: - state = 0 - commentbuf += buf + line - buf = '' - elif state in [4, 5]: - if line.startswith('-'): - lc[0] -= 1 - elif line.startswith('+'): - lc[1] -= 1 - elif line.startswith(r'\ No newline at end of file'): - # Special case: Not included as part of the hunk's line count - pass - else: - lc[0] -= 1 - lc[1] -= 1 - - patchbuf += line - - if lc[0] <= 0 and lc[1] <= 0: - state = 3 - hunk += 1 - else: - state = 5 - elif state == 6: - if line.startswith(('rename to ', 'rename from ')): - patchbuf += buf + line - buf = '' - elif line.startswith('--- '): - patchbuf += buf + line - buf = '' - state = 2 - else: - buf += line - state = 1 - else: - raise Exception("Unknown state %d! (line '%s')" % (state, line)) - - commentbuf += buf - - if patchbuf == '': - patchbuf = None - - if commentbuf == '': - commentbuf = None - - return patchbuf, commentbuf - - def parse_pull_request(content): git_re = re.compile(r'^The following changes since commit.*' + r'^are available in the git repository at:\n' @@ -940,7 +718,7 @@ def find_filenames(diff): if len(line) <= 0: continue - filename_match = _filename_re.match(line) + filename_match = FILENAME_RE.match(line) if not filename_match: continue diff --git a/patchwork/tests/test_parser.py b/patchwork/tests/test_parser.py index e4a379d..00891f9 100644 --- a/patchwork/tests/test_parser.py +++ b/patchwork/tests/test_parser.py @@ -27,13 +27,13 @@ import os from django.test import TestCase from django.utils import six +from patchwork.extractor import find_content from patchwork.models import Comment from patchwork.models import Patch from patchwork.models import Person from patchwork.models import State from patchwork.parser import clean_subject from patchwork.parser import find_author -from patchwork.parser import find_content from patchwork.parser import find_project_by_header from patchwork.parser import find_series from patchwork.parser import parse_mail as _parse_mail

[v2,3/6] parser: Move mail parsing into another module

Commit Message

Patch