[04/11] parsemail: Move parsing to 'parser'

Message ID	1468402860-3409-5-git-send-email-stephen.finucane@intel.com
State	Superseded
Headers	show Return-Path: <patchwork-bounces+incoming=patchwork.ozlabs.org@lists.ozlabs.org> From: Stephen Finucane <stephen.finucane@intel.com> To: patchwork@lists.ozlabs.org Subject: [PATCH 04/11] parsemail: Move parsing to 'parser' Date: Wed, 13 Jul 2016 10:40:53 +0100 Message-Id: <1468402860-3409-5-git-send-email-stephen.finucane@intel.com> In-Reply-To: <1468402860-3409-1-git-send-email-stephen.finucane@intel.com> References: <1468402860-3409-1-git-send-email-stephen.finucane@intel.com> Precedence: list MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: patchwork-bounces+incoming=patchwork.ozlabs.org@lists.ozlabs.org Sender: "Patchwork" <patchwork-bounces+incoming=patchwork.ozlabs.org@lists.ozlabs.org>

diff --git a/patchwork/bin/parsearchive.py b/patchwork/bin/parsearchive.py index 30bca13..8986b22 100755 --- a/patchwork/bin/parsearchive.py +++ b/patchwork/bin/parsearchive.py @@ -29,7 +29,7 @@ import mailbox import django -from patchwork.bin import parsemail +from patchwork.parser import parse_mail from patchwork import models LOGGER = logging.getLogger(__name__) @@ -55,7 +55,7 @@ def parse_mbox(path, list_id): mbox = mailbox.mbox(path) for msg in mbox: try: - obj = parsemail.parse_mail(msg, list_id) + obj = parse_mail(msg, list_id) if obj: results[type(obj)] += 1 else: diff --git a/patchwork/bin/parsemail.py b/patchwork/bin/parsemail.py index 56cd126..abcee04 100755 --- a/patchwork/bin/parsemail.py +++ b/patchwork/bin/parsemail.py @@ -22,29 +22,15 @@ from __future__ import absolute_import import argparse -import codecs -import datetime from email import message_from_file -from email.header import Header, decode_header -from email.utils import parsedate_tz, mktime_tz -from fnmatch import fnmatch -from functools import reduce import logging -import operator -import re import sys import django from django.conf import settings -from django.contrib.auth.models import User from django.utils.log import AdminEmailHandler -from django.utils import six -from django.utils.six.moves import map -from patchwork.models import (Patch, Project, Person, Comment, State, - DelegationRule, Submission, CoverLetter, - get_default_initial_patch_state) -from patchwork.parser import parse_patch, find_filenames +from patchwork.parser import parse_mail LOGGER = logging.getLogger(__name__) @@ -56,513 +42,6 @@ VERBOSITY_LEVELS = { 'critical': logging.CRITICAL } -list_id_headers = ['List-ID', 'X-Mailing-List', 'X-list'] - - -def normalise_space(str): - whitespace_re = re.compile(r'\s+') - return whitespace_re.sub(' ', str).strip() - - -def clean_header(header): - """Decode (possibly non-ascii) headers.""" - def decode(fragment): - (frag_str, frag_encoding) = fragment - if frag_encoding: - return frag_str.decode(frag_encoding) - elif isinstance(frag_str, six.binary_type): # python 2 - return frag_str.decode() - return frag_str - - fragments = list(map(decode, decode_header(header))) - - return normalise_space(u' '.join(fragments)) - - -def find_project_by_id(list_id): - """Find a `project` object with given `list_id`.""" - project = None - try: - project = Project.objects.get(listid=list_id) - except Project.DoesNotExist: - pass - return project - - -def find_project_by_header(mail): - project = None - listid_res = [re.compile(r'.*<([^>]+)>.*', re.S), - re.compile(r'^([\S]+)$', re.S)] - - for header in list_id_headers: - if header in mail: - - for listid_re in listid_res: - match = listid_re.match(mail.get(header)) - if match: - break - - if not match: - continue - - listid = match.group(1) - - project = find_project_by_id(listid) - if project: - break - - return project - - -def find_author(mail): - - from_header = clean_header(mail.get('From')) - name, email = (None, None) - - # tuple of (regex, fn) - # - where fn returns a (name, email) tuple from the match groups resulting - # from re.match().groups() - from_res = [ - # for "Firstname Lastname" <example@example.com> style addresses - (re.compile(r'"?(.*?)"?\s*<([^>]+)>'), (lambda g: (g[0], g[1]))), - - # for example@example.com (Firstname Lastname) style addresses - (re.compile(r'"?(.*?)"?\s*$([^$]+)\)'), (lambda g: (g[1], g[0]))), - - # for example at example.com (Firstname Lastname) style addresses - (re.compile(r'(.*?)\sat\s(.*?)\s*$([^$]+)\)'), - (lambda g: (g[2], '@'.join(g[0:2])))), - - # everything else - (re.compile(r'(.*)'), (lambda g: (None, g[0]))), - ] - - for regex, fn in from_res: - match = regex.match(from_header) - if match: - (name, email) = fn(match.groups()) - break - - if email is None: - raise ValueError("Invalid 'From' header") - - email = email.strip() - if name is not None: - name = name.strip() - - try: - person = Person.objects.get(email__iexact=email) - if name: # use the latest provided name - person.name = name - except Person.DoesNotExist: - person = Person(name=name, email=email) - - return person - - -def find_date(mail): - t = parsedate_tz(mail.get('Date', '')) - if not t: - return datetime.datetime.utcnow() - return datetime.datetime.utcfromtimestamp(mktime_tz(t)) - - -def find_headers(mail): - return reduce(operator.__concat__, - ['%s: %s\n' % (k, Header(v, header_name=k, - continuation_ws='\t').encode()) - for (k, v) in list(mail.items())]) - - -def find_pull_request(content): - git_re = re.compile(r'^The following changes since commit.*' + - r'^are available in the git repository at:\n' - r'^\s*([\S]+://[^\n]+)$', - re.DOTALL | re.MULTILINE) - match = git_re.search(content) - if match: - return match.group(1) - return None - - -def find_references(mail): - """Construct a list of possible reply message ids.""" - refs = [] - - if 'In-Reply-To' in mail: - refs.append(mail.get('In-Reply-To')) - - if 'References' in mail: - rs = mail.get('References').split() - rs.reverse() - for r in rs: - if r not in refs: - refs.append(r) - - return refs - - -def parse_series_marker(subject_prefixes): - """Extract series markers from subject. - - Extract the markers of multi-patches series, i.e. 'x/n', from the - provided subject series. - - Args: - subject_prefixes: List of subject prefixes to extract markers - from - - Returns: - (x, n) if markers found, else (None, None) - """ - - regex = re.compile('^([0-9]+)/([0-9]+)$') - for prefix in subject_prefixes: - m = regex.match(prefix) - if not m: - continue - return (int(m.group(1)), int(m.group(2))) - return (None, None) - - -def find_content(project, mail): - patchbuf = None - commentbuf = '' - - for part in mail.walk(): - if part.get_content_maintype() != 'text': - continue - - payload = part.get_payload(decode=True) - subtype = part.get_content_subtype() - - if not isinstance(payload, six.text_type): - charset = part.get_content_charset() - - # Check that we have a charset that we understand. Otherwise, - # ignore it and fallback to our standard set. - if charset is not None: - try: - codecs.lookup(charset) - except LookupError: - charset = None - - # If there is no charset or if it is unknown, then try some common - # charsets before we fail. - if charset is None: - try_charsets = ['utf-8', 'windows-1252', 'iso-8859-1'] - else: - try_charsets = [charset] - - for cset in try_charsets: - try: - payload = six.text_type(payload, cset) - break - except UnicodeDecodeError: - payload = None - - # Could not find a valid decoded payload. Fail. - if payload is None: - return None, None - - if subtype in ['x-patch', 'x-diff']: - patchbuf = payload - elif subtype == 'plain': - c = payload - - if not patchbuf: - patchbuf, c = parse_patch(payload) - - if c is not None: - commentbuf += c.strip() + '\n' - - commentbuf = clean_content(commentbuf) - - return patchbuf, commentbuf - - -def find_submission_for_comment(project, refs): - for ref in refs: - # first, check for a direct reply - try: - submission = Submission.objects.get(project=project, msgid=ref) - return submission - except Submission.DoesNotExist: - pass - - # see if we have comments that refer to a patch - try: - comment = Comment.objects.get(submission__project=project, - msgid=ref) - return comment.submission - except Comment.MultipleObjectsReturned: - # NOTE(stephenfin): This is a artifact of prior lack of support - # for cover letters in Patchwork. Previously all replies to - # patches were saved as comments. However, it's possible that - # someone could have created a new series as a reply to one of the - # comments on the original patch series. For example, - # '2015-November/002096.html' from the Patchwork archives. In this - # case, reparsing the archives will result in creation of a cover - # letter with the same message ID as the existing comment. Follow - # up comments will then apply to both this cover letter and the - # linked patch from the comment previously created. We choose to - # apply the comment to the cover letter. Note that this only - # happens when running 'parsearchive' or similar, so it should not - # affect every day use in any way. - comments = Comment.objects.filter(submission__project=project, - msgid=ref) - # The latter item will be the cover letter - return comments.reverse()[0].submission - except Comment.DoesNotExist: - pass - - return None - - -def split_prefixes(prefix): - """Turn a prefix string into a list of prefix tokens.""" - split_re = re.compile(r'[,\s]+') - matches = split_re.split(prefix) - - return [s for s in matches if s != ''] - - -def clean_subject(subject, drop_prefixes=None): - """Clean a Subject: header from an incoming patch. - - Removes Re: and Fwd: strings, as well as [PATCH]-style prefixes. By - default, only [PATCH] is removed, and we keep any other bracketed - data in the subject. If drop_prefixes is provided, remove those - too, comparing case-insensitively. - - Args: - subject: Subject to be cleaned - drop_prefixes: Additional, case-insensitive prefixes to remove - from the subject - """ - re_re = re.compile(r'^(re|fwd?)[:\s]\s*', re.I) - prefix_re = re.compile(r'^\[([^\]]*)\]\s*(.*)$') - subject = clean_header(subject) - - if drop_prefixes is None: - drop_prefixes = [] - else: - drop_prefixes = [s.lower() for s in drop_prefixes] - - drop_prefixes.append('patch') - - # remove Re:, Fwd:, etc - subject = re_re.sub(' ', subject) - - subject = normalise_space(subject) - - prefixes = [] - - match = prefix_re.match(subject) - - while match: - prefix_str = match.group(1) - prefixes += [p for p in split_prefixes(prefix_str) - if p.lower() not in drop_prefixes] - - subject = match.group(2) - match = prefix_re.match(subject) - - subject = normalise_space(subject) - - subject = subject.strip() - if prefixes: - subject = '[%s] %s' % (','.join(prefixes), subject) - - return (subject, prefixes) - - -def clean_content(content): - """Remove cruft from the email message. - - Catch signature (-- ) and list footer (_____) cruft. - """ - sig_re = re.compile(r'^(-- |_+)\n.*', re.S | re.M) - content = sig_re.sub('', content) - - return content.strip() - - -def find_state(mail): - """Return the state with the given name or the default.""" - state_name = mail.get('X-Patchwork-State', '').strip() - if state_name: - try: - return State.objects.get(name__iexact=state_name) - except State.DoesNotExist: - pass - return get_default_initial_patch_state() - - -def auto_delegate(project, filenames): - if not filenames: - return None - - rules = list(DelegationRule.objects.filter(project=project)) - - patch_delegate = None - - for filename in filenames: - file_delegate = None - for rule in rules: - if fnmatch(filename, rule.path): - file_delegate = rule.user - break - - if file_delegate is None: - return None - - if patch_delegate is not None and file_delegate != patch_delegate: - return None - - patch_delegate = file_delegate - - return patch_delegate - - -def find_delegate(mail): - """Return the delegate with the given email or None.""" - delegate_email = mail.get('X-Patchwork-Delegate', '').strip() - if delegate_email: - try: - return User.objects.get(email__iexact=delegate_email) - except User.DoesNotExist: - pass - return None - - -def parse_mail(mail, list_id=None): - """Parse a mail and add to the database. - - Args: - mail (`mbox.Mail`): Mail to parse and add. - list_id (str): Mailing list ID - - Returns: - None - """ - # some basic sanity checks - if 'From' not in mail: - raise ValueError("Missing 'From' header") - - if 'Subject' not in mail: - raise ValueError("Missing 'Subject' header") - - if 'Message-Id' not in mail: - raise ValueError("Missing 'Message-Id' header") - - hint = mail.get('X-Patchwork-Hint', '').lower() - if hint == 'ignore': - LOGGER.debug("Ignoring email due to 'ignore' hint") - return - - if list_id: - project = find_project_by_id(list_id) - else: - project = find_project_by_header(mail) - - if project is None: - LOGGER.error('Failed to find a project for email') - return - - # parse content - - diff, message = find_content(project, mail) - - if not (diff or message): - return # nothing to work with - - msgid = mail.get('Message-Id').strip() - author = find_author(mail) - name, prefixes = clean_subject(mail.get('Subject'), [project.linkname]) - x, n = parse_series_marker(prefixes) - refs = find_references(mail) - date = find_date(mail) - headers = find_headers(mail) - pull_url = find_pull_request(message) - - # build objects - - if diff or pull_url: # patches or pull requests - # we delay the saving until we know we have a patch. - author.save() - - delegate = find_delegate(mail) - if not delegate and diff: - filenames = find_filenames(diff) - delegate = auto_delegate(project, filenames) - - patch = Patch( - msgid=msgid, - project=project, - name=name, - date=date, - headers=headers, - submitter=author, - content=message, - diff=diff, - pull_url=pull_url, - delegate=delegate, - state=find_state(mail)) - patch.save() - LOGGER.debug('Patch saved') - - return patch - elif x == 0: # (potential) cover letters - # if refs are empty, it's implicitly a cover letter. If not, - # however, we need to see if a match already exists and, if - # not, assume that it is indeed a new cover letter - is_cover_letter = False - if not refs == []: - try: - CoverLetter.objects.all().get(name=name) - except CoverLetter.DoesNotExist: # no match => new cover - is_cover_letter = True - else: - is_cover_letter = True - - if is_cover_letter: - author.save() - - cover_letter = CoverLetter( - msgid=msgid, - project=project, - name=name, - date=date, - headers=headers, - submitter=author, - content=message) - cover_letter.save() - LOGGER.debug('Cover letter saved') - - return cover_letter - - # comments - - # we only save comments if we have the parent email - submission = find_submission_for_comment(project, refs) - if not submission: - return - - author.save() - - comment = Comment( - submission=submission, - msgid=msgid, - date=date, - headers=headers, - submitter=author, - content=message) - comment.save() - LOGGER.debug('Comment saved') - - return comment - extra_error_message = ''' == Mail diff --git a/patchwork/parser.py b/patchwork/parser.py index c9c058d..938b965 100644 --- a/patchwork/parser.py +++ b/patchwork/parser.py @@ -19,13 +19,359 @@ # along with Patchwork; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +import codecs +import datetime +from email.header import Header, decode_header +from email.utils import parsedate_tz, mktime_tz +from fnmatch import fnmatch +from functools import reduce +import logging +import operator import re +from django.contrib.auth.models import User +from django.utils import six from django.utils.six.moves import map +from patchwork.models import (Patch, Project, Person, Comment, State, + DelegationRule, Submission, CoverLetter, + get_default_initial_patch_state) -_hunk_re = re.compile('^\@\@ -\d+(?:,(\d+))? \+\d+(?:,(\d+))? \@\@') -_filename_re = re.compile('^(---|\+\+\+) (\S+)') + +_hunk_re = re.compile(r'^\@\@ -\d+(?:,(\d+))? \+\d+(?:,(\d+))? \@\@') +_filename_re = re.compile(r'^(---|\+\+\+) (\S+)') +list_id_headers = ['List-ID', 'X-Mailing-List', 'X-list'] + +LOGGER = logging.getLogger(__name__) + + +def normalise_space(str): + whitespace_re = re.compile(r'\s+') + return whitespace_re.sub(' ', str).strip() + + +def clean_header(header): + """Decode (possibly non-ascii) headers.""" + def decode(fragment): + (frag_str, frag_encoding) = fragment + if frag_encoding: + return frag_str.decode(frag_encoding) + elif isinstance(frag_str, six.binary_type): # python 2 + return frag_str.decode() + return frag_str + + fragments = list(map(decode, decode_header(header))) + + return normalise_space(u' '.join(fragments)) + + +def find_project_by_id(list_id): + """Find a `project` object with given `list_id`.""" + project = None + try: + project = Project.objects.get(listid=list_id) + except Project.DoesNotExist: + pass + return project + + +def find_project_by_header(mail): + project = None + listid_res = [re.compile(r'.*<([^>]+)>.*', re.S), + re.compile(r'^([\S]+)$', re.S)] + + for header in list_id_headers: + if header in mail: + + for listid_re in listid_res: + match = listid_re.match(mail.get(header)) + if match: + break + + if not match: + continue + + listid = match.group(1) + + project = find_project_by_id(listid) + if project: + break + + return project + + +def find_author(mail): + from_header = clean_header(mail.get('From')) + name, email = (None, None) + + # tuple of (regex, fn) + # - where fn returns a (name, email) tuple from the match groups resulting + # from re.match().groups() + from_res = [ + # for "Firstname Lastname" <example@example.com> style addresses + (re.compile(r'"?(.*?)"?\s*<([^>]+)>'), (lambda g: (g[0], g[1]))), + + # for example@example.com (Firstname Lastname) style addresses + (re.compile(r'"?(.*?)"?\s*$([^$]+)\)'), (lambda g: (g[1], g[0]))), + + # for example at example.com (Firstname Lastname) style addresses + (re.compile(r'(.*?)\sat\s(.*?)\s*$([^$]+)\)'), + (lambda g: (g[2], '@'.join(g[0:2])))), + + # everything else + (re.compile(r'(.*)'), (lambda g: (None, g[0]))), + ] + + for regex, fn in from_res: + match = regex.match(from_header) + if match: + (name, email) = fn(match.groups()) + break + + if email is None: + raise ValueError("Invalid 'From' header") + + email = email.strip() + if name is not None: + name = name.strip() + + try: + person = Person.objects.get(email__iexact=email) + if name: # use the latest provided name + person.name = name + except Person.DoesNotExist: + person = Person(name=name, email=email) + + return person + + +def find_date(mail): + t = parsedate_tz(mail.get('Date', '')) + if not t: + return datetime.datetime.utcnow() + return datetime.datetime.utcfromtimestamp(mktime_tz(t)) + + +def find_headers(mail): + return reduce(operator.__concat__, + ['%s: %s\n' % (k, Header(v, header_name=k, + continuation_ws='\t').encode()) + for (k, v) in list(mail.items())]) + + +def find_pull_request(content): + git_re = re.compile(r'^The following changes since commit.*' + + r'^are available in the git repository at:\n' + r'^\s*([\S]+://[^\n]+)$', + re.DOTALL | re.MULTILINE) + match = git_re.search(content) + if match: + return match.group(1) + return None + + +def find_references(mail): + """Construct a list of possible reply message ids.""" + refs = [] + + if 'In-Reply-To' in mail: + refs.append(mail.get('In-Reply-To')) + + if 'References' in mail: + rs = mail.get('References').split() + rs.reverse() + for r in rs: + if r not in refs: + refs.append(r) + + return refs + + +def parse_series_marker(subject_prefixes): + """Extract series markers from subject. + + Extract the markers of multi-patches series, i.e. 'x/n', from the + provided subject series. + + Args: + subject_prefixes: List of subject prefixes to extract markers + from + + Returns: + (x, n) if markers found, else (None, None) + """ + + regex = re.compile('^([0-9]+)/([0-9]+)$') + for prefix in subject_prefixes: + m = regex.match(prefix) + if not m: + continue + return (int(m.group(1)), int(m.group(2))) + return (None, None) + + +def find_content(project, mail): + """Extract a comment and potential diff from a mail.""" + patchbuf = None + commentbuf = '' + + for part in mail.walk(): + if part.get_content_maintype() != 'text': + continue + + payload = part.get_payload(decode=True) + subtype = part.get_content_subtype() + + if not isinstance(payload, six.text_type): + charset = part.get_content_charset() + + # Check that we have a charset that we understand. Otherwise, + # ignore it and fallback to our standard set. + if charset is not None: + try: + codecs.lookup(charset) + except LookupError: + charset = None + + # If there is no charset or if it is unknown, then try some common + # charsets before we fail. + if charset is None: + try_charsets = ['utf-8', 'windows-1252', 'iso-8859-1'] + else: + try_charsets = [charset] + + for cset in try_charsets: + try: + payload = six.text_type(payload, cset) + break + except UnicodeDecodeError: + payload = None + + # Could not find a valid decoded payload. Fail. + if payload is None: + return None, None + + if subtype in ['x-patch', 'x-diff']: + patchbuf = payload + elif subtype == 'plain': + c = payload + + if not patchbuf: + patchbuf, c = parse_patch(payload) + + if c is not None: + commentbuf += c.strip() + '\n' + + commentbuf = clean_content(commentbuf) + + return patchbuf, commentbuf + + +def find_submission_for_comment(project, refs): + for ref in refs: + # first, check for a direct reply + try: + submission = Submission.objects.get(project=project, msgid=ref) + return submission + except Submission.DoesNotExist: + pass + + # see if we have comments that refer to a patch + try: + comment = Comment.objects.get(submission__project=project, + msgid=ref) + return comment.submission + except Comment.MultipleObjectsReturned: + # NOTE(stephenfin): This is a artifact of prior lack of support + # for cover letters in Patchwork. Previously all replies to + # patches were saved as comments. However, it's possible that + # someone could have created a new series as a reply to one of the + # comments on the original patch series. For example, + # '2015-November/002096.html' from the Patchwork archives. In this + # case, reparsing the archives will result in creation of a cover + # letter with the same message ID as the existing comment. Follow + # up comments will then apply to both this cover letter and the + # linked patch from the comment previously created. We choose to + # apply the comment to the cover letter. Note that this only + # happens when running 'parsearchive' or similar, so it should not + # affect every day use in any way. + comments = Comment.objects.filter(submission__project=project, + msgid=ref) + # The latter item will be the cover letter + return comments.reverse()[0].submission + except Comment.DoesNotExist: + pass + + return None + + +def split_prefixes(prefix): + """Turn a prefix string into a list of prefix tokens.""" + split_re = re.compile(r'[,\s]+') + matches = split_re.split(prefix) + + return [s for s in matches if s != ''] + + +def clean_subject(subject, drop_prefixes=None): + """Clean a Subject: header from an incoming patch. + + Removes Re: and Fwd: strings, as well as [PATCH]-style prefixes. By + default, only [PATCH] is removed, and we keep any other bracketed + data in the subject. If drop_prefixes is provided, remove those + too, comparing case-insensitively. + + Args: + subject: Subject to be cleaned + drop_prefixes: Additional, case-insensitive prefixes to remove + from the subject + """ + re_re = re.compile(r'^(re|fwd?)[:\s]\s*', re.I) + prefix_re = re.compile(r'^\[([^\]]*)\]\s*(.*)$') + subject = clean_header(subject) + + if drop_prefixes is None: + drop_prefixes = [] + else: + drop_prefixes = [s.lower() for s in drop_prefixes] + + drop_prefixes.append('patch') + + # remove Re:, Fwd:, etc + subject = re_re.sub(' ', subject) + + subject = normalise_space(subject) + + prefixes = [] + + match = prefix_re.match(subject) + + while match: + prefix_str = match.group(1) + prefixes += [p for p in split_prefixes(prefix_str) + if p.lower() not in drop_prefixes] + + subject = match.group(2) + match = prefix_re.match(subject) + + subject = normalise_space(subject) + + subject = subject.strip() + if prefixes: + subject = '[%s] %s' % (','.join(prefixes), subject) + + return (subject, prefixes) + + +def clean_content(content): + """Remove cruft from the email message. + + Catch signature (-- ) and list footer (_____) cruft. + """ + sig_re = re.compile(r'^(-- |_+)\n.*', re.S | re.M) + content = sig_re.sub('', content) + + return content.strip() def parse_patch(content): @@ -181,6 +527,182 @@ def parse_patch(content): return patchbuf, commentbuf +def find_state(mail): + """Return the state with the given name or the default.""" + state_name = mail.get('X-Patchwork-State', '').strip() + if state_name: + try: + return State.objects.get(name__iexact=state_name) + except State.DoesNotExist: + pass + return get_default_initial_patch_state() + + +def auto_delegate(project, filenames): + if not filenames: + return None + + rules = list(DelegationRule.objects.filter(project=project)) + + patch_delegate = None + + for filename in filenames: + file_delegate = None + for rule in rules: + if fnmatch(filename, rule.path): + file_delegate = rule.user + break + + if file_delegate is None: + return None + + if patch_delegate is not None and file_delegate != patch_delegate: + return None + + patch_delegate = file_delegate + + return patch_delegate + + +def find_delegate(mail): + """Return the delegate with the given email or None.""" + delegate_email = mail.get('X-Patchwork-Delegate', '').strip() + if delegate_email: + try: + return User.objects.get(email__iexact=delegate_email) + except User.DoesNotExist: + pass + return None + + +def parse_mail(mail, list_id=None): + """Parse a mail and add to the database. + + Args: + mail (`mbox.Mail`): Mail to parse and add. + list_id (str): Mailing list ID + + Returns: + None + """ + # some basic sanity checks + if 'From' not in mail: + raise ValueError("Missing 'From' header") + + if 'Subject' not in mail: + raise ValueError("Missing 'Subject' header") + + if 'Message-Id' not in mail: + raise ValueError("Missing 'Message-Id' header") + + hint = mail.get('X-Patchwork-Hint', '').lower() + if hint == 'ignore': + LOGGER.debug("Ignoring email due to 'ignore' hint") + return + + if list_id: + project = find_project_by_id(list_id) + else: + project = find_project_by_header(mail) + + if project is None: + LOGGER.error('Failed to find a project for email') + return + + # parse content + + diff, message = find_content(project, mail) + + if not (diff or message): + return # nothing to work with + + msgid = mail.get('Message-Id').strip() + author = find_author(mail) + name, prefixes = clean_subject(mail.get('Subject'), [project.linkname]) + x, n = parse_series_marker(prefixes) + refs = find_references(mail) + date = find_date(mail) + headers = find_headers(mail) + pull_url = find_pull_request(message) + + # build objects + + if diff or pull_url: # patches or pull requests + # we delay the saving until we know we have a patch. + author.save() + + delegate = find_delegate(mail) + if not delegate and diff: + filenames = find_filenames(diff) + delegate = auto_delegate(project, filenames) + + patch = Patch( + msgid=msgid, + project=project, + name=name, + date=date, + headers=headers, + submitter=author, + content=message, + diff=diff, + pull_url=pull_url, + delegate=delegate, + state=find_state(mail)) + patch.save() + LOGGER.debug('Patch saved') + + return patch + elif x == 0: # (potential) cover letters + # if refs are empty, it's implicitly a cover letter. If not, + # however, we need to see if a match already exists and, if + # not, assume that it is indeed a new cover letter + is_cover_letter = False + if not refs == []: + try: + CoverLetter.objects.all().get(name=name) + except CoverLetter.DoesNotExist: # no match => new cover + is_cover_letter = True + else: + is_cover_letter = True + + if is_cover_letter: + author.save() + + cover_letter = CoverLetter( + msgid=msgid, + project=project, + name=name, + date=date, + headers=headers, + submitter=author, + content=message) + cover_letter.save() + LOGGER.debug('Cover letter saved') + + return cover_letter + + # comments + + # we only save comments if we have the parent email + submission = find_submission_for_comment(project, refs) + if not submission: + return + + author.save() + + comment = Comment( + submission=submission, + msgid=msgid, + date=date, + headers=headers, + submitter=author, + content=message) + comment.save() + LOGGER.debug('Comment saved') + + return comment + + def find_filenames(diff): """Find files changes in a given diff.""" # normalise spaces diff --git a/patchwork/tests/test_parser.py b/patchwork/tests/test_parser.py index eca05a0..684a667 100644 --- a/patchwork/tests/test_parser.py +++ b/patchwork/tests/test_parser.py @@ -26,18 +26,19 @@ import os from django.test import TestCase -from patchwork.bin.parsemail import clean_subject -from patchwork.bin.parsemail import find_author -from patchwork.bin.parsemail import find_content -from patchwork.bin.parsemail import find_project_by_header -from patchwork.bin.parsemail import find_pull_request -from patchwork.bin.parsemail import parse_mail as _parse_mail -from patchwork.bin.parsemail import parse_series_marker -from patchwork.bin.parsemail import split_prefixes from patchwork.models import Comment from patchwork.models import Patch from patchwork.models import Person from patchwork.models import State +from patchwork.parser import clean_subject +from patchwork.parser import find_author +from patchwork.parser import find_content +from patchwork.parser import find_project_by_header +from patchwork.parser import find_pull_request +from patchwork.parser import parse_mail as _parse_mail +from patchwork.parser import parse_series_marker +from patchwork.parser import split_prefixes +from patchwork.tests.utils import create_email from patchwork.tests.utils import create_project from patchwork.tests.utils import create_state from patchwork.tests.utils import create_user

[04/11] parsemail: Move parsing to 'parser'

Commit Message

Comments

Patch