diff mbox

[04/11] parsemail: Move parsing to 'parser'

Message ID 1468402860-3409-5-git-send-email-stephen.finucane@intel.com
State Superseded
Headers show

Commit Message

Stephen Finucane July 13, 2016, 9:40 a.m. UTC
Separate the parsing of mails from the CLI aspects of parsemail. Since
there is already a 'parser' module, it makes sense to place the parsing
functionality here. This will allow for additional uses of this parsing
functionality in the future.

Signed-off-by: Stephen Finucane <stephen.finucane@intel.com>
 patchwork/bin/parsearchive.py  |    4 +-
 patchwork/bin/parsemail.py     |  523 +---------------------------------------
 patchwork/parser.py            |  526 +++++++++++++++++++++++++++++++++++++++-
 patchwork/tests/test_parser.py |   17 +-
 4 files changed, 536 insertions(+), 534 deletions(-)


Andy Doan July 19, 2016, 9:33 p.m. UTC | #1
On 07/13/2016 04:40 AM, Stephen Finucane wrote:
> Separate the parsing of mails from the CLI aspects of parsemail. Since
> there is already a 'parser' module, it makes sense to place the parsing
> functionality here. This will allow for additional uses of this parsing
> functionality in the future.
> Signed-off-by: Stephen Finucane <stephen.finucane@intel.com>

Its going to break some code my deployments have (we pull mail via IMAP 
and then feed it to the parsemail function), but this is the right thing 
to do.

Reviewed-by: Andy Doan <andy.doan@linaro.org>
diff mbox


diff --git a/patchwork/bin/parsearchive.py b/patchwork/bin/parsearchive.py
index 30bca13..8986b22 100755
--- a/patchwork/bin/parsearchive.py
+++ b/patchwork/bin/parsearchive.py
@@ -29,7 +29,7 @@  import mailbox
 import django
-from patchwork.bin import parsemail
+from patchwork.parser import parse_mail
 from patchwork import models
 LOGGER = logging.getLogger(__name__)
@@ -55,7 +55,7 @@  def parse_mbox(path, list_id):
     mbox = mailbox.mbox(path)
     for msg in mbox:
-            obj = parsemail.parse_mail(msg, list_id)
+            obj = parse_mail(msg, list_id)
             if obj:
                 results[type(obj)] += 1
diff --git a/patchwork/bin/parsemail.py b/patchwork/bin/parsemail.py
index 56cd126..abcee04 100755
--- a/patchwork/bin/parsemail.py
+++ b/patchwork/bin/parsemail.py
@@ -22,29 +22,15 @@ 
 from __future__ import absolute_import
 import argparse
-import codecs
-import datetime
 from email import message_from_file
-from email.header import Header, decode_header
-from email.utils import parsedate_tz, mktime_tz
-from fnmatch import fnmatch
-from functools import reduce
 import logging
-import operator
-import re
 import sys
 import django
 from django.conf import settings
-from django.contrib.auth.models import User
 from django.utils.log import AdminEmailHandler
-from django.utils import six
-from django.utils.six.moves import map
-from patchwork.models import (Patch, Project, Person, Comment, State,
-                              DelegationRule, Submission, CoverLetter,
-                              get_default_initial_patch_state)
-from patchwork.parser import parse_patch, find_filenames
+from patchwork.parser import parse_mail
 LOGGER = logging.getLogger(__name__)
@@ -56,513 +42,6 @@  VERBOSITY_LEVELS = {
     'critical': logging.CRITICAL
-list_id_headers = ['List-ID', 'X-Mailing-List', 'X-list']
-def normalise_space(str):
-    whitespace_re = re.compile(r'\s+')
-    return whitespace_re.sub(' ', str).strip()
-def clean_header(header):
-    """Decode (possibly non-ascii) headers."""
-    def decode(fragment):
-        (frag_str, frag_encoding) = fragment
-        if frag_encoding:
-            return frag_str.decode(frag_encoding)
-        elif isinstance(frag_str, six.binary_type):  # python 2
-            return frag_str.decode()
-        return frag_str
-    fragments = list(map(decode, decode_header(header)))
-    return normalise_space(u' '.join(fragments))
-def find_project_by_id(list_id):
-    """Find a `project` object with given `list_id`."""
-    project = None
-    try:
-        project = Project.objects.get(listid=list_id)
-    except Project.DoesNotExist:
-        pass
-    return project
-def find_project_by_header(mail):
-    project = None
-    listid_res = [re.compile(r'.*<([^>]+)>.*', re.S),
-                  re.compile(r'^([\S]+)$', re.S)]
-    for header in list_id_headers:
-        if header in mail:
-            for listid_re in listid_res:
-                match = listid_re.match(mail.get(header))
-                if match:
-                    break
-            if not match:
-                continue
-            listid = match.group(1)
-            project = find_project_by_id(listid)
-            if project:
-                break
-    return project
-def find_author(mail):
-    from_header = clean_header(mail.get('From'))
-    name, email = (None, None)
-    # tuple of (regex, fn)
-    #  - where fn returns a (name, email) tuple from the match groups resulting
-    #    from re.match().groups()
-    from_res = [
-        # for "Firstname Lastname" <example@example.com> style addresses
-        (re.compile(r'"?(.*?)"?\s*<([^>]+)>'), (lambda g: (g[0], g[1]))),
-        # for example@example.com (Firstname Lastname) style addresses
-        (re.compile(r'"?(.*?)"?\s*\(([^\)]+)\)'), (lambda g: (g[1], g[0]))),
-        # for example at example.com (Firstname Lastname) style addresses
-        (re.compile(r'(.*?)\sat\s(.*?)\s*\(([^\)]+)\)'),
-         (lambda g: (g[2], '@'.join(g[0:2])))),
-        # everything else
-        (re.compile(r'(.*)'), (lambda g: (None, g[0]))),
-    ]
-    for regex, fn in from_res:
-        match = regex.match(from_header)
-        if match:
-            (name, email) = fn(match.groups())
-            break
-    if email is None:
-        raise ValueError("Invalid 'From' header")
-    email = email.strip()
-    if name is not None:
-        name = name.strip()
-    try:
-        person = Person.objects.get(email__iexact=email)
-        if name:  # use the latest provided name
-            person.name = name
-    except Person.DoesNotExist:
-        person = Person(name=name, email=email)
-    return person
-def find_date(mail):
-    t = parsedate_tz(mail.get('Date', ''))
-    if not t:
-        return datetime.datetime.utcnow()
-    return datetime.datetime.utcfromtimestamp(mktime_tz(t))
-def find_headers(mail):
-    return reduce(operator.__concat__,
-                  ['%s: %s\n' % (k, Header(v, header_name=k,
-                                           continuation_ws='\t').encode())
-                   for (k, v) in list(mail.items())])
-def find_pull_request(content):
-    git_re = re.compile(r'^The following changes since commit.*' +
-                        r'^are available in the git repository at:\n'
-                        r'^\s*([\S]+://[^\n]+)$',
-                        re.DOTALL | re.MULTILINE)
-    match = git_re.search(content)
-    if match:
-        return match.group(1)
-    return None
-def find_references(mail):
-    """Construct a list of possible reply message ids."""
-    refs = []
-    if 'In-Reply-To' in mail:
-        refs.append(mail.get('In-Reply-To'))
-    if 'References' in mail:
-        rs = mail.get('References').split()
-        rs.reverse()
-        for r in rs:
-            if r not in refs:
-                refs.append(r)
-    return refs
-def parse_series_marker(subject_prefixes):
-    """Extract series markers from subject.
-    Extract the markers of multi-patches series, i.e. 'x/n', from the
-    provided subject series.
-    Args:
-        subject_prefixes: List of subject prefixes to extract markers
-          from
-    Returns:
-        (x, n) if markers found, else (None, None)
-    """
-    regex = re.compile('^([0-9]+)/([0-9]+)$')
-    for prefix in subject_prefixes:
-        m = regex.match(prefix)
-        if not m:
-            continue
-        return (int(m.group(1)), int(m.group(2)))
-    return (None, None)
-def find_content(project, mail):
-    patchbuf = None
-    commentbuf = ''
-    for part in mail.walk():
-        if part.get_content_maintype() != 'text':
-            continue
-        payload = part.get_payload(decode=True)
-        subtype = part.get_content_subtype()
-        if not isinstance(payload, six.text_type):
-            charset = part.get_content_charset()
-            # Check that we have a charset that we understand. Otherwise,
-            # ignore it and fallback to our standard set.
-            if charset is not None:
-                try:
-                    codecs.lookup(charset)
-                except LookupError:
-                    charset = None
-            # If there is no charset or if it is unknown, then try some common
-            # charsets before we fail.
-            if charset is None:
-                try_charsets = ['utf-8', 'windows-1252', 'iso-8859-1']
-            else:
-                try_charsets = [charset]
-            for cset in try_charsets:
-                try:
-                    payload = six.text_type(payload, cset)
-                    break
-                except UnicodeDecodeError:
-                    payload = None
-            # Could not find a valid decoded payload.  Fail.
-            if payload is None:
-                return None, None
-        if subtype in ['x-patch', 'x-diff']:
-            patchbuf = payload
-        elif subtype == 'plain':
-            c = payload
-            if not patchbuf:
-                patchbuf, c = parse_patch(payload)
-            if c is not None:
-                commentbuf += c.strip() + '\n'
-    commentbuf = clean_content(commentbuf)
-    return patchbuf, commentbuf
-def find_submission_for_comment(project, refs):
-    for ref in refs:
-        # first, check for a direct reply
-        try:
-            submission = Submission.objects.get(project=project, msgid=ref)
-            return submission
-        except Submission.DoesNotExist:
-            pass
-        # see if we have comments that refer to a patch
-        try:
-            comment = Comment.objects.get(submission__project=project,
-                                          msgid=ref)
-            return comment.submission
-        except Comment.MultipleObjectsReturned:
-            # NOTE(stephenfin): This is a artifact of prior lack of support
-            # for cover letters in Patchwork. Previously all replies to
-            # patches were saved as comments. However, it's possible that
-            # someone could have created a new series as a reply to one of the
-            # comments on the original patch series. For example,
-            # '2015-November/002096.html' from the Patchwork archives. In this
-            # case, reparsing the archives will result in creation of a cover
-            # letter with the same message ID as the existing comment. Follow
-            # up comments will then apply to both this cover letter and the
-            # linked patch from the comment previously created. We choose to
-            # apply the comment to the cover letter. Note that this only
-            # happens when running 'parsearchive' or similar, so it should not
-            # affect every day use in any way.
-            comments = Comment.objects.filter(submission__project=project,
-                                              msgid=ref)
-            # The latter item will be the cover letter
-            return comments.reverse()[0].submission
-        except Comment.DoesNotExist:
-            pass
-    return None
-def split_prefixes(prefix):
-    """Turn a prefix string into a list of prefix tokens."""
-    split_re = re.compile(r'[,\s]+')
-    matches = split_re.split(prefix)
-    return [s for s in matches if s != '']
-def clean_subject(subject, drop_prefixes=None):
-    """Clean a Subject: header from an incoming patch.
-    Removes Re: and Fwd: strings, as well as [PATCH]-style prefixes. By
-    default, only [PATCH] is removed, and we keep any other bracketed
-    data in the subject. If drop_prefixes is provided, remove those
-    too, comparing case-insensitively.
-    Args:
-        subject: Subject to be cleaned
-        drop_prefixes: Additional, case-insensitive prefixes to remove
-          from the subject
-    """
-    re_re = re.compile(r'^(re|fwd?)[:\s]\s*', re.I)
-    prefix_re = re.compile(r'^\[([^\]]*)\]\s*(.*)$')
-    subject = clean_header(subject)
-    if drop_prefixes is None:
-        drop_prefixes = []
-    else:
-        drop_prefixes = [s.lower() for s in drop_prefixes]
-    drop_prefixes.append('patch')
-    # remove Re:, Fwd:, etc
-    subject = re_re.sub(' ', subject)
-    subject = normalise_space(subject)
-    prefixes = []
-    match = prefix_re.match(subject)
-    while match:
-        prefix_str = match.group(1)
-        prefixes += [p for p in split_prefixes(prefix_str)
-                     if p.lower() not in drop_prefixes]
-        subject = match.group(2)
-        match = prefix_re.match(subject)
-    subject = normalise_space(subject)
-    subject = subject.strip()
-    if prefixes:
-        subject = '[%s] %s' % (','.join(prefixes), subject)
-    return (subject, prefixes)
-def clean_content(content):
-    """Remove cruft from the email message.
-    Catch signature (-- ) and list footer (_____) cruft.
-    """
-    sig_re = re.compile(r'^(-- |_+)\n.*', re.S | re.M)
-    content = sig_re.sub('', content)
-    return content.strip()
-def find_state(mail):
-    """Return the state with the given name or the default."""
-    state_name = mail.get('X-Patchwork-State', '').strip()
-    if state_name:
-        try:
-            return State.objects.get(name__iexact=state_name)
-        except State.DoesNotExist:
-            pass
-    return get_default_initial_patch_state()
-def auto_delegate(project, filenames):
-    if not filenames:
-        return None
-    rules = list(DelegationRule.objects.filter(project=project))
-    patch_delegate = None
-    for filename in filenames:
-        file_delegate = None
-        for rule in rules:
-            if fnmatch(filename, rule.path):
-                file_delegate = rule.user
-                break
-        if file_delegate is None:
-            return None
-        if patch_delegate is not None and file_delegate != patch_delegate:
-            return None
-        patch_delegate = file_delegate
-    return patch_delegate
-def find_delegate(mail):
-    """Return the delegate with the given email or None."""
-    delegate_email = mail.get('X-Patchwork-Delegate', '').strip()
-    if delegate_email:
-        try:
-            return User.objects.get(email__iexact=delegate_email)
-        except User.DoesNotExist:
-            pass
-    return None
-def parse_mail(mail, list_id=None):
-    """Parse a mail and add to the database.
-    Args:
-        mail (`mbox.Mail`): Mail to parse and add.
-        list_id (str): Mailing list ID
-    Returns:
-        None
-    """
-    # some basic sanity checks
-    if 'From' not in mail:
-        raise ValueError("Missing 'From' header")
-    if 'Subject' not in mail:
-        raise ValueError("Missing 'Subject' header")
-    if 'Message-Id' not in mail:
-        raise ValueError("Missing 'Message-Id' header")
-    hint = mail.get('X-Patchwork-Hint', '').lower()
-    if hint == 'ignore':
-        LOGGER.debug("Ignoring email due to 'ignore' hint")
-        return
-    if list_id:
-        project = find_project_by_id(list_id)
-    else:
-        project = find_project_by_header(mail)
-    if project is None:
-        LOGGER.error('Failed to find a project for email')
-        return
-    # parse content
-    diff, message = find_content(project, mail)
-    if not (diff or message):
-        return  # nothing to work with
-    msgid = mail.get('Message-Id').strip()
-    author = find_author(mail)
-    name, prefixes = clean_subject(mail.get('Subject'), [project.linkname])
-    x, n = parse_series_marker(prefixes)
-    refs = find_references(mail)
-    date = find_date(mail)
-    headers = find_headers(mail)
-    pull_url = find_pull_request(message)
-    # build objects
-    if diff or pull_url:  # patches or pull requests
-        # we delay the saving until we know we have a patch.
-        author.save()
-        delegate = find_delegate(mail)
-        if not delegate and diff:
-            filenames = find_filenames(diff)
-            delegate = auto_delegate(project, filenames)
-        patch = Patch(
-            msgid=msgid,
-            project=project,
-            name=name,
-            date=date,
-            headers=headers,
-            submitter=author,
-            content=message,
-            diff=diff,
-            pull_url=pull_url,
-            delegate=delegate,
-            state=find_state(mail))
-        patch.save()
-        LOGGER.debug('Patch saved')
-        return patch
-    elif x == 0:  # (potential) cover letters
-        # if refs are empty, it's implicitly a cover letter. If not,
-        # however, we need to see if a match already exists and, if
-        # not, assume that it is indeed a new cover letter
-        is_cover_letter = False
-        if not refs == []:
-            try:
-                CoverLetter.objects.all().get(name=name)
-            except CoverLetter.DoesNotExist:  # no match => new cover
-                is_cover_letter = True
-        else:
-            is_cover_letter = True
-        if is_cover_letter:
-            author.save()
-            cover_letter = CoverLetter(
-                msgid=msgid,
-                project=project,
-                name=name,
-                date=date,
-                headers=headers,
-                submitter=author,
-                content=message)
-            cover_letter.save()
-            LOGGER.debug('Cover letter saved')
-            return cover_letter
-    # comments
-    # we only save comments if we have the parent email
-    submission = find_submission_for_comment(project, refs)
-    if not submission:
-        return
-    author.save()
-    comment = Comment(
-        submission=submission,
-        msgid=msgid,
-        date=date,
-        headers=headers,
-        submitter=author,
-        content=message)
-    comment.save()
-    LOGGER.debug('Comment saved')
-    return comment
 extra_error_message = '''
 == Mail
diff --git a/patchwork/parser.py b/patchwork/parser.py
index c9c058d..938b965 100644
--- a/patchwork/parser.py
+++ b/patchwork/parser.py
@@ -19,13 +19,359 @@ 
 # along with Patchwork; if not, write to the Free Software
 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+import codecs
+import datetime
+from email.header import Header, decode_header
+from email.utils import parsedate_tz, mktime_tz
+from fnmatch import fnmatch
+from functools import reduce
+import logging
+import operator
 import re
+from django.contrib.auth.models import User
+from django.utils import six
 from django.utils.six.moves import map
+from patchwork.models import (Patch, Project, Person, Comment, State,
+                              DelegationRule, Submission, CoverLetter,
+                              get_default_initial_patch_state)
-_hunk_re = re.compile('^\@\@ -\d+(?:,(\d+))? \+\d+(?:,(\d+))? \@\@')
-_filename_re = re.compile('^(---|\+\+\+) (\S+)')
+_hunk_re = re.compile(r'^\@\@ -\d+(?:,(\d+))? \+\d+(?:,(\d+))? \@\@')
+_filename_re = re.compile(r'^(---|\+\+\+) (\S+)')
+list_id_headers = ['List-ID', 'X-Mailing-List', 'X-list']
+LOGGER = logging.getLogger(__name__)
+def normalise_space(str):
+    whitespace_re = re.compile(r'\s+')
+    return whitespace_re.sub(' ', str).strip()
+def clean_header(header):
+    """Decode (possibly non-ascii) headers."""
+    def decode(fragment):
+        (frag_str, frag_encoding) = fragment
+        if frag_encoding:
+            return frag_str.decode(frag_encoding)
+        elif isinstance(frag_str, six.binary_type):  # python 2
+            return frag_str.decode()
+        return frag_str
+    fragments = list(map(decode, decode_header(header)))
+    return normalise_space(u' '.join(fragments))
+def find_project_by_id(list_id):
+    """Find a `project` object with given `list_id`."""
+    project = None
+    try:
+        project = Project.objects.get(listid=list_id)
+    except Project.DoesNotExist:
+        pass
+    return project
+def find_project_by_header(mail):
+    project = None
+    listid_res = [re.compile(r'.*<([^>]+)>.*', re.S),
+                  re.compile(r'^([\S]+)$', re.S)]
+    for header in list_id_headers:
+        if header in mail:
+            for listid_re in listid_res:
+                match = listid_re.match(mail.get(header))
+                if match:
+                    break
+            if not match:
+                continue
+            listid = match.group(1)
+            project = find_project_by_id(listid)
+            if project:
+                break
+    return project
+def find_author(mail):
+    from_header = clean_header(mail.get('From'))
+    name, email = (None, None)
+    # tuple of (regex, fn)
+    #  - where fn returns a (name, email) tuple from the match groups resulting
+    #    from re.match().groups()
+    from_res = [
+        # for "Firstname Lastname" <example@example.com> style addresses
+        (re.compile(r'"?(.*?)"?\s*<([^>]+)>'), (lambda g: (g[0], g[1]))),
+        # for example@example.com (Firstname Lastname) style addresses
+        (re.compile(r'"?(.*?)"?\s*\(([^\)]+)\)'), (lambda g: (g[1], g[0]))),
+        # for example at example.com (Firstname Lastname) style addresses
+        (re.compile(r'(.*?)\sat\s(.*?)\s*\(([^\)]+)\)'),
+         (lambda g: (g[2], '@'.join(g[0:2])))),
+        # everything else
+        (re.compile(r'(.*)'), (lambda g: (None, g[0]))),
+    ]
+    for regex, fn in from_res:
+        match = regex.match(from_header)
+        if match:
+            (name, email) = fn(match.groups())
+            break
+    if email is None:
+        raise ValueError("Invalid 'From' header")
+    email = email.strip()
+    if name is not None:
+        name = name.strip()
+    try:
+        person = Person.objects.get(email__iexact=email)
+        if name:  # use the latest provided name
+            person.name = name
+    except Person.DoesNotExist:
+        person = Person(name=name, email=email)
+    return person
+def find_date(mail):
+    t = parsedate_tz(mail.get('Date', ''))
+    if not t:
+        return datetime.datetime.utcnow()
+    return datetime.datetime.utcfromtimestamp(mktime_tz(t))
+def find_headers(mail):
+    return reduce(operator.__concat__,
+                  ['%s: %s\n' % (k, Header(v, header_name=k,
+                                           continuation_ws='\t').encode())
+                   for (k, v) in list(mail.items())])
+def find_pull_request(content):
+    git_re = re.compile(r'^The following changes since commit.*' +
+                        r'^are available in the git repository at:\n'
+                        r'^\s*([\S]+://[^\n]+)$',
+                        re.DOTALL | re.MULTILINE)
+    match = git_re.search(content)
+    if match:
+        return match.group(1)
+    return None
+def find_references(mail):
+    """Construct a list of possible reply message ids."""
+    refs = []
+    if 'In-Reply-To' in mail:
+        refs.append(mail.get('In-Reply-To'))
+    if 'References' in mail:
+        rs = mail.get('References').split()
+        rs.reverse()
+        for r in rs:
+            if r not in refs:
+                refs.append(r)
+    return refs
+def parse_series_marker(subject_prefixes):
+    """Extract series markers from subject.
+    Extract the markers of multi-patches series, i.e. 'x/n', from the
+    provided subject series.
+    Args:
+        subject_prefixes: List of subject prefixes to extract markers
+          from
+    Returns:
+        (x, n) if markers found, else (None, None)
+    """
+    regex = re.compile('^([0-9]+)/([0-9]+)$')
+    for prefix in subject_prefixes:
+        m = regex.match(prefix)
+        if not m:
+            continue
+        return (int(m.group(1)), int(m.group(2)))
+    return (None, None)
+def find_content(project, mail):
+    """Extract a comment and potential diff from a mail."""
+    patchbuf = None
+    commentbuf = ''
+    for part in mail.walk():
+        if part.get_content_maintype() != 'text':
+            continue
+        payload = part.get_payload(decode=True)
+        subtype = part.get_content_subtype()
+        if not isinstance(payload, six.text_type):
+            charset = part.get_content_charset()
+            # Check that we have a charset that we understand. Otherwise,
+            # ignore it and fallback to our standard set.
+            if charset is not None:
+                try:
+                    codecs.lookup(charset)
+                except LookupError:
+                    charset = None
+            # If there is no charset or if it is unknown, then try some common
+            # charsets before we fail.
+            if charset is None:
+                try_charsets = ['utf-8', 'windows-1252', 'iso-8859-1']
+            else:
+                try_charsets = [charset]
+            for cset in try_charsets:
+                try:
+                    payload = six.text_type(payload, cset)
+                    break
+                except UnicodeDecodeError:
+                    payload = None
+            # Could not find a valid decoded payload.  Fail.
+            if payload is None:
+                return None, None
+        if subtype in ['x-patch', 'x-diff']:
+            patchbuf = payload
+        elif subtype == 'plain':
+            c = payload
+            if not patchbuf:
+                patchbuf, c = parse_patch(payload)
+            if c is not None:
+                commentbuf += c.strip() + '\n'
+    commentbuf = clean_content(commentbuf)
+    return patchbuf, commentbuf
+def find_submission_for_comment(project, refs):
+    for ref in refs:
+        # first, check for a direct reply
+        try:
+            submission = Submission.objects.get(project=project, msgid=ref)
+            return submission
+        except Submission.DoesNotExist:
+            pass
+        # see if we have comments that refer to a patch
+        try:
+            comment = Comment.objects.get(submission__project=project,
+                                          msgid=ref)
+            return comment.submission
+        except Comment.MultipleObjectsReturned:
+            # NOTE(stephenfin): This is a artifact of prior lack of support
+            # for cover letters in Patchwork. Previously all replies to
+            # patches were saved as comments. However, it's possible that
+            # someone could have created a new series as a reply to one of the
+            # comments on the original patch series. For example,
+            # '2015-November/002096.html' from the Patchwork archives. In this
+            # case, reparsing the archives will result in creation of a cover
+            # letter with the same message ID as the existing comment. Follow
+            # up comments will then apply to both this cover letter and the
+            # linked patch from the comment previously created. We choose to
+            # apply the comment to the cover letter. Note that this only
+            # happens when running 'parsearchive' or similar, so it should not
+            # affect every day use in any way.
+            comments = Comment.objects.filter(submission__project=project,
+                                              msgid=ref)
+            # The latter item will be the cover letter
+            return comments.reverse()[0].submission
+        except Comment.DoesNotExist:
+            pass
+    return None
+def split_prefixes(prefix):
+    """Turn a prefix string into a list of prefix tokens."""
+    split_re = re.compile(r'[,\s]+')
+    matches = split_re.split(prefix)
+    return [s for s in matches if s != '']
+def clean_subject(subject, drop_prefixes=None):
+    """Clean a Subject: header from an incoming patch.
+    Removes Re: and Fwd: strings, as well as [PATCH]-style prefixes. By
+    default, only [PATCH] is removed, and we keep any other bracketed
+    data in the subject. If drop_prefixes is provided, remove those
+    too, comparing case-insensitively.
+    Args:
+        subject: Subject to be cleaned
+        drop_prefixes: Additional, case-insensitive prefixes to remove
+          from the subject
+    """
+    re_re = re.compile(r'^(re|fwd?)[:\s]\s*', re.I)
+    prefix_re = re.compile(r'^\[([^\]]*)\]\s*(.*)$')
+    subject = clean_header(subject)
+    if drop_prefixes is None:
+        drop_prefixes = []
+    else:
+        drop_prefixes = [s.lower() for s in drop_prefixes]
+    drop_prefixes.append('patch')
+    # remove Re:, Fwd:, etc
+    subject = re_re.sub(' ', subject)
+    subject = normalise_space(subject)
+    prefixes = []
+    match = prefix_re.match(subject)
+    while match:
+        prefix_str = match.group(1)
+        prefixes += [p for p in split_prefixes(prefix_str)
+                     if p.lower() not in drop_prefixes]
+        subject = match.group(2)
+        match = prefix_re.match(subject)
+    subject = normalise_space(subject)
+    subject = subject.strip()
+    if prefixes:
+        subject = '[%s] %s' % (','.join(prefixes), subject)
+    return (subject, prefixes)
+def clean_content(content):
+    """Remove cruft from the email message.
+    Catch signature (-- ) and list footer (_____) cruft.
+    """
+    sig_re = re.compile(r'^(-- |_+)\n.*', re.S | re.M)
+    content = sig_re.sub('', content)
+    return content.strip()
 def parse_patch(content):
@@ -181,6 +527,182 @@  def parse_patch(content):
     return patchbuf, commentbuf
+def find_state(mail):
+    """Return the state with the given name or the default."""
+    state_name = mail.get('X-Patchwork-State', '').strip()
+    if state_name:
+        try:
+            return State.objects.get(name__iexact=state_name)
+        except State.DoesNotExist:
+            pass
+    return get_default_initial_patch_state()
+def auto_delegate(project, filenames):
+    if not filenames:
+        return None
+    rules = list(DelegationRule.objects.filter(project=project))
+    patch_delegate = None
+    for filename in filenames:
+        file_delegate = None
+        for rule in rules:
+            if fnmatch(filename, rule.path):
+                file_delegate = rule.user
+                break
+        if file_delegate is None:
+            return None
+        if patch_delegate is not None and file_delegate != patch_delegate:
+            return None
+        patch_delegate = file_delegate
+    return patch_delegate
+def find_delegate(mail):
+    """Return the delegate with the given email or None."""
+    delegate_email = mail.get('X-Patchwork-Delegate', '').strip()
+    if delegate_email:
+        try:
+            return User.objects.get(email__iexact=delegate_email)
+        except User.DoesNotExist:
+            pass
+    return None
+def parse_mail(mail, list_id=None):
+    """Parse a mail and add to the database.
+    Args:
+        mail (`mbox.Mail`): Mail to parse and add.
+        list_id (str): Mailing list ID
+    Returns:
+        None
+    """
+    # some basic sanity checks
+    if 'From' not in mail:
+        raise ValueError("Missing 'From' header")
+    if 'Subject' not in mail:
+        raise ValueError("Missing 'Subject' header")
+    if 'Message-Id' not in mail:
+        raise ValueError("Missing 'Message-Id' header")
+    hint = mail.get('X-Patchwork-Hint', '').lower()
+    if hint == 'ignore':
+        LOGGER.debug("Ignoring email due to 'ignore' hint")
+        return
+    if list_id:
+        project = find_project_by_id(list_id)
+    else:
+        project = find_project_by_header(mail)
+    if project is None:
+        LOGGER.error('Failed to find a project for email')
+        return
+    # parse content
+    diff, message = find_content(project, mail)
+    if not (diff or message):
+        return  # nothing to work with
+    msgid = mail.get('Message-Id').strip()
+    author = find_author(mail)
+    name, prefixes = clean_subject(mail.get('Subject'), [project.linkname])
+    x, n = parse_series_marker(prefixes)
+    refs = find_references(mail)
+    date = find_date(mail)
+    headers = find_headers(mail)
+    pull_url = find_pull_request(message)
+    # build objects
+    if diff or pull_url:  # patches or pull requests
+        # we delay the saving until we know we have a patch.
+        author.save()
+        delegate = find_delegate(mail)
+        if not delegate and diff:
+            filenames = find_filenames(diff)
+            delegate = auto_delegate(project, filenames)
+        patch = Patch(
+            msgid=msgid,
+            project=project,
+            name=name,
+            date=date,
+            headers=headers,
+            submitter=author,
+            content=message,
+            diff=diff,
+            pull_url=pull_url,
+            delegate=delegate,
+            state=find_state(mail))
+        patch.save()
+        LOGGER.debug('Patch saved')
+        return patch
+    elif x == 0:  # (potential) cover letters
+        # if refs are empty, it's implicitly a cover letter. If not,
+        # however, we need to see if a match already exists and, if
+        # not, assume that it is indeed a new cover letter
+        is_cover_letter = False
+        if not refs == []:
+            try:
+                CoverLetter.objects.all().get(name=name)
+            except CoverLetter.DoesNotExist:  # no match => new cover
+                is_cover_letter = True
+        else:
+            is_cover_letter = True
+        if is_cover_letter:
+            author.save()
+            cover_letter = CoverLetter(
+                msgid=msgid,
+                project=project,
+                name=name,
+                date=date,
+                headers=headers,
+                submitter=author,
+                content=message)
+            cover_letter.save()
+            LOGGER.debug('Cover letter saved')
+            return cover_letter
+    # comments
+    # we only save comments if we have the parent email
+    submission = find_submission_for_comment(project, refs)
+    if not submission:
+        return
+    author.save()
+    comment = Comment(
+        submission=submission,
+        msgid=msgid,
+        date=date,
+        headers=headers,
+        submitter=author,
+        content=message)
+    comment.save()
+    LOGGER.debug('Comment saved')
+    return comment
 def find_filenames(diff):
     """Find files changes in a given diff."""
     # normalise spaces
diff --git a/patchwork/tests/test_parser.py b/patchwork/tests/test_parser.py
index eca05a0..684a667 100644
--- a/patchwork/tests/test_parser.py
+++ b/patchwork/tests/test_parser.py
@@ -26,18 +26,19 @@  import os
 from django.test import TestCase
-from patchwork.bin.parsemail import clean_subject
-from patchwork.bin.parsemail import find_author
-from patchwork.bin.parsemail import find_content
-from patchwork.bin.parsemail import find_project_by_header
-from patchwork.bin.parsemail import find_pull_request
-from patchwork.bin.parsemail import parse_mail as _parse_mail
-from patchwork.bin.parsemail import parse_series_marker
-from patchwork.bin.parsemail import split_prefixes
 from patchwork.models import Comment
 from patchwork.models import Patch
 from patchwork.models import Person
 from patchwork.models import State
+from patchwork.parser import clean_subject
+from patchwork.parser import find_author
+from patchwork.parser import find_content
+from patchwork.parser import find_project_by_header
+from patchwork.parser import find_pull_request
+from patchwork.parser import parse_mail as _parse_mail
+from patchwork.parser import parse_series_marker
+from patchwork.parser import split_prefixes
+from patchwork.tests.utils import create_email
 from patchwork.tests.utils import create_project
 from patchwork.tests.utils import create_state
 from patchwork.tests.utils import create_user