Patchwork Use email addresses of recipients as fallback to lookup the patch's project

login
register
mail settings
Submitter Guilherme Salgado
Date April 14, 2011, 2:05 p.m.
Message ID <20110414140550.17875.90233.stgit@localhost6.localdomain6>
Download mbox | patch
Permalink /patch/91241/
State Accepted
Headers show

Comments

Guilherme Salgado - April 14, 2011, 2:05 p.m.
This is used only when the List-ID lookup doesn't return any projects *and*
the PATCHWORK_FALLBACK_TO_LISTEMAIL setting is set to True.

Signed-off-by: Guilherme Salgado <guilherme.salgado@linaro.org>
---
 apps/patchwork/bin/parsemail.py     |   30 ++++++++++++++++++++-
 apps/patchwork/tests/patchparser.py |   50 ++++++++++++++++++++++++++++++++---
 apps/settings.py                    |    3 ++
 3 files changed, 78 insertions(+), 5 deletions(-)
Jeremy Kerr - April 19, 2011, 6:21 a.m.
Hi Guilherme,

> This is used only when the List-ID lookup doesn't return any projects *and*
> the PATCHWORK_FALLBACK_TO_LISTEMAIL setting is set to True.

Just one more thing I noticed when applying this:

> +def extract_email_addresses(str):
> +    email_re = re.compile(
> +        r"([_\.0-9a-zA-Z-+=]+@(([0-9a-zA-Z-]{1,}\.)*)[a-zA-Z]{2,})")

Is there not an existing RE we can use here? Would be good not to have
to reinvent the wheel.

If not:

> +    # re.findall() will return a list of tuples because we have multiple
> +    # groups on the regex above, but we're only interested on the outermost
> +    # group (which should contain the whole email address), so we drop the
> +    # second and third groups.

In that case, use the non-grouping format: (?:...).

Also, using \w allows you to reduce the size of the regex.

Cheers,


Jeremy
Guilherme Salgado - April 19, 2011, 11:43 a.m.
Hi Jeremy,

On Tue, 2011-04-19 at 14:21 +0800, Jeremy Kerr wrote:
> Hi Guilherme,
> 
> > This is used only when the List-ID lookup doesn't return any projects *and*
> > the PATCHWORK_FALLBACK_TO_LISTEMAIL setting is set to True.
> 
> Just one more thing I noticed when applying this:
> 
> > +def extract_email_addresses(str):
> > +    email_re = re.compile(
> > +        r"([_\.0-9a-zA-Z-+=]+@(([0-9a-zA-Z-]{1,}\.)*)[a-zA-Z]{2,})")
> 
> Is there not an existing RE we can use here? Would be good not to have
> to reinvent the wheel.

The only one I could find is in django.core.validators but it doesn't
seem to be part of the public API, so it's probably not a good idea to
use it.

email_re = re.compile(
    r"(^[-!#$%&'*+/=?^_`{}|~0-9A-Z]+(\.[-!#$%&'*+/=?^_`{}|~0-9A-Z]+)*"  # dot-atom
    r'|^"([\001-\010\013\014\016-\037!#-\[\]-\177]|\\[\001-011\013\014\016-\177])*"' # quoted-string
    r')@(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?$', re.IGNORECASE)  # domain

> 
> If not:
> 
> > +    # re.findall() will return a list of tuples because we have multiple
> > +    # groups on the regex above, but we're only interested on the outermost
> > +    # group (which should contain the whole email address), so we drop the
> > +    # second and third groups.
> 
> In that case, use the non-grouping format: (?:...).

Oh, good catch; I didn't know about that. :)

> 
> Also, using \w allows you to reduce the size of the regex.

Indeed, I can use \w on the user part, but not on the hostname part as
\w matches the underscore as well, which doesn't seem to be allowed in
hostnames.

Thanks for the review and suggestions, I'll submit a second version of
this patch in a minute.

Patch

diff --git a/apps/patchwork/bin/parsemail.py b/apps/patchwork/bin/parsemail.py
index c36dae4..2e93e0a 100755
--- a/apps/patchwork/bin/parsemail.py
+++ b/apps/patchwork/bin/parsemail.py
@@ -24,6 +24,7 @@  import re
 import datetime
 import time
 import operator
+import settings
 from email import message_from_file
 try:
     from email.header import Header, decode_header
@@ -57,7 +58,7 @@  def clean_header(header):
 
     return normalise_space(u' '.join(fragments))
 
-def find_project(mail):
+def find_project_by_listid(mail):
     project = None
     listid_res = [re.compile('.*<([^>]+)>.*', re.S),
                   re.compile('^([\S]+)$', re.S)]
@@ -83,6 +84,33 @@  def find_project(mail):
 
     return project
 
+def extract_email_addresses(str):
+    email_re = re.compile(
+        r"([_\.0-9a-zA-Z-+=]+@(([0-9a-zA-Z-]{1,}\.)*)[a-zA-Z]{2,})")
+    # re.findall() will return a list of tuples because we have multiple
+    # groups on the regex above, but we're only interested on the outermost
+    # group (which should contain the whole email address), so we drop the
+    # second and third groups.
+    return [email for email, dummy, dummy2 in email_re.findall(str)]
+
+def find_project_by_list_address(mail):
+    recipients = mail.get('To', '') + ' ' + mail.get('CC', '')
+    email_addresses = extract_email_addresses(recipients)
+    for email_address in email_addresses:
+        try:
+            return Project.objects.get(listemail = email_address)
+        except Project.DoesNotExist:
+            pass
+    print ("Unable to find a project for any of the recipients: %s" %
+           email_addresses)
+    return None
+
+def find_project(mail):
+    project = find_project_by_listid(mail)
+    if project is None and settings.PATCHWORK_FALLBACK_TO_LISTEMAIL:
+        project = find_project_by_list_address(mail)
+    return project
+
 def find_author(mail):
 
     from_header = clean_header(mail.get('From'))
diff --git a/apps/patchwork/tests/patchparser.py b/apps/patchwork/tests/patchparser.py
index 7013e85..d141412 100644
--- a/apps/patchwork/tests/patchparser.py
+++ b/apps/patchwork/tests/patchparser.py
@@ -20,6 +20,7 @@ 
 import unittest
 import os
 from email import message_from_string
+import settings
 from patchwork.models import Project, Person, Patch, Comment
 from patchwork.tests.utils import read_patch, read_mail, create_email, defaults
 
@@ -34,8 +35,9 @@  class PatchTest(unittest.TestCase):
     default_subject = defaults.subject
     project = defaults.project
 
-from patchwork.bin.parsemail import find_content, find_author, find_project, \
-                                    parse_mail
+from patchwork.bin.parsemail import (
+    extract_email_addresses, find_content, find_author, find_project,
+    parse_mail)
 
 class InlinePatchTest(PatchTest):
     patch_filename = '0001-add-line.patch'
@@ -299,18 +301,56 @@  class MultipleProjectPatchCommentTest(MultipleProjectPatchTest):
             # and the one we parsed in setUp()
             self.assertEquals(Comment.objects.filter(patch = patch).count(), 2)
 
-class ListIdHeaderTest(unittest.TestCase):
-    """ Test that we parse List-Id headers from mails correctly """
+class EmailProjectGuessing(unittest.TestCase):
+    """Projects are guessed based on List-Id headers or recipient addresses"""
     def setUp(self):
+        self.orig_fallback_to_listemail = \
+            settings.PATCHWORK_FALLBACK_TO_LISTEMAIL
+        settings.PATCHWORK_FALLBACK_TO_LISTEMAIL = False
         self.project = Project(linkname = 'test-project-1', name = 'Project 1',
                 listid = '1.example.com', listemail='1@example.com')
         self.project.save()
 
+    def testExtractingEmailAddressesFromRecipientsList(self):
+        emails = extract_email_addresses(
+            '"Foo Bar" <foo.bar@example.com>,'
+            '<baz+list@foo.example.com>,'
+            'bar-foo@bar.foo.example.com,'
+            # Notice that this one is not a valid email address.
+            'bar-foo@.com')
+        self.assertEqual(
+            ['foo.bar@example.com',
+             'baz+list@foo.example.com',
+             'bar-foo@bar.foo.example.com'],
+            emails)
+
+    def testDoNotFallbackToEmailAddressWhenNotConfiguredTo(self):
+        self.assertFalse(settings.PATCHWORK_FALLBACK_TO_LISTEMAIL)
+        email = MIMEText('')
+        email['To'] = '"First dev list" <1@example.com>'
+        project = find_project(email)
+        self.assertEquals(None, project)
+
     def testNoListId(self):
         email = MIMEText('')
         project = find_project(email)
         self.assertEquals(project, None)
 
+    def testNoListIdWithListEmailAsRecipient(self):
+        settings.PATCHWORK_FALLBACK_TO_LISTEMAIL = True
+        email = MIMEText('')
+        email['To'] = '"First dev list" <1@example.com>'
+        project = find_project(email)
+        self.assertEquals(self.project, project)
+
+    def testNoListIdWithListEmailAsCC(self):
+        settings.PATCHWORK_FALLBACK_TO_LISTEMAIL = True
+        email = MIMEText('')
+        email['CC'] = ('"First maintainer <maintainer@example.com>, '
+                       '"First dev list" <1@example.com>')
+        project = find_project(email)
+        self.assertEquals(self.project, project)
+
     def testBlankListId(self):
         email = MIMEText('')
         email['List-Id'] = ''
@@ -344,6 +384,8 @@  class ListIdHeaderTest(unittest.TestCase):
         self.assertEquals(project, self.project)
 
     def tearDown(self):
+        settings.PATCHWORK_FALLBACK_TO_LISTEMAIL = \
+            self.orig_fallback_to_listemail
         self.project.delete()
 
 
diff --git a/apps/settings.py b/apps/settings.py
index 401dfd5..f938310 100644
--- a/apps/settings.py
+++ b/apps/settings.py
@@ -118,6 +118,9 @@  INSTALLED_APPS = (
 
 DEFAULT_PATCHES_PER_PAGE = 100
 DEFAULT_FROM_EMAIL = 'Patchwork <patchwork@patchwork.example.com>'
+# If set to True, this will cause the parsemail script to lookup projects
+# by email address when one cannot be found by list ID.
+PATCHWORK_FALLBACK_TO_LISTEMAIL = False
 
 ACCOUNT_ACTIVATION_DAYS = 7