diff mbox series

parser: Unmangle From: headers that have been mangled for DMARC purposes

Message ID 20191010062047.21549-1-ajd@linux.ibm.com
State Superseded
Headers show
Series parser: Unmangle From: headers that have been mangled for DMARC purposes | expand

Commit Message

Andrew Donnellan Oct. 10, 2019, 6:20 a.m. UTC
To avoid triggering spam filters due to failed signature validation, many
mailing lists mangle the From header to change the From address to be the
address of the list, typically where the sender's domain has a strict DMARC
policy enabled.

In this case, we should try to unmangle the From header.

Add support for using the X-Original-Sender or Reply-To headers, as used by
Google Groups and Mailman respectively, to unmangle the From header when
necessary.

Closes: #64 ("Incorrect submitter when using googlegroups")
Reported-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Donnellan <ajd@linux.ibm.com>
---
 patchwork/parser.py            | 75 ++++++++++++++++++++++++++++++----
 patchwork/tests/test_parser.py | 68 ++++++++++++++++++++++++++++--
 2 files changed, 130 insertions(+), 13 deletions(-)

Comments

Jonathan Nieder Oct. 10, 2019, 7:41 p.m. UTC | #1
Hi,

Andrew Donnellan wrote:

> To avoid triggering spam filters due to failed signature validation, many
> mailing lists mangle the From header to change the From address to be the
> address of the list, typically where the sender's domain has a strict DMARC
> policy enabled.
>
> In this case, we should try to unmangle the From header.
>
> Add support for using the X-Original-Sender or Reply-To headers, as used by
> Google Groups and Mailman respectively, to unmangle the From header when
> necessary.
>
> Closes: #64 ("Incorrect submitter when using googlegroups")
> Reported-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
> Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
> Signed-off-by: Andrew Donnellan <ajd@linux.ibm.com>
> ---
>  patchwork/parser.py            | 75 ++++++++++++++++++++++++++++++----
>  patchwork/tests/test_parser.py | 68 ++++++++++++++++++++++++++++--
>  2 files changed, 130 insertions(+), 13 deletions(-)

Interesting!  I'm cc-ing the Git mailing list in case "git am" might
wnat to learn the same support.

Thanks,
Jonathan

(patch left unsnipped for reference)
> diff --git a/patchwork/parser.py b/patchwork/parser.py
> index 7dc66bc05a5b..79beac5617b1 100644
> --- a/patchwork/parser.py
> +++ b/patchwork/parser.py
> @@ -321,12 +321,7 @@ def find_series(project, mail, author):
>      return _find_series_by_markers(project, mail, author)
>  
>  
> -def get_or_create_author(mail):
> -    from_header = clean_header(mail.get('From'))
> -
> -    if not from_header:
> -        raise ValueError("Invalid 'From' header")
> -
> +def split_from_header(from_header):
>      name, email = (None, None)
>  
>      # tuple of (regex, fn)
> @@ -355,6 +350,65 @@ def get_or_create_author(mail):
>              (name, email) = fn(match.groups())
>              break
>  
> +    return (name, email)
> +
> +
> +# Unmangle From addresses that have been mangled for DMARC purposes.
> +#
> +# To avoid triggering spam filters due to failed signature validation, many
> +# mailing lists mangle the From header to change the From address to be the
> +# address of the list, typically where the sender's domain has a strict
> +# DMARC policy enabled.
> +#
> +# Unfortunately, there's no standardised way of preserving the original
> +# From address.
> +#
> +# Google Groups adds an X-Original-Sender header. If present, we use that.
> +#
> +# Mailman preserves the original address by adding a Reply-To, except in the
> +# case where the list is set to either reply to list, or reply to a specific
> +# address, in which case the original From is added to Cc instead. These corner
> +# cases are dumb, but we try and handle things as sensibly as possible by
> +# looking for a name in Reply-To/Cc that matches From. It's inexact but should
> +# be good enough for our purposes.
> +def get_original_sender(mail, name, email):
> +    if name and ' via ' in name:
> +        # Mailman uses the format "<name> via <list>"
> +        # Google Groups uses "'<name>' via <list>"
> +        stripped_name = name[:name.rfind(' via ')].strip().strip("'")
> +
> +    original_sender = clean_header(mail.get('X-Original-Sender', ''))
> +    if original_sender:
> +        new_email = split_from_header(original_sender)[1].strip()[:255]
> +        return (stripped_name, new_email)
> +
> +    addrs = []
> +    reply_to_headers = mail.get_all('Reply-To') or []
> +    cc_headers = mail.get_all('Cc') or []
> +    for header in reply_to_headers + cc_headers:
> +        header = clean_header(header)
> +        addrs = header.split(",")
> +        for addr in addrs:
> +            new_name, new_email = split_from_header(addr)
> +            if new_name:
> +                new_name = new_name.strip()[:255]
> +            if new_email:
> +                new_email = new_email.strip()[:255]
> +            if new_name == stripped_name:
> +                return (stripped_name, new_email)
> +
> +    # If we can't figure out the original sender, just keep it as is
> +    return (name, email)
> +
> +
> +def get_or_create_author(mail, project=None):
> +    from_header = clean_header(mail.get('From'))
> +
> +    if not from_header:
> +        raise ValueError("Invalid 'From' header")
> +
> +    name, email = split_from_header(from_header)
> +
>      if not email:
>          raise ValueError("Invalid 'From' header")
>  
> @@ -362,6 +416,9 @@ def get_or_create_author(mail):
>      if name is not None:
>          name = name.strip()[:255]
>  
> +    if project and email.lower() == project.listemail.lower():
> +        name, email = get_original_sender(mail, name, email)
> +
>      # this correctly handles the case where we lose the race to create
>      # the person and another process beats us to it. (If the record
>      # does not exist, g_o_c invokes _create_object_from_params which
> @@ -1004,7 +1061,7 @@ def parse_mail(mail, list_id=None):
>  
>      if not is_comment and (diff or pull_url):  # patches or pull requests
>          # we delay the saving until we know we have a patch.
> -        author = get_or_create_author(mail)
> +        author = get_or_create_author(mail, project)
>  
>          delegate = find_delegate_by_header(mail)
>          if not delegate and diff:
> @@ -1099,7 +1156,7 @@ def parse_mail(mail, list_id=None):
>                  is_cover_letter = True
>  
>          if is_cover_letter:
> -            author = get_or_create_author(mail)
> +            author = get_or_create_author(mail, project)
>  
>              # we don't use 'find_series' here as a cover letter will
>              # always be the first item in a thread, thus the references
> @@ -1159,7 +1216,7 @@ def parse_mail(mail, list_id=None):
>      if not submission:
>          return
>  
> -    author = get_or_create_author(mail)
> +    author = get_or_create_author(mail, project)
>  
>      try:
>          comment = Comment.objects.create(
> diff --git a/patchwork/tests/test_parser.py b/patchwork/tests/test_parser.py
> index e5a2fca3044e..85c6e7550f93 100644
> --- a/patchwork/tests/test_parser.py
> +++ b/patchwork/tests/test_parser.py
> @@ -265,11 +265,23 @@ class SenderCorrelationTest(TestCase):
>      """
>  
>      @staticmethod
> -    def _create_email(from_header):
> +    def _create_email(from_header, reply_tos=None, ccs=None,
> +                      x_original_sender=None):
>          mail = 'Message-Id: %s\n' % make_msgid() + \
> -               'From: %s\n' % from_header + \
> -               'Subject: Tests\n\n'\
> -               'test\n'
> +               'From: %s\n' % from_header
> +
> +        if reply_tos:
> +            mail += 'Reply-To: %s\n' % ', '.join(reply_tos)
> +
> +        if ccs:
> +            mail += 'Cc: %s\n' % ', '.join(ccs)
> +
> +        if x_original_sender:
> +            mail += 'X-Original-Sender: %s\n' % x_original_sender
> +
> +        mail += 'Subject: Tests\n\n'\
> +            'test\n'
> +
>          return message_from_string(mail)
>  
>      def test_existing_sender(self):
> @@ -311,6 +323,54 @@ class SenderCorrelationTest(TestCase):
>          self.assertEqual(person_b._state.adding, False)
>          self.assertEqual(person_b.id, person_a.id)
>  
> +    def test_mailman_dmarc_munging(self):
> +        project = create_project()
> +        real_sender = 'Existing Sender <existing@example.com>'
> +        munged_sender = 'Existing Sender via List <{}>'.format(
> +            project.listemail)
> +        other_email = 'Other Person <other@example.com>'
> +
> +        # Unmunged author
> +        mail = self._create_email(real_sender)
> +        person_a = get_or_create_author(mail, project)
> +        person_a.save()
> +
> +        # Single Reply-To
> +        mail = self._create_email(munged_sender, [real_sender])
> +        person_b = get_or_create_author(mail, project)
> +        self.assertEqual(person_b._state.adding, False)
> +        self.assertEqual(person_b.id, person_a.id)
> +
> +        # Single Cc
> +        mail = self._create_email(munged_sender, [], [real_sender])
> +        person_b = get_or_create_author(mail, project)
> +        self.assertEqual(person_b._state.adding, False)
> +        self.assertEqual(person_b.id, person_a.id)
> +
> +        # Multiple Reply-Tos and Ccs
> +        mail = self._create_email(munged_sender, [other_email, real_sender],
> +                                  [other_email, other_email])
> +        person_b = get_or_create_author(mail, project)
> +        self.assertEqual(person_b._state.adding, False)
> +        self.assertEqual(person_b.id, person_a.id)
> +
> +    def test_google_dmarc_munging(self):
> +        project = create_project()
> +        real_sender = 'Existing Sender <existing@example.com>'
> +        munged_sender = "'Existing Sender' via List <{}>".format(
> +            project.listemail)
> +
> +        # Unmunged author
> +        mail = self._create_email(real_sender)
> +        person_a = get_or_create_author(mail, project)
> +        person_a.save()
> +
> +        # X-Original-Sender header
> +        mail = self._create_email(munged_sender, None, None, real_sender)
> +        person_b = get_or_create_author(mail, project)
> +        self.assertEqual(person_b._state.adding, False)
> +        self.assertEqual(person_b.id, person_a.id)
> +
>  
>  class SeriesCorrelationTest(TestCase):
>      """Validate correct behavior of find_series."""
Andrew Donnellan Oct. 10, 2019, 9:13 p.m. UTC | #2
On 11/10/19 6:41 am, Jonathan Nieder wrote:
> Interesting!  I'm cc-ing the Git mailing list in case "git am" might
> wnat to learn the same support.
Argh, that reminds me... this patch only rewrites the name and email 
that is recorded as the Patchwork submitter, it doesn't actually rewrite 
the From: header when you fetch the mbox off Patchwork.

Part of me would really like to keep Patchwork mboxes as close as 
possible to the mbox we ingested, but on the other hand it means the 
mangled address is still going to land in the git repo at the end... so 
I should probably just change it?
Jeff King Oct. 10, 2019, 10:54 p.m. UTC | #3
On Thu, Oct 10, 2019 at 12:41:32PM -0700, Jonathan Nieder wrote:

> > Add support for using the X-Original-Sender or Reply-To headers, as used by
> > Google Groups and Mailman respectively, to unmangle the From header when
> > necessary.
> [...]
> Interesting!  I'm cc-ing the Git mailing list in case "git am" might
> wnat to learn the same support.

Neat. There was discussion on a similar issue recently in:

  https://public-inbox.org/git/305577c2-709a-b632-4056-6582771176ac@redhat.com/

where a possible solution was to get senders to use in-body From
headers even when sending their own patches.

This might provide an alternate solution (or vice versa). I kind of like
this one better in that it doesn't require the sender to do anything
differently (but it may be less robust, as it assumes the receiver
reliably de-mangling).

-Peff
Andrew Donnellan Oct. 10, 2019, 11:01 p.m. UTC | #4
On 11/10/19 9:54 am, Jeff King wrote:
> Neat. There was discussion on a similar issue recently in:
> 
>    https://public-inbox.org/git/305577c2-709a-b632-4056-6582771176ac@redhat.com/
> 
> where a possible solution was to get senders to use in-body From
> headers even when sending their own patches.

I think that's a good idea.

> 
> This might provide an alternate solution (or vice versa). I kind of like
> this one better in that it doesn't require the sender to do anything
> differently (but it may be less robust, as it assumes the receiver
> reliably de-mangling).

Yep, it's less robust - but OTOH there's always a long tail of users 
stuck on old versions of git for whatever reason and having some logic 
to detect DMARC munging may thus still be useful.
Jeff King Oct. 10, 2019, 11:06 p.m. UTC | #5
On Fri, Oct 11, 2019 at 10:01:23AM +1100, Andrew Donnellan wrote:

> > This might provide an alternate solution (or vice versa). I kind of like
> > this one better in that it doesn't require the sender to do anything
> > differently (but it may be less robust, as it assumes the receiver
> > reliably de-mangling).
> 
> Yep, it's less robust - but OTOH there's always a long tail of users stuck
> on old versions of git for whatever reason and having some logic to detect
> DMARC munging may thus still be useful.

I think the two features would work together nicely out of the box: if
somebody has an in-body from, we'd respect that before looking at email
headers anyway. So senders who do the extra work will be covered, and
checking other email headers would just improve the fallback case.

-Peff
Daniel Axtens Oct. 10, 2019, 11:16 p.m. UTC | #6
Andrew Donnellan <ajd@linux.ibm.com> writes:

> On 11/10/19 6:41 am, Jonathan Nieder wrote:
>> Interesting!  I'm cc-ing the Git mailing list in case "git am" might
>> wnat to learn the same support.
> Argh, that reminds me... this patch only rewrites the name and email 
> that is recorded as the Patchwork submitter, it doesn't actually rewrite 
> the From: header when you fetch the mbox off Patchwork.
>
> Part of me would really like to keep Patchwork mboxes as close as 
> possible to the mbox we ingested, but on the other hand it means the 
> mangled address is still going to land in the git repo at the end... so 
> I should probably just change it?

Yes, I think change it. If you're worried, stash the original one in the
headers.

>
> -- 
> Andrew Donnellan              OzLabs, ADL Canberra
> ajd@linux.ibm.com             IBM Australia Limited
>
> _______________________________________________
> Patchwork mailing list
> Patchwork@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/patchwork
Stephen Rothwell Oct. 10, 2019, 11:40 p.m. UTC | #7
On Fri, 11 Oct 2019 10:16:40 +1100 Daniel Axtens <dja@axtens.net> wrote:
>
> Andrew Donnellan <ajd@linux.ibm.com> writes:
> 
> > On 11/10/19 6:41 am, Jonathan Nieder wrote:  
> >> Interesting!  I'm cc-ing the Git mailing list in case "git am" might
> >> wnat to learn the same support.  
> > Argh, that reminds me... this patch only rewrites the name and email 
> > that is recorded as the Patchwork submitter, it doesn't actually rewrite 
> > the From: header when you fetch the mbox off Patchwork.
> >
> > Part of me would really like to keep Patchwork mboxes as close as 
> > possible to the mbox we ingested, but on the other hand it means the 
> > mangled address is still going to land in the git repo at the end... so 
> > I should probably just change it?  
> 
> Yes, I think change it. If you're worried, stash the original one in the
> headers.

Or add a From: line to the start of the body (if one is not already there).
Junio C Hamano Oct. 11, 2019, 4:29 a.m. UTC | #8
Jeff King <peff@peff.net> writes:

> This might provide an alternate solution (or vice versa). I kind of like
> this one better in that it doesn't require the sender to do anything
> differently (but it may be less robust, as it assumes the receiver
> reliably de-mangling).

I share the assessment.  I also feel that relying on Reply-To: would
make the result a lot less reliable (I do not have much problem with
the use of X-Original-Sender, though).
Andrew Donnellan Oct. 11, 2019, 4:36 a.m. UTC | #9
On 11/10/19 3:29 pm, Junio C Hamano wrote:
> Jeff King <peff@peff.net> writes:
> 
>> This might provide an alternate solution (or vice versa). I kind of like
>> this one better in that it doesn't require the sender to do anything
>> differently (but it may be less robust, as it assumes the receiver
>> reliably de-mangling).
> 
> I share the assessment.  I also feel that relying on Reply-To: would
> make the result a lot less reliable (I do not have much problem with
> the use of X-Original-Sender, though).
> 

It would be nice if Mailman could adopt X-Original-Sender too. As it is, 
it adds the original sender to Reply-To, but in some cases (where the 
list is set as reply-to-list, or has a custom reply-to setting) it adds 
to Cc instead. (In the patch that started this thread, I match the name 
from the munged From field against the name in Reply-To/Cc for the case 
where there's multiple Reply-Tos/Ccs.)

For the Patchwork use case, I'm quite okay with accepting the risk of 
using Reply-To, as the alternative is worse, the corner cases are rare, 
and ultimately a maintainer can still fix the odd stuff-up before 
applying the patch.
Andrew Donnellan Oct. 11, 2019, 4:50 a.m. UTC | #10
On 11/10/19 3:36 pm, Andrew Donnellan wrote:
> It would be nice if Mailman could adopt X-Original-Sender too. As it is, 

(which I have gone ahead and reported as 
https://gitlab.com/mailman/mailman/issues/641)
Christian Schoenebeck Oct. 11, 2019, 1:13 p.m. UTC | #11
On Freitag, 11. Oktober 2019 06:50:14 CEST Andrew Donnellan wrote:
> On 11/10/19 3:36 pm, Andrew Donnellan wrote:
> > It would be nice if Mailman could adopt X-Original-Sender too. As it is,
> 
> (which I have gone ahead and reported as
> https://gitlab.com/mailman/mailman/issues/641)

Not stopping you from doing that, since I still think that it'd be helpful if 
mailman added some kind X-Original-Sender header in case the email has to be 
munged for some reason. Just some notes about status & consensus we had:

1. On GNU lists the default mailman settings are now to prevent munging in 
first place (if possible):
https://lists.gnu.org/archive/html/qemu-devel/2019-09/msg00416.html

2. If any list member has the "nodup" mailman option turned on, mailman would 
still munge emails due to that. Ian (on CC) worked on a patch to override that 
individual user setting automatically if necessary:
https://bugs.launchpad.net/mailman/+bug/1845751

3. On git side it was suggested to add some kind of "always_use_in_body_from" 
option:
https://public-inbox.org/git/20190923222415.GA22495@sigill.intra.peff.net/

Unless that git option exists, this little trick proofed as usable workaround 
for git patch submitters suffering from munging:
https://lists.gnu.org/archive/html/qemu-devel/2019-09/msg00932.html

4. MTA's should also address this DKIM issue more accurately. For instance 
Exim is currently by default filling the "dkim h=..." header with "all header 
names listed in RFC4871 will be used, whether or not each header is present in 
the message":
https://www.exim.org/exim-html-current/doc/html/spec_html/ch-dkim_and_spf.html
That "h=" tag in email's dkim header lists all email headers which were 
included by MTA for signing the message. However IMO MTA's should not list any 
"List-*" header name in "dkim h=..." (at least not if not present in message), 
otherwise mailman is forced to munge any of such messages when adding its 
required List-* headers.

BTW section 5.5. (page 38) of that RFC4871 actually sais these headers "SHOULD 
be included in the signature, if they are present in the message being 
signed".

For now you can override this setting, e.g. by using Exim's 
"dkim_sign_headers" setting and providing your own list of header names, but 
from security point of view that's suboptimal, since admins probably leave 
that untouched for years and new security relevant headers might not be 
included for signing at some point in future. So IMO it would make sense to 
add more fine graded MTA DKIM config options like:
"include these headers for dkim signing only if present in message"
and/or
"use default header names except of these".

By taking these things into account, emails of domains with strict DMARC 
policies are no longer munged on gnu lists.

Best regards,
Christian Schoenebeck
Daniel Axtens Oct. 11, 2019, 3:42 p.m. UTC | #12
Hi,

>> Neat. There was discussion on a similar issue recently in:
>> 
>>    https://public-inbox.org/git/305577c2-709a-b632-4056-6582771176ac@redhat.com/
>> 
>> where a possible solution was to get senders to use in-body From
>> headers even when sending their own patches.
>
> I think that's a good idea.
>
>> 
>> This might provide an alternate solution (or vice versa). I kind of like
>> this one better in that it doesn't require the sender to do anything
>> differently (but it may be less robust, as it assumes the receiver
>> reliably de-mangling).
>
> Yep, it's less robust - but OTOH there's always a long tail of users 
> stuck on old versions of git for whatever reason and having some logic 
> to detect DMARC munging may thus still be useful.

I'm not sure this solution is correct.

If I take a patch from Andrew, backport it, and send to the list, Andrew
will be listed in the in-body From. However, he shouldn't be the sender
from the Patchwork point of view: he shouldn't get the patch status
notification emails - I should. We don't want to spam an original author
if their patch is backported to several different releases, or picked up
and resent in someone else's series, etc etc. So unless I've
misunderstood something, we can't rely on the in-body from matching
Patchwork's understanding of the sender.

Regards,
Daniel

>
> -- 
> Andrew Donnellan              OzLabs, ADL Canberra
> ajd@linux.ibm.com             IBM Australia Limited
>
> _______________________________________________
> Patchwork mailing list
> Patchwork@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/patchwork
Jeff King Oct. 11, 2019, 3:51 p.m. UTC | #13
On Sat, Oct 12, 2019 at 02:42:49AM +1100, Daniel Axtens wrote:

> >> where a possible solution was to get senders to use in-body From
> >> headers even when sending their own patches.
> [...]
> I'm not sure this solution is correct.
> 
> If I take a patch from Andrew, backport it, and send to the list, Andrew
> will be listed in the in-body From. However, he shouldn't be the sender
> from the Patchwork point of view: he shouldn't get the patch status
> notification emails - I should. We don't want to spam an original author
> if their patch is backported to several different releases, or picked up
> and resent in someone else's series, etc etc. So unless I've
> misunderstood something, we can't rely on the in-body from matching
> Patchwork's understanding of the sender.

Yeah, it may be that patchwork and git have two different priorities
here. From my perspective, the problem is getting the patch into a git
repo with the right author name. But patchwork may want to make the
distinction between author and sender.

-Peff
Ian Kelling Oct. 11, 2019, 5:36 p.m. UTC | #14
Christian Schoenebeck <qemu_oss@crudebyte.com> writes:

> 4. MTA's should also address this DKIM issue more accurately.

I agree that Exim should be changed as you suggest.

>
> By taking these things into account, emails of domains with strict DMARC 
> policies are no longer munged on gnu lists.

Additional info: Migration of many gnu/nongnu.gnu.org lists is still in
progress for another week or so, then that will be true for most of
them. For a minority of lists, the list administrators have set weird
settings like making all messages have from: rewritten as from this
list, and we are leaving them as is since the list administrators opted
in to that at some point. But if the list deals with patches and not
modifying the headers is useful to the people on the list, I think a
request to change the list settings is likely to be accepted by the list
admin.
Andrew Donnellan Oct. 13, 2019, 11:05 a.m. UTC | #15
On 12/10/19 2:51 am, Jeff King wrote:
> On Sat, Oct 12, 2019 at 02:42:49AM +1100, Daniel Axtens wrote:
> 
>>>> where a possible solution was to get senders to use in-body From
>>>> headers even when sending their own patches.
>> [...]
>> I'm not sure this solution is correct.
>>
>> If I take a patch from Andrew, backport it, and send to the list, Andrew
>> will be listed in the in-body From. However, he shouldn't be the sender
>> from the Patchwork point of view: he shouldn't get the patch status
>> notification emails - I should. We don't want to spam an original author
>> if their patch is backported to several different releases, or picked up
>> and resent in someone else's series, etc etc. So unless I've
>> misunderstood something, we can't rely on the in-body from matching
>> Patchwork's understanding of the sender.
> 
> Yeah, it may be that patchwork and git have two different priorities
> here. From my perspective, the problem is getting the patch into a git
> repo with the right author name. But patchwork may want to make the
> distinction between author and sender.
> 

Yes, I was referring to the git am case, not the Patchwork case.
diff mbox series

Patch

diff --git a/patchwork/parser.py b/patchwork/parser.py
index 7dc66bc05a5b..79beac5617b1 100644
--- a/patchwork/parser.py
+++ b/patchwork/parser.py
@@ -321,12 +321,7 @@  def find_series(project, mail, author):
     return _find_series_by_markers(project, mail, author)
 
 
-def get_or_create_author(mail):
-    from_header = clean_header(mail.get('From'))
-
-    if not from_header:
-        raise ValueError("Invalid 'From' header")
-
+def split_from_header(from_header):
     name, email = (None, None)
 
     # tuple of (regex, fn)
@@ -355,6 +350,65 @@  def get_or_create_author(mail):
             (name, email) = fn(match.groups())
             break
 
+    return (name, email)
+
+
+# Unmangle From addresses that have been mangled for DMARC purposes.
+#
+# To avoid triggering spam filters due to failed signature validation, many
+# mailing lists mangle the From header to change the From address to be the
+# address of the list, typically where the sender's domain has a strict
+# DMARC policy enabled.
+#
+# Unfortunately, there's no standardised way of preserving the original
+# From address.
+#
+# Google Groups adds an X-Original-Sender header. If present, we use that.
+#
+# Mailman preserves the original address by adding a Reply-To, except in the
+# case where the list is set to either reply to list, or reply to a specific
+# address, in which case the original From is added to Cc instead. These corner
+# cases are dumb, but we try and handle things as sensibly as possible by
+# looking for a name in Reply-To/Cc that matches From. It's inexact but should
+# be good enough for our purposes.
+def get_original_sender(mail, name, email):
+    if name and ' via ' in name:
+        # Mailman uses the format "<name> via <list>"
+        # Google Groups uses "'<name>' via <list>"
+        stripped_name = name[:name.rfind(' via ')].strip().strip("'")
+
+    original_sender = clean_header(mail.get('X-Original-Sender', ''))
+    if original_sender:
+        new_email = split_from_header(original_sender)[1].strip()[:255]
+        return (stripped_name, new_email)
+
+    addrs = []
+    reply_to_headers = mail.get_all('Reply-To') or []
+    cc_headers = mail.get_all('Cc') or []
+    for header in reply_to_headers + cc_headers:
+        header = clean_header(header)
+        addrs = header.split(",")
+        for addr in addrs:
+            new_name, new_email = split_from_header(addr)
+            if new_name:
+                new_name = new_name.strip()[:255]
+            if new_email:
+                new_email = new_email.strip()[:255]
+            if new_name == stripped_name:
+                return (stripped_name, new_email)
+
+    # If we can't figure out the original sender, just keep it as is
+    return (name, email)
+
+
+def get_or_create_author(mail, project=None):
+    from_header = clean_header(mail.get('From'))
+
+    if not from_header:
+        raise ValueError("Invalid 'From' header")
+
+    name, email = split_from_header(from_header)
+
     if not email:
         raise ValueError("Invalid 'From' header")
 
@@ -362,6 +416,9 @@  def get_or_create_author(mail):
     if name is not None:
         name = name.strip()[:255]
 
+    if project and email.lower() == project.listemail.lower():
+        name, email = get_original_sender(mail, name, email)
+
     # this correctly handles the case where we lose the race to create
     # the person and another process beats us to it. (If the record
     # does not exist, g_o_c invokes _create_object_from_params which
@@ -1004,7 +1061,7 @@  def parse_mail(mail, list_id=None):
 
     if not is_comment and (diff or pull_url):  # patches or pull requests
         # we delay the saving until we know we have a patch.
-        author = get_or_create_author(mail)
+        author = get_or_create_author(mail, project)
 
         delegate = find_delegate_by_header(mail)
         if not delegate and diff:
@@ -1099,7 +1156,7 @@  def parse_mail(mail, list_id=None):
                 is_cover_letter = True
 
         if is_cover_letter:
-            author = get_or_create_author(mail)
+            author = get_or_create_author(mail, project)
 
             # we don't use 'find_series' here as a cover letter will
             # always be the first item in a thread, thus the references
@@ -1159,7 +1216,7 @@  def parse_mail(mail, list_id=None):
     if not submission:
         return
 
-    author = get_or_create_author(mail)
+    author = get_or_create_author(mail, project)
 
     try:
         comment = Comment.objects.create(
diff --git a/patchwork/tests/test_parser.py b/patchwork/tests/test_parser.py
index e5a2fca3044e..85c6e7550f93 100644
--- a/patchwork/tests/test_parser.py
+++ b/patchwork/tests/test_parser.py
@@ -265,11 +265,23 @@  class SenderCorrelationTest(TestCase):
     """
 
     @staticmethod
-    def _create_email(from_header):
+    def _create_email(from_header, reply_tos=None, ccs=None,
+                      x_original_sender=None):
         mail = 'Message-Id: %s\n' % make_msgid() + \
-               'From: %s\n' % from_header + \
-               'Subject: Tests\n\n'\
-               'test\n'
+               'From: %s\n' % from_header
+
+        if reply_tos:
+            mail += 'Reply-To: %s\n' % ', '.join(reply_tos)
+
+        if ccs:
+            mail += 'Cc: %s\n' % ', '.join(ccs)
+
+        if x_original_sender:
+            mail += 'X-Original-Sender: %s\n' % x_original_sender
+
+        mail += 'Subject: Tests\n\n'\
+            'test\n'
+
         return message_from_string(mail)
 
     def test_existing_sender(self):
@@ -311,6 +323,54 @@  class SenderCorrelationTest(TestCase):
         self.assertEqual(person_b._state.adding, False)
         self.assertEqual(person_b.id, person_a.id)
 
+    def test_mailman_dmarc_munging(self):
+        project = create_project()
+        real_sender = 'Existing Sender <existing@example.com>'
+        munged_sender = 'Existing Sender via List <{}>'.format(
+            project.listemail)
+        other_email = 'Other Person <other@example.com>'
+
+        # Unmunged author
+        mail = self._create_email(real_sender)
+        person_a = get_or_create_author(mail, project)
+        person_a.save()
+
+        # Single Reply-To
+        mail = self._create_email(munged_sender, [real_sender])
+        person_b = get_or_create_author(mail, project)
+        self.assertEqual(person_b._state.adding, False)
+        self.assertEqual(person_b.id, person_a.id)
+
+        # Single Cc
+        mail = self._create_email(munged_sender, [], [real_sender])
+        person_b = get_or_create_author(mail, project)
+        self.assertEqual(person_b._state.adding, False)
+        self.assertEqual(person_b.id, person_a.id)
+
+        # Multiple Reply-Tos and Ccs
+        mail = self._create_email(munged_sender, [other_email, real_sender],
+                                  [other_email, other_email])
+        person_b = get_or_create_author(mail, project)
+        self.assertEqual(person_b._state.adding, False)
+        self.assertEqual(person_b.id, person_a.id)
+
+    def test_google_dmarc_munging(self):
+        project = create_project()
+        real_sender = 'Existing Sender <existing@example.com>'
+        munged_sender = "'Existing Sender' via List <{}>".format(
+            project.listemail)
+
+        # Unmunged author
+        mail = self._create_email(real_sender)
+        person_a = get_or_create_author(mail, project)
+        person_a.save()
+
+        # X-Original-Sender header
+        mail = self._create_email(munged_sender, None, None, real_sender)
+        person_b = get_or_create_author(mail, project)
+        self.assertEqual(person_b._state.adding, False)
+        self.assertEqual(person_b.id, person_a.id)
+
 
 class SeriesCorrelationTest(TestCase):
     """Validate correct behavior of find_series."""