diff mbox series

parser: Ignore CFWS in In-Reply-To, References headers

Message ID 20220506171027.723718-1-stephen@that.guru
State New
Headers show
Series parser: Ignore CFWS in In-Reply-To, References headers | expand

Commit Message

Stephen Finucane May 6, 2022, 5:10 p.m. UTC
RFC2822 states that [1] a comment or folding white space is permitted to
be inserted before or after a msg-id in in the Message-ID, In-Reply-To
or References fields. Allow for this.

[1] https://tools.ietf.org/html/rfc2822#section-3.6.4

Signed-off-by: Stephen Finucane <stephen@that.guru>
Closes: #399
---
 patchwork/parser.py                           | 13 ++-
 patchwork/tests/test_parser.py                | 79 +++++++++++++++++--
 .../notes/issue-399-09d6f17aa54b14b2.yaml     | 11 +++
 3 files changed, 89 insertions(+), 14 deletions(-)
 create mode 100644 releasenotes/notes/issue-399-09d6f17aa54b14b2.yaml

Comments

DJ Delorie May 6, 2022, 7:31 p.m. UTC | #1
Stephen Finucane <stephen@that.guru> writes:
> diff --git patchwork/parser.py patchwork/parser.py
> index e6e1a7fb..17cc2325 100644
> --- patchwork/parser.py
> +++ patchwork/parser.py
> @@ -31,6 +31,7 @@ from patchwork.models import SeriesReference
>  from patchwork.models import State
>  
>  
> +_msgid_re = re.compile(r'<[^>]+>')

Ok; a msgid is <something> and we don't support <> (empty) msgid

>      if 'In-Reply-To' in mail:
>          for in_reply_to in mail.get_all('In-Reply-To'):
> -            r = clean_header(in_reply_to)
> -            if r:
> -                refs.append(r)
> +            ref = _msgid_re.search(clean_header(in_reply_to))
> +            if ref:
> +                refs.append(ref.group(0))

Instead of appending the header as-is, we extract all <*> patterns, OK.
Would have the same effect if the string only had a msgid in it.

>      if 'References' in mail:
>          for references_header in mail.get_all('References'):
> -            h = clean_header(references_header)
> -            if not h:
> -                continue
> -            references = h.split()
> +            references = _msgid_re.findall(clean_header(references_header))
>              references.reverse()
>              for ref in references:
> -                ref = ref.strip()
>                  if ref not in refs:
>                      refs.append(ref)

Likewise OK.

> diff --git patchwork/tests/test_parser.py patchwork/tests/test_parser.py

The rest looks all OK to me, without needing line-by-line review.

Thanks!
Reviewed-by: DJ Delorie <dj@redhat.com>
diff mbox series

Patch

diff --git patchwork/parser.py patchwork/parser.py
index e6e1a7fb..17cc2325 100644
--- patchwork/parser.py
+++ patchwork/parser.py
@@ -31,6 +31,7 @@  from patchwork.models import SeriesReference
 from patchwork.models import State
 
 
+_msgid_re = re.compile(r'<[^>]+>')
 _hunk_re = re.compile(r'^\@\@ -\d+(?:,(\d+))? \+\d+(?:,(\d+))? \@\@')
 _filename_re = re.compile(r'^(---|\+\+\+) (\S+)')
 list_id_headers = ['List-ID', 'X-Mailing-List', 'X-list']
@@ -483,19 +484,15 @@  def find_references(mail):
 
     if 'In-Reply-To' in mail:
         for in_reply_to in mail.get_all('In-Reply-To'):
-            r = clean_header(in_reply_to)
-            if r:
-                refs.append(r)
+            ref = _msgid_re.search(clean_header(in_reply_to))
+            if ref:
+                refs.append(ref.group(0))
 
     if 'References' in mail:
         for references_header in mail.get_all('References'):
-            h = clean_header(references_header)
-            if not h:
-                continue
-            references = h.split()
+            references = _msgid_re.findall(clean_header(references_header))
             references.reverse()
             for ref in references:
-                ref = ref.strip()
                 if ref not in refs:
                     refs.append(ref)
 
diff --git patchwork/tests/test_parser.py patchwork/tests/test_parser.py
index 12f984bc..f65ad4b1 100644
--- patchwork/tests/test_parser.py
+++ patchwork/tests/test_parser.py
@@ -74,6 +74,7 @@  def _create_email(
     sender=None,
     listid=None,
     in_reply_to=None,
+    references=None,
     headers=None,
 ):
     msg['Message-Id'] = msgid or make_msgid()
@@ -84,6 +85,9 @@  def _create_email(
     if in_reply_to:
         msg['In-Reply-To'] = in_reply_to
 
+    if references:
+        msg['References'] = references
+
     for header in headers or {}:
         msg[header] = headers[header]
 
@@ -97,12 +101,21 @@  def create_email(
     sender=None,
     listid=None,
     in_reply_to=None,
+    references=None,
     headers=None,
 ):
     msg = MIMEText(content, _charset='us-ascii')
 
     return _create_email(
-        msg, msgid, subject, sender, listid, in_reply_to, headers)
+        msg,
+        msgid,
+        subject,
+        sender,
+        listid,
+        in_reply_to,
+        references,
+        headers,
+    )
 
 
 def parse_mail(*args, **kwargs):
@@ -1216,7 +1229,7 @@  class DuplicateMailTest(TestCase):
 
     def test_duplicate_patch(self):
         diff = read_patch('0001-add-line.patch')
-        m = create_email(diff, listid=self.listid, msgid='1@example.com')
+        m = create_email(diff, listid=self.listid, msgid='<1@example.com>')
 
         self._test_duplicate_mail(m)
 
@@ -1224,18 +1237,26 @@  class DuplicateMailTest(TestCase):
 
     def test_duplicate_comment(self):
         diff = read_patch('0001-add-line.patch')
-        m1 = create_email(diff, listid=self.listid, msgid='1@example.com')
+        m1 = create_email(
+            diff,
+            listid=self.listid,
+            msgid='<1@example.com>',
+        )
         _parse_mail(m1)
 
-        m2 = create_email('test', listid=self.listid, msgid='2@example.com',
-                          in_reply_to='1@example.com')
+        m2 = create_email(
+            'test',
+            listid=self.listid,
+            msgid='<2@example.com>',
+            in_reply_to='<1@example.com>',
+        )
         self._test_duplicate_mail(m2)
 
         self.assertEqual(Patch.objects.count(), 1)
         self.assertEqual(PatchComment.objects.count(), 1)
 
     def test_duplicate_coverletter(self):
-        m = create_email('test', listid=self.listid, msgid='1@example.com')
+        m = create_email('test', listid=self.listid, msgid='<1@example.com>')
         del m['Subject']
         m['Subject'] = '[PATCH 0/1] test cover letter'
 
@@ -1244,6 +1265,52 @@  class DuplicateMailTest(TestCase):
         self.assertEqual(Cover.objects.count(), 1)
 
 
+class TestFindReferences(TestCase):
+
+    def test_find_references__header_with_comments(self):
+        """Test that we strip comments from References, In-Reply-To fields."""
+        in_reply_to = (
+            '<4574b99b-edac-d8dc-9141-79c3109d2fcc@huawei.com> (message from\n'
+            ' liqingqing on Thu, 1 Apr 2021 16:51:45 +0800)'
+        )
+        email = create_email('test', in_reply_to=in_reply_to)
+
+        expected = ['<4574b99b-edac-d8dc-9141-79c3109d2fcc@huawei.com>']
+        actual = parser.find_references(email)
+
+        self.assertEqual(expected, actual)
+
+    def test_find_references__duplicate_references(self):
+        """Test that we ignore duplicate message IDs in 'References'."""
+        message_id = '<20130510114450.7104c5d2@nehalam.linuxnetplumber.net>'
+        in_reply_to = \
+            '<525534677.5312512.1368202896189.JavaMail.root@vmware.com>'
+        references = (
+            '<AFCFCEB8EB0E24448E4EB95988BA1E531FA2EA0D@xmb-aln-x05.cisco.com>\n'  # noqa: E501
+            ' <CAE68AUOr7B5a2QvduJhH0kEHPi+sR9X3qfrtumgLxT1BK4VS+Q@mail.gmail.com>\n'  # noqa: E501
+            ' <1676591087.5291867.1368201908283.JavaMail.root@vmware.com>\n'
+            ' <20130510091549.3c064df6@nehalam.linuxnetplumber.net>\n'
+            ' <525534677.5312512.1368202896189.JavaMail.root@vmware.com>'
+        )
+        email = create_email(
+            'test',
+            msgid=message_id,
+            in_reply_to=in_reply_to,
+            references=references,
+        )
+
+        expected = [
+            '<525534677.5312512.1368202896189.JavaMail.root@vmware.com>',
+            '<20130510091549.3c064df6@nehalam.linuxnetplumber.net>',
+            '<1676591087.5291867.1368201908283.JavaMail.root@vmware.com>',
+            '<CAE68AUOr7B5a2QvduJhH0kEHPi+sR9X3qfrtumgLxT1BK4VS+Q@mail.gmail.com>',  # noqa: E501
+            '<AFCFCEB8EB0E24448E4EB95988BA1E531FA2EA0D@xmb-aln-x05.cisco.com>',
+        ]
+        actual = parser.find_references(email)
+
+        self.assertEqual(expected, actual)
+
+
 class TestCommentCorrelation(TestCase):
 
     def test_find_patch_for_comment__no_reply(self):
diff --git releasenotes/notes/issue-399-09d6f17aa54b14b2.yaml releasenotes/notes/issue-399-09d6f17aa54b14b2.yaml
new file mode 100644
index 00000000..e925a606
--- /dev/null
+++ releasenotes/notes/issue-399-09d6f17aa54b14b2.yaml
@@ -0,0 +1,11 @@ 
+---
+fixes:
+  - |
+    Comments and whitespace are now correctly stripped from the ``Message-ID``,
+    ``In-Reply-To``, and ``References`` headers. One side effect of this change
+    is that the parser is now stricter with regards to the format of the
+    ``msg-id`` component of these headers: all identifiers must now be
+    surrounded by angle brackets, e.g. ``<abcdef@example.com>``. This is
+    mandated in the spec and a review of mailing lists archives suggest it is
+    broadly adhered to. Without these markers, there is no way to delimit
+    ``msg-id`` from any surrounding comments and whitespace.