diff mbox series

[2/n] parser: Ignore CFWS in Message-ID header

Message ID 20220506174925.731698-1-stephen@that.guru
State Accepted
Headers show
Series [2/n] parser: Ignore CFWS in Message-ID header | expand

Commit Message

Stephen Finucane May 6, 2022, 5:49 p.m. UTC
We recently started stripping comments and folding white space from the
In-Reply-To and References headers. Do so also for the Message-ID
header.

Signed-off-by: Stephen Finucane <stephen@that.guru>
Related: #399
---
 patchwork/parser.py            | 43 ++++++++++++++++++++++++++--------
 patchwork/tests/test_parser.py | 32 +++++++++++++++++++++++++
 2 files changed, 65 insertions(+), 10 deletions(-)

Comments

Stephen Finucane May 6, 2022, 5:53 p.m. UTC | #1
On Fri, 2022-05-06 at 18:49 +0100, Stephen Finucane wrote:
> We recently started stripping comments and folding white space from the
> In-Reply-To and References headers. Do so also for the Message-ID
> header.
> 
> Signed-off-by: Stephen Finucane <stephen@that.guru>
> Related: #399
> ---
>  patchwork/parser.py            | 43 ++++++++++++++++++++++++++--------
>  patchwork/tests/test_parser.py | 32 +++++++++++++++++++++++++
>  2 files changed, 65 insertions(+), 10 deletions(-)
> 
> diff --git patchwork/parser.py patchwork/parser.py
> index 17cc2325..f219f466 100644
> --- patchwork/parser.py
> +++ patchwork/parser.py
> @@ -236,15 +236,14 @@ def _find_series_by_references(project, mail):
>      name, prefixes = clean_subject(subject, [project.linkname])
>      version = parse_version(name, prefixes)
>  
> -    refs = find_references(mail)
> -    h = clean_header(mail.get('Message-Id'))
> -    if h:
> -        refs = [h] + refs
> +    msg_id = find_message_id(mail)
> +    refs = [msg_id] + find_references(mail)
>  
>      for ref in refs:
>          try:
>              series = SeriesReference.objects.get(
> -                msgid=ref[:255], project=project).series
> +                msgid=ref[:255], project=project,
> +            ).series
>  
>              if series.version != version:
>                  # if the versions don't match, at least make sure these were
> @@ -473,6 +472,34 @@ def find_headers(mail):
>      return '\n'.join(strings)
>  
>  
> +def find_message_id(mail):
> +    """Extract the 'message-id' headers from a given mail and validate it.
> +
> +    The validation here is simply checking that the Message-ID is correctly
> +    formatted per RFC-2822. However, even if it's not we'll attempt to use what
> +    we're given because a patch tracked in Patchwork with janky threading is
> +    better than no patch whatsoever.
> +    """
> +    header = clean_header(mail.get('Message-Id'))
> +    if not header:
> +        raise ValueError("Broken 'Message-Id' header")
> +
> +    msgid = _msgid_re.search(header)
> +    if msgid:
> +        msgid = msgid.group(0)
> +    else:
> +        # This is only info level since the admin likely can't do anything
> +        # about this
> +        logger.info(
> +            "Malformed 'Message-Id' header. The 'msg-id' component should be "
> +            "surrounded by angle brackets. Saving raw header. This may "
> +            "include comments and extra comments."
> +        )

I wonder if I should do this also for the 'In-Reply-To' field? I can't do it
easily for the 'References' field since there's zero way to delineate things if
I do, but that shouldn't matter since as long as we don't drop patches and
things appear roughly in order, we don't really need the 'References' field:
'In-Reply-To' is enough to find *a* parent.

Stephen

> +        msgid = header
> +
> +    return msgid[:255]
> +
> +
>  def find_references(mail):
>      """Construct a list of possible reply message ids.
>  
> @@ -1062,11 +1089,7 @@ def parse_mail(mail, list_id=None):
>  
>      # parse metadata
>  
> -    msgid = clean_header(mail.get('Message-Id'))
> -    if not msgid:
> -        raise ValueError("Broken 'Message-Id' header")
> -    msgid = msgid[:255]
> -
> +    msgid = find_message_id(mail)
>      subject = mail.get('Subject')
>      name, prefixes = clean_subject(subject, [project.linkname])
>      is_comment = subject_check(subject)
> diff --git patchwork/tests/test_parser.py patchwork/tests/test_parser.py
> index f65ad4b1..980a8afb 100644
> --- patchwork/tests/test_parser.py
> +++ patchwork/tests/test_parser.py
> @@ -1265,6 +1265,38 @@ class DuplicateMailTest(TestCase):
>          self.assertEqual(Cover.objects.count(), 1)
>  
>  
> +class TestFindMessageID(TestCase):
> +
> +    def test_find_message_id__missing_header(self):
> +        email = create_email('test')
> +        del email['Message-Id']
> +        email['Message-Id'] = ''
> +
> +        with self.assertRaises(ValueError) as cm:
> +            parser.find_message_id(email)
> +            self.assertIn("Broken 'Message-Id' header", str(cm.exeception))
> +
> +    def test_find_message_id__header_with_comments(self):
> +        """Test that we strip comments from the Message-ID field."""
> +        message_id = '<xnzgy1de8d.fsf@rhel8.vm> (message ID with a comment)'
> +        email = create_email('test', msgid=message_id)
> +
> +        expected = '<xnzgy1de8d.fsf@rhel8.vm>'
> +        actual = parser.find_message_id(email)
> +
> +        self.assertEqual(expected, actual)
> +
> +    def test_find_message_id__invalid_header_fallback(self):
> +        """Test that we accept badly formatted Message-ID fields."""
> +        message_id = '5899d592-8c87-47d9-92b6-d34260ce1aa4@radware.com>'
> +        email = create_email('test', msgid=message_id)
> +
> +        expected = '5899d592-8c87-47d9-92b6-d34260ce1aa4@radware.com>'
> +        actual = parser.find_message_id(email)
> +
> +        self.assertEqual(expected, actual)
> +
> +
>  class TestFindReferences(TestCase):
>  
>      def test_find_references__header_with_comments(self):
diff mbox series

Patch

diff --git patchwork/parser.py patchwork/parser.py
index 17cc2325..f219f466 100644
--- patchwork/parser.py
+++ patchwork/parser.py
@@ -236,15 +236,14 @@  def _find_series_by_references(project, mail):
     name, prefixes = clean_subject(subject, [project.linkname])
     version = parse_version(name, prefixes)
 
-    refs = find_references(mail)
-    h = clean_header(mail.get('Message-Id'))
-    if h:
-        refs = [h] + refs
+    msg_id = find_message_id(mail)
+    refs = [msg_id] + find_references(mail)
 
     for ref in refs:
         try:
             series = SeriesReference.objects.get(
-                msgid=ref[:255], project=project).series
+                msgid=ref[:255], project=project,
+            ).series
 
             if series.version != version:
                 # if the versions don't match, at least make sure these were
@@ -473,6 +472,34 @@  def find_headers(mail):
     return '\n'.join(strings)
 
 
+def find_message_id(mail):
+    """Extract the 'message-id' headers from a given mail and validate it.
+
+    The validation here is simply checking that the Message-ID is correctly
+    formatted per RFC-2822. However, even if it's not we'll attempt to use what
+    we're given because a patch tracked in Patchwork with janky threading is
+    better than no patch whatsoever.
+    """
+    header = clean_header(mail.get('Message-Id'))
+    if not header:
+        raise ValueError("Broken 'Message-Id' header")
+
+    msgid = _msgid_re.search(header)
+    if msgid:
+        msgid = msgid.group(0)
+    else:
+        # This is only info level since the admin likely can't do anything
+        # about this
+        logger.info(
+            "Malformed 'Message-Id' header. The 'msg-id' component should be "
+            "surrounded by angle brackets. Saving raw header. This may "
+            "include comments and extra comments."
+        )
+        msgid = header
+
+    return msgid[:255]
+
+
 def find_references(mail):
     """Construct a list of possible reply message ids.
 
@@ -1062,11 +1089,7 @@  def parse_mail(mail, list_id=None):
 
     # parse metadata
 
-    msgid = clean_header(mail.get('Message-Id'))
-    if not msgid:
-        raise ValueError("Broken 'Message-Id' header")
-    msgid = msgid[:255]
-
+    msgid = find_message_id(mail)
     subject = mail.get('Subject')
     name, prefixes = clean_subject(subject, [project.linkname])
     is_comment = subject_check(subject)
diff --git patchwork/tests/test_parser.py patchwork/tests/test_parser.py
index f65ad4b1..980a8afb 100644
--- patchwork/tests/test_parser.py
+++ patchwork/tests/test_parser.py
@@ -1265,6 +1265,38 @@  class DuplicateMailTest(TestCase):
         self.assertEqual(Cover.objects.count(), 1)
 
 
+class TestFindMessageID(TestCase):
+
+    def test_find_message_id__missing_header(self):
+        email = create_email('test')
+        del email['Message-Id']
+        email['Message-Id'] = ''
+
+        with self.assertRaises(ValueError) as cm:
+            parser.find_message_id(email)
+            self.assertIn("Broken 'Message-Id' header", str(cm.exeception))
+
+    def test_find_message_id__header_with_comments(self):
+        """Test that we strip comments from the Message-ID field."""
+        message_id = '<xnzgy1de8d.fsf@rhel8.vm> (message ID with a comment)'
+        email = create_email('test', msgid=message_id)
+
+        expected = '<xnzgy1de8d.fsf@rhel8.vm>'
+        actual = parser.find_message_id(email)
+
+        self.assertEqual(expected, actual)
+
+    def test_find_message_id__invalid_header_fallback(self):
+        """Test that we accept badly formatted Message-ID fields."""
+        message_id = '5899d592-8c87-47d9-92b6-d34260ce1aa4@radware.com>'
+        email = create_email('test', msgid=message_id)
+
+        expected = '5899d592-8c87-47d9-92b6-d34260ce1aa4@radware.com>'
+        actual = parser.find_message_id(email)
+
+        self.assertEqual(expected, actual)
+
+
 class TestFindReferences(TestCase):
 
     def test_find_references__header_with_comments(self):