diff mbox series

[v4,2/3] parser: Use a second query to weed out duplicate series

Message ID 20191201171045.97913-2-stephen@that.guru
State Accepted
Headers show
Series [v4,1/3] models: Use database constraints to prevent split Series | expand

Commit Message

Stephen Finucane Dec. 1, 2019, 5:10 p.m. UTC
Annoyingly, not all email clients properly thread emails using the
message ID fields originally specified in RFC 822 [1]. Worse, some MTAs
(cough, outlook.com, cough) actually override what the client
configures, breaking the world in the process. Realising this is an
issue, Patchwork supports threading using arbitrary metadata in addition
to the RFC 822 metadata. Specifically, it uses a combination of
submitter and list-id extracted from the headers along with the series
version and total count metadata extracted from the subject. In addition
to this, we timebox things so that two or more series that match on all
of this metadata but which are sent some time apart from each other
aren't combined by accident. This does leave one edge case - duplicate
series received within the timebox will be combined. We've resigned
ourselves to this fact on the basis that it's extremely unlikely for all
of these things to go wrong at once.

Given all the above, there should be no reason that attempting to find
series by series markers should return more than one series. The
timeboxing will prevent us grouping similar looking series by accident
and the only other reason for this to happen is because we lost a race
and we should try again.

[1] https://tools.ietf.org/html/rfc822

Signed-off-by: Stephen Finucane <stephen@that.guru>
Cc: Daniel Axtens <dja@axtens.net>
---
v4:
- Handle additional corner cases
---
 patchwork/parser.py            | 45 ++++++++++++++++++++++++++++------
 patchwork/tests/test_parser.py |  8 +++---
 2 files changed, 41 insertions(+), 12 deletions(-)
diff mbox series

Patch

diff --git a/patchwork/parser.py b/patchwork/parser.py
index c0084cc0..45dd8db8 100644
--- a/patchwork/parser.py
+++ b/patchwork/parser.py
@@ -61,6 +61,11 @@  class DuplicateMailError(Exception):
         self.msgid = msgid
 
 
+class DuplicateSeriesError(Exception):
+
+    pass
+
+
 def normalise_space(value):
     whitespace_re = re.compile(r'\s+')
     return whitespace_re.sub(' ', value).strip()
@@ -256,8 +261,10 @@  def _find_series_by_references(project, mail):
         refs = [h] + refs
     for ref in refs:
         try:
-            return SeriesReference.objects.get(
+            series = SeriesReference.objects.get(
                 msgid=ref[:255], project=project).series
+            # we want to return a queryset like '_find_series_by_markers'
+            return Series.objects.filter(id=series.id)
         except SeriesReference.DoesNotExist:
             continue
 
@@ -294,12 +301,9 @@  def _find_series_by_markers(project, mail, author):
     start_date = date - delta
     end_date = date + delta
 
-    try:
-        return Series.objects.get(
-            submitter=author, project=project, version=version, total=total,
-            date__range=[start_date, end_date])
-    except (Series.DoesNotExist, Series.MultipleObjectsReturned):
-        return
+    return Series.objects.filter(
+        submitter=author, project=project, version=version, total=total,
+        date__range=[start_date, end_date])
 
 
 def find_series(project, mail, author):
@@ -1094,6 +1098,23 @@  def parse_mail(mail, list_id=None):
                     series = None
                     if n:
                         series = find_series(project, mail, author)
+                        if len(series) > 1:
+                            # if this isn't our final attempt, go again
+                            if attempt != 10:
+                                raise DuplicateSeriesError()
+
+                            # if it is and we still haven't been able to save a
+                            # series, it's time to give up and just save yet
+                            # another duplicate - find the best possible match
+                            for series in series.order_by('-date'):
+                                if Patch.objects.filter(
+                                        series=series, number=x).count():
+                                    continue
+                                break
+                            else:
+                                series = None
+                        elif len(series) == 1:
+                            series = series.first()
                     else:
                         x = n = 1
 
@@ -1130,8 +1151,16 @@  def parse_mail(mail, list_id=None):
                             except SeriesReference.DoesNotExist:
                                 SeriesReference.objects.create(
                                     msgid=ref, project=project, series=series)
+
+                        # attempt to pull the series in again, raising an
+                        # exception if we lost the race when creating a series
+                        # and force us to go through this again
+                        if attempt != 10 and find_series(
+                                project, mail, author).count() > 1:
+                            raise DuplicateSeriesError()
+
                     break
-            except IntegrityError:
+            except (IntegrityError, DuplicateSeriesError):
                 # we lost the race so go again
                 logger.warning('Conflict while saving series. This is '
                                'probably because multiple patches belonging '
diff --git a/patchwork/tests/test_parser.py b/patchwork/tests/test_parser.py
index 0edbd87a..4a85fd42 100644
--- a/patchwork/tests/test_parser.py
+++ b/patchwork/tests/test_parser.py
@@ -400,8 +400,8 @@  class SeriesCorrelationTest(TestCase):
         email = self._create_email(msgid)
         project = create_project()
 
-        self.assertIsNone(find_series(project, email,
-                                      get_or_create_author(email)))
+        self.assertFalse(find_series(project, email,
+                                     get_or_create_author(email)))
 
     def test_first_reply(self):
         msgid_a = make_msgid()
@@ -413,7 +413,7 @@  class SeriesCorrelationTest(TestCase):
 
         series = find_series(ref.series.project, email,
                              get_or_create_author(email))
-        self.assertEqual(series, ref.series)
+        self.assertEqual(series.first(), ref.series)
 
     def test_nested_series(self):
         """Handle a series sent in-reply-to an existing series."""
@@ -443,7 +443,7 @@  class SeriesCorrelationTest(TestCase):
 
         # this should link to the second series - not the first
         self.assertEqual(len(msgids), 4 + 1)  # old series + new cover
-        self.assertEqual(series, ref_v2.series)
+        self.assertEqual(series.first(), ref_v2.series)
 
 
 class SubjectEncodingTest(TestCase):