Message ID | 20180221141716.10908-7-dja@axtens.net |
---|---|
State | Changes Requested |
Headers | show |
Series | Tools and fixes for parallel parsing | expand |
Context | Check | Description |
---|---|---|
dja/snowpatch-0_1_0 | success | master/apply_patch Successfully applied |
dja/snowpatch-snowpatch_job_snowpatch-patchwork | success | Test snowpatch/job/snowpatch-patchwork on branch master |
On 22/02/18 01:17, Daniel Axtens wrote: > find_author looks up a person by email, and if they do not exist, > creates a Person model, which may be saved later if the message > contains something valuable. > > Multiple simultaneous processes can race here: both can do the SELECT, > find there is no Person, and create the model. One will succeed in > saving, the other will get an IntegrityError. > > Reduce the window by making find_author into get_or_create_author, and > plumb that through. (Remove a test that specifically required find_author > to *not* create). > > More importantly, cover the case where we lose the race, by catching the > IntegrityError and fetching the winning Person model. > > Signed-off-by: Daniel Axtens <dja@axtens.net> LGTM apart from one comment below Reviewed-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com> > --- > patchwork/parser.py | 32 ++++++++++++++++++-------------- > patchwork/tests/test_parser.py | 37 +++++++++++++++---------------------- > 2 files changed, 33 insertions(+), 36 deletions(-) > > diff --git a/patchwork/parser.py b/patchwork/parser.py > index cbf88fe4e464..1f3b3dd8901f 100644 > --- a/patchwork/parser.py > +++ b/patchwork/parser.py > @@ -29,6 +29,7 @@ import logging > import re > > from django.contrib.auth.models import User > +from django.db import IntegrityError > from django.utils import six > > from patchwork.models import Comment > @@ -241,7 +242,7 @@ def _find_series_by_references(project, mail): > continue > > > -def _find_series_by_markers(project, mail): > +def _find_series_by_markers(project, mail, author): > """Find a patch's series using series markers and sender. > > Identify suitable series for a patch using a combination of the > @@ -262,7 +263,6 @@ def _find_series_by_markers(project, mail): > still won't help us if someone spams the mailing list with > duplicate series but that's a tricky situation for anyone to parse. > """ > - author = find_author(mail) > > subject = mail.get('Subject') > name, prefixes = clean_subject(subject, [project.linkname]) > @@ -282,7 +282,7 @@ def _find_series_by_markers(project, mail): > return > > > -def find_series(project, mail): > +def find_series(project, mail, author): > """Find a series, if any, for a given patch. > > Args: > @@ -297,10 +297,10 @@ def find_series(project, mail): > if series: > return series > > - return _find_series_by_markers(project, mail) > + return _find_series_by_markers(project, mail, author) > > > -def find_author(mail): > +def get_or_create_author(mail): > from_header = clean_header(mail.get('From')) > > if not from_header: > @@ -342,11 +342,16 @@ def find_author(mail): > name = name.strip()[:255] > > try: > + person = Person.objects.get_or_create(email__iexact=email, > + defaults={'name': name, > + 'email': email})[0] > + except IntegrityError: > + # we lost the race to create the person > person = Person.objects.get(email__iexact=email) get_or_create() should handle the IntegrityError in there for you. > - if name: # use the latest provided name > - person.name = name > - except Person.DoesNotExist: > - person = Person(name=name, email=email) > + > + if name: # use the latest provided name > + person.name = name > + person.save() > > return person > > @@ -947,7 +952,6 @@ def parse_mail(mail, list_id=None): > raise ValueError("Broken 'Message-Id' header") > msgid = msgid[:255] > > - author = find_author(mail) > subject = mail.get('Subject') > name, prefixes = clean_subject(subject, [project.linkname]) > is_comment = subject_check(subject) > @@ -973,7 +977,7 @@ def parse_mail(mail, list_id=None): > > if not is_comment and (diff or pull_url): # patches or pull requests > # we delay the saving until we know we have a patch. > - author.save() > + author = get_or_create_author(mail) > > delegate = find_delegate_by_header(mail) > if not delegate and diff: > @@ -984,7 +988,7 @@ def parse_mail(mail, list_id=None): > # series to match against. > series = None > if n: > - series = find_series(project, mail) > + series = find_series(project, mail, author) > else: > x = n = 1 > > @@ -1061,7 +1065,7 @@ def parse_mail(mail, list_id=None): > is_cover_letter = True > > if is_cover_letter: > - author.save() > + author = get_or_create_author(mail) > > # we don't use 'find_series' here as a cover letter will > # always be the first item in a thread, thus the references > @@ -1109,7 +1113,7 @@ def parse_mail(mail, list_id=None): > if not submission: > return > > - author.save() > + author = get_or_create_author(mail) > > comment = Comment( > submission=submission, > diff --git a/patchwork/tests/test_parser.py b/patchwork/tests/test_parser.py > index 68bcb937b273..6cfe7a484443 100644 > --- a/patchwork/tests/test_parser.py > +++ b/patchwork/tests/test_parser.py > @@ -34,7 +34,7 @@ from patchwork.models import Patch > from patchwork.models import Person > from patchwork.models import State > from patchwork.parser import clean_subject > -from patchwork.parser import find_author > +from patchwork.parser import get_or_create_author > from patchwork.parser import find_patch_content as find_content > from patchwork.parser import find_project_by_header > from patchwork.parser import find_series > @@ -225,7 +225,7 @@ class SenderEncodingTest(TestCase): > > def _test_encoding(self, from_header, sender_name, sender_email): > email = self._create_email(from_header) > - person = find_author(email) > + person = get_or_create_author(email) > person.save() > > # ensure it was parsed correctly > @@ -241,7 +241,7 @@ class SenderEncodingTest(TestCase): > def test_empty(self): > email = self._create_email('') > with self.assertRaises(ValueError): > - find_author(email) > + get_or_create_author(email) > > def test_ascii_encoding(self): > from_header = 'example user <user@example.com>' > @@ -269,7 +269,7 @@ class SenderEncodingTest(TestCase): > > > class SenderCorrelationTest(TestCase): > - """Validate correct behavior of the find_author case. > + """Validate correct behavior of the get_or_create_author case. > > Relies of checking the internal state of a Django model object. > > @@ -284,25 +284,16 @@ class SenderCorrelationTest(TestCase): > 'test\n' > return message_from_string(mail) > > - def test_non_existing_sender(self): > - sender = 'Non-existing Sender <nonexisting@example.com>' > - mail = self._create_email(sender) > - > - # don't create the person - attempt to find immediately > - person = find_author(mail) > - self.assertEqual(person._state.adding, True) > - self.assertEqual(person.id, None) > - > def test_existing_sender(self): > sender = 'Existing Sender <existing@example.com>' > mail = self._create_email(sender) > > # create the person first > - person_a = find_author(mail) > + person_a = get_or_create_author(mail) > person_a.save() > > # then attempt to parse email with the same 'From' line > - person_b = find_author(mail) > + person_b = get_or_create_author(mail) > self.assertEqual(person_b._state.adding, False) > self.assertEqual(person_b.id, person_a.id) > > @@ -311,12 +302,12 @@ class SenderCorrelationTest(TestCase): > mail = self._create_email(sender) > > # create the person first > - person_a = find_author(mail) > + person_a = get_or_create_author(mail) > person_a.save() > > # then attempt to parse email with a new 'From' line > mail = self._create_email('existing@example.com') > - person_b = find_author(mail) > + person_b = get_or_create_author(mail) > self.assertEqual(person_b._state.adding, False) > self.assertEqual(person_b.id, person_a.id) > > @@ -324,11 +315,11 @@ class SenderCorrelationTest(TestCase): > sender = 'Existing Sender <existing@example.com>' > mail = self._create_email(sender) > > - person_a = find_author(mail) > + person_a = get_or_create_author(mail) > person_a.save() > > mail = self._create_email(sender.upper()) > - person_b = find_author(mail) > + person_b = get_or_create_author(mail) > self.assertEqual(person_b._state.adding, False) > self.assertEqual(person_b.id, person_a.id) > > @@ -361,7 +352,8 @@ class SeriesCorrelationTest(TestCase): > email = self._create_email(msgid) > project = create_project() > > - self.assertIsNone(find_series(project, email)) > + self.assertIsNone(find_series(project, email, > + get_or_create_author(email))) > > def test_first_reply(self): > msgid_a = make_msgid() > @@ -371,7 +363,8 @@ class SeriesCorrelationTest(TestCase): > # assume msgid_a was already handled > ref = create_series_reference(msgid=msgid_a) > > - series = find_series(ref.series.project, email) > + series = find_series(ref.series.project, email, > + get_or_create_author(email)) > self.assertEqual(series, ref.series) > > def test_nested_series(self): > @@ -395,7 +388,7 @@ class SeriesCorrelationTest(TestCase): > # ...and the "first patch" of this new series > msgid = make_msgid() > email = self._create_email(msgid, msgids) > - series = find_series(project, email) > + series = find_series(project, email, get_or_create_author(email)) > > # this should link to the second series - not the first > self.assertEqual(len(msgids), 4 + 1) # old series + new cover >
Hi Andrew, >> try: >> + person = Person.objects.get_or_create(email__iexact=email, >> + defaults={'name': name, >> + 'email': email})[0] >> + except IntegrityError: >> + # we lost the race to create the person >> person = Person.objects.get(email__iexact=email) > > get_or_create() should handle the IntegrityError in there for you. Upon inspection, yes it does, although *not* - annoyingly - in get_or_create itself, but in _create_object_from_params. Indeed, if you just read g_o_c, you can easily convince yourself it's not parallel-safe, even though it is. *sigh* Anyway, you are right and I have fixed this up for v2. Regards, Daniel > >> - if name: # use the latest provided name >> - person.name = name >> - except Person.DoesNotExist: >> - person = Person(name=name, email=email) >> + >> + if name: # use the latest provided name >> + person.name = name >> + person.save() >> >> return person >> >> @@ -947,7 +952,6 @@ def parse_mail(mail, list_id=None): >> raise ValueError("Broken 'Message-Id' header") >> msgid = msgid[:255] >> >> - author = find_author(mail) >> subject = mail.get('Subject') >> name, prefixes = clean_subject(subject, [project.linkname]) >> is_comment = subject_check(subject) >> @@ -973,7 +977,7 @@ def parse_mail(mail, list_id=None): >> >> if not is_comment and (diff or pull_url): # patches or pull requests >> # we delay the saving until we know we have a patch. >> - author.save() >> + author = get_or_create_author(mail) >> >> delegate = find_delegate_by_header(mail) >> if not delegate and diff: >> @@ -984,7 +988,7 @@ def parse_mail(mail, list_id=None): >> # series to match against. >> series = None >> if n: >> - series = find_series(project, mail) >> + series = find_series(project, mail, author) >> else: >> x = n = 1 >> >> @@ -1061,7 +1065,7 @@ def parse_mail(mail, list_id=None): >> is_cover_letter = True >> >> if is_cover_letter: >> - author.save() >> + author = get_or_create_author(mail) >> >> # we don't use 'find_series' here as a cover letter will >> # always be the first item in a thread, thus the references >> @@ -1109,7 +1113,7 @@ def parse_mail(mail, list_id=None): >> if not submission: >> return >> >> - author.save() >> + author = get_or_create_author(mail) >> >> comment = Comment( >> submission=submission, >> diff --git a/patchwork/tests/test_parser.py b/patchwork/tests/test_parser.py >> index 68bcb937b273..6cfe7a484443 100644 >> --- a/patchwork/tests/test_parser.py >> +++ b/patchwork/tests/test_parser.py >> @@ -34,7 +34,7 @@ from patchwork.models import Patch >> from patchwork.models import Person >> from patchwork.models import State >> from patchwork.parser import clean_subject >> -from patchwork.parser import find_author >> +from patchwork.parser import get_or_create_author >> from patchwork.parser import find_patch_content as find_content >> from patchwork.parser import find_project_by_header >> from patchwork.parser import find_series >> @@ -225,7 +225,7 @@ class SenderEncodingTest(TestCase): >> >> def _test_encoding(self, from_header, sender_name, sender_email): >> email = self._create_email(from_header) >> - person = find_author(email) >> + person = get_or_create_author(email) >> person.save() >> >> # ensure it was parsed correctly >> @@ -241,7 +241,7 @@ class SenderEncodingTest(TestCase): >> def test_empty(self): >> email = self._create_email('') >> with self.assertRaises(ValueError): >> - find_author(email) >> + get_or_create_author(email) >> >> def test_ascii_encoding(self): >> from_header = 'example user <user@example.com>' >> @@ -269,7 +269,7 @@ class SenderEncodingTest(TestCase): >> >> >> class SenderCorrelationTest(TestCase): >> - """Validate correct behavior of the find_author case. >> + """Validate correct behavior of the get_or_create_author case. >> >> Relies of checking the internal state of a Django model object. >> >> @@ -284,25 +284,16 @@ class SenderCorrelationTest(TestCase): >> 'test\n' >> return message_from_string(mail) >> >> - def test_non_existing_sender(self): >> - sender = 'Non-existing Sender <nonexisting@example.com>' >> - mail = self._create_email(sender) >> - >> - # don't create the person - attempt to find immediately >> - person = find_author(mail) >> - self.assertEqual(person._state.adding, True) >> - self.assertEqual(person.id, None) >> - >> def test_existing_sender(self): >> sender = 'Existing Sender <existing@example.com>' >> mail = self._create_email(sender) >> >> # create the person first >> - person_a = find_author(mail) >> + person_a = get_or_create_author(mail) >> person_a.save() >> >> # then attempt to parse email with the same 'From' line >> - person_b = find_author(mail) >> + person_b = get_or_create_author(mail) >> self.assertEqual(person_b._state.adding, False) >> self.assertEqual(person_b.id, person_a.id) >> >> @@ -311,12 +302,12 @@ class SenderCorrelationTest(TestCase): >> mail = self._create_email(sender) >> >> # create the person first >> - person_a = find_author(mail) >> + person_a = get_or_create_author(mail) >> person_a.save() >> >> # then attempt to parse email with a new 'From' line >> mail = self._create_email('existing@example.com') >> - person_b = find_author(mail) >> + person_b = get_or_create_author(mail) >> self.assertEqual(person_b._state.adding, False) >> self.assertEqual(person_b.id, person_a.id) >> >> @@ -324,11 +315,11 @@ class SenderCorrelationTest(TestCase): >> sender = 'Existing Sender <existing@example.com>' >> mail = self._create_email(sender) >> >> - person_a = find_author(mail) >> + person_a = get_or_create_author(mail) >> person_a.save() >> >> mail = self._create_email(sender.upper()) >> - person_b = find_author(mail) >> + person_b = get_or_create_author(mail) >> self.assertEqual(person_b._state.adding, False) >> self.assertEqual(person_b.id, person_a.id) >> >> @@ -361,7 +352,8 @@ class SeriesCorrelationTest(TestCase): >> email = self._create_email(msgid) >> project = create_project() >> >> - self.assertIsNone(find_series(project, email)) >> + self.assertIsNone(find_series(project, email, >> + get_or_create_author(email))) >> >> def test_first_reply(self): >> msgid_a = make_msgid() >> @@ -371,7 +363,8 @@ class SeriesCorrelationTest(TestCase): >> # assume msgid_a was already handled >> ref = create_series_reference(msgid=msgid_a) >> >> - series = find_series(ref.series.project, email) >> + series = find_series(ref.series.project, email, >> + get_or_create_author(email)) >> self.assertEqual(series, ref.series) >> >> def test_nested_series(self): >> @@ -395,7 +388,7 @@ class SeriesCorrelationTest(TestCase): >> # ...and the "first patch" of this new series >> msgid = make_msgid() >> email = self._create_email(msgid, msgids) >> - series = find_series(project, email) >> + series = find_series(project, email, get_or_create_author(email)) >> >> # this should link to the second series - not the first >> self.assertEqual(len(msgids), 4 + 1) # old series + new cover >> > > -- > Andrew Donnellan OzLabs, ADL Canberra > andrew.donnellan@au1.ibm.com IBM Australia Limited
diff --git a/patchwork/parser.py b/patchwork/parser.py index cbf88fe4e464..1f3b3dd8901f 100644 --- a/patchwork/parser.py +++ b/patchwork/parser.py @@ -29,6 +29,7 @@ import logging import re from django.contrib.auth.models import User +from django.db import IntegrityError from django.utils import six from patchwork.models import Comment @@ -241,7 +242,7 @@ def _find_series_by_references(project, mail): continue -def _find_series_by_markers(project, mail): +def _find_series_by_markers(project, mail, author): """Find a patch's series using series markers and sender. Identify suitable series for a patch using a combination of the @@ -262,7 +263,6 @@ def _find_series_by_markers(project, mail): still won't help us if someone spams the mailing list with duplicate series but that's a tricky situation for anyone to parse. """ - author = find_author(mail) subject = mail.get('Subject') name, prefixes = clean_subject(subject, [project.linkname]) @@ -282,7 +282,7 @@ def _find_series_by_markers(project, mail): return -def find_series(project, mail): +def find_series(project, mail, author): """Find a series, if any, for a given patch. Args: @@ -297,10 +297,10 @@ def find_series(project, mail): if series: return series - return _find_series_by_markers(project, mail) + return _find_series_by_markers(project, mail, author) -def find_author(mail): +def get_or_create_author(mail): from_header = clean_header(mail.get('From')) if not from_header: @@ -342,11 +342,16 @@ def find_author(mail): name = name.strip()[:255] try: + person = Person.objects.get_or_create(email__iexact=email, + defaults={'name': name, + 'email': email})[0] + except IntegrityError: + # we lost the race to create the person person = Person.objects.get(email__iexact=email) - if name: # use the latest provided name - person.name = name - except Person.DoesNotExist: - person = Person(name=name, email=email) + + if name: # use the latest provided name + person.name = name + person.save() return person @@ -947,7 +952,6 @@ def parse_mail(mail, list_id=None): raise ValueError("Broken 'Message-Id' header") msgid = msgid[:255] - author = find_author(mail) subject = mail.get('Subject') name, prefixes = clean_subject(subject, [project.linkname]) is_comment = subject_check(subject) @@ -973,7 +977,7 @@ def parse_mail(mail, list_id=None): if not is_comment and (diff or pull_url): # patches or pull requests # we delay the saving until we know we have a patch. - author.save() + author = get_or_create_author(mail) delegate = find_delegate_by_header(mail) if not delegate and diff: @@ -984,7 +988,7 @@ def parse_mail(mail, list_id=None): # series to match against. series = None if n: - series = find_series(project, mail) + series = find_series(project, mail, author) else: x = n = 1 @@ -1061,7 +1065,7 @@ def parse_mail(mail, list_id=None): is_cover_letter = True if is_cover_letter: - author.save() + author = get_or_create_author(mail) # we don't use 'find_series' here as a cover letter will # always be the first item in a thread, thus the references @@ -1109,7 +1113,7 @@ def parse_mail(mail, list_id=None): if not submission: return - author.save() + author = get_or_create_author(mail) comment = Comment( submission=submission, diff --git a/patchwork/tests/test_parser.py b/patchwork/tests/test_parser.py index 68bcb937b273..6cfe7a484443 100644 --- a/patchwork/tests/test_parser.py +++ b/patchwork/tests/test_parser.py @@ -34,7 +34,7 @@ from patchwork.models import Patch from patchwork.models import Person from patchwork.models import State from patchwork.parser import clean_subject -from patchwork.parser import find_author +from patchwork.parser import get_or_create_author from patchwork.parser import find_patch_content as find_content from patchwork.parser import find_project_by_header from patchwork.parser import find_series @@ -225,7 +225,7 @@ class SenderEncodingTest(TestCase): def _test_encoding(self, from_header, sender_name, sender_email): email = self._create_email(from_header) - person = find_author(email) + person = get_or_create_author(email) person.save() # ensure it was parsed correctly @@ -241,7 +241,7 @@ class SenderEncodingTest(TestCase): def test_empty(self): email = self._create_email('') with self.assertRaises(ValueError): - find_author(email) + get_or_create_author(email) def test_ascii_encoding(self): from_header = 'example user <user@example.com>' @@ -269,7 +269,7 @@ class SenderEncodingTest(TestCase): class SenderCorrelationTest(TestCase): - """Validate correct behavior of the find_author case. + """Validate correct behavior of the get_or_create_author case. Relies of checking the internal state of a Django model object. @@ -284,25 +284,16 @@ class SenderCorrelationTest(TestCase): 'test\n' return message_from_string(mail) - def test_non_existing_sender(self): - sender = 'Non-existing Sender <nonexisting@example.com>' - mail = self._create_email(sender) - - # don't create the person - attempt to find immediately - person = find_author(mail) - self.assertEqual(person._state.adding, True) - self.assertEqual(person.id, None) - def test_existing_sender(self): sender = 'Existing Sender <existing@example.com>' mail = self._create_email(sender) # create the person first - person_a = find_author(mail) + person_a = get_or_create_author(mail) person_a.save() # then attempt to parse email with the same 'From' line - person_b = find_author(mail) + person_b = get_or_create_author(mail) self.assertEqual(person_b._state.adding, False) self.assertEqual(person_b.id, person_a.id) @@ -311,12 +302,12 @@ class SenderCorrelationTest(TestCase): mail = self._create_email(sender) # create the person first - person_a = find_author(mail) + person_a = get_or_create_author(mail) person_a.save() # then attempt to parse email with a new 'From' line mail = self._create_email('existing@example.com') - person_b = find_author(mail) + person_b = get_or_create_author(mail) self.assertEqual(person_b._state.adding, False) self.assertEqual(person_b.id, person_a.id) @@ -324,11 +315,11 @@ class SenderCorrelationTest(TestCase): sender = 'Existing Sender <existing@example.com>' mail = self._create_email(sender) - person_a = find_author(mail) + person_a = get_or_create_author(mail) person_a.save() mail = self._create_email(sender.upper()) - person_b = find_author(mail) + person_b = get_or_create_author(mail) self.assertEqual(person_b._state.adding, False) self.assertEqual(person_b.id, person_a.id) @@ -361,7 +352,8 @@ class SeriesCorrelationTest(TestCase): email = self._create_email(msgid) project = create_project() - self.assertIsNone(find_series(project, email)) + self.assertIsNone(find_series(project, email, + get_or_create_author(email))) def test_first_reply(self): msgid_a = make_msgid() @@ -371,7 +363,8 @@ class SeriesCorrelationTest(TestCase): # assume msgid_a was already handled ref = create_series_reference(msgid=msgid_a) - series = find_series(ref.series.project, email) + series = find_series(ref.series.project, email, + get_or_create_author(email)) self.assertEqual(series, ref.series) def test_nested_series(self): @@ -395,7 +388,7 @@ class SeriesCorrelationTest(TestCase): # ...and the "first patch" of this new series msgid = make_msgid() email = self._create_email(msgid, msgids) - series = find_series(project, email) + series = find_series(project, email, get_or_create_author(email)) # this should link to the second series - not the first self.assertEqual(len(msgids), 4 + 1) # old series + new cover
find_author looks up a person by email, and if they do not exist, creates a Person model, which may be saved later if the message contains something valuable. Multiple simultaneous processes can race here: both can do the SELECT, find there is no Person, and create the model. One will succeed in saving, the other will get an IntegrityError. Reduce the window by making find_author into get_or_create_author, and plumb that through. (Remove a test that specifically required find_author to *not* create). More importantly, cover the case where we lose the race, by catching the IntegrityError and fetching the winning Person model. Signed-off-by: Daniel Axtens <dja@axtens.net> --- patchwork/parser.py | 32 ++++++++++++++++++-------------- patchwork/tests/test_parser.py | 37 +++++++++++++++---------------------- 2 files changed, 33 insertions(+), 36 deletions(-)