diff mbox series

[RESEND,1/2] Rework tagging infrastructure

Message ID 20180709161022.32622-2-vkabatov@redhat.com
State Superseded
Headers show
Series Rework tagging infrastructure | expand

Commit Message

Veronika Kabatova July 9, 2018, 4:10 p.m. UTC
From: Veronika Kabatova <vkabatov@redhat.com>

Solve #113 and #57 GitHub issues, fix up returning tags in the API,
keep track of tag origin to be able to add tags to comments in the API.

Use relations Tag-Patch and Tag-CoverLetter to avoid duplication of
tags for each patch in series, and use `series` attribute of
SubmissionTag as a notion of a tag which is related to each patch in the
series (because it comes from cover letter or it's comments)

Signed-off-by: Veronika Kabatova <vkabatov@redhat.com>
---
 patchwork/api/comment.py        |  18 ++++-
 patchwork/api/cover.py          |  19 ++++-
 patchwork/api/filters.py        |  68 +++++++++++++++-
 patchwork/api/patch.py          |  18 ++++-
 patchwork/models.py             | 173 ++++++++++++++++------------------------
 patchwork/templatetags/patch.py |   3 +-
 patchwork/views/__init__.py     |   3 -
 patchwork/views/utils.py        |  10 ++-
 8 files changed, 191 insertions(+), 121 deletions(-)

Comments

Stephen Finucane Sept. 11, 2018, 5:55 p.m. UTC | #1
On Mon, 2018-07-09 at 18:10 +0200, vkabatov@redhat.com wrote:
> From: Veronika Kabatova <vkabatov@redhat.com>
> 
> Solve #113 and #57 GitHub issues, fix up returning tags in the API,
> keep track of tag origin to be able to add tags to comments in the API.
> 
> Use relations Tag-Patch and Tag-CoverLetter to avoid duplication of
> tags for each patch in series, and use `series` attribute of
> SubmissionTag as a notion of a tag which is related to each patch in the
> series (because it comes from cover letter or it's comments)
> 
> Signed-off-by: Veronika Kabatova <vkabatov@redhat.com>

To start, apologies if I'm repeating myself on any of the comments
below: it's been a long time since I first reviewed this (sorry!).
Also, for the next revision of this, this can definitely be broken up
on purpose :) Given that tags aren't currently shown in the API, I'd
move the API changes into a separate patch. Similarly, docs (though not
release notes) aren't needed and can be broken up and moved into a
separate patch. Now to the review...

> ---
>  patchwork/api/comment.py        |  18 ++++-
>  patchwork/api/cover.py          |  19 ++++-
>  patchwork/api/filters.py        |  68 +++++++++++++++-
>  patchwork/api/patch.py          |  18 ++++-
>  patchwork/models.py             | 173 ++++++++++++++++------------------------
>  patchwork/templatetags/patch.py |   3 +-
>  patchwork/views/__init__.py     |   3 -
>  patchwork/views/utils.py        |  10 ++-
>  8 files changed, 191 insertions(+), 121 deletions(-)
> 
> diff --git a/patchwork/api/comment.py b/patchwork/api/comment.py
> index 5a5adb1..e03207e 100644
> --- a/patchwork/api/comment.py
> +++ b/patchwork/api/comment.py
> @@ -26,6 +26,7 @@ from patchwork.api.base import BaseHyperlinkedModelSerializer
>  from patchwork.api.base import PatchworkPermission
>  from patchwork.api.embedded import PersonSerializer
>  from patchwork.models import Comment
> +from patchwork.models import SubmissionTag
>  
>  
>  class CommentListSerializer(BaseHyperlinkedModelSerializer):
> @@ -34,6 +35,7 @@ class CommentListSerializer(BaseHyperlinkedModelSerializer):
>      subject = SerializerMethodField()
>      headers = SerializerMethodField()
>      submitter = PersonSerializer(read_only=True)
> +    tags = SerializerMethodField()
>  
>      def get_web_url(self, instance):
>          request = self.context.get('request')
> @@ -43,6 +45,20 @@ class CommentListSerializer(BaseHyperlinkedModelSerializer):
>          return email.parser.Parser().parsestr(comment.headers,
>                                                True).get('Subject', '')
>  
> +    def get_tags(self, instance):
> +        if not instance.submission.project.use_tags:
> +            return {}
> +
> +        tags = SubmissionTag.objects.filter(
> +            comment=instance
> +        ).values_list('tag__name', 'value')
> +
> +        result = {}
> +        for name, value in tags:
> +            result.setdefault(name, []).append(value)
> +
> +        return result

This smells of something that should be placed in models.py. Is there
any reason we _can't_ do it there, e.g. via a property?

> +
>      def get_headers(self, comment):
>          headers = {}
>  
> @@ -60,7 +76,7 @@ class CommentListSerializer(BaseHyperlinkedModelSerializer):
>      class Meta:
>          model = Comment
>          fields = ('id', 'web_url', 'msgid', 'date', 'subject', 'submitter',
> -                  'content', 'headers')
> +                  'content', 'headers', 'tags')
>          read_only_fields = fields
>          versioned_fields = {
>              '1.1': ('web_url', ),

You definitely need changes to 'get_queryset' here. I can help you work
on these but tl;dr: look at what we do for 'check_set' in
'patchwork/api/patch.py'.

> diff --git a/patchwork/api/cover.py b/patchwork/api/cover.py
> index b497fd8..b17b9f7 100644
> --- a/patchwork/api/cover.py
> +++ b/patchwork/api/cover.py
> @@ -30,6 +30,7 @@ from patchwork.api.embedded import PersonSerializer
>  from patchwork.api.embedded import ProjectSerializer
>  from patchwork.api.embedded import SeriesSerializer
>  from patchwork.models import CoverLetter
> +from patchwork.models import SubmissionTag
>  
>  
>  class CoverLetterListSerializer(BaseHyperlinkedModelSerializer):
> @@ -40,6 +41,7 @@ class CoverLetterListSerializer(BaseHyperlinkedModelSerializer):
>      mbox = SerializerMethodField()
>      series = SeriesSerializer(many=True, read_only=True)
>      comments = SerializerMethodField()
> +    tags = SerializerMethodField()
>  
>      def get_web_url(self, instance):
>          request = self.context.get('request')
> @@ -53,13 +55,26 @@ class CoverLetterListSerializer(BaseHyperlinkedModelSerializer):
>          return self.context.get('request').build_absolute_uri(
>              reverse('api-cover-comment-list', kwargs={'pk': cover.id}))
>  
> +    def get_tags(self, instance):
> +        if not instance.project.use_tags:
> +            return {}
> +
> +        tags = SubmissionTag.objects.filter(
> +            submission=instance).values_list('tag__name', 'value').distinct()
> +
> +        result = {}
> +        for name, value in tags:
> +            result.setdefault(name, []).append(value)
> +
> +        return result
> +

As above, this should be a property on the model, I imagine.

>      class Meta:
>          model = CoverLetter
>          fields = ('id', 'url', 'web_url', 'project', 'msgid', 'date', 'name',
> -                  'submitter', 'mbox', 'series', 'comments')
> +                  'submitter', 'mbox', 'series', 'comments', 'tags')
>          read_only_fields = fields
>          versioned_fields = {
> -            '1.1': ('web_url', 'mbox', 'comments'),
> +            '1.1': ('web_url', 'mbox', 'comments', 'tags'),

This should be '1.2' now.

>          }
>          extra_kwargs = {
>              'url': {'view_name': 'api-cover-detail'},

Same comments RE: 'get_queryset'.

> diff --git a/patchwork/api/filters.py b/patchwork/api/filters.py
> index 73353d9..e213181 100644
> --- a/patchwork/api/filters.py
> +++ b/patchwork/api/filters.py
> @@ -21,9 +21,11 @@ from django.contrib.auth.models import User
>  from django.core.exceptions import ValidationError
>  from django.db.models import Q
>  from django_filters.rest_framework import FilterSet
> +from django_filters import Filter
>  from django_filters import IsoDateTimeFilter
>  from django_filters import ModelMultipleChoiceFilter
>  from django.forms import ModelMultipleChoiceField as BaseMultipleChoiceField
> +from django.forms.widgets import HiddenInput
>  from django.forms.widgets import MultipleHiddenInput
>  
>  from patchwork.models import Bundle
> @@ -35,6 +37,7 @@ from patchwork.models import Person
>  from patchwork.models import Project
>  from patchwork.models import Series
>  from patchwork.models import State
> +from patchwork.models import SubmissionTag
>  
>  
>  # custom fields, filters
> @@ -136,6 +139,60 @@ class StateFilter(ModelMultipleChoiceFilter):
>      field_class = StateChoiceField
>  
>  
> +class TagNameFilter(Filter):
> +
> +    def filter(self, qs, tag_name):
> +        submissions_series = SubmissionTag.objects.filter(
> +            tag__name__iexact=tag_name
> +        ).values_list('submission__id', 'series')
> +
> +        submission_list = []
> +        series_list = []
> +        for submission, series in submissions_series:
> +            submission_list.append(submission)
> +            series_list.append(series)
> +
> +        return qs.filter(Q(id__in=submission_list) | Q(series__in=series_list))
> +
> +
> +class TagValueFilter(Filter):
> +
> +    def filter(self, qs, tag_value):
> +        submissions_series = SubmissionTag.objects.filter(
> +            value__icontains=tag_value
> +        ).values_list('submission__id', 'series')
> +
> +        submission_list = []
> +        series_list = []
> +        for submission, series in submissions_series:
> +            submission_list.append(submission)
> +            series_list.append(series)
> +
> +        return qs.filter(Q(id__in=submission_list) | Q(series__in=series_list))
> +
> +
> +class TagFilter(Filter):
> +
> +    def filter(self, qs, query):
> +        try:
> +            tag_name, tag_value = query.split(',', 1)
> +        except ValueError:
> +            raise ValidationError('Query in format `tag=name,value` expected!')
> +
> +        submissions_series = SubmissionTag.objects.filter(
> +            tag__name__iexact=tag_name,
> +            value__icontains=tag_value
> +        ).values_list('submission__id', 'series')
> +
> +        submission_list = []
> +        series_list = []
> +        for submission, series in submissions_series:
> +            submission_list.append(submission)
> +            series_list.append(series)
> +
> +        return qs.filter(Q(id__in=submission_list) | Q(series__in=series_list))
> +
> +

The code for this _looks_ fine, but I do wonder if it's necessary?
Would it be possible to support basic wildcarding in the filter
instead? Something like this:

  ?tag=*: stephen@that.guru

(but with HTTP encoding). We could also make this wildcarded by
default, e.g.

  ?tag=Reviewed-by:

Would return all results for this tag.

>  class UserChoiceField(ModelMultipleChoiceField):
>  
>      alternate_lookup = 'username__iexact'
> @@ -173,10 +230,14 @@ class CoverLetterFilterSet(TimestampMixin, FilterSet):
>      series = BaseFilter(queryset=Project.objects.all(),
>                          widget=MultipleHiddenInput)
>      submitter = PersonFilter(queryset=Person.objects.all())
> +    tagname = TagNameFilter(label='Tag name')
> +    tagvalue = TagValueFilter(widget=HiddenInput)
> +    tag = TagFilter(widget=HiddenInput)
>  
>      class Meta:
>          model = CoverLetter
> -        fields = ('project', 'series', 'submitter')
> +        fields = ('project', 'series', 'submitter', 'tagname', 'tagvalue',
> +                  'tag')
>  
>  
>  class PatchFilterSet(TimestampMixin, FilterSet):
> @@ -189,11 +250,14 @@ class PatchFilterSet(TimestampMixin, FilterSet):
>      submitter = PersonFilter(queryset=Person.objects.all())
>      delegate = UserFilter(queryset=User.objects.all())
>      state = StateFilter(queryset=State.objects.all())
> +    tagname = TagNameFilter(label='Tag name')
> +    tagvalue = TagValueFilter(widget=HiddenInput)
> +    tag = TagFilter(widget=HiddenInput)
>  
>      class Meta:
>          model = Patch
>          fields = ('project', 'series', 'submitter', 'delegate',
> -                  'state', 'archived')
> +                  'state', 'archived', 'tagname', 'tagvalue', 'tag')
>  
>  
>  class CheckFilterSet(TimestampMixin, FilterSet):
> diff --git a/patchwork/api/patch.py b/patchwork/api/patch.py
> index 9d890eb..cf97c40 100644
> --- a/patchwork/api/patch.py
> +++ b/patchwork/api/patch.py
> @@ -19,6 +19,7 @@
>  
>  import email.parser
>  
> +from django.db.models import Q
>  from django.utils.translation import ugettext_lazy as _
>  from rest_framework.generics import ListAPIView
>  from rest_framework.generics import RetrieveUpdateAPIView
> @@ -35,6 +36,7 @@ from patchwork.api.embedded import SeriesSerializer
>  from patchwork.api.embedded import UserSerializer
>  from patchwork.models import Patch
>  from patchwork.models import State
> +from patchwork.models import SubmissionTag
>  from patchwork.parser import clean_subject
>  
>  
> @@ -109,9 +111,18 @@ class PatchListSerializer(BaseHyperlinkedModelSerializer):
>              reverse('api-check-list', kwargs={'patch_id': instance.id}))
>  
>      def get_tags(self, instance):
> -        # TODO(stephenfin): Make tags performant, possibly by reworking the
> -        # model
> -        return {}
> +        if not instance.project.use_tags:
> +            return {}
> +
> +        tags = SubmissionTag.objects.filter(
> +            Q(submission=instance) | Q(series__in=instance.series.all())
> +        ).values_list('tag__name', 'value').distinct()
> +
> +        result = {}
> +        for name, value in tags:
> +            result.setdefault(name, []).append(value)
> +
> +        return result
>  
>      class Meta:
>          model = Patch
> @@ -160,6 +171,7 @@ class PatchDetailSerializer(PatchListSerializer):
>              'headers', 'content', 'diff', 'prefixes')
>          versioned_fields = PatchListSerializer.Meta.versioned_fields
>          extra_kwargs = PatchListSerializer.Meta.extra_kwargs
> +        versioned_fields = PatchListSerializer.Meta.versioned_fields
>  
>  
>  class PatchList(ListAPIView):

This is missing the entry in the versioned_fields setting. Also, same
comment around 'get_tags'.

> diff --git a/patchwork/models.py b/patchwork/models.py
> index 6268f5b..ba53278 100644
> --- a/patchwork/models.py
> +++ b/patchwork/models.py
> @@ -20,8 +20,6 @@
>  
>  from __future__ import absolute_import
>  
> -from collections import Counter
> -from collections import OrderedDict
>  import datetime
>  import random
>  import re
> @@ -31,6 +29,7 @@ from django.conf import settings
>  from django.contrib.auth.models import User
>  from django.core.exceptions import ValidationError
>  from django.db import models
> +from django.db.models import Q
>  from django.utils.encoding import python_2_unicode_compatible
>  from django.utils.functional import cached_property
>  
> @@ -252,10 +251,6 @@ class Tag(models.Model):
>                                        ' tag\'s count in the patch list view',
>                                        default=True)
>  
> -    @property
> -    def attr_name(self):
> -        return 'tag_%d_count' % self.id
> -
>      def __str__(self):
>          return self.name
>  
> @@ -263,64 +258,21 @@ class Tag(models.Model):
>          ordering = ['abbrev']
>  
>  
> -class PatchTag(models.Model):
> -    patch = models.ForeignKey('Patch', on_delete=models.CASCADE)
> +class SubmissionTag(models.Model):
> +    submission = models.ForeignKey('Submission', on_delete=models.CASCADE)
>      tag = models.ForeignKey('Tag', on_delete=models.CASCADE)
> -    count = models.IntegerField(default=1)
> +    value = models.CharField(max_length=255)
> +    comment = models.ForeignKey('Comment', null=True, on_delete=models.CASCADE)
> +    series = models.ForeignKey('Series', null=True, on_delete=models.CASCADE)
>  
>      class Meta:
> -        unique_together = [('patch', 'tag')]
> +        unique_together = [('submission', 'tag', 'value', 'comment')]
>  
>  
>  def get_default_initial_patch_state():
>      return State.objects.get(ordering=0)
>  
>  
> -class PatchQuerySet(models.query.QuerySet):
> -
> -    def with_tag_counts(self, project=None):
> -        if project and not project.use_tags:
> -            return self
> -
> -        # We need the project's use_tags field loaded for Project.tags().
> -        # Using prefetch_related means we'll share the one instance of
> -        # Project, and share the project.tags cache between all patch.project
> -        # references.
> -        qs = self.prefetch_related('project')
> -        select = OrderedDict()
> -        select_params = []
> -
> -        # All projects have the same tags, so we're good to go here
> -        if project:
> -            tags = project.tags
> -        else:
> -            tags = Tag.objects.all()
> -
> -        for tag in tags:
> -            select[tag.attr_name] = (
> -                "coalesce("
> -                "(SELECT count FROM patchwork_patchtag"
> -                " WHERE patchwork_patchtag.patch_id="
> -                "patchwork_patch.submission_ptr_id"
> -                " AND patchwork_patchtag.tag_id=%s), 0)")
> -            select_params.append(tag.id)
> -
> -        return qs.extra(select=select, select_params=select_params)
> -
> -
> -class PatchManager(models.Manager):
> -    use_for_related_fields = True
> -    # NOTE(stephenfin): This is necessary to silence a warning with Django >=
> -    # 1.10. Remove when 1.10 is the minimum supported version.
> -    silence_use_for_related_fields_deprecation = True
> -
> -    def get_queryset(self):
> -        return PatchQuerySet(self.model, using=self.db)
> -
> -    def with_tag_counts(self, project):
> -        return self.get_queryset().with_tag_counts(project)
> -
> -
>  class EmailMixin(models.Model):
>      """Mixin for models with an email-origin."""
>      # email metadata
> @@ -334,17 +286,16 @@ class EmailMixin(models.Model):
>      submitter = models.ForeignKey(Person, on_delete=models.CASCADE)
>      content = models.TextField(null=True, blank=True)
>  
> -    response_re = re.compile(
> -        r'^(Tested|Reviewed|Acked|Signed-off|Nacked|Reported)-by:.*$',
> -        re.M | re.I)
> +    def _extract_tags(self, tags):
> +        found_tags = {}
>  
> -    @property
> -    def patch_responses(self):
>          if not self.content:
> -            return ''
> +            return found_tags
>  
> -        return ''.join([match.group(0) + '\n' for match in
> -                        self.response_re.finditer(self.content)])
> +        for tag in tags:
> +            regex = re.compile(tag.pattern + r'\s*(.*)', re.M | re.I | re.U)
> +            found_tags[tag] = regex.findall(self.content)
> +        return found_tags

I wonder if we can move this to 'parser.py' now? It doesn't sound like
something we'll be doing at runtime so we could move it out of here
now.

>  
>      def save(self, *args, **kwargs):
>          # Modifying a submission via admin interface changes '\n' newlines in
> @@ -377,6 +328,40 @@ class Submission(FilenameMixin, EmailMixin, models.Model):
>      # submission metadata
>  
>      name = models.CharField(max_length=255)
> +    related_tags = models.ManyToManyField(Tag, through=SubmissionTag)

Just 'tags'?

> +
> +    def add_tags(self, tag, values, comment=None):
> +        if hasattr(self, 'patch'):
> +            series = None
> +        else:
> +            series = self.coverletter.series.first()
> +
> +        current_objs = SubmissionTag.objects.filter(submission=self,
> +                                                    comment=comment,
> +                                                    tag=tag,
> +                                                    series=series)
> +
> +        if not values:
> +            current_objs.delete()
> +            return
> +
> +        # In case the origin is modified, delete tags that were removed
> +        current_objs.exclude(value__in=values).delete()
> +
> +        values_to_add = set(values) - set(current_objs.values_list('value',
> +                                                                   flat=True))
> +        SubmissionTag.objects.bulk_create([SubmissionTag(
> +            submission=self,
> +            tag=tag,
> +            value=value,
> +            comment=comment,
> +            series=series
> +        ) for value in values_to_add])
> +
> +    def refresh_tags(self):
> +        submission_tags = self._extract_tags(Tag.objects.all())
> +        for tag in submission_tags:
> +            self.add_tags(tag, submission_tags[tag])

Similar comments around moving these functions out of here and into
'parser.py'. I'd _really_ like to have all this stuff in 'parser.py' or
'retag.py'.

>  
>      # patchwork metadata
>  
> @@ -426,7 +411,6 @@ class Patch(SeriesMixin, Submission):
>      diff = models.TextField(null=True, blank=True)
>      commit_ref = models.CharField(max_length=255, null=True, blank=True)
>      pull_url = models.CharField(max_length=255, null=True, blank=True)
> -    tags = models.ManyToManyField(Tag, through=PatchTag)
>  
>      # patchwork metadata
>  
> @@ -440,40 +424,6 @@ class Patch(SeriesMixin, Submission):
>      # patches in a project without needing to do a JOIN.
>      patch_project = models.ForeignKey(Project, on_delete=models.CASCADE)
>  
> -    objects = PatchManager()
> -
> -    @staticmethod
> -    def extract_tags(content, tags):
> -        counts = Counter()
> -
> -        for tag in tags:
> -            regex = re.compile(tag.pattern, re.MULTILINE | re.IGNORECASE)
> -            counts[tag] = len(regex.findall(content))
> -
> -        return counts
> -
> -    def _set_tag(self, tag, count):
> -        if count == 0:
> -            self.patchtag_set.filter(tag=tag).delete()
> -            return
> -        patchtag, _ = PatchTag.objects.get_or_create(patch=self, tag=tag)
> -        if patchtag.count != count:
> -            patchtag.count = count
> -            patchtag.save()
> -
> -    def refresh_tag_counts(self):
> -        tags = self.project.tags
> -        counter = Counter()
> -
> -        if self.content:
> -            counter += self.extract_tags(self.content, tags)
> -
> -        for comment in self.comments.all():
> -            counter = counter + self.extract_tags(comment.content, tags)
> -
> -        for tag in tags:
> -            self._set_tag(tag, counter[tag])
> -
>      def save(self, *args, **kwargs):
>          if not hasattr(self, 'state') or not self.state:
>              self.state = get_default_initial_patch_state()
> @@ -482,8 +432,7 @@ class Patch(SeriesMixin, Submission):
>              self.hash = hash_diff(self.diff)
>  
>          super(Patch, self).save(**kwargs)
> -
> -        self.refresh_tag_counts()
> +        self.refresh_tags()
>  
>      def is_editable(self, user):
>          if not is_authenticated(user):
> @@ -495,6 +444,18 @@ class Patch(SeriesMixin, Submission):
>          return self.project.is_editable(user)
>  
>      @property
> +    def all_tags(self):
> +        related_tags = SubmissionTag.objects.filter(
> +            Q(submission=self) | Q(series__in=self.series.all())
> +        ).values_list('tag__name', 'value').distinct()
> +
> +        patch_tags = {}
> +        for tag in self.project.tags:
> +            patch_tags[tag] = [related_tag[1] for related_tag in related_tags
> +                               if related_tag[0] == tag.name]

Hmm, I don't think you want to be doing a list comprehension here as
Python is way slower than the DB. You could do:

  related_tags = related_tags.filter(name__in=self.project.tags)

...or similar.

> +        return patch_tags

Yeah, we should probably be using this in the API. Not sure about
performance though. This could need some work.

> +
> +    @property
>      def combined_check_state(self):
>          """Return the combined state for all checks.
>  
> @@ -611,13 +572,12 @@ class Comment(EmailMixin, models.Model):
>  
>      def save(self, *args, **kwargs):
>          super(Comment, self).save(*args, **kwargs)
> -        if hasattr(self.submission, 'patch'):
> -            self.submission.patch.refresh_tag_counts()
> +        self.refresh_tags()

I'd _personally_ rather we did this in 'parser.py'. There's no reason
to calculate this stuff everytime we change anything about the patch
(e.g. updating the state).

> -    def delete(self, *args, **kwargs):
> -        super(Comment, self).delete(*args, **kwargs)
> -        if hasattr(self.submission, 'patch'):
> -            self.submission.patch.refresh_tag_counts()
> +    def refresh_tags(self):
> +        comment_tags = self._extract_tags(Tag.objects.all())
> +        for tag in comment_tags:
> +            self.submission.add_tags(tag, comment_tags[tag], comment=self)
>  
>      def is_editable(self, user):
>          return False
> @@ -716,6 +676,7 @@ class Series(FilenameMixin, models.Model):
>                  self.name = self._format_name(cover)
>  
>          self.save()
> +        cover.refresh_tags()
>  
>      def add_patch(self, patch, number):
>          """Add a patch to the series."""
> diff --git a/patchwork/templatetags/patch.py b/patchwork/templatetags/patch.py
> index 4350e09..f226dc0 100644
> --- a/patchwork/templatetags/patch.py
> +++ b/patchwork/templatetags/patch.py
> @@ -34,8 +34,9 @@ register = template.Library()
>  def patch_tags(patch):
>      counts = []
>      titles = []
> +    all_tags = patch.all_tags
>      for tag in [t for t in patch.project.tags if t.show_column]:
> -        count = getattr(patch, tag.attr_name)
> +        count = len(all_tags[tag])
>          titles.append('%d %s' % (count, tag.name))
>          if count == 0:
>              counts.append("-")
> diff --git a/patchwork/views/__init__.py b/patchwork/views/__init__.py
> index f8d23a3..8c41df6 100644
> --- a/patchwork/views/__init__.py
> +++ b/patchwork/views/__init__.py
> @@ -272,9 +272,6 @@ def generic_list(request, project, view, view_args=None, filter_settings=None,
>      if patches is None:
>          patches = Patch.objects.filter(patch_project=project)
>  
> -    # annotate with tag counts
> -    patches = patches.with_tag_counts(project)
> -
>      patches = context['filters'].apply(patches)
>      if not editable_order:
>          patches = order.apply(patches)
> diff --git a/patchwork/views/utils.py b/patchwork/views/utils.py
> index 2357ab8..f1f78c8 100644
> --- a/patchwork/views/utils.py
> +++ b/patchwork/views/utils.py
> @@ -27,11 +27,13 @@ import email.utils
>  import re
>  
>  from django.conf import settings
> +from django.db.models import Q
>  from django.http import Http404
>  from django.utils import six
>  
>  from patchwork.models import Comment
>  from patchwork.models import Patch
> +from patchwork.models import SubmissionTag
>  from patchwork.models import Series
>  
>  if settings.ENABLE_REST_API:
> @@ -75,9 +77,11 @@ def _submission_to_mbox(submission):
>      else:
>          postscript = ''
>  
> -    # TODO(stephenfin): Make this use the tags infrastructure
> -    for comment in Comment.objects.filter(submission=submission):
> -        body += comment.patch_responses
> +    for (tagname, value) in SubmissionTag.objects.filter(
> +            Q(submission=submission)
> +            | Q(series__in=submission.series.all())).values_list(
> +                'tag__name', 'value').distinct():
> +        body += '%s: %s\n' % (tagname, value)
>  
>      if postscript:
>          body += '---\n' + postscript + '\n'
Veronika Kabatova Sept. 12, 2018, 8:57 a.m. UTC | #2
----- Original Message -----
> From: "Stephen Finucane" <stephen@that.guru>
> To: vkabatov@redhat.com, patchwork@lists.ozlabs.org
> Sent: Tuesday, September 11, 2018 7:55:46 PM
> Subject: Re: [PATCH RESEND 1/2] Rework tagging infrastructure
> 
> On Mon, 2018-07-09 at 18:10 +0200, vkabatov@redhat.com wrote:
> > From: Veronika Kabatova <vkabatov@redhat.com>
> > 
> > Solve #113 and #57 GitHub issues, fix up returning tags in the API,
> > keep track of tag origin to be able to add tags to comments in the API.
> > 
> > Use relations Tag-Patch and Tag-CoverLetter to avoid duplication of
> > tags for each patch in series, and use `series` attribute of
> > SubmissionTag as a notion of a tag which is related to each patch in the
> > series (because it comes from cover letter or it's comments)
> > 
> > Signed-off-by: Veronika Kabatova <vkabatov@redhat.com>
> 
> To start, apologies if I'm repeating myself on any of the comments
> below: it's been a long time since I first reviewed this (sorry!).
> Also, for the next revision of this, this can definitely be broken up
> on purpose :) Given that tags aren't currently shown in the API, I'd
> move the API changes into a separate patch. Similarly, docs (though not
> release notes) aren't needed and can be broken up and moved into a
> separate patch. Now to the review...
> 

Sounds good!

/me goes on to comment on everything

> > ---
> >  patchwork/api/comment.py        |  18 ++++-
> >  patchwork/api/cover.py          |  19 ++++-
> >  patchwork/api/filters.py        |  68 +++++++++++++++-
> >  patchwork/api/patch.py          |  18 ++++-
> >  patchwork/models.py             | 173
> >  ++++++++++++++++------------------------
> >  patchwork/templatetags/patch.py |   3 +-
> >  patchwork/views/__init__.py     |   3 -
> >  patchwork/views/utils.py        |  10 ++-
> >  8 files changed, 191 insertions(+), 121 deletions(-)
> > 
> > diff --git a/patchwork/api/comment.py b/patchwork/api/comment.py
> > index 5a5adb1..e03207e 100644
> > --- a/patchwork/api/comment.py
> > +++ b/patchwork/api/comment.py
> > @@ -26,6 +26,7 @@ from patchwork.api.base import
> > BaseHyperlinkedModelSerializer
> >  from patchwork.api.base import PatchworkPermission
> >  from patchwork.api.embedded import PersonSerializer
> >  from patchwork.models import Comment
> > +from patchwork.models import SubmissionTag
> >  
> >  
> >  class CommentListSerializer(BaseHyperlinkedModelSerializer):
> > @@ -34,6 +35,7 @@ class
> > CommentListSerializer(BaseHyperlinkedModelSerializer):
> >      subject = SerializerMethodField()
> >      headers = SerializerMethodField()
> >      submitter = PersonSerializer(read_only=True)
> > +    tags = SerializerMethodField()
> >  
> >      def get_web_url(self, instance):
> >          request = self.context.get('request')
> > @@ -43,6 +45,20 @@ class
> > CommentListSerializer(BaseHyperlinkedModelSerializer):
> >          return email.parser.Parser().parsestr(comment.headers,
> >                                                True).get('Subject', '')
> >  
> > +    def get_tags(self, instance):
> > +        if not instance.submission.project.use_tags:
> > +            return {}
> > +
> > +        tags = SubmissionTag.objects.filter(
> > +            comment=instance
> > +        ).values_list('tag__name', 'value')
> > +
> > +        result = {}
> > +        for name, value in tags:
> > +            result.setdefault(name, []).append(value)
> > +
> > +        return result
> 
> This smells of something that should be placed in models.py. Is there
> any reason we _can't_ do it there, e.g. via a property?
> 

I guess we could (it's been a while since I thought about this code too,
but right now I can't think of any reason why not).

> > +
> >      def get_headers(self, comment):
> >          headers = {}
> >  
> > @@ -60,7 +76,7 @@ class
> > CommentListSerializer(BaseHyperlinkedModelSerializer):
> >      class Meta:
> >          model = Comment
> >          fields = ('id', 'web_url', 'msgid', 'date', 'subject',
> >          'submitter',
> > -                  'content', 'headers')
> > +                  'content', 'headers', 'tags')
> >          read_only_fields = fields
> >          versioned_fields = {
> >              '1.1': ('web_url', ),
> 
> You definitely need changes to 'get_queryset' here. I can help you work
> on these but tl;dr: look at what we do for 'check_set' in
> 'patchwork/api/patch.py'.

Did I mention I'm not a DB master? Any help with performance is
appreciated :)

> 
> > diff --git a/patchwork/api/cover.py b/patchwork/api/cover.py
> > index b497fd8..b17b9f7 100644
> > --- a/patchwork/api/cover.py
> > +++ b/patchwork/api/cover.py
> > @@ -30,6 +30,7 @@ from patchwork.api.embedded import PersonSerializer
> >  from patchwork.api.embedded import ProjectSerializer
> >  from patchwork.api.embedded import SeriesSerializer
> >  from patchwork.models import CoverLetter
> > +from patchwork.models import SubmissionTag
> >  
> >  
> >  class CoverLetterListSerializer(BaseHyperlinkedModelSerializer):
> > @@ -40,6 +41,7 @@ class
> > CoverLetterListSerializer(BaseHyperlinkedModelSerializer):
> >      mbox = SerializerMethodField()
> >      series = SeriesSerializer(many=True, read_only=True)
> >      comments = SerializerMethodField()
> > +    tags = SerializerMethodField()
> >  
> >      def get_web_url(self, instance):
> >          request = self.context.get('request')
> > @@ -53,13 +55,26 @@ class
> > CoverLetterListSerializer(BaseHyperlinkedModelSerializer):
> >          return self.context.get('request').build_absolute_uri(
> >              reverse('api-cover-comment-list', kwargs={'pk': cover.id}))
> >  
> > +    def get_tags(self, instance):
> > +        if not instance.project.use_tags:
> > +            return {}
> > +
> > +        tags = SubmissionTag.objects.filter(
> > +            submission=instance).values_list('tag__name',
> > 'value').distinct()
> > +
> > +        result = {}
> > +        for name, value in tags:
> > +            result.setdefault(name, []).append(value)
> > +
> > +        return result
> > +
> 
> As above, this should be a property on the model, I imagine.
> 
> >      class Meta:
> >          model = CoverLetter
> >          fields = ('id', 'url', 'web_url', 'project', 'msgid', 'date',
> >          'name',
> > -                  'submitter', 'mbox', 'series', 'comments')
> > +                  'submitter', 'mbox', 'series', 'comments', 'tags')
> >          read_only_fields = fields
> >          versioned_fields = {
> > -            '1.1': ('web_url', 'mbox', 'comments'),
> > +            '1.1': ('web_url', 'mbox', 'comments', 'tags'),
> 
> This should be '1.2' now.
> 

Noted. It was still 1.1 when this patch was originally sent, I think.

> >          }
> >          extra_kwargs = {
> >              'url': {'view_name': 'api-cover-detail'},
> 
> Same comments RE: 'get_queryset'.
> 
> > diff --git a/patchwork/api/filters.py b/patchwork/api/filters.py
> > index 73353d9..e213181 100644
> > --- a/patchwork/api/filters.py
> > +++ b/patchwork/api/filters.py
> > @@ -21,9 +21,11 @@ from django.contrib.auth.models import User
> >  from django.core.exceptions import ValidationError
> >  from django.db.models import Q
> >  from django_filters.rest_framework import FilterSet
> > +from django_filters import Filter
> >  from django_filters import IsoDateTimeFilter
> >  from django_filters import ModelMultipleChoiceFilter
> >  from django.forms import ModelMultipleChoiceField as
> >  BaseMultipleChoiceField
> > +from django.forms.widgets import HiddenInput
> >  from django.forms.widgets import MultipleHiddenInput
> >  
> >  from patchwork.models import Bundle
> > @@ -35,6 +37,7 @@ from patchwork.models import Person
> >  from patchwork.models import Project
> >  from patchwork.models import Series
> >  from patchwork.models import State
> > +from patchwork.models import SubmissionTag
> >  
> >  
> >  # custom fields, filters
> > @@ -136,6 +139,60 @@ class StateFilter(ModelMultipleChoiceFilter):
> >      field_class = StateChoiceField
> >  
> >  
> > +class TagNameFilter(Filter):
> > +
> > +    def filter(self, qs, tag_name):
> > +        submissions_series = SubmissionTag.objects.filter(
> > +            tag__name__iexact=tag_name
> > +        ).values_list('submission__id', 'series')
> > +
> > +        submission_list = []
> > +        series_list = []
> > +        for submission, series in submissions_series:
> > +            submission_list.append(submission)
> > +            series_list.append(series)
> > +
> > +        return qs.filter(Q(id__in=submission_list) |
> > Q(series__in=series_list))
> > +
> > +
> > +class TagValueFilter(Filter):
> > +
> > +    def filter(self, qs, tag_value):
> > +        submissions_series = SubmissionTag.objects.filter(
> > +            value__icontains=tag_value
> > +        ).values_list('submission__id', 'series')
> > +
> > +        submission_list = []
> > +        series_list = []
> > +        for submission, series in submissions_series:
> > +            submission_list.append(submission)
> > +            series_list.append(series)
> > +
> > +        return qs.filter(Q(id__in=submission_list) |
> > Q(series__in=series_list))
> > +
> > +
> > +class TagFilter(Filter):
> > +
> > +    def filter(self, qs, query):
> > +        try:
> > +            tag_name, tag_value = query.split(',', 1)
> > +        except ValueError:
> > +            raise ValidationError('Query in format `tag=name,value`
> > expected!')
> > +
> > +        submissions_series = SubmissionTag.objects.filter(
> > +            tag__name__iexact=tag_name,
> > +            value__icontains=tag_value
> > +        ).values_list('submission__id', 'series')
> > +
> > +        submission_list = []
> > +        series_list = []
> > +        for submission, series in submissions_series:
> > +            submission_list.append(submission)
> > +            series_list.append(series)
> > +
> > +        return qs.filter(Q(id__in=submission_list) |
> > Q(series__in=series_list))
> > +
> > +
> 
> The code for this _looks_ fine, but I do wonder if it's necessary?
> Would it be possible to support basic wildcarding in the filter
> instead? Something like this:
> 
>   ?tag=*: stephen@that.guru
> 
> (but with HTTP encoding). We could also make this wildcarded by
> default, e.g.
> 
>   ?tag=Reviewed-by:
> 
> Would return all results for this tag.
> 

That's an interesting idea. I had some serious problems getting the
filtering to work. One of the reasons would have been the chaos with the M:N
patch-series relationship which would be cleaned up now (yay!). But even
then, using a single filter for both parts was giving me OR relation between
the tag and the value instead of AND.

I'm not sure now (it was a while ago) but I think Django using OR in the
method I was using was the culprit, and splitting the filters into 3
different ones was the only way that occurred to me to work around it that
had the desired functionality.

In any case, I agree that your idea makes sense. I'll take a look at it, but
if you have any pointers on how to make it work because of the above, I'd
really appreciate it.

> >  class UserChoiceField(ModelMultipleChoiceField):
> >  
> >      alternate_lookup = 'username__iexact'
> > @@ -173,10 +230,14 @@ class CoverLetterFilterSet(TimestampMixin,
> > FilterSet):
> >      series = BaseFilter(queryset=Project.objects.all(),
> >                          widget=MultipleHiddenInput)
> >      submitter = PersonFilter(queryset=Person.objects.all())
> > +    tagname = TagNameFilter(label='Tag name')
> > +    tagvalue = TagValueFilter(widget=HiddenInput)
> > +    tag = TagFilter(widget=HiddenInput)
> >  
> >      class Meta:
> >          model = CoverLetter
> > -        fields = ('project', 'series', 'submitter')
> > +        fields = ('project', 'series', 'submitter', 'tagname', 'tagvalue',
> > +                  'tag')
> >  
> >  
> >  class PatchFilterSet(TimestampMixin, FilterSet):
> > @@ -189,11 +250,14 @@ class PatchFilterSet(TimestampMixin, FilterSet):
> >      submitter = PersonFilter(queryset=Person.objects.all())
> >      delegate = UserFilter(queryset=User.objects.all())
> >      state = StateFilter(queryset=State.objects.all())
> > +    tagname = TagNameFilter(label='Tag name')
> > +    tagvalue = TagValueFilter(widget=HiddenInput)
> > +    tag = TagFilter(widget=HiddenInput)
> >  
> >      class Meta:
> >          model = Patch
> >          fields = ('project', 'series', 'submitter', 'delegate',
> > -                  'state', 'archived')
> > +                  'state', 'archived', 'tagname', 'tagvalue', 'tag')
> >  
> >  
> >  class CheckFilterSet(TimestampMixin, FilterSet):
> > diff --git a/patchwork/api/patch.py b/patchwork/api/patch.py
> > index 9d890eb..cf97c40 100644
> > --- a/patchwork/api/patch.py
> > +++ b/patchwork/api/patch.py
> > @@ -19,6 +19,7 @@
> >  
> >  import email.parser
> >  
> > +from django.db.models import Q
> >  from django.utils.translation import ugettext_lazy as _
> >  from rest_framework.generics import ListAPIView
> >  from rest_framework.generics import RetrieveUpdateAPIView
> > @@ -35,6 +36,7 @@ from patchwork.api.embedded import SeriesSerializer
> >  from patchwork.api.embedded import UserSerializer
> >  from patchwork.models import Patch
> >  from patchwork.models import State
> > +from patchwork.models import SubmissionTag
> >  from patchwork.parser import clean_subject
> >  
> >  
> > @@ -109,9 +111,18 @@ class
> > PatchListSerializer(BaseHyperlinkedModelSerializer):
> >              reverse('api-check-list', kwargs={'patch_id': instance.id}))
> >  
> >      def get_tags(self, instance):
> > -        # TODO(stephenfin): Make tags performant, possibly by reworking
> > the
> > -        # model
> > -        return {}
> > +        if not instance.project.use_tags:
> > +            return {}
> > +
> > +        tags = SubmissionTag.objects.filter(
> > +            Q(submission=instance) | Q(series__in=instance.series.all())
> > +        ).values_list('tag__name', 'value').distinct()
> > +
> > +        result = {}
> > +        for name, value in tags:
> > +            result.setdefault(name, []).append(value)
> > +
> > +        return result
> >  
> >      class Meta:
> >          model = Patch
> > @@ -160,6 +171,7 @@ class PatchDetailSerializer(PatchListSerializer):
> >              'headers', 'content', 'diff', 'prefixes')
> >          versioned_fields = PatchListSerializer.Meta.versioned_fields
> >          extra_kwargs = PatchListSerializer.Meta.extra_kwargs
> > +        versioned_fields = PatchListSerializer.Meta.versioned_fields
> >  
> >  
> >  class PatchList(ListAPIView):
> 
> This is missing the entry in the versioned_fields setting. Also, same
> comment around 'get_tags'.
> 

Good catch! I originally intended for the tags to be visible only in the
detailed view and missed to add it.

> > diff --git a/patchwork/models.py b/patchwork/models.py
> > index 6268f5b..ba53278 100644
> > --- a/patchwork/models.py
> > +++ b/patchwork/models.py
> > @@ -20,8 +20,6 @@
> >  
> >  from __future__ import absolute_import
> >  
> > -from collections import Counter
> > -from collections import OrderedDict
> >  import datetime
> >  import random
> >  import re
> > @@ -31,6 +29,7 @@ from django.conf import settings
> >  from django.contrib.auth.models import User
> >  from django.core.exceptions import ValidationError
> >  from django.db import models
> > +from django.db.models import Q
> >  from django.utils.encoding import python_2_unicode_compatible
> >  from django.utils.functional import cached_property
> >  
> > @@ -252,10 +251,6 @@ class Tag(models.Model):
> >                                        ' tag\'s count in the patch list
> >                                        view',
> >                                        default=True)
> >  
> > -    @property
> > -    def attr_name(self):
> > -        return 'tag_%d_count' % self.id
> > -
> >      def __str__(self):
> >          return self.name
> >  
> > @@ -263,64 +258,21 @@ class Tag(models.Model):
> >          ordering = ['abbrev']
> >  
> >  
> > -class PatchTag(models.Model):
> > -    patch = models.ForeignKey('Patch', on_delete=models.CASCADE)
> > +class SubmissionTag(models.Model):
> > +    submission = models.ForeignKey('Submission', on_delete=models.CASCADE)
> >      tag = models.ForeignKey('Tag', on_delete=models.CASCADE)
> > -    count = models.IntegerField(default=1)
> > +    value = models.CharField(max_length=255)
> > +    comment = models.ForeignKey('Comment', null=True,
> > on_delete=models.CASCADE)
> > +    series = models.ForeignKey('Series', null=True,
> > on_delete=models.CASCADE)
> >  
> >      class Meta:
> > -        unique_together = [('patch', 'tag')]
> > +        unique_together = [('submission', 'tag', 'value', 'comment')]
> >  
> >  
> >  def get_default_initial_patch_state():
> >      return State.objects.get(ordering=0)
> >  
> >  
> > -class PatchQuerySet(models.query.QuerySet):
> > -
> > -    def with_tag_counts(self, project=None):
> > -        if project and not project.use_tags:
> > -            return self
> > -
> > -        # We need the project's use_tags field loaded for Project.tags().
> > -        # Using prefetch_related means we'll share the one instance of
> > -        # Project, and share the project.tags cache between all
> > patch.project
> > -        # references.
> > -        qs = self.prefetch_related('project')
> > -        select = OrderedDict()
> > -        select_params = []
> > -
> > -        # All projects have the same tags, so we're good to go here
> > -        if project:
> > -            tags = project.tags
> > -        else:
> > -            tags = Tag.objects.all()
> > -
> > -        for tag in tags:
> > -            select[tag.attr_name] = (
> > -                "coalesce("
> > -                "(SELECT count FROM patchwork_patchtag"
> > -                " WHERE patchwork_patchtag.patch_id="
> > -                "patchwork_patch.submission_ptr_id"
> > -                " AND patchwork_patchtag.tag_id=%s), 0)")
> > -            select_params.append(tag.id)
> > -
> > -        return qs.extra(select=select, select_params=select_params)
> > -
> > -
> > -class PatchManager(models.Manager):
> > -    use_for_related_fields = True
> > -    # NOTE(stephenfin): This is necessary to silence a warning with Django
> > >=
> > -    # 1.10. Remove when 1.10 is the minimum supported version.
> > -    silence_use_for_related_fields_deprecation = True
> > -
> > -    def get_queryset(self):
> > -        return PatchQuerySet(self.model, using=self.db)
> > -
> > -    def with_tag_counts(self, project):
> > -        return self.get_queryset().with_tag_counts(project)
> > -
> > -
> >  class EmailMixin(models.Model):
> >      """Mixin for models with an email-origin."""
> >      # email metadata
> > @@ -334,17 +286,16 @@ class EmailMixin(models.Model):
> >      submitter = models.ForeignKey(Person, on_delete=models.CASCADE)
> >      content = models.TextField(null=True, blank=True)
> >  
> > -    response_re = re.compile(
> > -        r'^(Tested|Reviewed|Acked|Signed-off|Nacked|Reported)-by:.*$',
> > -        re.M | re.I)
> > +    def _extract_tags(self, tags):
> > +        found_tags = {}
> >  
> > -    @property
> > -    def patch_responses(self):
> >          if not self.content:
> > -            return ''
> > +            return found_tags
> >  
> > -        return ''.join([match.group(0) + '\n' for match in
> > -                        self.response_re.finditer(self.content)])
> > +        for tag in tags:
> > +            regex = re.compile(tag.pattern + r'\s*(.*)', re.M | re.I |
> > re.U)
> > +            found_tags[tag] = regex.findall(self.content)
> > +        return found_tags
> 
> I wonder if we can move this to 'parser.py' now? It doesn't sound like
> something we'll be doing at runtime so we could move it out of here
> now.
> 

I vaguely remember discussing this with someone before, but don't know whom.
What if the submission is edited via the admin interface, and the tags,
being extracted only on the entry creation, will not match the real values in
the database anymore? While it's not the intended workflow, it is possible to
do it, and I think consistency between the data is important.

If there's an easy way to detect if the content of the submission was changed
and not trigger the recounts otherwise, I'm all for it. Unfortunately, the
only ways I found were to keep a hidden copy of the field or to fetch the
data from the DB before saving, and neither of them sounds cool. Or, at least
to trigger only on the changes coming from the admin interface, which would
reduce the trigger count a tons already (but still not move it to parser).

> >  
> >      def save(self, *args, **kwargs):
> >          # Modifying a submission via admin interface changes '\n' newlines
> >          in
> > @@ -377,6 +328,40 @@ class Submission(FilenameMixin, EmailMixin,
> > models.Model):
> >      # submission metadata
> >  
> >      name = models.CharField(max_length=255)
> > +    related_tags = models.ManyToManyField(Tag, through=SubmissionTag)
> 
> Just 'tags'?

Sure.

> 
> > +
> > +    def add_tags(self, tag, values, comment=None):
> > +        if hasattr(self, 'patch'):
> > +            series = None
> > +        else:
> > +            series = self.coverletter.series.first()
> > +
> > +        current_objs = SubmissionTag.objects.filter(submission=self,
> > +                                                    comment=comment,
> > +                                                    tag=tag,
> > +                                                    series=series)
> > +
> > +        if not values:
> > +            current_objs.delete()
> > +            return
> > +
> > +        # In case the origin is modified, delete tags that were removed
> > +        current_objs.exclude(value__in=values).delete()
> > +
> > +        values_to_add = set(values) -
> > set(current_objs.values_list('value',
> > +
> > flat=True))
> > +        SubmissionTag.objects.bulk_create([SubmissionTag(
> > +            submission=self,
> > +            tag=tag,
> > +            value=value,
> > +            comment=comment,
> > +            series=series
> > +        ) for value in values_to_add])
> > +
> > +    def refresh_tags(self):
> > +        submission_tags = self._extract_tags(Tag.objects.all())
> > +        for tag in submission_tags:
> > +            self.add_tags(tag, submission_tags[tag])
> 
> Similar comments around moving these functions out of here and into
> 'parser.py'. I'd _really_ like to have all this stuff in 'parser.py' or
> 'retag.py'.
> 

See the reasoning above.

> >  
> >      # patchwork metadata
> >  
> > @@ -426,7 +411,6 @@ class Patch(SeriesMixin, Submission):
> >      diff = models.TextField(null=True, blank=True)
> >      commit_ref = models.CharField(max_length=255, null=True, blank=True)
> >      pull_url = models.CharField(max_length=255, null=True, blank=True)
> > -    tags = models.ManyToManyField(Tag, through=PatchTag)
> >  
> >      # patchwork metadata
> >  
> > @@ -440,40 +424,6 @@ class Patch(SeriesMixin, Submission):
> >      # patches in a project without needing to do a JOIN.
> >      patch_project = models.ForeignKey(Project, on_delete=models.CASCADE)
> >  
> > -    objects = PatchManager()
> > -
> > -    @staticmethod
> > -    def extract_tags(content, tags):
> > -        counts = Counter()
> > -
> > -        for tag in tags:
> > -            regex = re.compile(tag.pattern, re.MULTILINE | re.IGNORECASE)
> > -            counts[tag] = len(regex.findall(content))
> > -
> > -        return counts
> > -
> > -    def _set_tag(self, tag, count):
> > -        if count == 0:
> > -            self.patchtag_set.filter(tag=tag).delete()
> > -            return
> > -        patchtag, _ = PatchTag.objects.get_or_create(patch=self, tag=tag)
> > -        if patchtag.count != count:
> > -            patchtag.count = count
> > -            patchtag.save()
> > -
> > -    def refresh_tag_counts(self):
> > -        tags = self.project.tags
> > -        counter = Counter()
> > -
> > -        if self.content:
> > -            counter += self.extract_tags(self.content, tags)
> > -
> > -        for comment in self.comments.all():
> > -            counter = counter + self.extract_tags(comment.content, tags)
> > -
> > -        for tag in tags:
> > -            self._set_tag(tag, counter[tag])
> > -
> >      def save(self, *args, **kwargs):
> >          if not hasattr(self, 'state') or not self.state:
> >              self.state = get_default_initial_patch_state()
> > @@ -482,8 +432,7 @@ class Patch(SeriesMixin, Submission):
> >              self.hash = hash_diff(self.diff)
> >  
> >          super(Patch, self).save(**kwargs)
> > -
> > -        self.refresh_tag_counts()
> > +        self.refresh_tags()
> >  
> >      def is_editable(self, user):
> >          if not is_authenticated(user):
> > @@ -495,6 +444,18 @@ class Patch(SeriesMixin, Submission):
> >          return self.project.is_editable(user)
> >  
> >      @property
> > +    def all_tags(self):
> > +        related_tags = SubmissionTag.objects.filter(
> > +            Q(submission=self) | Q(series__in=self.series.all())
> > +        ).values_list('tag__name', 'value').distinct()
> > +
> > +        patch_tags = {}
> > +        for tag in self.project.tags:
> > +            patch_tags[tag] = [related_tag[1] for related_tag in
> > related_tags
> > +                               if related_tag[0] == tag.name]
> 
> Hmm, I don't think you want to be doing a list comprehension here as
> Python is way slower than the DB. You could do:
> 
>   related_tags = related_tags.filter(name__in=self.project.tags)
> 
> ...or similar.
> 

Good idea!

> > +        return patch_tags
> 
> Yeah, we should probably be using this in the API. Not sure about
> performance though. This could need some work.
> 

As mentioned before, any comments about how to make the performance
better are welcome :)

I can make this property to submission class and implement a similar
one for the comments.

> > +
> > +    @property
> >      def combined_check_state(self):
> >          """Return the combined state for all checks.
> >  
> > @@ -611,13 +572,12 @@ class Comment(EmailMixin, models.Model):
> >  
> >      def save(self, *args, **kwargs):
> >          super(Comment, self).save(*args, **kwargs)
> > -        if hasattr(self.submission, 'patch'):
> > -            self.submission.patch.refresh_tag_counts()
> > +        self.refresh_tags()
> 
> I'd _personally_ rather we did this in 'parser.py'. There's no reason
> to calculate this stuff everytime we change anything about the patch
> (e.g. updating the state).
> 
> > -    def delete(self, *args, **kwargs):
> > -        super(Comment, self).delete(*args, **kwargs)
> > -        if hasattr(self.submission, 'patch'):
> > -            self.submission.patch.refresh_tag_counts()
> > +    def refresh_tags(self):
> > +        comment_tags = self._extract_tags(Tag.objects.all())
> > +        for tag in comment_tags:
> > +            self.submission.add_tags(tag, comment_tags[tag], comment=self)
> >  
> >      def is_editable(self, user):
> >          return False
> > @@ -716,6 +676,7 @@ class Series(FilenameMixin, models.Model):
> >                  self.name = self._format_name(cover)
> >  
> >          self.save()
> > +        cover.refresh_tags()
> >  
> >      def add_patch(self, patch, number):
> >          """Add a patch to the series."""
> > diff --git a/patchwork/templatetags/patch.py
> > b/patchwork/templatetags/patch.py
> > index 4350e09..f226dc0 100644
> > --- a/patchwork/templatetags/patch.py
> > +++ b/patchwork/templatetags/patch.py
> > @@ -34,8 +34,9 @@ register = template.Library()
> >  def patch_tags(patch):
> >      counts = []
> >      titles = []
> > +    all_tags = patch.all_tags
> >      for tag in [t for t in patch.project.tags if t.show_column]:
> > -        count = getattr(patch, tag.attr_name)
> > +        count = len(all_tags[tag])
> >          titles.append('%d %s' % (count, tag.name))
> >          if count == 0:
> >              counts.append("-")
> > diff --git a/patchwork/views/__init__.py b/patchwork/views/__init__.py
> > index f8d23a3..8c41df6 100644
> > --- a/patchwork/views/__init__.py
> > +++ b/patchwork/views/__init__.py
> > @@ -272,9 +272,6 @@ def generic_list(request, project, view,
> > view_args=None, filter_settings=None,
> >      if patches is None:
> >          patches = Patch.objects.filter(patch_project=project)
> >  
> > -    # annotate with tag counts
> > -    patches = patches.with_tag_counts(project)
> > -
> >      patches = context['filters'].apply(patches)
> >      if not editable_order:
> >          patches = order.apply(patches)
> > diff --git a/patchwork/views/utils.py b/patchwork/views/utils.py
> > index 2357ab8..f1f78c8 100644
> > --- a/patchwork/views/utils.py
> > +++ b/patchwork/views/utils.py
> > @@ -27,11 +27,13 @@ import email.utils
> >  import re
> >  
> >  from django.conf import settings
> > +from django.db.models import Q
> >  from django.http import Http404
> >  from django.utils import six
> >  
> >  from patchwork.models import Comment
> >  from patchwork.models import Patch
> > +from patchwork.models import SubmissionTag
> >  from patchwork.models import Series
> >  
> >  if settings.ENABLE_REST_API:
> > @@ -75,9 +77,11 @@ def _submission_to_mbox(submission):
> >      else:
> >          postscript = ''
> >  
> > -    # TODO(stephenfin): Make this use the tags infrastructure
> > -    for comment in Comment.objects.filter(submission=submission):
> > -        body += comment.patch_responses
> > +    for (tagname, value) in SubmissionTag.objects.filter(
> > +            Q(submission=submission)
> > +            | Q(series__in=submission.series.all())).values_list(
> > +                'tag__name', 'value').distinct():
> > +        body += '%s: %s\n' % (tagname, value)
> >  
> >      if postscript:
> >          body += '---\n' + postscript + '\n'
> 
> 
>
Stephen Finucane Sept. 19, 2018, 9:38 p.m. UTC | #3
On Wed, 2018-09-12 at 04:57 -0400, Veronika Kabatova wrote:
> 
> ----- Original Message -----
> > From: "Stephen Finucane" <stephen@that.guru>
> > To: vkabatov@redhat.com, patchwork@lists.ozlabs.org
> > Sent: Tuesday, September 11, 2018 7:55:46 PM
> > Subject: Re: [PATCH RESEND 1/2] Rework tagging infrastructure
> > 
> > On Mon, 2018-07-09 at 18:10 +0200, vkabatov@redhat.com wrote:
> > > From: Veronika Kabatova <vkabatov@redhat.com>
> > > 
> > > Solve #113 and #57 GitHub issues, fix up returning tags in the API,
> > > keep track of tag origin to be able to add tags to comments in the API.
> > > 
> > > Use relations Tag-Patch and Tag-CoverLetter to avoid duplication of
> > > tags for each patch in series, and use `series` attribute of
> > > SubmissionTag as a notion of a tag which is related to each patch in the
> > > series (because it comes from cover letter or it's comments)
> > > 
> > > Signed-off-by: Veronika Kabatova <vkabatov@redhat.com>
> > 
> > To start, apologies if I'm repeating myself on any of the comments
> > below: it's been a long time since I first reviewed this (sorry!).
> > Also, for the next revision of this, this can definitely be broken up
> > on purpose :) Given that tags aren't currently shown in the API, I'd
> > move the API changes into a separate patch. Similarly, docs (though not
> > release notes) aren't needed and can be broken up and moved into a
> > separate patch. Now to the review...
> > 
> 
> Sounds good!
> 
> /me goes on to comment on everything

I've been playing with the new revision all evening and, unfortunately,
have been hitting some pretty dire performance issues. I've outlined
what I think to be the cause of the issues below, along with some steps
that might help resolve these issues. Also, many of the comments I have
on the new revision are follow ups to comments from the earlier
revision (this one) so I'm replying here to avoid repeating myself :)

From what I can see, there are two issues here:

   1. The main issue we still have here is that fetching and combining
      tags for a patch is really expensive. This is because we need to
      fetch tags for the patch, its comments, the patch's series and this
      series' comments. We do this in 'Patch.all_tags'.
   2. Related to (1), because Tags are mapped to Submission, we suddenly
      need to join on the Submission table again, something which Daniel
      fixed in commit c97dad1cd.

I realized I've been pushing you in this direction but I didn't forsee
either of these issues upfront. However, I do have an option that might
resolve this.

Currently, SubmissionTag has three foreign keys:

 * submission
 * series, NULLABLE
 * comment, NULLABLE

As noted above, this means getting all tags for a given patch means
fetching tags for the patch, its comments, the patch's series and this
series' comments. From what I can tell, we can minimize the cost of
only the first of these (using prefetch_related). The rest involve a
query _per patch_ in the list which is really, really expensive.

I propose changing SubmissionTag to SeriesTag. This will have three
foreign keys:

 * series
 * patch, NULLABLE
 * comment, NULLABLE

These would be set like so:

 * When you receive a cover letter, you set just the series field
 * When you receive a patch, you set the series and patch fields
 * When you receive a cover letter comment, you set the series and
   comment fields
 * When you receive a patch comment, you set all fields

Once we do this, we should be able to something like the below to get
all tags for a given patch:

   SELECT * FROM patchwork_patch p
   INNER JOIN patchwork_seriestag t ON p.submission_ptr_id = t.patch_id
   UNION
   SELECT * FROM patchwork_patch p
   INNER JOIN patchwork_seriestag t ON p.series_id = t.series_id AND t.patch_id IS NULL;

Apologies for it being in SQL - I still haven't figured out how to do
this in Django's ORM. However, in brief that means:

   Join the two tables where the tag belongs to either that patch or
   the patches series.

If we wanted to get tags for a cover letter, we could then do this:

   SELECT * patchwork_coverletter c
   INNER JOIN patchwork_seriestag t ON c.series_id = c.series_id AND c.patch_id IS NULL;

I _think_ that would work, but the question is figuring out how to
convert that to something the ORM will like. I'll work on this myself
over the coming days, but this is where I got this evening.

More comments below.

> > > ---
> > >  patchwork/api/comment.py        |  18 ++++-
> > >  patchwork/api/cover.py          |  19 ++++-
> > >  patchwork/api/filters.py        |  68 +++++++++++++++-
> > >  patchwork/api/patch.py          |  18 ++++-
> > >  patchwork/models.py             | 173
> > >  ++++++++++++++++------------------------
> > >  patchwork/templatetags/patch.py |   3 +-
> > >  patchwork/views/__init__.py     |   3 -
> > >  patchwork/views/utils.py        |  10 ++-
> > >  8 files changed, 191 insertions(+), 121 deletions(-)
> > > 
> > > diff --git a/patchwork/api/comment.py b/patchwork/api/comment.py
> > > index 5a5adb1..e03207e 100644
> > > --- a/patchwork/api/comment.py
> > > +++ b/patchwork/api/comment.py
> > > @@ -26,6 +26,7 @@ from patchwork.api.base import
> > > BaseHyperlinkedModelSerializer
> > >  from patchwork.api.base import PatchworkPermission
> > >  from patchwork.api.embedded import PersonSerializer
> > >  from patchwork.models import Comment
> > > +from patchwork.models import SubmissionTag
> > >  
> > >  
> > >  class CommentListSerializer(BaseHyperlinkedModelSerializer):
> > > @@ -34,6 +35,7 @@ class
> > > CommentListSerializer(BaseHyperlinkedModelSerializer):
> > >      subject = SerializerMethodField()
> > >      headers = SerializerMethodField()
> > >      submitter = PersonSerializer(read_only=True)
> > > +    tags = SerializerMethodField()
> > >  
> > >      def get_web_url(self, instance):
> > >          request = self.context.get('request')
> > > @@ -43,6 +45,20 @@ class
> > > CommentListSerializer(BaseHyperlinkedModelSerializer):
> > >          return email.parser.Parser().parsestr(comment.headers,
> > >                                                True).get('Subject', '')
> > >  
> > > +    def get_tags(self, instance):
> > > +        if not instance.submission.project.use_tags:
> > > +            return {}
> > > +
> > > +        tags = SubmissionTag.objects.filter(
> > > +            comment=instance
> > > +        ).values_list('tag__name', 'value')
> > > +
> > > +        result = {}
> > > +        for name, value in tags:
> > > +            result.setdefault(name, []).append(value)
> > > +
> > > +        return result
> > 
> > This smells of something that should be placed in models.py. Is there
> > any reason we _can't_ do it there, e.g. via a property?
> > 
> 
> I guess we could (it's been a while since I thought about this code too,
> but right now I can't think of any reason why not).
> 
> > > +
> > >      def get_headers(self, comment):
> > >          headers = {}
> > >  
> > > @@ -60,7 +76,7 @@ class
> > > CommentListSerializer(BaseHyperlinkedModelSerializer):
> > >      class Meta:
> > >          model = Comment
> > >          fields = ('id', 'web_url', 'msgid', 'date', 'subject',
> > >          'submitter',
> > > -                  'content', 'headers')
> > > +                  'content', 'headers', 'tags')
> > >          read_only_fields = fields
> > >          versioned_fields = {
> > >              '1.1': ('web_url', ),
> > 
> > You definitely need changes to 'get_queryset' here. I can help you work
> > on these but tl;dr: look at what we do for 'check_set' in
> > 'patchwork/api/patch.py'.
> 
> Did I mention I'm not a DB master? Any help with performance is
> appreciated :)
> 
> > 
> > > diff --git a/patchwork/api/cover.py b/patchwork/api/cover.py
> > > index b497fd8..b17b9f7 100644
> > > --- a/patchwork/api/cover.py
> > > +++ b/patchwork/api/cover.py
> > > @@ -30,6 +30,7 @@ from patchwork.api.embedded import PersonSerializer
> > >  from patchwork.api.embedded import ProjectSerializer
> > >  from patchwork.api.embedded import SeriesSerializer
> > >  from patchwork.models import CoverLetter
> > > +from patchwork.models import SubmissionTag
> > >  
> > >  
> > >  class CoverLetterListSerializer(BaseHyperlinkedModelSerializer):
> > > @@ -40,6 +41,7 @@ class
> > > CoverLetterListSerializer(BaseHyperlinkedModelSerializer):
> > >      mbox = SerializerMethodField()
> > >      series = SeriesSerializer(many=True, read_only=True)
> > >      comments = SerializerMethodField()
> > > +    tags = SerializerMethodField()
> > >  
> > >      def get_web_url(self, instance):
> > >          request = self.context.get('request')
> > > @@ -53,13 +55,26 @@ class
> > > CoverLetterListSerializer(BaseHyperlinkedModelSerializer):
> > >          return self.context.get('request').build_absolute_uri(
> > >              reverse('api-cover-comment-list', kwargs={'pk': cover.id}))
> > >  
> > > +    def get_tags(self, instance):
> > > +        if not instance.project.use_tags:
> > > +            return {}
> > > +
> > > +        tags = SubmissionTag.objects.filter(
> > > +            submission=instance).values_list('tag__name',
> > > 'value').distinct()
> > > +
> > > +        result = {}
> > > +        for name, value in tags:
> > > +            result.setdefault(name, []).append(value)
> > > +
> > > +        return result
> > > +
> > 
> > As above, this should be a property on the model, I imagine.
> > 
> > >      class Meta:
> > >          model = CoverLetter
> > >          fields = ('id', 'url', 'web_url', 'project', 'msgid', 'date',
> > >          'name',
> > > -                  'submitter', 'mbox', 'series', 'comments')
> > > +                  'submitter', 'mbox', 'series', 'comments', 'tags')
> > >          read_only_fields = fields
> > >          versioned_fields = {
> > > -            '1.1': ('web_url', 'mbox', 'comments'),
> > > +            '1.1': ('web_url', 'mbox', 'comments', 'tags'),
> > 
> > This should be '1.2' now.
> > 
> 
> Noted. It was still 1.1 when this patch was originally sent, I think.
> 
> > >          }
> > >          extra_kwargs = {
> > >              'url': {'view_name': 'api-cover-detail'},
> > 
> > Same comments RE: 'get_queryset'.
> > 
> > > diff --git a/patchwork/api/filters.py b/patchwork/api/filters.py
> > > index 73353d9..e213181 100644
> > > --- a/patchwork/api/filters.py
> > > +++ b/patchwork/api/filters.py
> > > @@ -21,9 +21,11 @@ from django.contrib.auth.models import User
> > >  from django.core.exceptions import ValidationError
> > >  from django.db.models import Q
> > >  from django_filters.rest_framework import FilterSet
> > > +from django_filters import Filter
> > >  from django_filters import IsoDateTimeFilter
> > >  from django_filters import ModelMultipleChoiceFilter
> > >  from django.forms import ModelMultipleChoiceField as BaseMultipleChoiceField
> > > +from django.forms.widgets import HiddenInput
> > >  from django.forms.widgets import MultipleHiddenInput
> > >  
> > >  from patchwork.models import Bundle
> > > @@ -35,6 +37,7 @@ from patchwork.models import Person
> > >  from patchwork.models import Project
> > >  from patchwork.models import Series
> > >  from patchwork.models import State
> > > +from patchwork.models import SubmissionTag
> > >  
> > >  
> > >  # custom fields, filters
> > > @@ -136,6 +139,60 @@ class StateFilter(ModelMultipleChoiceFilter):
> > >      field_class = StateChoiceField
> > >  
> > >  
> > > +class TagNameFilter(Filter):
> > > +
> > > +    def filter(self, qs, tag_name):
> > > +        submissions_series = SubmissionTag.objects.filter(
> > > +            tag__name__iexact=tag_name
> > > +        ).values_list('submission__id', 'series')
> > > +
> > > +        submission_list = []
> > > +        series_list = []
> > > +        for submission, series in submissions_series:
> > > +            submission_list.append(submission)
> > > +            series_list.append(series)
> > > +
> > > +        return qs.filter(Q(id__in=submission_list) | Q(series__in=series_list))
> > > +
> > > +
> > > +class TagValueFilter(Filter):
> > > +
> > > +    def filter(self, qs, tag_value):
> > > +        submissions_series = SubmissionTag.objects.filter(
> > > +            value__icontains=tag_value
> > > +        ).values_list('submission__id', 'series')
> > > +
> > > +        submission_list = []
> > > +        series_list = []
> > > +        for submission, series in submissions_series:
> > > +            submission_list.append(submission)
> > > +            series_list.append(series)
> > > +
> > > +        return qs.filter(Q(id__in=submission_list) | Q(series__in=series_list))
> > > +
> > > +
> > > +class TagFilter(Filter):
> > > +
> > > +    def filter(self, qs, query):
> > > +        try:
> > > +            tag_name, tag_value = query.split(',', 1)
> > > +        except ValueError:
> > > +            raise ValidationError('Query in format `tag=name,value` expected!')
> > > +
> > > +        submissions_series = SubmissionTag.objects.filter(
> > > +            tag__name__iexact=tag_name,
> > > +            value__icontains=tag_value
> > > +        ).values_list('submission__id', 'series')
> > > +
> > > +        submission_list = []
> > > +        series_list = []
> > > +        for submission, series in submissions_series:
> > > +            submission_list.append(submission)
> > > +            series_list.append(series)
> > > +
> > > +        return qs.filter(Q(id__in=submission_list) | Q(series__in=series_list))
> > > +
> > > +
> > 
> > The code for this _looks_ fine, but I do wonder if it's necessary?
> > Would it be possible to support basic wildcarding in the filter
> > instead? Something like this:
> > 
> >   ?tag=*: stephen@that.guru
> > 
> > (but with HTTP encoding). We could also make this wildcarded by
> > default, e.g.
> > 
> >   ?tag=Reviewed-by:
> > 
> > Would return all results for this tag.
> > 
> 
> That's an interesting idea. I had some serious problems getting the
> filtering to work. One of the reasons would have been the chaos with the M:N
> patch-series relationship which would be cleaned up now (yay!). But even
> then, using a single filter for both parts was giving me OR relation between
> the tag and the value instead of AND.
> 
> I'm not sure now (it was a while ago) but I think Django using OR in the
> method I was using was the culprit, and splitting the filters into 3
> different ones was the only way that occurred to me to work around it that
> had the desired functionality.
> 
> In any case, I agree that your idea makes sense. I'll take a look at it, but
> if you have any pointers on how to make it work because of the above, I'd
> really appreciate it.
> 
> > >  class UserChoiceField(ModelMultipleChoiceField):
> > >  
> > >      alternate_lookup = 'username__iexact'
> > > @@ -173,10 +230,14 @@ class CoverLetterFilterSet(TimestampMixin,
> > > FilterSet):
> > >      series = BaseFilter(queryset=Project.objects.all(),
> > >                          widget=MultipleHiddenInput)
> > >      submitter = PersonFilter(queryset=Person.objects.all())
> > > +    tagname = TagNameFilter(label='Tag name')
> > > +    tagvalue = TagValueFilter(widget=HiddenInput)
> > > +    tag = TagFilter(widget=HiddenInput)
> > >  
> > >      class Meta:
> > >          model = CoverLetter
> > > -        fields = ('project', 'series', 'submitter')
> > > +        fields = ('project', 'series', 'submitter', 'tagname', 'tagvalue',
> > > +                  'tag')
> > >  
> > >  
> > >  class PatchFilterSet(TimestampMixin, FilterSet):
> > > @@ -189,11 +250,14 @@ class PatchFilterSet(TimestampMixin, FilterSet):
> > >      submitter = PersonFilter(queryset=Person.objects.all())
> > >      delegate = UserFilter(queryset=User.objects.all())
> > >      state = StateFilter(queryset=State.objects.all())
> > > +    tagname = TagNameFilter(label='Tag name')
> > > +    tagvalue = TagValueFilter(widget=HiddenInput)
> > > +    tag = TagFilter(widget=HiddenInput)
> > >  
> > >      class Meta:
> > >          model = Patch
> > >          fields = ('project', 'series', 'submitter', 'delegate',
> > > -                  'state', 'archived')
> > > +                  'state', 'archived', 'tagname', 'tagvalue', 'tag')
> > >  
> > >  
> > >  class CheckFilterSet(TimestampMixin, FilterSet):
> > > diff --git a/patchwork/api/patch.py b/patchwork/api/patch.py
> > > index 9d890eb..cf97c40 100644
> > > --- a/patchwork/api/patch.py
> > > +++ b/patchwork/api/patch.py
> > > @@ -19,6 +19,7 @@
> > >  
> > >  import email.parser
> > >  
> > > +from django.db.models import Q
> > >  from django.utils.translation import ugettext_lazy as _
> > >  from rest_framework.generics import ListAPIView
> > >  from rest_framework.generics import RetrieveUpdateAPIView
> > > @@ -35,6 +36,7 @@ from patchwork.api.embedded import SeriesSerializer
> > >  from patchwork.api.embedded import UserSerializer
> > >  from patchwork.models import Patch
> > >  from patchwork.models import State
> > > +from patchwork.models import SubmissionTag
> > >  from patchwork.parser import clean_subject
> > >  
> > >  
> > > @@ -109,9 +111,18 @@ class PatchListSerializer(BaseHyperlinkedModelSerializer):
> > >              reverse('api-check-list', kwargs={'patch_id': instance.id}))
> > >  
> > >      def get_tags(self, instance):
> > > -        # TODO(stephenfin): Make tags performant, possibly by reworking the
> > > -        # model
> > > -        return {}
> > > +        if not instance.project.use_tags:
> > > +            return {}
> > > +
> > > +        tags = SubmissionTag.objects.filter(
> > > +            Q(submission=instance) | Q(series__in=instance.series.all())
> > > +        ).values_list('tag__name', 'value').distinct()
> > > +
> > > +        result = {}
> > > +        for name, value in tags:
> > > +            result.setdefault(name, []).append(value)
> > > +
> > > +        return result
> > >  
> > >      class Meta:
> > >          model = Patch
> > > @@ -160,6 +171,7 @@ class PatchDetailSerializer(PatchListSerializer):
> > >              'headers', 'content', 'diff', 'prefixes')
> > >          versioned_fields = PatchListSerializer.Meta.versioned_fields
> > >          extra_kwargs = PatchListSerializer.Meta.extra_kwargs
> > > +        versioned_fields = PatchListSerializer.Meta.versioned_fields
> > >  
> > >  
> > >  class PatchList(ListAPIView):
> > 
> > This is missing the entry in the versioned_fields setting. Also, same
> > comment around 'get_tags'.
> > 
> 
> Good catch! I originally intended for the tags to be visible only in the
> detailed view and missed to add it.
> 
> > > diff --git a/patchwork/models.py b/patchwork/models.py
> > > index 6268f5b..ba53278 100644
> > > --- a/patchwork/models.py
> > > +++ b/patchwork/models.py
> > > @@ -20,8 +20,6 @@
> > >  
> > >  from __future__ import absolute_import
> > >  
> > > -from collections import Counter
> > > -from collections import OrderedDict
> > >  import datetime
> > >  import random
> > >  import re
> > > @@ -31,6 +29,7 @@ from django.conf import settings
> > >  from django.contrib.auth.models import User
> > >  from django.core.exceptions import ValidationError
> > >  from django.db import models
> > > +from django.db.models import Q
> > >  from django.utils.encoding import python_2_unicode_compatible
> > >  from django.utils.functional import cached_property
> > >  
> > > @@ -252,10 +251,6 @@ class Tag(models.Model):
> > >                                        ' tag\'s count in the patch list
> > >                                        view',
> > >                                        default=True)
> > >  
> > > -    @property
> > > -    def attr_name(self):
> > > -        return 'tag_%d_count' % self.id
> > > -
> > >      def __str__(self):
> > >          return self.name
> > >  
> > > @@ -263,64 +258,21 @@ class Tag(models.Model):
> > >          ordering = ['abbrev']
> > >  
> > >  
> > > -class PatchTag(models.Model):
> > > -    patch = models.ForeignKey('Patch', on_delete=models.CASCADE)
> > > +class SubmissionTag(models.Model):
> > > +    submission = models.ForeignKey('Submission', on_delete=models.CASCADE)
> > >      tag = models.ForeignKey('Tag', on_delete=models.CASCADE)
> > > -    count = models.IntegerField(default=1)
> > > +    value = models.CharField(max_length=255)
> > > +    comment = models.ForeignKey('Comment', null=True, on_delete=models.CASCADE)
> > > +    series = models.ForeignKey('Series', null=True, on_delete=models.CASCADE)
> > >  
> > >      class Meta:
> > > -        unique_together = [('patch', 'tag')]
> > > +        unique_together = [('submission', 'tag', 'value', 'comment')]
> > >  
> > >  
> > >  def get_default_initial_patch_state():
> > >      return State.objects.get(ordering=0)
> > >  
> > >  
> > > -class PatchQuerySet(models.query.QuerySet):
> > > -
> > > -    def with_tag_counts(self, project=None):
> > > -        if project and not project.use_tags:
> > > -            return self
> > > -
> > > -        # We need the project's use_tags field loaded for Project.tags().
> > > -        # Using prefetch_related means we'll share the one instance of
> > > -        # Project, and share the project.tags cache between all patch.project
> > > -        # references.
> > > -        qs = self.prefetch_related('project')
> > > -        select = OrderedDict()
> > > -        select_params = []
> > > -
> > > -        # All projects have the same tags, so we're good to go here
> > > -        if project:
> > > -            tags = project.tags
> > > -        else:
> > > -            tags = Tag.objects.all()
> > > -
> > > -        for tag in tags:
> > > -            select[tag.attr_name] = (
> > > -                "coalesce("
> > > -                "(SELECT count FROM patchwork_patchtag"
> > > -                " WHERE patchwork_patchtag.patch_id="
> > > -                "patchwork_patch.submission_ptr_id"
> > > -                " AND patchwork_patchtag.tag_id=%s), 0)")
> > > -            select_params.append(tag.id)
> > > -
> > > -        return qs.extra(select=select, select_params=select_params)
> > > -
> > > -
> > > -class PatchManager(models.Manager):
> > > -    use_for_related_fields = True
> > > -    # NOTE(stephenfin): This is necessary to silence a warning with Django >=
> > > -    # 1.10. Remove when 1.10 is the minimum supported version.
> > > -    silence_use_for_related_fields_deprecation = True
> > > -
> > > -    def get_queryset(self):
> > > -        return PatchQuerySet(self.model, using=self.db)
> > > -
> > > -    def with_tag_counts(self, project):
> > > -        return self.get_queryset().with_tag_counts(project)
> > > -
> > > -
> > >  class EmailMixin(models.Model):
> > >      """Mixin for models with an email-origin."""
> > >      # email metadata
> > > @@ -334,17 +286,16 @@ class EmailMixin(models.Model):
> > >      submitter = models.ForeignKey(Person, on_delete=models.CASCADE)
> > >      content = models.TextField(null=True, blank=True)
> > >  
> > > -    response_re = re.compile(
> > > -        r'^(Tested|Reviewed|Acked|Signed-off|Nacked|Reported)-by:.*$',
> > > -        re.M | re.I)
> > > +    def _extract_tags(self, tags):
> > > +        found_tags = {}
> > >  
> > > -    @property
> > > -    def patch_responses(self):
> > >          if not self.content:
> > > -            return ''
> > > +            return found_tags
> > >  
> > > -        return ''.join([match.group(0) + '\n' for match in
> > > -                        self.response_re.finditer(self.content)])
> > > +        for tag in tags:
> > > +            regex = re.compile(tag.pattern + r'\s*(.*)', re.M | re.I | re.U)
> > > +            found_tags[tag] = regex.findall(self.content)
> > > +        return found_tags
> > 
> > I wonder if we can move this to 'parser.py' now? It doesn't sound like
> > something we'll be doing at runtime so we could move it out of here
> > now.
> > 
> 
> I vaguely remember discussing this with someone before, but don't know whom.
> What if the submission is edited via the admin interface, and the tags,
> being extracted only on the entry creation, will not match the real values in
> the database anymore? While it's not the intended workflow, it is possible to
> do it, and I think consistency between the data is important.
> 
> If there's an easy way to detect if the content of the submission was changed
> and not trigger the recounts otherwise, I'm all for it. Unfortunately, the
> only ways I found were to keep a hidden copy of the field or to fetch the
> data from the DB before saving, and neither of them sounds cool. Or, at least
> to trigger only on the changes coming from the admin interface, which would
> reduce the trigger count a tons already (but still not move it to parser).

This is a very good point BUT I think you're overthinking this :) Is
there a good reason the 'content' field and 'diff' fields of a patch
should be writable? Personally, I don't think they should. I realize
this is a unrelated change but I'd just make those fields immutable and
avoid this whole thing.

(Note: I do realize people could still edit this through SQL or by
making the fields writeable in the admin again, but once you do that,
all guarantees are out the window).

> > >  
> > >      def save(self, *args, **kwargs):
> > >          # Modifying a submission via admin interface changes '\n' newlines
> > >          in
> > > @@ -377,6 +328,40 @@ class Submission(FilenameMixin, EmailMixin,
> > > models.Model):
> > >      # submission metadata
> > >  
> > >      name = models.CharField(max_length=255)
> > > +    related_tags = models.ManyToManyField(Tag, through=SubmissionTag)
> > 
> > Just 'tags'?
> 
> Sure.
> 
> > 
> > > +
> > > +    def add_tags(self, tag, values, comment=None):
> > > +        if hasattr(self, 'patch'):
> > > +            series = None
> > > +        else:
> > > +            series = self.coverletter.series.first()
> > > +
> > > +        current_objs = SubmissionTag.objects.filter(submission=self,
> > > +                                                    comment=comment,
> > > +                                                    tag=tag,
> > > +                                                    series=series)
> > > +
> > > +        if not values:
> > > +            current_objs.delete()
> > > +            return
> > > +
> > > +        # In case the origin is modified, delete tags that were removed
> > > +        current_objs.exclude(value__in=values).delete()
> > > +
> > > +        values_to_add = set(values) - set(current_objs.values_list('value',
> > > +                                                                   flat=True))
> > > +        SubmissionTag.objects.bulk_create([SubmissionTag(
> > > +            submission=self,
> > > +            tag=tag,
> > > +            value=value,
> > > +            comment=comment,
> > > +            series=series
> > > +        ) for value in values_to_add])
> > > +
> > > +    def refresh_tags(self):
> > > +        submission_tags = self._extract_tags(Tag.objects.all())
> > > +        for tag in submission_tags:
> > > +            self.add_tags(tag, submission_tags[tag])
> > 
> > Similar comments around moving these functions out of here and into
> > 'parser.py'. I'd _really_ like to have all this stuff in 'parser.py' or
> > 'retag.py'.
> > 
> 
> See the reasoning above.
> 
> > >  
> > >      # patchwork metadata
> > >  
> > > @@ -426,7 +411,6 @@ class Patch(SeriesMixin, Submission):
> > >      diff = models.TextField(null=True, blank=True)
> > >      commit_ref = models.CharField(max_length=255, null=True, blank=True)
> > >      pull_url = models.CharField(max_length=255, null=True, blank=True)
> > > -    tags = models.ManyToManyField(Tag, through=PatchTag)
> > >  
> > >      # patchwork metadata
> > >  
> > > @@ -440,40 +424,6 @@ class Patch(SeriesMixin, Submission):
> > >      # patches in a project without needing to do a JOIN.
> > >      patch_project = models.ForeignKey(Project, on_delete=models.CASCADE)
> > >  
> > > -    objects = PatchManager()
> > > -
> > > -    @staticmethod
> > > -    def extract_tags(content, tags):
> > > -        counts = Counter()
> > > -
> > > -        for tag in tags:
> > > -            regex = re.compile(tag.pattern, re.MULTILINE | re.IGNORECASE)
> > > -            counts[tag] = len(regex.findall(content))
> > > -
> > > -        return counts
> > > -
> > > -    def _set_tag(self, tag, count):
> > > -        if count == 0:
> > > -            self.patchtag_set.filter(tag=tag).delete()
> > > -            return
> > > -        patchtag, _ = PatchTag.objects.get_or_create(patch=self, tag=tag)
> > > -        if patchtag.count != count:
> > > -            patchtag.count = count
> > > -            patchtag.save()
> > > -
> > > -    def refresh_tag_counts(self):
> > > -        tags = self.project.tags
> > > -        counter = Counter()
> > > -
> > > -        if self.content:
> > > -            counter += self.extract_tags(self.content, tags)
> > > -
> > > -        for comment in self.comments.all():
> > > -            counter = counter + self.extract_tags(comment.content, tags)
> > > -
> > > -        for tag in tags:
> > > -            self._set_tag(tag, counter[tag])
> > > -
> > >      def save(self, *args, **kwargs):
> > >          if not hasattr(self, 'state') or not self.state:
> > >              self.state = get_default_initial_patch_state()
> > > @@ -482,8 +432,7 @@ class Patch(SeriesMixin, Submission):
> > >              self.hash = hash_diff(self.diff)
> > >  
> > >          super(Patch, self).save(**kwargs)
> > > -
> > > -        self.refresh_tag_counts()
> > > +        self.refresh_tags()
> > >  
> > >      def is_editable(self, user):
> > >          if not is_authenticated(user):
> > > @@ -495,6 +444,18 @@ class Patch(SeriesMixin, Submission):
> > >          return self.project.is_editable(user)
> > >  
> > >      @property
> > > +    def all_tags(self):
> > > +        related_tags = SubmissionTag.objects.filter(
> > > +            Q(submission=self) | Q(series__in=self.series.all())
> > > +        ).values_list('tag__name', 'value').distinct()
> > > +
> > > +        patch_tags = {}
> > > +        for tag in self.project.tags:
> > > +            patch_tags[tag] = [related_tag[1] for related_tag in related_tags
> > > +                               if related_tag[0] == tag.name]
> > 
> > Hmm, I don't think you want to be doing a list comprehension here as
> > Python is way slower than the DB. You could do:
> > 
> >   related_tags = related_tags.filter(name__in=self.project.tags)
> > 
> > ...or similar.
> > 
> 
> Good idea!
> 
> > > +        return patch_tags
> > 
> > Yeah, we should probably be using this in the API. Not sure about
> > performance though. This could need some work.
> > 
> 
> As mentioned before, any comments about how to make the performance
> better are welcome :)
> 
> I can make this property to submission class and implement a similar
> one for the comments.
> 
> > > +
> > > +    @property
> > >      def combined_check_state(self):
> > >          """Return the combined state for all checks.
> > >  
> > > @@ -611,13 +572,12 @@ class Comment(EmailMixin, models.Model):
> > >  
> > >      def save(self, *args, **kwargs):
> > >          super(Comment, self).save(*args, **kwargs)
> > > -        if hasattr(self.submission, 'patch'):
> > > -            self.submission.patch.refresh_tag_counts()
> > > +        self.refresh_tags()
> > 
> > I'd _personally_ rather we did this in 'parser.py'. There's no reason
> > to calculate this stuff everytime we change anything about the patch
> > (e.g. updating the state).
> > 
> > > -    def delete(self, *args, **kwargs):
> > > -        super(Comment, self).delete(*args, **kwargs)
> > > -        if hasattr(self.submission, 'patch'):
> > > -            self.submission.patch.refresh_tag_counts()
> > > +    def refresh_tags(self):
> > > +        comment_tags = self._extract_tags(Tag.objects.all())
> > > +        for tag in comment_tags:
> > > +            self.submission.add_tags(tag, comment_tags[tag], comment=self)
> > >  
> > >      def is_editable(self, user):
> > >          return False
> > > @@ -716,6 +676,7 @@ class Series(FilenameMixin, models.Model):
> > >                  self.name = self._format_name(cover)
> > >  
> > >          self.save()
> > > +        cover.refresh_tags()
> > >  
> > >      def add_patch(self, patch, number):
> > >          """Add a patch to the series."""
> > > diff --git a/patchwork/templatetags/patch.py
> > > b/patchwork/templatetags/patch.py
> > > index 4350e09..f226dc0 100644
> > > --- a/patchwork/templatetags/patch.py
> > > +++ b/patchwork/templatetags/patch.py
> > > @@ -34,8 +34,9 @@ register = template.Library()
> > >  def patch_tags(patch):
> > >      counts = []
> > >      titles = []
> > > +    all_tags = patch.all_tags
> > >      for tag in [t for t in patch.project.tags if t.show_column]:
> > > -        count = getattr(patch, tag.attr_name)
> > > +        count = len(all_tags[tag])
> > >          titles.append('%d %s' % (count, tag.name))
> > >          if count == 0:
> > >              counts.append("-")
> > > diff --git a/patchwork/views/__init__.py b/patchwork/views/__init__.py
> > > index f8d23a3..8c41df6 100644
> > > --- a/patchwork/views/__init__.py
> > > +++ b/patchwork/views/__init__.py
> > > @@ -272,9 +272,6 @@ def generic_list(request, project, view,
> > > view_args=None, filter_settings=None,
> > >      if patches is None:
> > >          patches = Patch.objects.filter(patch_project=project)
> > >  
> > > -    # annotate with tag counts
> > > -    patches = patches.with_tag_counts(project)
> > > -
> > >      patches = context['filters'].apply(patches)
> > >      if not editable_order:
> > >          patches = order.apply(patches)
> > > diff --git a/patchwork/views/utils.py b/patchwork/views/utils.py
> > > index 2357ab8..f1f78c8 100644
> > > --- a/patchwork/views/utils.py
> > > +++ b/patchwork/views/utils.py
> > > @@ -27,11 +27,13 @@ import email.utils
> > >  import re
> > >  
> > >  from django.conf import settings
> > > +from django.db.models import Q
> > >  from django.http import Http404
> > >  from django.utils import six
> > >  
> > >  from patchwork.models import Comment
> > >  from patchwork.models import Patch
> > > +from patchwork.models import SubmissionTag
> > >  from patchwork.models import Series
> > >  
> > >  if settings.ENABLE_REST_API:
> > > @@ -75,9 +77,11 @@ def _submission_to_mbox(submission):
> > >      else:
> > >          postscript = ''
> > >  
> > > -    # TODO(stephenfin): Make this use the tags infrastructure
> > > -    for comment in Comment.objects.filter(submission=submission):
> > > -        body += comment.patch_responses
> > > +    for (tagname, value) in SubmissionTag.objects.filter(
> > > +            Q(submission=submission)
> > > +            | Q(series__in=submission.series.all())).values_list(
> > > +                'tag__name', 'value').distinct():
> > > +        body += '%s: %s\n' % (tagname, value)
> > >  
> > >      if postscript:
> > >          body += '---\n' + postscript + '\n'
Veronika Kabatova Sept. 21, 2018, 8:13 p.m. UTC | #4
----- Original Message -----
> From: "Stephen Finucane" <stephen@that.guru>
> To: "Veronika Kabatova" <vkabatov@redhat.com>
> Cc: patchwork@lists.ozlabs.org
> Sent: Wednesday, September 19, 2018 11:38:32 PM
> Subject: Re: [PATCH RESEND 1/2] Rework tagging infrastructure
> 
> On Wed, 2018-09-12 at 04:57 -0400, Veronika Kabatova wrote:
> > 
> > ----- Original Message -----
> > > From: "Stephen Finucane" <stephen@that.guru>
> > > To: vkabatov@redhat.com, patchwork@lists.ozlabs.org
> > > Sent: Tuesday, September 11, 2018 7:55:46 PM
> > > Subject: Re: [PATCH RESEND 1/2] Rework tagging infrastructure
> > > 
> > > On Mon, 2018-07-09 at 18:10 +0200, vkabatov@redhat.com wrote:
> > > > From: Veronika Kabatova <vkabatov@redhat.com>
> > > > 
> > > > Solve #113 and #57 GitHub issues, fix up returning tags in the API,
> > > > keep track of tag origin to be able to add tags to comments in the API.
> > > > 
> > > > Use relations Tag-Patch and Tag-CoverLetter to avoid duplication of
> > > > tags for each patch in series, and use `series` attribute of
> > > > SubmissionTag as a notion of a tag which is related to each patch in
> > > > the
> > > > series (because it comes from cover letter or it's comments)
> > > > 
> > > > Signed-off-by: Veronika Kabatova <vkabatov@redhat.com>
> > > 
> > > To start, apologies if I'm repeating myself on any of the comments
> > > below: it's been a long time since I first reviewed this (sorry!).
> > > Also, for the next revision of this, this can definitely be broken up
> > > on purpose :) Given that tags aren't currently shown in the API, I'd
> > > move the API changes into a separate patch. Similarly, docs (though not
> > > release notes) aren't needed and can be broken up and moved into a
> > > separate patch. Now to the review...
> > > 
> > 
> > Sounds good!
> > 
> > /me goes on to comment on everything
> 
> I've been playing with the new revision all evening and, unfortunately,
> have been hitting some pretty dire performance issues. I've outlined
> what I think to be the cause of the issues below, along with some steps
> that might help resolve these issues. Also, many of the comments I have
> on the new revision are follow ups to comments from the earlier
> revision (this one) so I'm replying here to avoid repeating myself :)
> 
> From what I can see, there are two issues here:
> 
>    1. The main issue we still have here is that fetching and combining
>       tags for a patch is really expensive. This is because we need to
>       fetch tags for the patch, its comments, the patch's series and this
>       series' comments. We do this in 'Patch.all_tags'.
>    2. Related to (1), because Tags are mapped to Submission, we suddenly
>       need to join on the Submission table again, something which Daniel
>       fixed in commit c97dad1cd.
> 
> I realized I've been pushing you in this direction but I didn't forsee
> either of these issues upfront. However, I do have an option that might
> resolve this.
> 
> Currently, SubmissionTag has three foreign keys:
> 
>  * submission
>  * series, NULLABLE
>  * comment, NULLABLE
> 
> As noted above, this means getting all tags for a given patch means
> fetching tags for the patch, its comments, the patch's series and this
> series' comments. From what I can tell, we can minimize the cost of
> only the first of these (using prefetch_related). The rest involve a
> query _per patch_ in the list which is really, really expensive.
> 
> I propose changing SubmissionTag to SeriesTag. This will have three
> foreign keys:
> 
>  * series
>  * patch, NULLABLE
>  * comment, NULLABLE
> 
> These would be set like so:
> 
>  * When you receive a cover letter, you set just the series field
>  * When you receive a patch, you set the series and patch fields
>  * When you receive a cover letter comment, you set the series and
>    comment fields
>  * When you receive a patch comment, you set all fields
> 

Interesting idea. I managed to create a working prototype here:
https://github.com/veruu/patchwork/tree/tags_changes

The tests are only partially fixed, but I'll finish that part later when
we agree on the general approach.

> Once we do this, we should be able to something like the below to get
> all tags for a given patch:
> 
>    SELECT * FROM patchwork_patch p
>    INNER JOIN patchwork_seriestag t ON p.submission_ptr_id = t.patch_id
>    UNION
>    SELECT * FROM patchwork_patch p
>    INNER JOIN patchwork_seriestag t ON p.series_id = t.series_id AND
>    t.patch_id IS NULL;
> 
> Apologies for it being in SQL - I still haven't figured out how to do
> this in Django's ORM. However, in brief that means:
> 
>    Join the two tables where the tag belongs to either that patch or
>    the patches series.
> 
> If we wanted to get tags for a cover letter, we could then do this:
> 
>    SELECT * patchwork_coverletter c
>    INNER JOIN patchwork_seriestag t ON c.series_id = c.series_id AND
>    c.patch_id IS NULL;
> 
> I _think_ that would work, but the question is figuring out how to
> convert that to something the ORM will like. I'll work on this myself
> over the coming days, but this is where I got this evening.
> 

Oh yeah, this part is fun. For a single patch (the all_tags property)
it's doable. For the tag filtering with the whole query set, I wasn't
able to create a single request. I also have a feeling that Django's
lazy evaluation for query sets in this case isn't exactly helpful either.
I got something that works, but didn't try to inspect the underlying
query created by Django in either case.

> More comments below.
> 
> > > > ---
> > > >  patchwork/api/comment.py        |  18 ++++-
> > > >  patchwork/api/cover.py          |  19 ++++-
> > > >  patchwork/api/filters.py        |  68 +++++++++++++++-

[snip]

> > 
> > I vaguely remember discussing this with someone before, but don't know
> > whom.
> > What if the submission is edited via the admin interface, and the tags,
> > being extracted only on the entry creation, will not match the real values
> > in
> > the database anymore? While it's not the intended workflow, it is possible
> > to
> > do it, and I think consistency between the data is important.
> > 
> > If there's an easy way to detect if the content of the submission was
> > changed
> > and not trigger the recounts otherwise, I'm all for it. Unfortunately, the
> > only ways I found were to keep a hidden copy of the field or to fetch the
> > data from the DB before saving, and neither of them sounds cool. Or, at
> > least
> > to trigger only on the changes coming from the admin interface, which would
> > reduce the trigger count a tons already (but still not move it to parser).
> 
> This is a very good point BUT I think you're overthinking this :) Is
> there a good reason the 'content' field and 'diff' fields of a patch
> should be writable? Personally, I don't think they should. I realize
> this is a unrelated change but I'd just make those fields immutable and
> avoid this whole thing.
> 
> (Note: I do realize people could still edit this through SQL or by
> making the fields writeable in the admin again, but once you do that,
> all guarantees are out the window).
> 

Some of our maintainers do edit the fields. For example, if the commit
message needs to be edited, instead of having the author resend the patch
or forgetting to fix it when applying the patch they change it in PW when
they see it.

I did double check with them regarding this feature though, and it should
be safe to move tag extraction to parser (which I did for the new linked
version). I can't speak for other people though.


Anyways, let me know if you have other ideas on what to include!
Veronika
diff mbox series

Patch

diff --git a/patchwork/api/comment.py b/patchwork/api/comment.py
index 5a5adb1..e03207e 100644
--- a/patchwork/api/comment.py
+++ b/patchwork/api/comment.py
@@ -26,6 +26,7 @@  from patchwork.api.base import BaseHyperlinkedModelSerializer
 from patchwork.api.base import PatchworkPermission
 from patchwork.api.embedded import PersonSerializer
 from patchwork.models import Comment
+from patchwork.models import SubmissionTag
 
 
 class CommentListSerializer(BaseHyperlinkedModelSerializer):
@@ -34,6 +35,7 @@  class CommentListSerializer(BaseHyperlinkedModelSerializer):
     subject = SerializerMethodField()
     headers = SerializerMethodField()
     submitter = PersonSerializer(read_only=True)
+    tags = SerializerMethodField()
 
     def get_web_url(self, instance):
         request = self.context.get('request')
@@ -43,6 +45,20 @@  class CommentListSerializer(BaseHyperlinkedModelSerializer):
         return email.parser.Parser().parsestr(comment.headers,
                                               True).get('Subject', '')
 
+    def get_tags(self, instance):
+        if not instance.submission.project.use_tags:
+            return {}
+
+        tags = SubmissionTag.objects.filter(
+            comment=instance
+        ).values_list('tag__name', 'value')
+
+        result = {}
+        for name, value in tags:
+            result.setdefault(name, []).append(value)
+
+        return result
+
     def get_headers(self, comment):
         headers = {}
 
@@ -60,7 +76,7 @@  class CommentListSerializer(BaseHyperlinkedModelSerializer):
     class Meta:
         model = Comment
         fields = ('id', 'web_url', 'msgid', 'date', 'subject', 'submitter',
-                  'content', 'headers')
+                  'content', 'headers', 'tags')
         read_only_fields = fields
         versioned_fields = {
             '1.1': ('web_url', ),
diff --git a/patchwork/api/cover.py b/patchwork/api/cover.py
index b497fd8..b17b9f7 100644
--- a/patchwork/api/cover.py
+++ b/patchwork/api/cover.py
@@ -30,6 +30,7 @@  from patchwork.api.embedded import PersonSerializer
 from patchwork.api.embedded import ProjectSerializer
 from patchwork.api.embedded import SeriesSerializer
 from patchwork.models import CoverLetter
+from patchwork.models import SubmissionTag
 
 
 class CoverLetterListSerializer(BaseHyperlinkedModelSerializer):
@@ -40,6 +41,7 @@  class CoverLetterListSerializer(BaseHyperlinkedModelSerializer):
     mbox = SerializerMethodField()
     series = SeriesSerializer(many=True, read_only=True)
     comments = SerializerMethodField()
+    tags = SerializerMethodField()
 
     def get_web_url(self, instance):
         request = self.context.get('request')
@@ -53,13 +55,26 @@  class CoverLetterListSerializer(BaseHyperlinkedModelSerializer):
         return self.context.get('request').build_absolute_uri(
             reverse('api-cover-comment-list', kwargs={'pk': cover.id}))
 
+    def get_tags(self, instance):
+        if not instance.project.use_tags:
+            return {}
+
+        tags = SubmissionTag.objects.filter(
+            submission=instance).values_list('tag__name', 'value').distinct()
+
+        result = {}
+        for name, value in tags:
+            result.setdefault(name, []).append(value)
+
+        return result
+
     class Meta:
         model = CoverLetter
         fields = ('id', 'url', 'web_url', 'project', 'msgid', 'date', 'name',
-                  'submitter', 'mbox', 'series', 'comments')
+                  'submitter', 'mbox', 'series', 'comments', 'tags')
         read_only_fields = fields
         versioned_fields = {
-            '1.1': ('web_url', 'mbox', 'comments'),
+            '1.1': ('web_url', 'mbox', 'comments', 'tags'),
         }
         extra_kwargs = {
             'url': {'view_name': 'api-cover-detail'},
diff --git a/patchwork/api/filters.py b/patchwork/api/filters.py
index 73353d9..e213181 100644
--- a/patchwork/api/filters.py
+++ b/patchwork/api/filters.py
@@ -21,9 +21,11 @@  from django.contrib.auth.models import User
 from django.core.exceptions import ValidationError
 from django.db.models import Q
 from django_filters.rest_framework import FilterSet
+from django_filters import Filter
 from django_filters import IsoDateTimeFilter
 from django_filters import ModelMultipleChoiceFilter
 from django.forms import ModelMultipleChoiceField as BaseMultipleChoiceField
+from django.forms.widgets import HiddenInput
 from django.forms.widgets import MultipleHiddenInput
 
 from patchwork.models import Bundle
@@ -35,6 +37,7 @@  from patchwork.models import Person
 from patchwork.models import Project
 from patchwork.models import Series
 from patchwork.models import State
+from patchwork.models import SubmissionTag
 
 
 # custom fields, filters
@@ -136,6 +139,60 @@  class StateFilter(ModelMultipleChoiceFilter):
     field_class = StateChoiceField
 
 
+class TagNameFilter(Filter):
+
+    def filter(self, qs, tag_name):
+        submissions_series = SubmissionTag.objects.filter(
+            tag__name__iexact=tag_name
+        ).values_list('submission__id', 'series')
+
+        submission_list = []
+        series_list = []
+        for submission, series in submissions_series:
+            submission_list.append(submission)
+            series_list.append(series)
+
+        return qs.filter(Q(id__in=submission_list) | Q(series__in=series_list))
+
+
+class TagValueFilter(Filter):
+
+    def filter(self, qs, tag_value):
+        submissions_series = SubmissionTag.objects.filter(
+            value__icontains=tag_value
+        ).values_list('submission__id', 'series')
+
+        submission_list = []
+        series_list = []
+        for submission, series in submissions_series:
+            submission_list.append(submission)
+            series_list.append(series)
+
+        return qs.filter(Q(id__in=submission_list) | Q(series__in=series_list))
+
+
+class TagFilter(Filter):
+
+    def filter(self, qs, query):
+        try:
+            tag_name, tag_value = query.split(',', 1)
+        except ValueError:
+            raise ValidationError('Query in format `tag=name,value` expected!')
+
+        submissions_series = SubmissionTag.objects.filter(
+            tag__name__iexact=tag_name,
+            value__icontains=tag_value
+        ).values_list('submission__id', 'series')
+
+        submission_list = []
+        series_list = []
+        for submission, series in submissions_series:
+            submission_list.append(submission)
+            series_list.append(series)
+
+        return qs.filter(Q(id__in=submission_list) | Q(series__in=series_list))
+
+
 class UserChoiceField(ModelMultipleChoiceField):
 
     alternate_lookup = 'username__iexact'
@@ -173,10 +230,14 @@  class CoverLetterFilterSet(TimestampMixin, FilterSet):
     series = BaseFilter(queryset=Project.objects.all(),
                         widget=MultipleHiddenInput)
     submitter = PersonFilter(queryset=Person.objects.all())
+    tagname = TagNameFilter(label='Tag name')
+    tagvalue = TagValueFilter(widget=HiddenInput)
+    tag = TagFilter(widget=HiddenInput)
 
     class Meta:
         model = CoverLetter
-        fields = ('project', 'series', 'submitter')
+        fields = ('project', 'series', 'submitter', 'tagname', 'tagvalue',
+                  'tag')
 
 
 class PatchFilterSet(TimestampMixin, FilterSet):
@@ -189,11 +250,14 @@  class PatchFilterSet(TimestampMixin, FilterSet):
     submitter = PersonFilter(queryset=Person.objects.all())
     delegate = UserFilter(queryset=User.objects.all())
     state = StateFilter(queryset=State.objects.all())
+    tagname = TagNameFilter(label='Tag name')
+    tagvalue = TagValueFilter(widget=HiddenInput)
+    tag = TagFilter(widget=HiddenInput)
 
     class Meta:
         model = Patch
         fields = ('project', 'series', 'submitter', 'delegate',
-                  'state', 'archived')
+                  'state', 'archived', 'tagname', 'tagvalue', 'tag')
 
 
 class CheckFilterSet(TimestampMixin, FilterSet):
diff --git a/patchwork/api/patch.py b/patchwork/api/patch.py
index 9d890eb..cf97c40 100644
--- a/patchwork/api/patch.py
+++ b/patchwork/api/patch.py
@@ -19,6 +19,7 @@ 
 
 import email.parser
 
+from django.db.models import Q
 from django.utils.translation import ugettext_lazy as _
 from rest_framework.generics import ListAPIView
 from rest_framework.generics import RetrieveUpdateAPIView
@@ -35,6 +36,7 @@  from patchwork.api.embedded import SeriesSerializer
 from patchwork.api.embedded import UserSerializer
 from patchwork.models import Patch
 from patchwork.models import State
+from patchwork.models import SubmissionTag
 from patchwork.parser import clean_subject
 
 
@@ -109,9 +111,18 @@  class PatchListSerializer(BaseHyperlinkedModelSerializer):
             reverse('api-check-list', kwargs={'patch_id': instance.id}))
 
     def get_tags(self, instance):
-        # TODO(stephenfin): Make tags performant, possibly by reworking the
-        # model
-        return {}
+        if not instance.project.use_tags:
+            return {}
+
+        tags = SubmissionTag.objects.filter(
+            Q(submission=instance) | Q(series__in=instance.series.all())
+        ).values_list('tag__name', 'value').distinct()
+
+        result = {}
+        for name, value in tags:
+            result.setdefault(name, []).append(value)
+
+        return result
 
     class Meta:
         model = Patch
@@ -160,6 +171,7 @@  class PatchDetailSerializer(PatchListSerializer):
             'headers', 'content', 'diff', 'prefixes')
         versioned_fields = PatchListSerializer.Meta.versioned_fields
         extra_kwargs = PatchListSerializer.Meta.extra_kwargs
+        versioned_fields = PatchListSerializer.Meta.versioned_fields
 
 
 class PatchList(ListAPIView):
diff --git a/patchwork/models.py b/patchwork/models.py
index 6268f5b..ba53278 100644
--- a/patchwork/models.py
+++ b/patchwork/models.py
@@ -20,8 +20,6 @@ 
 
 from __future__ import absolute_import
 
-from collections import Counter
-from collections import OrderedDict
 import datetime
 import random
 import re
@@ -31,6 +29,7 @@  from django.conf import settings
 from django.contrib.auth.models import User
 from django.core.exceptions import ValidationError
 from django.db import models
+from django.db.models import Q
 from django.utils.encoding import python_2_unicode_compatible
 from django.utils.functional import cached_property
 
@@ -252,10 +251,6 @@  class Tag(models.Model):
                                       ' tag\'s count in the patch list view',
                                       default=True)
 
-    @property
-    def attr_name(self):
-        return 'tag_%d_count' % self.id
-
     def __str__(self):
         return self.name
 
@@ -263,64 +258,21 @@  class Tag(models.Model):
         ordering = ['abbrev']
 
 
-class PatchTag(models.Model):
-    patch = models.ForeignKey('Patch', on_delete=models.CASCADE)
+class SubmissionTag(models.Model):
+    submission = models.ForeignKey('Submission', on_delete=models.CASCADE)
     tag = models.ForeignKey('Tag', on_delete=models.CASCADE)
-    count = models.IntegerField(default=1)
+    value = models.CharField(max_length=255)
+    comment = models.ForeignKey('Comment', null=True, on_delete=models.CASCADE)
+    series = models.ForeignKey('Series', null=True, on_delete=models.CASCADE)
 
     class Meta:
-        unique_together = [('patch', 'tag')]
+        unique_together = [('submission', 'tag', 'value', 'comment')]
 
 
 def get_default_initial_patch_state():
     return State.objects.get(ordering=0)
 
 
-class PatchQuerySet(models.query.QuerySet):
-
-    def with_tag_counts(self, project=None):
-        if project and not project.use_tags:
-            return self
-
-        # We need the project's use_tags field loaded for Project.tags().
-        # Using prefetch_related means we'll share the one instance of
-        # Project, and share the project.tags cache between all patch.project
-        # references.
-        qs = self.prefetch_related('project')
-        select = OrderedDict()
-        select_params = []
-
-        # All projects have the same tags, so we're good to go here
-        if project:
-            tags = project.tags
-        else:
-            tags = Tag.objects.all()
-
-        for tag in tags:
-            select[tag.attr_name] = (
-                "coalesce("
-                "(SELECT count FROM patchwork_patchtag"
-                " WHERE patchwork_patchtag.patch_id="
-                "patchwork_patch.submission_ptr_id"
-                " AND patchwork_patchtag.tag_id=%s), 0)")
-            select_params.append(tag.id)
-
-        return qs.extra(select=select, select_params=select_params)
-
-
-class PatchManager(models.Manager):
-    use_for_related_fields = True
-    # NOTE(stephenfin): This is necessary to silence a warning with Django >=
-    # 1.10. Remove when 1.10 is the minimum supported version.
-    silence_use_for_related_fields_deprecation = True
-
-    def get_queryset(self):
-        return PatchQuerySet(self.model, using=self.db)
-
-    def with_tag_counts(self, project):
-        return self.get_queryset().with_tag_counts(project)
-
-
 class EmailMixin(models.Model):
     """Mixin for models with an email-origin."""
     # email metadata
@@ -334,17 +286,16 @@  class EmailMixin(models.Model):
     submitter = models.ForeignKey(Person, on_delete=models.CASCADE)
     content = models.TextField(null=True, blank=True)
 
-    response_re = re.compile(
-        r'^(Tested|Reviewed|Acked|Signed-off|Nacked|Reported)-by:.*$',
-        re.M | re.I)
+    def _extract_tags(self, tags):
+        found_tags = {}
 
-    @property
-    def patch_responses(self):
         if not self.content:
-            return ''
+            return found_tags
 
-        return ''.join([match.group(0) + '\n' for match in
-                        self.response_re.finditer(self.content)])
+        for tag in tags:
+            regex = re.compile(tag.pattern + r'\s*(.*)', re.M | re.I | re.U)
+            found_tags[tag] = regex.findall(self.content)
+        return found_tags
 
     def save(self, *args, **kwargs):
         # Modifying a submission via admin interface changes '\n' newlines in
@@ -377,6 +328,40 @@  class Submission(FilenameMixin, EmailMixin, models.Model):
     # submission metadata
 
     name = models.CharField(max_length=255)
+    related_tags = models.ManyToManyField(Tag, through=SubmissionTag)
+
+    def add_tags(self, tag, values, comment=None):
+        if hasattr(self, 'patch'):
+            series = None
+        else:
+            series = self.coverletter.series.first()
+
+        current_objs = SubmissionTag.objects.filter(submission=self,
+                                                    comment=comment,
+                                                    tag=tag,
+                                                    series=series)
+
+        if not values:
+            current_objs.delete()
+            return
+
+        # In case the origin is modified, delete tags that were removed
+        current_objs.exclude(value__in=values).delete()
+
+        values_to_add = set(values) - set(current_objs.values_list('value',
+                                                                   flat=True))
+        SubmissionTag.objects.bulk_create([SubmissionTag(
+            submission=self,
+            tag=tag,
+            value=value,
+            comment=comment,
+            series=series
+        ) for value in values_to_add])
+
+    def refresh_tags(self):
+        submission_tags = self._extract_tags(Tag.objects.all())
+        for tag in submission_tags:
+            self.add_tags(tag, submission_tags[tag])
 
     # patchwork metadata
 
@@ -426,7 +411,6 @@  class Patch(SeriesMixin, Submission):
     diff = models.TextField(null=True, blank=True)
     commit_ref = models.CharField(max_length=255, null=True, blank=True)
     pull_url = models.CharField(max_length=255, null=True, blank=True)
-    tags = models.ManyToManyField(Tag, through=PatchTag)
 
     # patchwork metadata
 
@@ -440,40 +424,6 @@  class Patch(SeriesMixin, Submission):
     # patches in a project without needing to do a JOIN.
     patch_project = models.ForeignKey(Project, on_delete=models.CASCADE)
 
-    objects = PatchManager()
-
-    @staticmethod
-    def extract_tags(content, tags):
-        counts = Counter()
-
-        for tag in tags:
-            regex = re.compile(tag.pattern, re.MULTILINE | re.IGNORECASE)
-            counts[tag] = len(regex.findall(content))
-
-        return counts
-
-    def _set_tag(self, tag, count):
-        if count == 0:
-            self.patchtag_set.filter(tag=tag).delete()
-            return
-        patchtag, _ = PatchTag.objects.get_or_create(patch=self, tag=tag)
-        if patchtag.count != count:
-            patchtag.count = count
-            patchtag.save()
-
-    def refresh_tag_counts(self):
-        tags = self.project.tags
-        counter = Counter()
-
-        if self.content:
-            counter += self.extract_tags(self.content, tags)
-
-        for comment in self.comments.all():
-            counter = counter + self.extract_tags(comment.content, tags)
-
-        for tag in tags:
-            self._set_tag(tag, counter[tag])
-
     def save(self, *args, **kwargs):
         if not hasattr(self, 'state') or not self.state:
             self.state = get_default_initial_patch_state()
@@ -482,8 +432,7 @@  class Patch(SeriesMixin, Submission):
             self.hash = hash_diff(self.diff)
 
         super(Patch, self).save(**kwargs)
-
-        self.refresh_tag_counts()
+        self.refresh_tags()
 
     def is_editable(self, user):
         if not is_authenticated(user):
@@ -495,6 +444,18 @@  class Patch(SeriesMixin, Submission):
         return self.project.is_editable(user)
 
     @property
+    def all_tags(self):
+        related_tags = SubmissionTag.objects.filter(
+            Q(submission=self) | Q(series__in=self.series.all())
+        ).values_list('tag__name', 'value').distinct()
+
+        patch_tags = {}
+        for tag in self.project.tags:
+            patch_tags[tag] = [related_tag[1] for related_tag in related_tags
+                               if related_tag[0] == tag.name]
+        return patch_tags
+
+    @property
     def combined_check_state(self):
         """Return the combined state for all checks.
 
@@ -611,13 +572,12 @@  class Comment(EmailMixin, models.Model):
 
     def save(self, *args, **kwargs):
         super(Comment, self).save(*args, **kwargs)
-        if hasattr(self.submission, 'patch'):
-            self.submission.patch.refresh_tag_counts()
+        self.refresh_tags()
 
-    def delete(self, *args, **kwargs):
-        super(Comment, self).delete(*args, **kwargs)
-        if hasattr(self.submission, 'patch'):
-            self.submission.patch.refresh_tag_counts()
+    def refresh_tags(self):
+        comment_tags = self._extract_tags(Tag.objects.all())
+        for tag in comment_tags:
+            self.submission.add_tags(tag, comment_tags[tag], comment=self)
 
     def is_editable(self, user):
         return False
@@ -716,6 +676,7 @@  class Series(FilenameMixin, models.Model):
                 self.name = self._format_name(cover)
 
         self.save()
+        cover.refresh_tags()
 
     def add_patch(self, patch, number):
         """Add a patch to the series."""
diff --git a/patchwork/templatetags/patch.py b/patchwork/templatetags/patch.py
index 4350e09..f226dc0 100644
--- a/patchwork/templatetags/patch.py
+++ b/patchwork/templatetags/patch.py
@@ -34,8 +34,9 @@  register = template.Library()
 def patch_tags(patch):
     counts = []
     titles = []
+    all_tags = patch.all_tags
     for tag in [t for t in patch.project.tags if t.show_column]:
-        count = getattr(patch, tag.attr_name)
+        count = len(all_tags[tag])
         titles.append('%d %s' % (count, tag.name))
         if count == 0:
             counts.append("-")
diff --git a/patchwork/views/__init__.py b/patchwork/views/__init__.py
index f8d23a3..8c41df6 100644
--- a/patchwork/views/__init__.py
+++ b/patchwork/views/__init__.py
@@ -272,9 +272,6 @@  def generic_list(request, project, view, view_args=None, filter_settings=None,
     if patches is None:
         patches = Patch.objects.filter(patch_project=project)
 
-    # annotate with tag counts
-    patches = patches.with_tag_counts(project)
-
     patches = context['filters'].apply(patches)
     if not editable_order:
         patches = order.apply(patches)
diff --git a/patchwork/views/utils.py b/patchwork/views/utils.py
index 2357ab8..f1f78c8 100644
--- a/patchwork/views/utils.py
+++ b/patchwork/views/utils.py
@@ -27,11 +27,13 @@  import email.utils
 import re
 
 from django.conf import settings
+from django.db.models import Q
 from django.http import Http404
 from django.utils import six
 
 from patchwork.models import Comment
 from patchwork.models import Patch
+from patchwork.models import SubmissionTag
 from patchwork.models import Series
 
 if settings.ENABLE_REST_API:
@@ -75,9 +77,11 @@  def _submission_to_mbox(submission):
     else:
         postscript = ''
 
-    # TODO(stephenfin): Make this use the tags infrastructure
-    for comment in Comment.objects.filter(submission=submission):
-        body += comment.patch_responses
+    for (tagname, value) in SubmissionTag.objects.filter(
+            Q(submission=submission)
+            | Q(series__in=submission.series.all())).values_list(
+                'tag__name', 'value').distinct():
+        body += '%s: %s\n' % (tagname, value)
 
     if postscript:
         body += '---\n' + postscript + '\n'