diff mbox

Add a source_tree field to Project.

Message ID 1301607301.2758.40.camel@feioso
State RFC
Headers show

Commit Message

Guilherme Salgado March 31, 2011, 9:35 p.m. UTC
And here's the diff which I forgot to attach to the previous message.

On Thu, 2011-03-31 at 18:32 -0300, Guilherme Salgado wrote:
> On Wed, 2011-03-30 at 18:25 -0300, Guilherme Salgado wrote:
> > On Wed, 2011-03-30 at 12:10 +0800, Jeremy Kerr wrote:
> > > Hi Guilherme,
> > > 
> > > >  apps/patchwork/models.py                      |    1 +
> > > >  lib/sql/migration/008-project-source-tree.sql |    3 +++
> > > >  2 files changed, 4 insertions(+), 0 deletions(-)
> > > >  create mode 100644 lib/sql/migration/008-project-source-tree.sql
> > > 
> > > Looks good. I'd like to wait until there is a user of this field before 
> > > merging the change though - or are you using this for linaro-internal things?
> > 
> > I'm using it on a script I'm writing to fetch the git history of every
> > project and scan that looking for patches that have been committed. Just
> > like the existing patchwork-update-commits script does but this one is
> > fully automated, and to make it more easily testable I'm experimenting
> > with python-dulwich to scan the git history. 
> 
> So, although I could do most things using python-dulwich it was failing
> in some cases and I just don't have the time to chase down these
> failures and fix them, so I resorted to running git on a subprocess. I'm
> still writing the script in python as that allows me to have at least
> some test coverage, which is something very valuable to me.
> 
> This new version of the script works similarly to the previous version.
> There are a few things that need to be changed but it has some tests and
> is able to scan the qemu tree and update the status of a couple patches.
> Also note that it updates the commit_ref of a patch, which will allow us
> to generate a link to the project's gitweb once we have the commit_url
> field.
> 
> I'd appreciate some feedback on this; specially whether or not this is
> something that's going to be useful upstream and, if so, if the current
> approach is reasonable.
> 
> Cheers,
>
diff mbox

Patch

commit ede49cf2cb17ecf08e81290bb475c700b2314a67
Author: Guilherme Salgado <guilherme.salgado@linaro.org>
Date:   Fri Mar 25 15:59:28 2011 -0300

    Adds a script which goes through all registered projects looking for patches that have been committed already
    
    It does that by checking out the project's source code from its VCS of choice
    (currently only git is supported, though), scanning the commits there and
    comparing them to the patches in Patchwork.

diff --git a/apps/patchwork/bin/update-committed-patches.py b/apps/patchwork/bin/update-committed-patches.py
new file mode 100755
index 0000000..5cc03d6
--- /dev/null
+++ b/apps/patchwork/bin/update-committed-patches.py
@@ -0,0 +1,38 @@ 
+#!/usr/bin/python
+
+import _pythonpath
+from patchwork.models import Patch, Project, State
+from patchwork.utils import (
+    ensure_source_checkout_for_project, get_hashes_for_commits)
+
+
+for project in Project.objects.all():
+    if project.source_tree is None:
+        continue
+
+    print "\n"
+    print "="*80
+    print "Scanning commits of %s" % project.name
+    print "="*80
+
+    try:
+        root = ensure_source_checkout_for_project(project)
+    except AssertionError:
+        print ("Skipping %s as we couldn't get a source checkout" %
+               project.name)
+        continue
+
+    hashes = get_hashes_for_commits(
+        root, start_at=project.last_seen_commit_ref)
+    for commit_id, patch_hash in hashes:
+        # There may be multiple patches with the same hash. That's usually
+        # the case when a second version of a patch series is submitted
+        # and some of the patches in the series are identical in both
+        # series.
+        for patch in Patch.objects.filter(project=project, hash=patch_hash):
+            patch.state = State.objects.get(name='Accepted')
+            patch.commit_ref = commit_id
+            print patch, patch.state
+
+        project.last_seen_commit_ref = commit_id
+        project.save()
diff --git a/apps/patchwork/tests/__init__.py b/apps/patchwork/tests/__init__.py
index 68fe563..e79331b 100644
--- a/apps/patchwork/tests/__init__.py
+++ b/apps/patchwork/tests/__init__.py
@@ -23,3 +23,4 @@  from patchwork.tests.bundles import *
 from patchwork.tests.mboxviews import *
 from patchwork.tests.updates import *
 from patchwork.tests.filters import *
+from patchwork.tests.test_utils import *
diff --git a/apps/patchwork/tests/test_utils.py b/apps/patchwork/tests/test_utils.py
new file mode 100644
index 0000000..6c058a2
--- /dev/null
+++ b/apps/patchwork/tests/test_utils.py
@@ -0,0 +1,83 @@ 
+
+import atexit
+import shutil
+import tempfile
+from time import time
+from unittest import TestCase
+
+from dulwich.objects import Blob, Commit, parse_timezone, Tree
+from dulwich.repo import Repo
+
+from patchwork.utils import get_hashes_for_commits, get_commits_to_parse
+
+
+class TestGitRepoScanning(TestCase):
+    """Tests for helper functions that scan commits on a git repo."""
+
+    def test_get_commits_to_parse(self):
+        repo = self.create_git_repo()
+        commit = self.add_file_and_commit(repo, 'foo', 'Content1')
+        commit2 = self.add_file_and_commit(repo, 'bar', 'Content2', commit)
+        commit3 = self.add_file_and_commit(repo, 'baz', 'Content3', commit2)
+        self.assertEqual(
+            [commit.id, commit2.id, commit3.id],
+            get_commits_to_parse(repo.path, start_at=None))
+
+    def test_get_hashes_for_commits(self):
+        repo = self.create_git_repo()
+        commit = self.add_file_and_commit(repo, 'foo', 'Content1')
+        commit2 = self.add_file_and_commit(repo, 'bar', 'Content2', commit)
+        self.assertEqual(
+            [(commit.id, 'a082b23263e8e3366bf3c387ffdcb8b21658e3ad'),
+             (commit2.id, '5c010402c5673981ee3e1712e6a037de3ff9cae4')],
+            list(get_hashes_for_commits(repo.path, start_at=None)))
+
+    def test_get_hashes_for_commits_with_single_empty_patch(self):
+        # Here the repo has only one commit which just adds an empty file, so
+        # parse_patch() returns returns None and we don't have any hash to
+        # check.  XXX: Maybe parse_patch() should be fixed to not return None
+        # in these cases?
+        repo = self.create_git_repo()
+        commit = self.add_file_and_commit(repo, 'foo', '')
+        self.assertEqual(
+            [], list(get_hashes_for_commits(repo.path, start_at=None)))
+
+    def test_get_hashes_for_commits_starting_at_specific_commit(self):
+        repo = self.create_git_repo()
+        commit = self.add_file_and_commit(repo, 'foo', 'Content1')
+        commit2 = self.add_file_and_commit(repo, 'bar', 'Content2', commit)
+        commit3 = self.add_file_and_commit(repo, 'baz', 'Content3', commit2)
+        self.assertEqual(
+            [(commit3.id, '11d22fa0986b3bb341baa76b8a6a757a46a2f916')],
+            list(get_hashes_for_commits(repo.path, start_at=commit2.id)))
+
+    def create_git_repo(self):
+        tmpdir = tempfile.mkdtemp()
+        atexit.register(shutil.rmtree, tmpdir)
+        repo = Repo.init(tmpdir)
+        return repo
+
+    def add_file_and_commit(self, repo, filename, data, parent=None):
+        blob = Blob.from_string(data)
+        parents = []
+        tree = Tree()
+        if parent is not None:
+            tree = repo[parent.tree]
+            parents = [parent.id]
+        tree.add(0100644, filename, blob.id)
+        commit = Commit()
+        commit.tree = tree.id
+        author = 'You <you@example.com>'
+        commit.author = commit.committer = author
+        commit.commit_time = commit.author_time = int(time())
+        tz = parse_timezone('-0200')[0]
+        commit.commit_timezone = commit.author_timezone = tz
+        commit.encoding = "UTF-8"
+        commit.message = "A commit"
+        commit.parents = parents
+        object_store = repo.object_store
+        object_store.add_object(blob)
+        object_store.add_object(tree)
+        object_store.add_object(commit)
+        repo.refs['refs/heads/master'] = commit.id
+        return commit
diff --git a/apps/patchwork/utils.py b/apps/patchwork/utils.py
index e41ffb6..865c3e0 100644
--- a/apps/patchwork/utils.py
+++ b/apps/patchwork/utils.py
@@ -17,8 +17,12 @@ 
 # along with Patchwork; if not, write to the Free Software
 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 
+import chardet
+import os
+import subprocess
 
-from patchwork.models import Bundle, Project, BundlePatch
+from patchwork.parser import hash_patch, parse_patch
+from patchwork.models import Bundle, BundlePatch
 from django.shortcuts import get_object_or_404
 
 def get_patch_ids(d, prefix = 'patch_id'):
@@ -137,3 +141,57 @@  def set_bundle(user, project, action, data, patches, context):
     bundle.save()
 
     return []
+
+
+def ensure_source_checkout_for_project(project):
+    forest = '/home/salgado/src' # This is where we store the trees we checkout
+    root = os.path.join(forest, project.linkname)
+    if not os.path.exists(root):
+        proc = subprocess.Popen(['git', 'clone', project.source_tree, root],
+                                stdout=subprocess.PIPE)
+    else:
+        proc = subprocess.Popen(
+            ['git', 'fetch', '-f', project.source_tree], cwd=root,
+            stdout=subprocess.PIPE)
+    proc.communicate()
+    if proc.returncode != 0:
+        raise AssertionError("FIXME: raise a more specific assertion")
+    return root
+
+
+def get_commits_to_parse(root, start_at):
+    args = ['git', 'rev-list', '--reverse']
+    if start_at:
+        args.append('%s..HEAD' % start_at)
+    else:
+        args.extend(['--max-count=2000', 'HEAD'])
+    proc = subprocess.Popen(args, cwd=root, stdout=subprocess.PIPE)
+    stdout, stderr = proc.communicate()
+    return stdout.split()
+
+
+def get_hashes_for_commits(root, start_at):
+    for commit_id in get_commits_to_parse(root, start_at=start_at):
+        proc = subprocess.Popen(
+            ['git', 'show', commit_id], cwd=root, stdout=subprocess.PIPE)
+        stdout, stderr = proc.communicate()
+
+        try:
+            diff = stdout.decode('utf-8')
+        except UnicodeDecodeError:
+            try:
+                # XXX: Should either add chardet as a dependency or skip this
+                # block when it's not available.
+                # chardet.detect is rather slow so we only use it when we fail
+                # to decode from utf-8.
+                encoding = chardet.detect(stdout)['encoding']
+                diff = stdout.decode(encoding)
+            except UnicodeDecodeError:
+                print "Skipping %s as it doesn't seem to be utf-8" % commit_id
+                continue
+
+        patch, _ = parse_patch(diff)
+        # When commits just add files or change permissions the diff will be
+        # empty and thus parse_patch() will return None.
+        if patch is not None:
+            yield commit_id, hash_patch(patch).hexdigest()