diff mbox series

[3/4] migrations: 0043: split the data migration into chunks

Message ID 20210716171940.3827847-4-dja@axtens.net
State New
Headers show
Series RFC: Patchwork 2.2->3.0 migration improvements | expand

Commit Message

Daniel Axtens July 16, 2021, 5:19 p.m. UTC
Migrate 10000 rows at a time. This:
 - provides a view on progress
 - means replication happens in manageable chunks
 - hopefully prevents db lockups

Signed-off-by: Daniel Axtens <dja@axtens.net>
---
 .../migrations/0043_merge_patch_submission.py | 146 ++++++++++--------
 1 file changed, 82 insertions(+), 64 deletions(-)

Comments

Stephen Finucane Aug. 12, 2021, 4:55 p.m. UTC | #1
On Sat, 2021-07-17 at 03:19 +1000, Daniel Axtens wrote:
> Migrate 10000 rows at a time. This:
>  - provides a view on progress
>  - means replication happens in manageable chunks
>  - hopefully prevents db lockups
> 
> Signed-off-by: Daniel Axtens <dja@axtens.net>
> ---
>  .../migrations/0043_merge_patch_submission.py | 146 ++++++++++--------
>  1 file changed, 82 insertions(+), 64 deletions(-)
> 
> diff --git a/patchwork/migrations/0043_merge_patch_submission.py b/patchwork/migrations/0043_merge_patch_submission.py
> index 465e527812ba..1d072ca18230 100644
> --- a/patchwork/migrations/0043_merge_patch_submission.py
> +++ b/patchwork/migrations/0043_merge_patch_submission.py
> @@ -1,76 +1,94 @@
>  from django.conf import settings
>  from django.db import connection, migrations, models
>  import django.db.models.deletion
> +from django.db.models import Max
>  
>  import patchwork.fields
>  
> +CHUNK_SIZE = 10000
>  
>  def migrate_data(apps, schema_editor):
> -    if connection.vendor == 'postgresql':
> -        schema_editor.execute(
> -            """
> -            UPDATE patchwork_submission
> -              SET archived = patchwork_patch.archived,
> -                  commit_ref = patchwork_patch.commit_ref,
> -                  delegate_id = patchwork_patch.delegate_id,
> -                  diff = patchwork_patch.diff,
> -                  hash = patchwork_patch.hash,
> -                  number = patchwork_patch.number,
> -                  pull_url = patchwork_patch.pull_url,
> -                  related_id = patchwork_patch.related_id,
> -                  series_id = patchwork_patch.series_id,
> -                  state_id = patchwork_patch.state_id
> -            FROM patchwork_patch
> -              WHERE patchwork_submission.id = patchwork_patch.submission_ptr_id
> -            """
> -        )
> -    elif connection.vendor == 'mysql':
> -        schema_editor.execute(
> -            """
> -            UPDATE patchwork_submission, patchwork_patch
> -              SET patchwork_submission.archived = patchwork_patch.archived,
> -                  patchwork_submission.commit_ref = patchwork_patch.commit_ref,
> -                  patchwork_submission.delegate_id = patchwork_patch.delegate_id,
> -                  patchwork_submission.diff = patchwork_patch.diff,
> -                  patchwork_submission.hash = patchwork_patch.hash,
> -                  patchwork_submission.number = patchwork_patch.number,
> -                  patchwork_submission.pull_url = patchwork_patch.pull_url,
> -                  patchwork_submission.related_id = patchwork_patch.related_id,
> -                  patchwork_submission.series_id = patchwork_patch.series_id,
> -                  patchwork_submission.state_id = patchwork_patch.state_id
> -            WHERE patchwork_submission.id = patchwork_patch.submission_ptr_id
> -            """  # noqa
> -        )
> -    else:
> -        schema_editor.execute(
> -            """
> -            UPDATE patchwork_submission
> -              SET (
> -                archived, commit_ref, delegate_id, diff, hash, number,
> -                pull_url, related_id, series_id, state_id
> -              ) = (
> -                SELECT
> -                  patchwork_patch.archived,
> -                  patchwork_patch.commit_ref,
> -                  patchwork_patch.delegate_id,
> -                  patchwork_patch.diff,
> -                  patchwork_patch.hash,
> -                  patchwork_patch.number,
> -                  patchwork_patch.pull_url,
> -                  patchwork_patch.related_id,
> -                  patchwork_patch.series_id,
> -                  patchwork_patch.state_id
> +    Patch = apps.get_model('patchwork', 'patch')
> +    max_id = Patch.objects.all().aggregate(Max('submission_ptr_id'))['submission_ptr_id__max']

Any risk of a TOCTOU bug with this on busy instances? We probably want a final
check at the end unless we lock the database entirely for this operation
(perhaps that happens as-is? I actually don't know off the top of my head)

> +    old_max = 0
> +    # process CHUNK_SIZE at a time. +CHUNK_SIZE+1 is to ensure that if we get
> +    # max_id = 50000, we do a migration of ids >= 50000, < 60000 - otherwise
> +    # we would miss id=50000
> +    print("\nMigrating data - %d rows" % max_id)
> +    for new_max in range(CHUNK_SIZE, max_id+CHUNK_SIZE+1, CHUNK_SIZE):

nit: space around the '+'. I guess pep8 is disabled on this directory?

> +        print("Migrating rows >= %d, < %d (max %d)" % (old_max, new_max, max_id))

Should we use print statements or logging? I suspect the latter might be wiser.
The Django docs should have answers.


<snip rest of mostly unchanged but re-indented code>

Cheers,
Stephen
diff mbox series

Patch

diff --git a/patchwork/migrations/0043_merge_patch_submission.py b/patchwork/migrations/0043_merge_patch_submission.py
index 465e527812ba..1d072ca18230 100644
--- a/patchwork/migrations/0043_merge_patch_submission.py
+++ b/patchwork/migrations/0043_merge_patch_submission.py
@@ -1,76 +1,94 @@ 
 from django.conf import settings
 from django.db import connection, migrations, models
 import django.db.models.deletion
+from django.db.models import Max
 
 import patchwork.fields
 
+CHUNK_SIZE = 10000
 
 def migrate_data(apps, schema_editor):
-    if connection.vendor == 'postgresql':
-        schema_editor.execute(
-            """
-            UPDATE patchwork_submission
-              SET archived = patchwork_patch.archived,
-                  commit_ref = patchwork_patch.commit_ref,
-                  delegate_id = patchwork_patch.delegate_id,
-                  diff = patchwork_patch.diff,
-                  hash = patchwork_patch.hash,
-                  number = patchwork_patch.number,
-                  pull_url = patchwork_patch.pull_url,
-                  related_id = patchwork_patch.related_id,
-                  series_id = patchwork_patch.series_id,
-                  state_id = patchwork_patch.state_id
-            FROM patchwork_patch
-              WHERE patchwork_submission.id = patchwork_patch.submission_ptr_id
-            """
-        )
-    elif connection.vendor == 'mysql':
-        schema_editor.execute(
-            """
-            UPDATE patchwork_submission, patchwork_patch
-              SET patchwork_submission.archived = patchwork_patch.archived,
-                  patchwork_submission.commit_ref = patchwork_patch.commit_ref,
-                  patchwork_submission.delegate_id = patchwork_patch.delegate_id,
-                  patchwork_submission.diff = patchwork_patch.diff,
-                  patchwork_submission.hash = patchwork_patch.hash,
-                  patchwork_submission.number = patchwork_patch.number,
-                  patchwork_submission.pull_url = patchwork_patch.pull_url,
-                  patchwork_submission.related_id = patchwork_patch.related_id,
-                  patchwork_submission.series_id = patchwork_patch.series_id,
-                  patchwork_submission.state_id = patchwork_patch.state_id
-            WHERE patchwork_submission.id = patchwork_patch.submission_ptr_id
-            """  # noqa
-        )
-    else:
-        schema_editor.execute(
-            """
-            UPDATE patchwork_submission
-              SET (
-                archived, commit_ref, delegate_id, diff, hash, number,
-                pull_url, related_id, series_id, state_id
-              ) = (
-                SELECT
-                  patchwork_patch.archived,
-                  patchwork_patch.commit_ref,
-                  patchwork_patch.delegate_id,
-                  patchwork_patch.diff,
-                  patchwork_patch.hash,
-                  patchwork_patch.number,
-                  patchwork_patch.pull_url,
-                  patchwork_patch.related_id,
-                  patchwork_patch.series_id,
-                  patchwork_patch.state_id
+    Patch = apps.get_model('patchwork', 'patch')
+    max_id = Patch.objects.all().aggregate(Max('submission_ptr_id'))['submission_ptr_id__max']
+    old_max = 0
+    # process CHUNK_SIZE at a time. +CHUNK_SIZE+1 is to ensure that if we get
+    # max_id = 50000, we do a migration of ids >= 50000, < 60000 - otherwise
+    # we would miss id=50000
+    print("\nMigrating data - %d rows" % max_id)
+    for new_max in range(CHUNK_SIZE, max_id+CHUNK_SIZE+1, CHUNK_SIZE):
+        print("Migrating rows >= %d, < %d (max %d)" % (old_max, new_max, max_id))
+        if connection.vendor == 'postgresql':
+            schema_editor.execute(
+                """
+                UPDATE patchwork_submission
+                  SET archived = patchwork_patch.archived,
+                      commit_ref = patchwork_patch.commit_ref,
+                      delegate_id = patchwork_patch.delegate_id,
+                      diff = patchwork_patch.diff,
+                      hash = patchwork_patch.hash,
+                      number = patchwork_patch.number,
+                      pull_url = patchwork_patch.pull_url,
+                      related_id = patchwork_patch.related_id,
+                      series_id = patchwork_patch.series_id,
+                      state_id = patchwork_patch.state_id
                 FROM patchwork_patch
-                WHERE patchwork_patch.submission_ptr_id = patchwork_submission.id
-              )
-            WHERE
-              EXISTS (
-                SELECT *
-                FROM patchwork_patch
-                WHERE patchwork_patch.submission_ptr_id = patchwork_submission.id
-              )
-            """  # noqa
-        )
+                  WHERE patchwork_patch.submission_ptr_id >= %d
+                    AND patchwork_patch.submission_ptr_id < %d
+                    AND patchwork_submission.id = patchwork_patch.submission_ptr_id
+                """ % (old_max, new_max)
+            )
+        elif connection.vendor == 'mysql':
+            schema_editor.execute(
+                """
+                UPDATE patchwork_submission, patchwork_patch
+                  SET patchwork_submission.archived = patchwork_patch.archived,
+                      patchwork_submission.commit_ref = patchwork_patch.commit_ref,
+                      patchwork_submission.delegate_id = patchwork_patch.delegate_id,
+                      patchwork_submission.diff = patchwork_patch.diff,
+                      patchwork_submission.hash = patchwork_patch.hash,
+                      patchwork_submission.number = patchwork_patch.number,
+                      patchwork_submission.pull_url = patchwork_patch.pull_url,
+                      patchwork_submission.related_id = patchwork_patch.related_id,
+                      patchwork_submission.series_id = patchwork_patch.series_id,
+                      patchwork_submission.state_id = patchwork_patch.state_id
+                WHERE patchwork_patch.submission_ptr_id >= %d
+                  AND patchwork_patch.submission_ptr_id < %d
+                  AND patchwork_submission.id = patchwork_patch.submission_ptr_id
+                """ % (old_max, new_max) # noqa
+            )
+        else:
+            schema_editor.execute(
+                """
+                UPDATE patchwork_submission
+                  SET (
+                    archived, commit_ref, delegate_id, diff, hash, number,
+                    pull_url, related_id, series_id, state_id
+                  ) = (
+                    SELECT
+                      patchwork_patch.archived,
+                      patchwork_patch.commit_ref,
+                      patchwork_patch.delegate_id,
+                      patchwork_patch.diff,
+                      patchwork_patch.hash,
+                      patchwork_patch.number,
+                      patchwork_patch.pull_url,
+                      patchwork_patch.related_id,
+                      patchwork_patch.series_id,
+                      patchwork_patch.state_id
+                    FROM patchwork_patch
+                    WHERE patchwork_patch.submission_ptr_id = patchwork_submission.id
+                  )
+                WHERE
+                  EXISTS (
+                    SELECT *
+                    FROM patchwork_patch
+                    WHERE patchwork_patch.submission_ptr_id >= %d
+                      AND patchwork_patch.submission_ptr_id < %d
+                      AND patchwork_patch.submission_ptr_id = patchwork_submission.id
+                  )
+                """ % (old_max, new_max)  # noqa
+            )
+        old_max = new_max
 
 
 class Migration(migrations.Migration):