diff mbox series

[1/4] contrib: add generate_snapshot_index.py

Message ID 20231102084058.1142941-1-sam@gentoo.org
State New
Headers show
Series [1/4] contrib: add generate_snapshot_index.py | expand

Commit Message

Sam James Nov. 2, 2023, 8:39 a.m. UTC
Script to create a map between weekly snapshots and the commit they're based on
with space-separated format BRANCH-DATE COMMIT.

For example:
8-20210107 5114ee0676e432493ada968e34071f02fb08114f
8-20210114 f9267925c648f2ccd9e4680b699e581003125bcf
...

This is helpful for bisects and quickly looking up the information from bug
reports.

contrib/:
    * generate_snapshot_index.py: New file.

Signed-off-by: Sam James <sam@gentoo.org>
---
 contrib/generate_snapshot_index.py | 79 ++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)
 create mode 100755 contrib/generate_snapshot_index.py
diff mbox series

Patch

diff --git a/contrib/generate_snapshot_index.py b/contrib/generate_snapshot_index.py
new file mode 100755
index 000000000000..80fc14b2cf1e
--- /dev/null
+++ b/contrib/generate_snapshot_index.py
@@ -0,0 +1,79 @@ 
+#!/usr/bin/env python3
+#
+# Copyright (C) 2023 Free Software Foundation, Inc.
+# Contributed by Sam James.
+#
+# This script is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# Script to create a map between weekly snapshots and the commit they're based on.
+# Creates known_snapshots.txt with space-separated format: BRANCH-DATE COMMIT
+# For example:
+# 8-20210107 5114ee0676e432493ada968e34071f02fb08114f
+# 8-20210114 f9267925c648f2ccd9e4680b699e581003125bcf
+
+import os
+import re
+import urllib.request
+
+MIRROR = "https://mirrorservice.org/sites/sourceware.org/pub/gcc/snapshots/"
+
+
+def get_remote_snapshot_list() -> list[str]:
+    # Parse the HTML index for links to snapshots
+    with urllib.request.urlopen(MIRROR) as index_response:
+        html = index_response.read().decode("utf-8")
+        snapshots = re.findall(r'href="([0-9]+-.*)"', html)
+
+    return snapshots
+
+
+def load_cached_entries() -> dict[str, str]:
+    local_snapshots = {}
+
+    with open("known_snapshots.txt", encoding="utf-8") as local_entries:
+        for entry in local_entries.readlines():
+            if not entry:
+                continue
+
+            date, commit = entry.strip().split(" ")
+            local_snapshots[date] = commit
+
+    return local_snapshots
+
+
+remote_snapshots = get_remote_snapshot_list()
+try:
+    known_snapshots = load_cached_entries()
+except FileNotFoundError:
+    # No cache available
+    known_snapshots = {}
+
+# This would give us chronological order (as in by creation)
+# snapshots.sort(reverse=False, key=lambda x: x.split('-')[1])
+# snapshots.sort(reverse=True, key=lambda x: x.split('-')[0])
+
+for snapshot in remote_snapshots:
+    # 8-20210107/ -> 8-20210107
+    snapshot = snapshot.strip("/")
+
+    # Don't fetch entries we already have stored.
+    if snapshot in known_snapshots:
+        continue
+
+    # The READMEs are plain text with several lines, one of which is:
+    # "with the following options: git://gcc.gnu.org/git/gcc.git branch releases/gcc-8 revision e4e5ad2304db534957c4af612aa288cb6ef51f25""
+    # We match after 'revision ' to grab the commit used.
+    with urllib.request.urlopen(f"{MIRROR}/{snapshot}/README") as readme_response:
+        data = readme_response.read().decode("utf-8")
+        parsed_commit = re.findall(r"revision (.*)", data)[0]
+        known_snapshots[snapshot] = parsed_commit
+
+# Dump it all back out to disk.
+with open("known_snapshots.txt.tmp", "w", encoding="utf-8") as known_entries:
+    for name, stored_commit in known_snapshots.items():
+        known_entries.write(f"{name} {stored_commit}\n")
+
+os.rename("known_snapshots.txt.tmp", "known_snapshots.txt")