diff mbox series

[2/3] support/scripts/cve.py: switch from NVD to FKIE for the JSON files

Message ID 20240207153519.657048-2-arnout@mind.be
State Accepted
Headers show
Series [1/3] support/scripts/pkg-stats: remove checking the CPE database | expand

Commit Message

Arnout Vandecappelle Feb. 7, 2024, 3:35 p.m. UTC
NVD will deprecate the v1.1 API which allows us to download the full
database as individual JSON files. Instead, there's a horribly crappy
API that is extremely slow and subject to race conditions.

Fortunately, there is a project, Fraunhofer FKIE - Cyber Analysis and
Defense [1], that goes through the effort of adapting to this new API
and regenerating the convenient JSON files. The JSON files and meta
files are re-generated daily.

Instead of implementing the NVD v2 API, we decided to just use the JSON
files generatd by fkie-cad. That saves us the effort of solving the race
conditions, devising a cache mechanism that works, handling the frequent
gateway timeouts on the NVD servers, dealing with the rate limiting, and
keeping up with changes in the API.

Switch to this repository on github as NVD_BASE_URL. The file name is
also slightly different (CVE-20XX.json instead of nvdcve-1.1-20XX.json).

The fkie-cad repository compresses with xz instead of gz. Therefore:
 - rename the filename variables to _xz instead of _gz;
 - use xz as a subprocess because there is no xz decompressor in Python
   stdlib.

[1] https://www.fkie.fraunhofer.de/en/departments/cad.html

Cc: Daniel Lang <dalang@gmx.at>
Signed-off-by: Arnout Vandecappelle <arnout@mind.be>
---
Even better would be to clone the
https://github.com/fkie-cad/nvd-json-data-feeds repository so we can
cache with simple "git pull". I leave that as an exercise to the reader
:-)
---
 support/scripts/cve.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)
diff mbox series

Patch

diff --git a/support/scripts/cve.py b/support/scripts/cve.py
index 7cd6fce4d8..46c384955c 100755
--- a/support/scripts/cve.py
+++ b/support/scripts/cve.py
@@ -22,7 +22,7 @@  import os
 import requests  # URL checking
 import distutils.version
 import time
-import gzip
+import subprocess
 import sys
 import operator
 
@@ -41,8 +41,7 @@  except ImportError:
 sys.path.append('utils/')
 
 NVD_START_YEAR = 2002
-NVD_JSON_VERSION = "1.1"
-NVD_BASE_URL = "https://nvd.nist.gov/feeds/json/cve/" + NVD_JSON_VERSION
+NVD_BASE_URL = "https://github.com/fkie-cad/nvd-json-data-feeds/releases/latest/download"
 
 ops = {
     '>=': operator.ge,
@@ -83,15 +82,15 @@  class CVE:
 
     @staticmethod
     def download_nvd_year(nvd_path, year):
-        metaf = "nvdcve-%s-%s.meta" % (NVD_JSON_VERSION, year)
+        metaf = "CVE-%s.meta" % year
         path_metaf = os.path.join(nvd_path, metaf)
-        jsonf_gz = "nvdcve-%s-%s.json.gz" % (NVD_JSON_VERSION, year)
-        path_jsonf_gz = os.path.join(nvd_path, jsonf_gz)
+        jsonf_xz = "CVE-%s.json.xz" % year
+        path_jsonf_xz = os.path.join(nvd_path, jsonf_xz)
 
         # If the database file is less than a day old, we assume the NVD data
         # locally available is recent enough.
-        if os.path.exists(path_jsonf_gz) and os.stat(path_jsonf_gz).st_mtime >= time.time() - 86400:
-            return path_jsonf_gz
+        if os.path.exists(path_jsonf_xz) and os.stat(path_jsonf_xz).st_mtime >= time.time() - 86400:
+            return path_jsonf_xz
 
         # If not, we download the meta file
         url = "%s/%s" % (NVD_BASE_URL, metaf)
@@ -104,19 +103,19 @@  class CVE:
         # we need to re-download the database.
         # If the database does not exist locally, we need to redownload it in
         # any case.
-        if os.path.exists(path_metaf) and os.path.exists(path_jsonf_gz):
+        if os.path.exists(path_metaf) and os.path.exists(path_jsonf_xz):
             meta_known = open(path_metaf, "r").read()
             if page_meta.text == meta_known:
-                return path_jsonf_gz
+                return path_jsonf_xz
 
         # Grab the compressed JSON NVD, and write files to disk
-        url = "%s/%s" % (NVD_BASE_URL, jsonf_gz)
+        url = "%s/%s" % (NVD_BASE_URL, jsonf_xz)
         print("Getting %s" % url)
         page_json = requests.get(url)
         page_json.raise_for_status()
-        open(path_jsonf_gz, "wb").write(page_json.content)
+        open(path_jsonf_xz, "wb").write(page_json.content)
         open(path_metaf, "w").write(page_meta.text)
-        return path_jsonf_gz
+        return path_jsonf_xz
 
     @classmethod
     def read_nvd_dir(cls, nvd_dir):
@@ -128,7 +127,8 @@  class CVE:
         for year in range(NVD_START_YEAR, datetime.datetime.now().year + 1):
             filename = CVE.download_nvd_year(nvd_dir, year)
             try:
-                content = ijson.items(gzip.GzipFile(filename), 'CVE_Items.item')
+                uncompressed = subprocess.check_output(["xz", "-d", "-c", filename])
+                content = ijson.items(uncompressed, 'CVE_Items.item')
             except:  # noqa: E722
                 print("ERROR: cannot read %s. Please remove the file then rerun this script" % filename)
                 raise