diff mbox series

[v2,1/2] support/scripts/pkg-stats: URL checking support

Message ID 1537544157-2992-1-git-send-email-matthew.weber@rockwellcollins.com
State Superseded
Headers show
Series [v2,1/2] support/scripts/pkg-stats: URL checking support | expand

Commit Message

Matt Weber Sept. 21, 2018, 3:35 p.m. UTC
- Adds support to check if a package has a URL and if that URL
   is valid by doing a header request.
 - Reports this information as part of the generated html output

The URL data is currently gathered from the URL string provided
in the Kconfig help sections for each package.

This check helps ensure the URLs are valid and can be used
for other scripting purposes as the product's home site/URL.
CPE XML generation is an example of a case that could use this
product URL as part of an automated update generation script.

Signed-off-by: Matt Weber <matthew.weber@rockwellcollins.com>
---
Changes
v1 -> v2
 - Dropped disabling of SSL cert verifing

[Thomas
 - Moved URL report to new column
 - Added color coding
 - Better info on if a package doesn't have a Config.in to search
   or the Config.in is missing the URL
---
 support/scripts/pkg-stats | 57 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

Comments

Matt Weber Sept. 21, 2018, 3:37 p.m. UTC | #1
All,

On Fri, Sep 21, 2018 at 10:36 AM Matt Weber
<matthew.weber@rockwellcollins.com> wrote:
>
>  - Adds support to check if a package has a URL and if that URL
>    is valid by doing a header request.
>  - Reports this information as part of the generated html output
>
> The URL data is currently gathered from the URL string provided
> in the Kconfig help sections for each package.
>
> This check helps ensure the URLs are valid and can be used
> for other scripting purposes as the product's home site/URL.
> CPE XML generation is an example of a case that could use this
> product URL as part of an automated update generation script.
>

~ 3x the time to execute the script with this change.

> Signed-off-by: Matt Weber <matthew.weber@rockwellcollins.com>
> ---
> Changes
> v1 -> v2
>  - Dropped disabling of SSL cert verifing
>
> [Thomas
>  - Moved URL report to new column
>  - Added color coding
>  - Better info on if a package doesn't have a Config.in to search
>    or the Config.in is missing the URL
> ---
>  support/scripts/pkg-stats | 57 +++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 57 insertions(+)
>
> diff --git a/support/scripts/pkg-stats b/support/scripts/pkg-stats
> index b7b00e8..37f89ef 100755
> --- a/support/scripts/pkg-stats
> +++ b/support/scripts/pkg-stats
> @@ -24,6 +24,7 @@ from collections import defaultdict
>  import re
>  import subprocess
>  import sys
> +import requests  # URL checking
>
>  INFRA_RE = re.compile("\$\(eval \$\(([a-z-]*)-package\)\)")
>
> @@ -43,10 +44,32 @@ class Package:
>          self.patch_count = 0
>          self.warnings = 0
>          self.current_version = None
> +        self.url = None
> +        self.url_status = None
>
>      def pkgvar(self):
>          return self.name.upper().replace("-", "_")
>
> +    def set_url(self):
> +        """
> +        Fills in the .url field
> +        """
> +        in_help_section = False
> +        self.url_status = "No Config.in"
> +        for filename in os.listdir(os.path.dirname(self.path)):
> +            if fnmatch.fnmatch(filename, 'Config.*'):
> +                fp = open(os.path.join(os.path.dirname(self.path), filename), "r")
> +                for config_line in fp:
> +                    if config_line.strip() == "help":
> +                        in_help_section = True
> +                    if in_help_section and re.match("(.*)https?://", config_line):
> +                        self.url = ''.join(config_line.split())
> +                        self.url_status = "Found"
> +                        fp.close()
> +                        return
> +                self.url_status = "Missing"
> +                fp.close()
> +
>      def set_infra(self):
>          """
>          Fills in the .infras field
> @@ -254,6 +277,14 @@ def package_init_make_info():
>
>          Package.all_versions[pkgvar] = value
>
> +def check_url_status(pkg):
> +    if pkg.url_status != "Missing" and pkg.url_status != "No Config.in":
> +        try:
> +            url_status_code = requests.head(pkg.url, timeout=5).status_code
> +            if url_status_code >= 400:
> +                pkg.url_status = "Invalid(%s)" % str(url_status_code)
> +        except requests.exceptions.RequestException as e:
> +            return
>
>  def calculate_stats(packages):
>      stats = defaultdict(int)
> @@ -311,6 +342,15 @@ td.somepatches {
>  td.lotsofpatches {
>    background: #ff9a69;
>  }
> +td.good_url {
> +  background: #d2ffc4;
> +}
> +td.missing_url {
> +  background: #ffd870;
> +}
> +td.invalid_url {
> +  background: #ff9a69;
> +}
>  </style>
>  <title>Statistics of Buildroot packages</title>
>  </head>
> @@ -422,6 +462,20 @@ def dump_html_pkg(f, pkg):
>      f.write("  <td class=\"%s\">%d</td>\n" %
>              (" ".join(td_class), pkg.warnings))
>
> +    # URL status
> +    td_class = ["centered"]
> +    url_str = pkg.url_status
> +    if pkg.url_status == "Missing" or pkg.url_status == "No Config.in":
> +        td_class.append("missing_url")
> +    elif pkg.url_status.startswith("Invalid"):
> +        td_class.append("invalid_url")
> +        url_str = "<a href=%s>%s</a>" % (pkg.url, pkg.url_status)
> +    else:
> +        td_class.append("good_url")
> +        url_str = "<a href=%s>Link</a>" % pkg.url
> +    f.write("  <td class=\"%s\">%s</td>\n" %
> +            (" ".join(td_class), url_str))
> +
>      f.write(" </tr>\n")
>
>
> @@ -437,6 +491,7 @@ def dump_html_all_pkgs(f, packages):
>  <td class=\"centered\">Hash file</td>
>  <td class=\"centered\">Current version</td>
>  <td class=\"centered\">Warnings</td>
> +<td class=\"centered\">URL</td>
>  </tr>
>  """)
>      for pkg in sorted(packages):
> @@ -517,6 +572,8 @@ def __main__():
>          pkg.set_patch_count()
>          pkg.set_check_package_warnings()
>          pkg.set_current_version()
> +        pkg.set_url()
> +        check_url_status(pkg)
>      print("Calculate stats")
>      stats = calculate_stats(packages)
>      print("Write HTML")
> --
> 1.9.1
>
Ricardo Martincoski Sept. 29, 2018, 5:40 a.m. UTC | #2
Hello,

In overall looks good. A few issues to fix below.

On Fri, Sep 21, 2018 at 12:35 PM, Matt Weber wrote:

[snip]
>  INFRA_RE = re.compile("\$\(eval \$\(([a-z-]*)-package\)\)")
>  
[snip]
> +    def set_url(self):
> +        """
> +        Fills in the .url field
> +        """
> +        in_help_section = False
> +        self.url_status = "No Config.in"
> +        for filename in os.listdir(os.path.dirname(self.path)):
> +            if fnmatch.fnmatch(filename, 'Config.*'):
> +                fp = open(os.path.join(os.path.dirname(self.path), filename), "r")
> +                for config_line in fp:
> +                    if config_line.strip() == "help":
> +                        in_help_section = True

Please take at look at HelpText @ utils/checkpackagelib/lib_config.py
A Config.in can have many help sections (one per option) so using a single flag
to know if we are at a help section is not enough. When any string from
entries_that_should_not_be_indented appears at the beginning of a line, the help
section is over.
But maybe we don't even need to check this, if the regexp is more strict.

> +                    if in_help_section and re.match("(.*)https?://", config_line):

For package/libcoap/Config.in
the regexp will find 'CoAP, see <http://coap.technology>.'

And for package/lttng-babeltrace/Config.in
'Trace Format (see <http://diamon.org/ctf/>). Babeltrace'

... so a better regexp would be:
re.match("\s*https?://", config_line):

Also it is good practice to compile the regexp just like it is done for
INFRA_RE. Because this script have a small number of regexps and the Python
interpreter do some caching (I don't know the internals of this), probably it
won't make a difference in performance. But this code does potentially 2000+
regexp compile() while only one is needed.

> +                        self.url = ''.join(config_line.split())

join() is not needed. Actually we want to match only 'upstream URLs' that are
always the only text in a line, so an even better regexp would be:

re.match("\s*https?://\S*$", config_line):
or
re.match("\s*https?://\S*\s*$", config_line):
if you want the regexp work even when there are trailing white spaces.

... so no need to join() or split(), just strip().

Notice I didn't tested any code I suggested.

[snip]
> +def check_url_status(pkg):
> +    if pkg.url_status != "Missing" and pkg.url_status != "No Config.in":
> +        try:
> +            url_status_code = requests.head(pkg.url, timeout=5).status_code

This timeout is a bit strict in my opinion, but that is not that important in
this first patch, using serial requests.
It will became important in the next patch, to avoid false timeouts for slow
hosts, or hosts with a high load, or limited internet connection, or slow
servers ... so in the next patch I will suggest to use 30 seconds.
       
> +            if url_status_code >= 400:
> +                pkg.url_status = "Invalid(%s)" % str(url_status_code)
> +        except requests.exceptions.RequestException as e:
> +            return

Please fix the 3 warnings from flake8.


Regards,
Ricardo
diff mbox series

Patch

diff --git a/support/scripts/pkg-stats b/support/scripts/pkg-stats
index b7b00e8..37f89ef 100755
--- a/support/scripts/pkg-stats
+++ b/support/scripts/pkg-stats
@@ -24,6 +24,7 @@  from collections import defaultdict
 import re
 import subprocess
 import sys
+import requests  # URL checking
 
 INFRA_RE = re.compile("\$\(eval \$\(([a-z-]*)-package\)\)")
 
@@ -43,10 +44,32 @@  class Package:
         self.patch_count = 0
         self.warnings = 0
         self.current_version = None
+        self.url = None
+        self.url_status = None
 
     def pkgvar(self):
         return self.name.upper().replace("-", "_")
 
+    def set_url(self):
+        """
+        Fills in the .url field
+        """
+        in_help_section = False
+        self.url_status = "No Config.in"
+        for filename in os.listdir(os.path.dirname(self.path)):
+            if fnmatch.fnmatch(filename, 'Config.*'):
+                fp = open(os.path.join(os.path.dirname(self.path), filename), "r")
+                for config_line in fp:
+                    if config_line.strip() == "help":
+                        in_help_section = True
+                    if in_help_section and re.match("(.*)https?://", config_line):
+                        self.url = ''.join(config_line.split())
+                        self.url_status = "Found"
+                        fp.close()
+                        return
+                self.url_status = "Missing"
+                fp.close()
+
     def set_infra(self):
         """
         Fills in the .infras field
@@ -254,6 +277,14 @@  def package_init_make_info():
 
         Package.all_versions[pkgvar] = value
 
+def check_url_status(pkg):
+    if pkg.url_status != "Missing" and pkg.url_status != "No Config.in":
+        try:
+            url_status_code = requests.head(pkg.url, timeout=5).status_code
+            if url_status_code >= 400:
+                pkg.url_status = "Invalid(%s)" % str(url_status_code)
+        except requests.exceptions.RequestException as e:
+            return
 
 def calculate_stats(packages):
     stats = defaultdict(int)
@@ -311,6 +342,15 @@  td.somepatches {
 td.lotsofpatches {
   background: #ff9a69;
 }
+td.good_url {
+  background: #d2ffc4;
+}
+td.missing_url {
+  background: #ffd870;
+}
+td.invalid_url {
+  background: #ff9a69;
+}
 </style>
 <title>Statistics of Buildroot packages</title>
 </head>
@@ -422,6 +462,20 @@  def dump_html_pkg(f, pkg):
     f.write("  <td class=\"%s\">%d</td>\n" %
             (" ".join(td_class), pkg.warnings))
 
+    # URL status
+    td_class = ["centered"]
+    url_str = pkg.url_status
+    if pkg.url_status == "Missing" or pkg.url_status == "No Config.in":
+        td_class.append("missing_url")
+    elif pkg.url_status.startswith("Invalid"):
+        td_class.append("invalid_url")
+        url_str = "<a href=%s>%s</a>" % (pkg.url, pkg.url_status)
+    else:
+        td_class.append("good_url")
+        url_str = "<a href=%s>Link</a>" % pkg.url
+    f.write("  <td class=\"%s\">%s</td>\n" %
+            (" ".join(td_class), url_str))
+
     f.write(" </tr>\n")
 
 
@@ -437,6 +491,7 @@  def dump_html_all_pkgs(f, packages):
 <td class=\"centered\">Hash file</td>
 <td class=\"centered\">Current version</td>
 <td class=\"centered\">Warnings</td>
+<td class=\"centered\">URL</td>
 </tr>
 """)
     for pkg in sorted(packages):
@@ -517,6 +572,8 @@  def __main__():
         pkg.set_patch_count()
         pkg.set_check_package_warnings()
         pkg.set_current_version()
+        pkg.set_url()
+        check_url_status(pkg)
     print("Calculate stats")
     stats = calculate_stats(packages)
     print("Write HTML")