diff mbox series

[v2,2/2] support/scripts/pkg-stats: URL check using threads

Message ID 1537544157-2992-2-git-send-email-matthew.weber@rockwellcollins.com
State Superseded
Headers show
Series [v2,1/2] support/scripts/pkg-stats: URL checking support | expand

Commit Message

Matt Weber Sept. 21, 2018, 3:35 p.m. UTC
Adds a pool of worker threads to accelerate connection testing.

CC: Signed-off-by: Ricardo Martincoski <ricardo.martincoski@gmail.com>
Signed-off-by: Matthew Weber <matthew.weber@rockwellcollins.com>
---
 support/scripts/pkg-stats | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

Comments

Matt Weber Sept. 21, 2018, 3:38 p.m. UTC | #1
All,

On Fri, Sep 21, 2018 at 10:36 AM Matt Weber
<matthew.weber@rockwellcollins.com> wrote:
>
> Adds a pool of worker threads to accelerate connection testing.
>

Takes an example serial ~15min execution of the script with the URL
testing feature and makes it ~4-5mins when ran in parallel.

> CC: Signed-off-by: Ricardo Martincoski <ricardo.martincoski@gmail.com>
> Signed-off-by: Matthew Weber <matthew.weber@rockwellcollins.com>
> ---
>  support/scripts/pkg-stats | 27 +++++++++++++++++++++------
>  1 file changed, 21 insertions(+), 6 deletions(-)
>
> diff --git a/support/scripts/pkg-stats b/support/scripts/pkg-stats
> index 37f89ef..f5c8353 100755
> --- a/support/scripts/pkg-stats
> +++ b/support/scripts/pkg-stats
> @@ -25,6 +25,7 @@ import re
>  import subprocess
>  import sys
>  import requests  # URL checking
> +from multiprocessing import Pool
>
>  INFRA_RE = re.compile("\$\(eval \$\(([a-z-]*)-package\)\)")
>
> @@ -46,6 +47,7 @@ class Package:
>          self.current_version = None
>          self.url = None
>          self.url_status = None
> +        self.url_worker = None
>
>      def pkgvar(self):
>          return self.name.upper().replace("-", "_")
> @@ -277,14 +279,26 @@ def package_init_make_info():
>
>          Package.all_versions[pkgvar] = value
>
> -def check_url_status(pkg):
> -    if pkg.url_status != "Missing" and pkg.url_status != "No Config.in":
> +
> +def check_url_status_worker(url, url_status):
> +    if url_status != "Missing" and url_status != "No Config.in":
>          try:
> -            url_status_code = requests.head(pkg.url, timeout=5).status_code
> +            url_status_code = requests.head(url, timeout=5).status_code
>              if url_status_code >= 400:
> -                pkg.url_status = "Invalid(%s)" % str(url_status_code)
> +                return "Invalid(%s)" % str(url_status_code)
>          except requests.exceptions.RequestException as e:
> -            return
> +            return "Invalid(Err)"
> +        return "Ok"
> +    return url_status
> +
> +
> +def check_package_urls(packages):
> +    Package.pool = Pool(processes=64)
> +    for pkg in packages:
> +        pkg.url_worker = pkg.pool.apply_async(check_url_status_worker, (pkg.url, pkg.url_status))
> +    for pkg in packages:
> +        pkg.url_status = pkg.url_worker.get(timeout=3600)
> +
>
>  def calculate_stats(packages):
>      stats = defaultdict(int)
> @@ -573,7 +587,8 @@ def __main__():
>          pkg.set_check_package_warnings()
>          pkg.set_current_version()
>          pkg.set_url()
> -        check_url_status(pkg)
> +    print("Checking URL status")
> +    check_package_urls(packages)
>      print("Calculate stats")
>      stats = calculate_stats(packages)
>      print("Write HTML")
> --
> 1.9.1
>
Ricardo Martincoski Sept. 29, 2018, 5:42 a.m. UTC | #2
Hello,

On Fri, Sep 21, 2018 at 12:35 PM, Matt Weber wrote:

[snip]
> +def check_url_status_worker(url, url_status):
> +    if url_status != "Missing" and url_status != "No Config.in":
>          try:
> -            url_status_code = requests.head(pkg.url, timeout=5).status_code
> +            url_status_code = requests.head(url, timeout=5).status_code

In order to avoid false timeouts for slow hosts, or hosts with a high load, or
limited internet connection, or slow servers ... I think 30 seconds is a better
timeout here.

>              if url_status_code >= 400:
> -                pkg.url_status = "Invalid(%s)" % str(url_status_code)
> +                return "Invalid(%s)" % str(url_status_code)
>          except requests.exceptions.RequestException as e:

There is a warning from flake8 for this line. Please fix it in previous patch.


Regards,
Ricardo
Matt Weber Oct. 1, 2018, 2:05 p.m. UTC | #3
Ricardo,

On Sat, Sep 29, 2018 at 12:42 AM Ricardo Martincoski
<ricardo.martincoski@gmail.com> wrote:
>
> Hello,
>
> On Fri, Sep 21, 2018 at 12:35 PM, Matt Weber wrote:
>
> [snip]
> > +def check_url_status_worker(url, url_status):
> > +    if url_status != "Missing" and url_status != "No Config.in":
> >          try:
> > -            url_status_code = requests.head(pkg.url, timeout=5).status_code
> > +            url_status_code = requests.head(url, timeout=5).status_code
>
> In order to avoid false timeouts for slow hosts, or hosts with a high load, or
> limited internet connection, or slow servers ... I think 30 seconds is a better
> timeout here.

Sure.

>
> >              if url_status_code >= 400:
> > -                pkg.url_status = "Invalid(%s)" % str(url_status_code)
> > +                return "Invalid(%s)" % str(url_status_code)
> >          except requests.exceptions.RequestException as e:
>
> There is a warning from flake8 for this line. Please fix it in previous patch.

I'm not seeing any output from flake8.  To be sure the tool is
working, I changed something syntax wise in the script and I get a
flake8 err/warning.  Are there options you're calling it with?

Matt
Matt Weber Oct. 1, 2018, 2:20 p.m. UTC | #4
Ricardo,

On Mon, Oct 1, 2018 at 9:05 AM Matthew Weber
<matthew.weber@rockwellcollins.com> wrote:
>
> Ricardo,
>
> On Sat, Sep 29, 2018 at 12:42 AM Ricardo Martincoski
> <ricardo.martincoski@gmail.com> wrote:
> >
> > Hello,
> >
> > On Fri, Sep 21, 2018 at 12:35 PM, Matt Weber wrote:
> >
> > [snip]
> > > +def check_url_status_worker(url, url_status):
> > > +    if url_status != "Missing" and url_status != "No Config.in":
> > >          try:
> > > -            url_status_code = requests.head(pkg.url, timeout=5).status_code
> > > +            url_status_code = requests.head(url, timeout=5).status_code
> >
> > In order to avoid false timeouts for slow hosts, or hosts with a high load, or
> > limited internet connection, or slow servers ... I think 30 seconds is a better
> > timeout here.
>
> Sure.
>
> >
> > >              if url_status_code >= 400:
> > > -                pkg.url_status = "Invalid(%s)" % str(url_status_code)
> > > +                return "Invalid(%s)" % str(url_status_code)
> > >          except requests.exceptions.RequestException as e:
> >
> > There is a warning from flake8 for this line. Please fix it in previous patch.
>
> I'm not seeing any output from flake8.  To be sure the tool is
> working, I changed something syntax wise in the script and I get a
> flake8 err/warning.  Are there options you're calling it with?
>

Got it.  My flake8 install was messed up and after switching machines
I see the unused warning.

Matt
diff mbox series

Patch

diff --git a/support/scripts/pkg-stats b/support/scripts/pkg-stats
index 37f89ef..f5c8353 100755
--- a/support/scripts/pkg-stats
+++ b/support/scripts/pkg-stats
@@ -25,6 +25,7 @@  import re
 import subprocess
 import sys
 import requests  # URL checking
+from multiprocessing import Pool
 
 INFRA_RE = re.compile("\$\(eval \$\(([a-z-]*)-package\)\)")
 
@@ -46,6 +47,7 @@  class Package:
         self.current_version = None
         self.url = None
         self.url_status = None
+        self.url_worker = None
 
     def pkgvar(self):
         return self.name.upper().replace("-", "_")
@@ -277,14 +279,26 @@  def package_init_make_info():
 
         Package.all_versions[pkgvar] = value
 
-def check_url_status(pkg):
-    if pkg.url_status != "Missing" and pkg.url_status != "No Config.in":
+
+def check_url_status_worker(url, url_status):
+    if url_status != "Missing" and url_status != "No Config.in":
         try:
-            url_status_code = requests.head(pkg.url, timeout=5).status_code
+            url_status_code = requests.head(url, timeout=5).status_code
             if url_status_code >= 400:
-                pkg.url_status = "Invalid(%s)" % str(url_status_code)
+                return "Invalid(%s)" % str(url_status_code)
         except requests.exceptions.RequestException as e:
-            return
+            return "Invalid(Err)"
+        return "Ok"
+    return url_status
+
+
+def check_package_urls(packages):
+    Package.pool = Pool(processes=64)
+    for pkg in packages:
+        pkg.url_worker = pkg.pool.apply_async(check_url_status_worker, (pkg.url, pkg.url_status))
+    for pkg in packages:
+        pkg.url_status = pkg.url_worker.get(timeout=3600)
+
 
 def calculate_stats(packages):
     stats = defaultdict(int)
@@ -573,7 +587,8 @@  def __main__():
         pkg.set_check_package_warnings()
         pkg.set_current_version()
         pkg.set_url()
-        check_url_status(pkg)
+    print("Checking URL status")
+    check_package_urls(packages)
     print("Calculate stats")
     stats = calculate_stats(packages)
     print("Write HTML")