diff mbox series

[1/1,autotest-client-tests] UBUNTU: SAUCE: ubuntu_nvidia_server_driver: create Nvidia server driver loading test

Message ID 20211130101609.3579-2-taihsiang.ho@canonical.com
State New
Headers show
Series create Nvidia server driver loading test | expand

Commit Message

Taihsiang Ho (tai271828) Nov. 30, 2021, 10:16 a.m. UTC
The goal of the test is trying to reload all relevant packages in
different scenarios and make sure all of the scenarios work propoerly.

Signed-off-by: Taihsiang Ho (tai271828) <taihsiang.ho@canonical.com>
---
 ubuntu_nvidia_server_driver/control           | 12 +++
 ubuntu_nvidia_server_driver/nvidia-module-lib | 96 +++++++++++++++++++
 .../test-each-nvidia-server-driver.sh         | 35 +++++++
 .../ubuntu_nvidia_server_driver.py            | 32 +++++++
 .../ubuntu_nvidia_server_driver.sh            | 40 ++++++++
 5 files changed, 215 insertions(+)
 create mode 100644 ubuntu_nvidia_server_driver/control
 create mode 100644 ubuntu_nvidia_server_driver/nvidia-module-lib
 create mode 100755 ubuntu_nvidia_server_driver/test-each-nvidia-server-driver.sh
 create mode 100644 ubuntu_nvidia_server_driver/ubuntu_nvidia_server_driver.py
 create mode 100755 ubuntu_nvidia_server_driver/ubuntu_nvidia_server_driver.sh
diff mbox series

Patch

diff --git a/ubuntu_nvidia_server_driver/control b/ubuntu_nvidia_server_driver/control
new file mode 100644
index 00000000..2c3f2510
--- /dev/null
+++ b/ubuntu_nvidia_server_driver/control
@@ -0,0 +1,12 @@ 
+AUTHOR = 'Taihsiang Ho <taihsiang.ho@canonical.com>'
+TIME = 'SHORT'
+NAME = 'Nvidia server driver build and load verification test'
+TEST_TYPE = 'client'
+TEST_CLASS = 'General'
+TEST_CATEGORY = 'Smoke'
+
+DOC = """
+Perform testing of Nvidia server drivers
+"""
+
+job.run_test_detail('ubuntu_nvidia_server_driver', test_name='load', tag='load', timeout=600)
diff --git a/ubuntu_nvidia_server_driver/nvidia-module-lib b/ubuntu_nvidia_server_driver/nvidia-module-lib
new file mode 100644
index 00000000..06141bfc
--- /dev/null
+++ b/ubuntu_nvidia_server_driver/nvidia-module-lib
@@ -0,0 +1,96 @@ 
+# Copyright 2021 Canonical Ltd.
+# Written by:
+#   Dann Frazier <dann.frazier@canonical.com>
+#   Taihsiang Ho <taihsiang.ho@canonical.com>
+#
+# shellcheck shell=bash
+module_loaded() {
+    module="$1"
+    # Check linux/include/linux/module.h for module_state enumeration
+    # There are the other states like Loading and Unloading besides Live. The
+    # other states usually only take only few microseconds but let's specify
+    # Live explicitly.
+    grep "^${module} " /proc/modules | grep -q Live
+}
+
+get_module_field() {
+    local module="$1"
+    local field="$2"
+    # shellcheck disable=SC2034
+    read -r mod size usecnt deps rest < <(grep "^${module} " /proc/modules)
+    case $field in
+        usecnt)
+            echo "$usecnt"
+            ;;
+        deps)
+            if [ "$deps" = "-" ]; then
+                return 0
+            fi
+            echo "$deps" | tr ',' ' '
+            ;;
+        *)
+            return 1
+    esac
+}
+
+module_in_use() {
+    module="$1"
+
+    usecnt="$(get_module_field "$module" usecnt)"
+
+    if [ "$usecnt" -eq 0 ]; then
+        return 1
+    fi
+    return 0
+}
+
+recursive_remove_module() {
+    local module="$1"
+
+    if ! module_loaded "$module"; then
+        return 0
+    fi
+
+    if ! module_in_use "$module"; then
+        sudo rmmod "$module"
+        return 0
+    fi
+
+    if [ "$(get_module_field "$module" deps)" = "" ]; then
+        echo "ERROR: $module is in use, but has no reverse dependencies"
+        echo "ERROR: Maybe an application is using it."
+        exit 1
+    fi
+    beforecnt="$(get_module_field "$module" usecnt)"
+    for dep in $(get_module_field "$module" deps); do
+        recursive_remove_module "$dep"
+    done
+    aftercnt="$(get_module_field "$module" usecnt)"
+    if [ "$beforecnt" -eq "$aftercnt" ]; then
+        echo "ERROR: Unable to reduce $module use count"
+        exit 1
+    fi
+    recursive_remove_module "$module"
+}
+
+uninstall_all_nvidia_mod_pkgs() {
+    for pkg in $(dpkg-query -f "\${Package}\n" -W 'linux-modules-nvidia-*'); do
+        sudo apt remove --purge "$pkg" -y
+    done
+    if sudo modinfo nvidia; then
+        echo "ERROR: Uninstallation of all nvidia modules failed."
+        exit 1
+    fi
+}
+
+product="$(sudo dmidecode -s baseboard-product-name)"
+pkg_compatible_with_platform() {
+    local pkg="$1"
+    branch="$(echo "$pkg" | cut -d- -f4)"
+
+    if [ "$product" = "DGXA100" ] && [ "$branch" -le "418" ]; then
+        return 1
+    fi
+
+    return 0
+}
diff --git a/ubuntu_nvidia_server_driver/test-each-nvidia-server-driver.sh b/ubuntu_nvidia_server_driver/test-each-nvidia-server-driver.sh
new file mode 100755
index 00000000..9dae85cc
--- /dev/null
+++ b/ubuntu_nvidia_server_driver/test-each-nvidia-server-driver.sh
@@ -0,0 +1,35 @@ 
+#!/usr/bin/env bash
+#
+# Copyright 2021 Canonical Ltd.
+# Written by:
+#   Dann Frazier <dann.frazier@canonical.com>
+#   Taihsiang Ho <taihsiang.ho@canonical.com>
+
+set -e
+
+source nvidia-module-lib
+
+sudo service nvidia-fabricmanager stop || /bin/true
+
+# Some examples like:
+# ubuntu@hot-koala:~$ apt-cache search --names-only "^linux-modules-nvidia-[0-9]+-server-$(uname -r)$"
+# linux-modules-nvidia-418-server-5.4.0-90-generic - Linux kernel nvidia modules for version 5.4.0-90
+# linux-modules-nvidia-450-server-5.4.0-90-generic - Linux kernel nvidia modules for version 5.4.0-90
+# linux-modules-nvidia-460-server-5.4.0-90-generic - Linux kernel nvidia modules for version 5.4.0-90
+# linux-modules-nvidia-470-server-5.4.0-90-generic - Linux kernel nvidia modules for version 5.4.0-90
+for drvpkg in $(apt-cache search --names-only "^linux-modules-nvidia-[0-9]+-server-$(uname -r)$" | cut -d' ' -f1); do
+    if ! pkg_compatible_with_platform "$drvpkg"; then
+        echo "INFO: Skipping $drvpkg on $platform" 1>&2
+        continue
+    fi
+    uninstall_all_nvidia_mod_pkgs
+    recursive_remove_module nvidia
+    sudo dmesg -c > /dev/null
+    sudo apt install -y "$drvpkg"
+    sudo modprobe nvidia
+    if sudo dmesg | grep "NVRM: loading NVIDIA UNIX"; then
+        continue
+    fi
+    echo "ERROR: Failed to detect nvidia driver initialization message in dmesg"
+    exit 1
+done
diff --git a/ubuntu_nvidia_server_driver/ubuntu_nvidia_server_driver.py b/ubuntu_nvidia_server_driver/ubuntu_nvidia_server_driver.py
new file mode 100644
index 00000000..d0c667ae
--- /dev/null
+++ b/ubuntu_nvidia_server_driver/ubuntu_nvidia_server_driver.py
@@ -0,0 +1,32 @@ 
+import os
+from autotest.client import test, utils
+
+p_dir = os.path.dirname(os.path.abspath(__file__))
+sh_executable = os.path.join(p_dir, "ubuntu_nvidia_server_driver.sh")
+
+
+class ubuntu_nvidia_server_driver(test.test):
+    version = 1
+
+    def initialize(self):
+        pass
+
+    def setup(self):
+        cmd = "{} setup".format(sh_executable)
+        utils.system(cmd)
+
+    def compare_kernel_modules(self):
+        cmd = "{} test".format(sh_executable)
+        utils.system(cmd)
+
+    def run_once(self, test_name):
+        if test_name == "load":
+            self.compare_kernel_modules()
+
+            print("")
+            print("{} has run.".format(test_name))
+
+        print("")
+
+    def postprocess_iteration(self):
+        pass
diff --git a/ubuntu_nvidia_server_driver/ubuntu_nvidia_server_driver.sh b/ubuntu_nvidia_server_driver/ubuntu_nvidia_server_driver.sh
new file mode 100755
index 00000000..8dda591e
--- /dev/null
+++ b/ubuntu_nvidia_server_driver/ubuntu_nvidia_server_driver.sh
@@ -0,0 +1,40 @@ 
+#!/usr/bin/env bash
+#
+# perform Nvidia driver load testing and corresponding pre-setup.
+#
+
+set -eo pipefail
+
+setup() {
+    # pre-setup testing environment and necessary tools
+    # currently there is nothing practically but will be used possibly in the future.
+    echo "begin to pre-setup testing"
+}
+
+run_test() {
+    exe_dir=$(dirname "${BASH_SOURCE[0]}")
+    pushd "${exe_dir}"
+    ./test-each-nvidia-server-driver.sh
+    popd
+}
+
+case $1 in
+    setup)
+        echo ""
+        echo "On setting up necessary test environment..."
+        echo ""
+        setup
+        echo ""
+        echo "Setting up necessary test environment..."
+        echo ""
+        ;;
+    test)
+        echo ""
+        echo "On running test..."
+        echo ""
+        run_test
+        echo ""
+        echo "Running test..."
+        echo ""
+        ;;
+esac