X-Git-Url: https://gerrit.fd.io/r/gitweb?p=csit.git;a=blobdiff_plain;f=resources%2Flibraries%2Fpython%2FDUTSetup.py;h=16acfba7adbc7de8bc14bea4a4ddf4267e907438;hp=1a8899f6f6faa8350766d5ebe6d48bfcffd83fbb;hb=6b86c6fa1315f5c12c55bdd289b4e2af7d710c39;hpb=6b893681d1246f90e55b44762dd85f7d9dab1348

diff --git a/resources/libraries/python/DUTSetup.py b/resources/libraries/python/DUTSetup.py
index 1a8899f6f6..16acfba7ad 100644
--- a/resources/libraries/python/DUTSetup.py
+++ b/resources/libraries/python/DUTSetup.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019 Cisco and/or its affiliates.
+# Copyright (c) 2021 Cisco and/or its affiliates.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at:
@@ -13,10 +13,11 @@
 
 """DUT setup library."""
 
+from time import sleep
 from robot.api import logger
 
 from resources.libraries.python.Constants import Constants
-from resources.libraries.python.ssh import SSH, exec_cmd_no_error
+from resources.libraries.python.ssh import SSH, exec_cmd, exec_cmd_no_error
 from resources.libraries.python.topology import NodeType, Topology
 
 
@@ -32,11 +33,11 @@ class DUTSetup:
         :type node: dict
         :type service: str
         """
-        command = u"echo $(< /tmp/*supervisor*.log)"\
+        command = u"cat /tmp/*supervisor*.log"\
             if DUTSetup.running_in_container(node) \
-            else f"journalctl --no-pager --unit={service} " \
-            f"--since=\"$(echo `systemctl show -p ActiveEnterTimestamp " \
-            f"{service}` | awk \'{{print $2 $3}}\')\""
+            else f"journalctl --no-pager _SYSTEMD_INVOCATION_ID=$(systemctl " \
+            f"show -p InvocationID --value {service})"
+
         message = f"Node {node[u'host']} failed to get logs from unit {service}"
 
         exec_cmd_no_error(
@@ -132,6 +133,8 @@ class DUTSetup:
         :type node: dict
         :type service: str
         """
+        DUTSetup.get_service_logs(node, service)
+
         command = f"supervisorctl stop {service}" \
             if DUTSetup.running_in_container(node) \
             else f"service {service} stop"
@@ -141,8 +144,6 @@ class DUTSetup:
             node, command, timeout=180, sudo=True, message=message
         )
 
-        DUTSetup.get_service_logs(node, service)
-
     @staticmethod
     def stop_service_on_all_duts(nodes, service):
         """Stop the named service on all DUTs.
@@ -157,11 +158,62 @@ class DUTSetup:
                 DUTSetup.stop_service(node, service)
 
     @staticmethod
-    def get_vpp_pid(node):
-        """Get PID of running VPP process.
+    def kill_program(node, program, namespace=None):
+        """Kill program on the specified topology node.
+
+        :param node: Topology node.
+        :param program: Program name.
+        :param namespace: Namespace program is running in.
+        :type node: dict
+        :type program: str
+        :type namespace: str
+        """
+        host = node[u"host"]
+        cmd_timeout = 5
+        if namespace in (None, u"default"):
+            shell_cmd = u"sh -c"
+        else:
+            shell_cmd = f"ip netns exec {namespace} sh -c"
+
+        pgrep_cmd = f"{shell_cmd} \'pgrep -c {program}\'"
+        _, stdout, _ = exec_cmd(node, pgrep_cmd, timeout=cmd_timeout,
+                                sudo=True)
+        if int(stdout) == 0:
+            logger.trace(f"{program} is not running on {host}")
+            return
+        exec_cmd(node, f"{shell_cmd} \'pkill {program}\'",
+                 timeout=cmd_timeout, sudo=True)
+        for attempt in range(5):
+            _, stdout, _ = exec_cmd(node, pgrep_cmd, timeout=cmd_timeout,
+                                    sudo=True)
+            if int(stdout) == 0:
+                logger.trace(f"Attempt {attempt}: {program} is dead on {host}")
+                return
+            sleep(1)
+        logger.trace(f"SIGKILLing {program} on {host}")
+        exec_cmd(node, f"{shell_cmd} \'pkill -9 {program}\'",
+                 timeout=cmd_timeout, sudo=True)
+
+    @staticmethod
+    def verify_program_installed(node, program):
+        """Verify that program is installed on the specified topology node.
+
+        :param node: Topology node.
+        :param program: Program name.
+        :type node: dict
+        :type program: str
+        """
+        cmd = f"command -v {program}"
+        exec_cmd_no_error(node, cmd, message=f"{program} is not installed")
+
+    @staticmethod
+    def get_pid(node, process):
+        """Get PID of running process.
 
         :param node: DUT node.
+        :param process: process name.
         :type node: dict
+        :type process: str
         :returns: PID
         :rtype: int
         :raises RuntimeError: If it is not possible to get the PID.
@@ -171,26 +223,24 @@ class DUTSetup:
 
         retval = None
         for i in range(3):
-            logger.trace(f"Try {i}: Get VPP PID")
-            ret_code, stdout, stderr = ssh.exec_command(u"pidof vpp")
+            logger.trace(f"Try {i}: Get {process} PID")
+            ret_code, stdout, stderr = ssh.exec_command(f"pidof {process}")
 
             if int(ret_code):
                 raise RuntimeError(
-                    f"Not possible to get PID of VPP process on node: "
+                    f"Not possible to get PID of {process} process on node: "
                     f"{node[u'host']}\n {stdout + stderr}"
                 )
 
             pid_list = stdout.split()
             if len(pid_list) == 1:
-                retval = int(stdout)
-            elif not pid_list:
-                logger.debug(f"No VPP PID found on node {node[u'host']}")
+                return [int(stdout)]
+            if not pid_list:
+                logger.debug(f"No {process} PID found on node {node[u'host']}")
                 continue
-            else:
-                logger.debug(
-                    f"More then one VPP PID found on node {node[u'host']}"
-                )
-                retval = [int(pid) for pid in pid_list]
+            logger.debug(f"More than one {process} PID found " \
+                         f"on node {node[u'host']}")
+            retval = [int(pid) for pid in pid_list]
 
         return retval
 
@@ -206,7 +256,7 @@ class DUTSetup:
         pids = dict()
         for node in nodes.values():
             if node[u"type"] == NodeType.DUT:
-                pids[node[u"host"]] = DUTSetup.get_vpp_pid(node)
+                pids[node[u"host"]] = DUTSetup.get_pid(node, u"vpp")
         return pids
 
     @staticmethod
@@ -295,7 +345,7 @@ class DUTSetup:
         :type pf_pci_addr: str
         :type vf_id: int
         :returns: Virtual Function PCI address.
-        :rtype: int
+        :rtype: str
         :raises RuntimeError: If failed to get Virtual Function PCI address.
         """
         command = f"sh -c \"basename $(readlink " \
@@ -350,6 +400,20 @@ class DUTSetup:
         :type numvfs: int
         :raises RuntimeError: Failed to create VFs on PCI.
         """
+        cmd = f"test -f /sys/bus/pci/devices/{pf_pci_addr}/sriov_numvfs"
+        sriov_unsupported, _, _ = exec_cmd(node, cmd)
+        # if sriov_numvfs doesn't exist, then sriov_unsupported != 0
+        if int(sriov_unsupported):
+            if numvfs == 0:
+                # sriov is not supported and we want 0 VFs
+                # no need to do anything
+                return
+
+            raise RuntimeError(
+                f"Can't configure {numvfs} VFs on {pf_pci_addr} device "
+                f"on {node[u'host']} since it doesn't support SR-IOV."
+            )
+
         pci = pf_pci_addr.replace(u":", r"\:")
         command = f"sh -c \"echo {numvfs} | " \
             f"tee /sys/bus/pci/devices/{pci}/sriov_numvfs\""
@@ -379,6 +443,18 @@ class DUTSetup:
             node, command, timeout=120, sudo=True, message=message
         )
 
+    @staticmethod
+    def pci_driver_unbind_list(node, *pci_addrs):
+        """Unbind PCI devices from current driver on node.
+
+        :param node: DUT node.
+        :param pci_addrs: PCI device addresses.
+        :type node: dict
+        :type pci_addrs: list
+        """
+        for pci_addr in pci_addrs:
+            DUTSetup.pci_driver_unbind(node, pci_addr)
+
     @staticmethod
     def pci_driver_bind(node, pci_addr, driver):
         """Bind PCI device to driver on node.
@@ -481,61 +557,24 @@ class DUTSetup:
     def get_pci_dev_driver(node, pci_addr):
         """Get current PCI device driver on node.
 
-        .. note::
-            # lspci -vmmks 0000:00:05.0
-            Slot:   00:05.0
-            Class:  Ethernet controller
-            Vendor: Red Hat, Inc
-            Device: Virtio network device
-            SVendor:        Red Hat, Inc
-            SDevice:        Device 0001
-            PhySlot:        5
-            Driver: virtio-pci
-
         :param node: DUT node.
         :param pci_addr: PCI device address.
         :type node: dict
         :type pci_addr: str
         :returns: Driver or None
-        :raises RuntimeError: If PCI rescan or lspci command execution failed.
         :raises RuntimeError: If it is not possible to get the interface driver
             information from the node.
         """
-        ssh = SSH()
-        ssh.connect(node)
-
-        for i in range(3):
-            logger.trace(f"Try number {i}: Get PCI device driver")
-
-            cmd = f"lspci -vmmks {pci_addr}"
-            ret_code, stdout, _ = ssh.exec_command(cmd)
-            if int(ret_code):
-                raise RuntimeError(f"'{cmd}' failed on '{node[u'host']}'")
-
-            for line in stdout.splitlines():
-                if not line:
-                    continue
-                name = None
-                value = None
-                try:
-                    name, value = line.split(u"\t", 1)
-                except ValueError:
-                    if name == u"Driver:":
-                        return None
-                if name == u"Driver:":
-                    return value
-
-            if i < 2:
-                logger.trace(
-                    f"Driver for PCI device {pci_addr} not found, "
-                    f"executing pci rescan and retrying"
-                )
-                cmd = u"sh -c \"echo 1 > /sys/bus/pci/rescan\""
-                ret_code, _, _ = ssh.exec_command_sudo(cmd)
-                if int(ret_code) != 0:
-                    raise RuntimeError(f"'{cmd}' failed on '{node[u'host']}'")
-
-        return None
+        driver_path = f"/sys/bus/pci/devices/{pci_addr}/driver"
+        cmd = f"test -d {driver_path}"
+        ret_code, ret_val, _ = exec_cmd(node, cmd)
+        if int(ret_code):
+            # the directory doesn't exist which means the device is not bound
+            # to any driver
+            return None
+        cmd = f"basename $(readlink -f {driver_path})"
+        ret_val, _ = exec_cmd_no_error(node, cmd)
+        return ret_val.strip()
 
     @staticmethod
     def verify_kernel_module(node, module, force_load=False):
@@ -637,7 +676,7 @@ class DUTSetup:
                     )
                     # workaround to avoid installation of vpp-api-python
                     exec_cmd_no_error(
-                        node, u"rm -f {vpp_pkg_dir}vpp-api-python.deb",
+                        node, f"rm -f {vpp_pkg_dir}vpp-api-python.deb",
                         timeout=120, sudo=True
                     )
                     exec_cmd_no_error(
@@ -654,7 +693,7 @@ class DUTSetup:
                     )
                     # workaround to avoid installation of vpp-api-python
                     exec_cmd_no_error(
-                        node, u"rm -f {vpp_pkg_dir}vpp-api-python.rpm",
+                        node, f"rm -f {vpp_pkg_dir}vpp-api-python.rpm",
                         timeout=120, sudo=True
                     )
                     exec_cmd_no_error(
@@ -704,176 +743,98 @@ class DUTSetup:
         return stdout.strip()
 
     @staticmethod
-    def get_huge_page_size(node):
-        """Get default size of huge pages in system.
+    def get_hugepages_info(node, hugesize=None):
+        """Get number of huge pages in system.
 
         :param node: Node in the topology.
+        :param hugesize: Size of hugepages. Default system huge size if None.
         :type node: dict
-        :returns: Default size of free huge pages in system.
-        :rtype: int
-        :raises RuntimeError: If reading failed for three times.
-        """
-        ssh = SSH()
-        ssh.connect(node)
-
-        for _ in range(3):
-            ret_code, stdout, _ = ssh.exec_command_sudo(
-                u"grep Hugepagesize /proc/meminfo | awk '{ print $2 }'"
-            )
-            if ret_code == 0:
-                try:
-                    huge_size = int(stdout)
-                except ValueError:
-                    logger.trace(u"Reading huge page size information failed")
-                else:
-                    break
-        else:
-            raise RuntimeError(u"Getting huge page size information failed.")
-        return huge_size
-
-    @staticmethod
-    def get_huge_page_free(node, huge_size):
-        """Get number of free huge pages in system.
-
-        :param node: Node in the topology.
-        :param huge_size: Size of hugepages.
-        :type node: dict
-        :type huge_size: int
-        :returns: Number of free huge pages in system.
-        :rtype: int
-        :raises RuntimeError: If reading failed for three times.
-        """
-        # TODO: add numa aware option
-        ssh = SSH()
-        ssh.connect(node)
-
-        for _ in range(3):
-            ret_code, stdout, _ = ssh.exec_command_sudo(
-                f"cat /sys/kernel/mm/hugepages/hugepages-{huge_size}kB/"
-                f"free_hugepages"
-            )
-            if ret_code == 0:
-                try:
-                    huge_free = int(stdout)
-                except ValueError:
-                    logger.trace(u"Reading free huge pages information failed")
-                else:
-                    break
-        else:
-            raise RuntimeError(u"Getting free huge pages information failed.")
-        return huge_free
-
-    @staticmethod
-    def get_huge_page_total(node, huge_size):
-        """Get total number of huge pages in system.
-
-        :param node: Node in the topology.
-        :param huge_size: Size of hugepages.
-        :type node: dict
-        :type huge_size: int
-        :returns: Total number of huge pages in system.
-        :rtype: int
-        :raises RuntimeError: If reading failed for three times.
+        :type hugesize: int
+        :returns: Number of huge pages in system.
+        :rtype: dict
+        :raises RuntimeError: If reading failed.
         """
-        # TODO: add numa aware option
-        ssh = SSH()
-        ssh.connect(node)
-
-        for _ in range(3):
-            ret_code, stdout, _ = ssh.exec_command_sudo(
-                f"cat /sys/kernel/mm/hugepages/hugepages-{huge_size}kB/"
-                f"nr_hugepages"
-            )
-            if ret_code == 0:
-                try:
-                    huge_total = int(stdout)
-                except ValueError:
-                    logger.trace(u"Reading total huge pages information failed")
-                else:
-                    break
-        else:
-            raise RuntimeError(u"Getting total huge pages information failed.")
-        return huge_total
+        if not hugesize:
+            hugesize = "$(grep Hugepagesize /proc/meminfo | awk '{ print $2 }')"
+        command = f"cat /sys/kernel/mm/hugepages/hugepages-{hugesize}kB/*"
+        stdout, _ = exec_cmd_no_error(node, command)
+        try:
+            line = stdout.splitlines()
+            return {
+                "free_hugepages": int(line[0]),
+                "nr_hugepages": int(line[1]),
+                "nr_hugepages_mempolicy": int(line[2]),
+                "nr_overcommit_hugepages": int(line[3]),
+                "resv_hugepages": int(line[4]),
+                "surplus_hugepages": int(line[5])
+            }
+        except ValueError:
+            logger.trace(u"Reading huge pages information failed!")
 
     @staticmethod
-    def check_huge_page(node, huge_mnt, mem_size, allocate=False):
+    def check_huge_page(
+            node, huge_mnt, mem_size, hugesize=2048, allocate=False):
         """Check if there is enough HugePages in system. If allocate is set to
         true, try to allocate more HugePages.
 
         :param node: Node in the topology.
         :param huge_mnt: HugePage mount point.
-        :param mem_size: Requested memory in MB.
+        :param mem_size: Reqeusted memory in MB.
+        :param hugesize: HugePage size in KB.
         :param allocate: Whether to allocate more memory if not enough.
         :type node: dict
         :type huge_mnt: str
-        :type mem_size: str
+        :type mem_size: int
+        :type hugesize: int
         :type allocate: bool
         :raises RuntimeError: Mounting hugetlbfs failed or not enough HugePages
             or increasing map count failed.
         """
-        # TODO: split function into smaller parts.
-        ssh = SSH()
-        ssh.connect(node)
-
-        # Get huge pages information
-        huge_size = DUTSetup.get_huge_page_size(node)
-        huge_free = DUTSetup.get_huge_page_free(node, huge_size)
-        huge_total = DUTSetup.get_huge_page_total(node, huge_size)
+        # Get huge pages information.
+        hugepages = DUTSetup.get_hugepages_info(node, hugesize=hugesize)
+
+        # Check if hugepages requested are available on node.
+        if hugepages[u"nr_overcommit_hugepages"]:
+            # If overcommit is used, we need to know how many additional pages
+            # we can allocate
+            huge_available = hugepages[u"nr_overcommit_hugepages"] - \
+                hugepages[u"surplus_hugepages"]
+        else:
+            # Fallbacking to free_hugepages which were used before to detect.
+            huge_available = hugepages[u"free_hugepages"]
 
-        # Check if memory requested is available on
-        mem_size = int(mem_size)
-        if (mem_size * 1024) > (huge_free * huge_size):
-            # If we want to allocate hugepage dynamically
+        if ((mem_size * 1024) // hugesize) > huge_available:
+            # If we want to allocate hugepage dynamically.
             if allocate:
-                mem_needed = (mem_size * 1024) - (huge_free * huge_size)
-                huge_to_allocate = ((mem_needed // huge_size) * 2) + huge_total
-                max_map_count = huge_to_allocate*4
-                # Increase maximum number of memory map areas a process may have
-                ret_code, _, _ = ssh.exec_command_sudo(
+                huge_needed = ((mem_size * 1024) // hugesize) - huge_available
+                huge_to_allocate = huge_needed + hugepages[u"nr_hugepages"]
+                max_map_count = huge_to_allocate * 4
+                # Check if huge pages mount point exist.
+                try:
+                    exec_cmd_no_error(node, u"fgrep 'hugetlbfs' /proc/mounts")
+                except RuntimeError:
+                    exec_cmd_no_error(node, f"mkdir -p {huge_mnt}", sudo=True)
+                    exec_cmd_no_error(
+                        node,
+                        f"mount -t hugetlbfs -o pagesize={hugesize}k none "
+                        f"{huge_mnt}",
+                        sudo=True)
+                # Increase maximum number of memory map areas for process.
+                exec_cmd_no_error(
+                    node,
                     f"echo \"{max_map_count}\" | "
-                    f"sudo tee /proc/sys/vm/max_map_count"
+                    f"sudo tee /proc/sys/vm/max_map_count",
+                    message=f"Increase map count failed on {node[u'host']}!"
                 )
-                if int(ret_code) != 0:
-                    raise RuntimeError(
-                        f"Increase map count failed on {node[u'host']}"
-                    )
-                # Increase hugepage count
-                ret_code, _, _ = ssh.exec_command_sudo(
+                # Increase hugepage count.
+                exec_cmd_no_error(
+                    node,
                     f"echo \"{huge_to_allocate}\" | "
-                    f"sudo tee /proc/sys/vm/nr_hugepages"
+                    f"sudo tee /proc/sys/vm/nr_hugepages",
+                    message=f"Mount huge pages failed on {node[u'host']}!"
                 )
-                if int(ret_code) != 0:
-                    raise RuntimeError(
-                        f"Mount huge pages failed on {node[u'host']}"
-                    )
-            # If we do not want to allocate dynamically end with error
+            # If we do not want to allocate dynamically end with error.
             else:
                 raise RuntimeError(
-                    f"Not enough free huge pages: {huge_free}, "
-                    f"{huge_free * huge_size} MB"
-                )
-        # Check if huge pages mount point exist
-        has_huge_mnt = False
-        ret_code, stdout, _ = ssh.exec_command(u"cat /proc/mounts")
-        if int(ret_code) == 0:
-            for line in stdout.splitlines():
-                # Try to find something like:
-                # none /mnt/huge hugetlbfs rw,realtime,pagesize=2048k 0 0
-                mount = line.split()
-                if mount[2] == u"hugetlbfs" and mount[1] == huge_mnt:
-                    has_huge_mnt = True
-                    break
-        # If huge page mount point not exist create one
-        if not has_huge_mnt:
-            ret_code, _, _ = ssh.exec_command_sudo(f"mkdir -p {huge_mnt}")
-            if int(ret_code) != 0:
-                raise RuntimeError(
-                    f"Create mount dir failed on {node[u'host']}"
-                )
-            ret_code, _, _ = ssh.exec_command_sudo(
-                f"mount -t hugetlbfs -o pagesize=2048k none {huge_mnt}"
-            )
-            if int(ret_code) != 0:
-                raise RuntimeError(
-                    f"Mount huge pages failed on {node[u'host']}"
+                    f"Not enough availablehuge pages: {huge_available}!"
                 )