Add flow test suites

[csit.git] / resources / libraries / python / DUTSetup.py
diff --git a/resources/libraries/python/DUTSetup.py b/resources/libraries/python/DUTSetup.py

index 1cca974..5040223 100644 (file)
--- a/resources/libraries/python/DUTSetup.py
+++ b/resources/libraries/python/DUTSetup.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019 Cisco and/or its affiliates.
+# Copyright (c) 2021 Cisco and/or its affiliates.
  # Licensed under the Apache License, Version 2.0 (the "License");
  # you may not use this file except in compliance with the License.
  # You may obtain a copy of the License at:
@@ -13,10 +13,11 @@
  
  """DUT setup library."""
  
+from time import sleep
  from robot.api import logger
  
  from resources.libraries.python.Constants import Constants
-from resources.libraries.python.ssh import SSH, exec_cmd_no_error
+from resources.libraries.python.ssh import SSH, exec_cmd, exec_cmd_no_error
  from resources.libraries.python.topology import NodeType, Topology
  
  
@@ -32,11 +33,11 @@ class DUTSetup:
          :type node: dict
          :type service: str
          """
-        command = u"echo $(< /tmp/*supervisor*.log)"\
+        command = u"cat /tmp/*supervisor*.log"\
              if DUTSetup.running_in_container(node) \
-            else f"journalctl --no-pager --unit={service} " \
-            f"--since=\"$(echo `systemctl show -p ActiveEnterTimestamp " \
-            f"{service}` | awk \'{{print $2 $3}}\')\""
+            else f"journalctl --no-pager _SYSTEMD_INVOCATION_ID=$(systemctl " \
+            f"show -p InvocationID --value {service})"
+
          message = f"Node {node[u'host']} failed to get logs from unit {service}"
  
          exec_cmd_no_error(
@@ -132,6 +133,8 @@ class DUTSetup:
          :type node: dict
          :type service: str
          """
+        DUTSetup.get_service_logs(node, service)
+
          command = f"supervisorctl stop {service}" \
              if DUTSetup.running_in_container(node) \
              else f"service {service} stop"
@@ -141,8 +144,6 @@ class DUTSetup:
              node, command, timeout=180, sudo=True, message=message
          )
  
-        DUTSetup.get_service_logs(node, service)
-
      @staticmethod
      def stop_service_on_all_duts(nodes, service):
          """Stop the named service on all DUTs.
@@ -157,11 +158,62 @@ class DUTSetup:
                  DUTSetup.stop_service(node, service)
  
      @staticmethod
-    def get_vpp_pid(node):
-        """Get PID of running VPP process.
+    def kill_program(node, program, namespace=None):
+        """Kill program on the specified topology node.
+
+        :param node: Topology node.
+        :param program: Program name.
+        :param namespace: Namespace program is running in.
+        :type node: dict
+        :type program: str
+        :type namespace: str
+        """
+        host = node[u"host"]
+        cmd_timeout = 5
+        if namespace in (None, u"default"):
+            shell_cmd = u"sh -c"
+        else:
+            shell_cmd = f"ip netns exec {namespace} sh -c"
+
+        pgrep_cmd = f"{shell_cmd} \'pgrep -c {program}\'"
+        _, stdout, _ = exec_cmd(node, pgrep_cmd, timeout=cmd_timeout,
+                                sudo=True)
+        if int(stdout) == 0:
+            logger.trace(f"{program} is not running on {host}")
+            return
+        exec_cmd(node, f"{shell_cmd} \'pkill {program}\'",
+                 timeout=cmd_timeout, sudo=True)
+        for attempt in range(5):
+            _, stdout, _ = exec_cmd(node, pgrep_cmd, timeout=cmd_timeout,
+                                    sudo=True)
+            if int(stdout) == 0:
+                logger.trace(f"Attempt {attempt}: {program} is dead on {host}")
+                return
+            sleep(1)
+        logger.trace(f"SIGKILLing {program} on {host}")
+        exec_cmd(node, f"{shell_cmd} \'pkill -9 {program}\'",
+                 timeout=cmd_timeout, sudo=True)
+
+    @staticmethod
+    def verify_program_installed(node, program):
+        """Verify that program is installed on the specified topology node.
+
+        :param node: Topology node.
+        :param program: Program name.
+        :type node: dict
+        :type program: str
+        """
+        cmd = f"command -v {program}"
+        exec_cmd_no_error(node, cmd, message=f"{program} is not installed")
+
+    @staticmethod
+    def get_pid(node, process):
+        """Get PID of running process.
  
          :param node: DUT node.
+        :param process: process name.
          :type node: dict
+        :type process: str
          :returns: PID
          :rtype: int
          :raises RuntimeError: If it is not possible to get the PID.
@@ -171,26 +223,24 @@ class DUTSetup:
  
          retval = None
          for i in range(3):
-            logger.trace(f"Try {i}: Get VPP PID")
-            ret_code, stdout, stderr = ssh.exec_command(u"pidof vpp")
+            logger.trace(f"Try {i}: Get {process} PID")
+            ret_code, stdout, stderr = ssh.exec_command(f"pidof {process}")
  
              if int(ret_code):
                  raise RuntimeError(
-                    f"Not possible to get PID of VPP process on node: "
+                    f"Not possible to get PID of {process} process on node: "
                      f"{node[u'host']}\n {stdout + stderr}"
                  )
  
              pid_list = stdout.split()
              if len(pid_list) == 1:
-                retval = int(stdout)
-            elif not pid_list:
-                logger.debug(f"No VPP PID found on node {node[u'host']}")
+                return [int(stdout)]
+            if not pid_list:
+                logger.debug(f"No {process} PID found on node {node[u'host']}")
                  continue
-            else:
-                logger.debug(
-                    f"More then one VPP PID found on node {node[u'host']}"
-                )
-                retval = [int(pid) for pid in pid_list]
+            logger.debug(f"More than one {process} PID found " \
+                         f"on node {node[u'host']}")
+            retval = [int(pid) for pid in pid_list]
  
          return retval
  
@@ -206,7 +256,7 @@ class DUTSetup:
          pids = dict()
          for node in nodes.values():
              if node[u"type"] == NodeType.DUT:
-                pids[node[u"host"]] = DUTSetup.get_vpp_pid(node)
+                pids[node[u"host"]] = DUTSetup.get_pid(node, u"vpp")
          return pids
  
      @staticmethod
@@ -295,7 +345,7 @@ class DUTSetup:
          :type pf_pci_addr: str
          :type vf_id: int
          :returns: Virtual Function PCI address.
-        :rtype: int
+        :rtype: str
          :raises RuntimeError: If failed to get Virtual Function PCI address.
          """
          command = f"sh -c \"basename $(readlink " \
@@ -350,6 +400,20 @@ class DUTSetup:
          :type numvfs: int
          :raises RuntimeError: Failed to create VFs on PCI.
          """
+        cmd = f"test -f /sys/bus/pci/devices/{pf_pci_addr}/sriov_numvfs"
+        sriov_unsupported, _, _ = exec_cmd(node, cmd)
+        # if sriov_numvfs doesn't exist, then sriov_unsupported != 0
+        if int(sriov_unsupported):
+            if numvfs == 0:
+                # sriov is not supported and we want 0 VFs
+                # no need to do anything
+                return
+            else:
+                raise RuntimeError(
+                    f"Can't configure {numvfs} VFs on {pf_pci_addr} device "
+                    f"on {node[u'host']} since it doesn't support SR-IOV."
+                )
+
          pci = pf_pci_addr.replace(u":", r"\:")
          command = f"sh -c \"echo {numvfs} | " \
              f"tee /sys/bus/pci/devices/{pci}/sriov_numvfs\""
@@ -379,6 +443,18 @@ class DUTSetup:
              node, command, timeout=120, sudo=True, message=message
          )
  
+    @staticmethod
+    def pci_driver_unbind_list(node, *pci_addrs):
+        """Unbind PCI devices from current driver on node.
+
+        :param node: DUT node.
+        :param pci_addrs: PCI device addresses.
+        :type node: dict
+        :type pci_addrs: list
+        """
+        for pci_addr in pci_addrs:
+            DUTSetup.pci_driver_unbind(node, pci_addr)
+
      @staticmethod
      def pci_driver_bind(node, pci_addr, driver):
          """Bind PCI device to driver on node.
@@ -481,61 +557,25 @@ class DUTSetup:
      def get_pci_dev_driver(node, pci_addr):
          """Get current PCI device driver on node.
  
-        .. note::
-            # lspci -vmmks 0000:00:05.0
-            Slot:   00:05.0
-            Class:  Ethernet controller
-            Vendor: Red Hat, Inc
-            Device: Virtio network device
-            SVendor:        Red Hat, Inc
-            SDevice:        Device 0001
-            PhySlot:        5
-            Driver: virtio-pci
-
          :param node: DUT node.
          :param pci_addr: PCI device address.
          :type node: dict
          :type pci_addr: str
          :returns: Driver or None
-        :raises RuntimeError: If PCI rescan or lspci command execution failed.
          :raises RuntimeError: If it is not possible to get the interface driver
              information from the node.
          """
-        ssh = SSH()
-        ssh.connect(node)
-
-        for i in range(3):
-            logger.trace(f"Try number {i}: Get PCI device driver")
-
-            cmd = f"lspci -vmmks {pci_addr}"
-            ret_code, stdout, _ = ssh.exec_command(cmd)
-            if int(ret_code):
-                raise RuntimeError(f"'{cmd}' failed on '{node[u'host']}'")
-
-            for line in stdout.splitlines():
-                if not line:
-                    continue
-                name = None
-                value = None
-                try:
-                    name, value = line.split(u"\t", 1)
-                except ValueError:
-                    if name == u"Driver:":
-                        return None
-                if name == u"Driver:":
-                    return value
-
-            if i < 2:
-                logger.trace(
-                    f"Driver for PCI device {pci_addr} not found, "
-                    f"executing pci rescan and retrying"
-                )
-                cmd = u"sh -c \"echo 1 > /sys/bus/pci/rescan\""
-                ret_code, _, _ = ssh.exec_command_sudo(cmd)
-                if int(ret_code) != 0:
-                    raise RuntimeError(f"'{cmd}' failed on '{node[u'host']}'")
-
-        return None
+        driver_path = f"/sys/bus/pci/devices/{pci_addr}/driver"
+        cmd = f"test -d {driver_path}"
+        ret_code, ret_val, _ = exec_cmd(node, cmd)
+        if int(ret_code):
+            # the directory doesn't exist which means the device is not bound
+            # to any driver
+            return None
+        else:
+            cmd = f"basename $(readlink -f {driver_path})"
+            ret_val, _ = exec_cmd_no_error(node, cmd)
+            return ret_val.strip()
  
      @staticmethod
      def verify_kernel_module(node, module, force_load=False):
@@ -661,7 +701,7 @@ class DUTSetup:
                          node, f"rpm -ivh {vpp_pkg_dir}*.rpm",
                          timeout=120, sudo=True, message=message
                      )
-                    exec_cmd_no_error(node, u"rpm -qai *vpp*", sudo=True)
+                    exec_cmd_no_error(node, u"rpm -qai '*vpp*'", sudo=True)
                      DUTSetup.restart_service(node, Constants.VPP_UNIT)
  
      @staticmethod
@@ -704,176 +744,98 @@ class DUTSetup:
          return stdout.strip()
  
      @staticmethod
-    def get_huge_page_size(node):
-        """Get default size of huge pages in system.
-
-        :param node: Node in the topology.
-        :type node: dict
-        :returns: Default size of free huge pages in system.
-        :rtype: int
-        :raises RuntimeError: If reading failed for three times.
-        """
-        ssh = SSH()
-        ssh.connect(node)
-
-        for _ in range(3):
-            ret_code, stdout, _ = ssh.exec_command_sudo(
-                u"grep Hugepagesize /proc/meminfo | awk '{ print $2 }'"
-            )
-            if ret_code == 0:
-                try:
-                    huge_size = int(stdout)
-                except ValueError:
-                    logger.trace(u"Reading huge page size information failed")
-                else:
-                    break
-        else:
-            raise RuntimeError(u"Getting huge page size information failed.")
-        return huge_size
-
-    @staticmethod
-    def get_huge_page_free(node, huge_size):
-        """Get number of free huge pages in system.
-
-        :param node: Node in the topology.
-        :param huge_size: Size of hugepages.
-        :type node: dict
-        :type huge_size: int
-        :returns: Number of free huge pages in system.
-        :rtype: int
-        :raises RuntimeError: If reading failed for three times.
-        """
-        # TODO: add numa aware option
-        ssh = SSH()
-        ssh.connect(node)
-
-        for _ in range(3):
-            ret_code, stdout, _ = ssh.exec_command_sudo(
-                f"cat /sys/kernel/mm/hugepages/hugepages-{huge_size}kB/"
-                f"free_hugepages"
-            )
-            if ret_code == 0:
-                try:
-                    huge_free = int(stdout)
-                except ValueError:
-                    logger.trace(u"Reading free huge pages information failed")
-                else:
-                    break
-        else:
-            raise RuntimeError(u"Getting free huge pages information failed.")
-        return huge_free
-
-    @staticmethod
-    def get_huge_page_total(node, huge_size):
-        """Get total number of huge pages in system.
+    def get_hugepages_info(node, hugesize=None):
+        """Get number of huge pages in system.
  
          :param node: Node in the topology.
-        :param huge_size: Size of hugepages.
+        :param hugesize: Size of hugepages. Default system huge size if None.
          :type node: dict
-        :type huge_size: int
-        :returns: Total number of huge pages in system.
-        :rtype: int
-        :raises RuntimeError: If reading failed for three times.
+        :type hugesize: int
+        :returns: Number of huge pages in system.
+        :rtype: dict
+        :raises RuntimeError: If reading failed.
          """
-        # TODO: add numa aware option
-        ssh = SSH()
-        ssh.connect(node)
-
-        for _ in range(3):
-            ret_code, stdout, _ = ssh.exec_command_sudo(
-                f"cat /sys/kernel/mm/hugepages/hugepages-{huge_size}kB/"
-                f"nr_hugepages"
-            )
-            if ret_code == 0:
-                try:
-                    huge_total = int(stdout)
-                except ValueError:
-                    logger.trace(u"Reading total huge pages information failed")
-                else:
-                    break
-        else:
-            raise RuntimeError(u"Getting total huge pages information failed.")
-        return huge_total
+        if not hugesize:
+            hugesize = "$(grep Hugepagesize /proc/meminfo | awk '{ print $2 }')"
+        command = f"cat /sys/kernel/mm/hugepages/hugepages-{hugesize}kB/*"
+        stdout, _ = exec_cmd_no_error(node, command)
+        try:
+            line = stdout.splitlines()
+            return {
+                "free_hugepages": int(line[0]),
+                "nr_hugepages": int(line[1]),
+                "nr_hugepages_mempolicy": int(line[2]),
+                "nr_overcommit_hugepages": int(line[3]),
+                "resv_hugepages": int(line[4]),
+                "surplus_hugepages": int(line[5])
+            }
+        except ValueError:
+            logger.trace(u"Reading huge pages information failed!")
  
      @staticmethod
-    def check_huge_page(node, huge_mnt, mem_size, allocate=False):
+    def check_huge_page(
+            node, huge_mnt, mem_size, hugesize=2048, allocate=False):
          """Check if there is enough HugePages in system. If allocate is set to
          true, try to allocate more HugePages.
  
          :param node: Node in the topology.
          :param huge_mnt: HugePage mount point.
-        :param mem_size: Requested memory in MB.
+        :param mem_size: Reqeusted memory in MB.
+        :param hugesize: HugePage size in KB.
          :param allocate: Whether to allocate more memory if not enough.
          :type node: dict
          :type huge_mnt: str
-        :type mem_size: str
+        :type mem_size: int
+        :type hugesize: int
          :type allocate: bool
          :raises RuntimeError: Mounting hugetlbfs failed or not enough HugePages
              or increasing map count failed.
          """
-        # TODO: split function into smaller parts.
-        ssh = SSH()
-        ssh.connect(node)
-
-        # Get huge pages information
-        huge_size = DUTSetup.get_huge_page_size(node)
-        huge_free = DUTSetup.get_huge_page_free(node, huge_size)
-        huge_total = DUTSetup.get_huge_page_total(node, huge_size)
+        # Get huge pages information.
+        hugepages = DUTSetup.get_hugepages_info(node, hugesize=hugesize)
+
+        # Check if hugepages requested are available on node.
+        if hugepages[u"nr_overcommit_hugepages"]:
+            # If overcommit is used, we need to know how many additional pages
+            # we can allocate
+            huge_available = hugepages[u"nr_overcommit_hugepages"] - \
+                hugepages[u"surplus_hugepages"]
+        else:
+            # Fallbacking to free_hugepages which were used before to detect.
+            huge_available = hugepages[u"free_hugepages"]
  
-        # Check if memory requested is available on
-        mem_size = int(mem_size)
-        if (mem_size * 1024) > (huge_free * huge_size):
-            # If we want to allocate hugepage dynamically
+        if ((mem_size * 1024) // hugesize) > huge_available:
+            # If we want to allocate hugepage dynamically.
              if allocate:
-                mem_needed = (mem_size * 1024) - (huge_free * huge_size)
-                huge_to_allocate = ((mem_needed / huge_size) * 2) + huge_total
-                max_map_count = huge_to_allocate*4
-                # Increase maximum number of memory map areas a process may have
-                ret_code, _, _ = ssh.exec_command_sudo(
+                huge_needed = ((mem_size * 1024) // hugesize) - huge_available
+                huge_to_allocate = huge_needed + hugepages[u"nr_hugepages"]
+                max_map_count = huge_to_allocate * 4
+                # Check if huge pages mount point exist.
+                try:
+                    exec_cmd_no_error(node, u"fgrep 'hugetlbfs' /proc/mounts")
+                except RuntimeError:
+                    exec_cmd_no_error(node, f"mkdir -p {huge_mnt}", sudo=True)
+                    exec_cmd_no_error(
+                        node,
+                        f"mount -t hugetlbfs -o pagesize={hugesize}k none "
+                        f"{huge_mnt}",
+                        sudo=True)
+                # Increase maximum number of memory map areas for process.
+                exec_cmd_no_error(
+                    node,
                      f"echo \"{max_map_count}\" | "
-                    f"sudo tee /proc/sys/vm/max_map_count"
+                    f"sudo tee /proc/sys/vm/max_map_count",
+                    message=f"Increase map count failed on {node[u'host']}!"
                  )
-                if int(ret_code) != 0:
-                    raise RuntimeError(
-                        f"Increase map count failed on {node[u'host']}"
-                    )
-                # Increase hugepage count
-                ret_code, _, _ = ssh.exec_command_sudo(
+                # Increase hugepage count.
+                exec_cmd_no_error(
+                    node,
                      f"echo \"{huge_to_allocate}\" | "
-                    f"sudo tee /proc/sys/vm/nr_hugepages"
+                    f"sudo tee /proc/sys/vm/nr_hugepages",
+                    message=f"Mount huge pages failed on {node[u'host']}!"
                  )
-                if int(ret_code) != 0:
-                    raise RuntimeError(
-                        f"Mount huge pages failed on {node[u'host']}"
-                    )
-            # If we do not want to allocate dynamically end with error
+            # If we do not want to allocate dynamically end with error.
              else:
                  raise RuntimeError(
-                    f"Not enough free huge pages: {huge_free}, "
-                    f"{huge_free * huge_size} MB"
-                )
-        # Check if huge pages mount point exist
-        has_huge_mnt = False
-        ret_code, stdout, _ = ssh.exec_command(u"cat /proc/mounts")
-        if int(ret_code) == 0:
-            for line in stdout.splitlines():
-                # Try to find something like:
-                # none /mnt/huge hugetlbfs rw,realtime,pagesize=2048k 0 0
-                mount = line.split()
-                if mount[2] == u"hugetlbfs" and mount[1] == huge_mnt:
-                    has_huge_mnt = True
-                    break
-        # If huge page mount point not exist create one
-        if not has_huge_mnt:
-            ret_code, _, _ = ssh.exec_command_sudo(f"mkdir -p {huge_mnt}")
-            if int(ret_code) != 0:
-                raise RuntimeError(
-                    f"Create mount dir failed on {node[u'host']}"
-                )
-            ret_code, _, _ = ssh.exec_command_sudo(
-                f"mount -t hugetlbfs -o pagesize=2048k none {huge_mnt}"
-            )
-            if int(ret_code) != 0:
-                raise RuntimeError(
-                    f"Mount huge pages failed on {node[u'host']}"
+                    f"Not enough availablehuge pages: {huge_available}!"
                  )