X-Git-Url: https://gerrit.fd.io/r/gitweb?p=csit.git;a=blobdiff_plain;f=resources%2Flibraries%2Fpython%2FDUTSetup.py;h=396029a04f7164abf55418e44373ddc946505ae9;hp=1cca9748932f3cd1a4d654fd5fe8522e6f2c5cff;hb=HEAD;hpb=d68951ac245150eeefa6e0f4156e4c1b5c9e9325 diff --git a/resources/libraries/python/DUTSetup.py b/resources/libraries/python/DUTSetup.py index 1cca974893..f9758c5f9f 100644 --- a/resources/libraries/python/DUTSetup.py +++ b/resources/libraries/python/DUTSetup.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019 Cisco and/or its affiliates. +# Copyright (c) 2023 Cisco and/or its affiliates. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at: @@ -13,10 +13,10 @@ """DUT setup library.""" +from time import sleep from robot.api import logger -from resources.libraries.python.Constants import Constants -from resources.libraries.python.ssh import SSH, exec_cmd_no_error +from resources.libraries.python.ssh import exec_cmd, exec_cmd_no_error from resources.libraries.python.topology import NodeType, Topology @@ -32,11 +32,12 @@ class DUTSetup: :type node: dict :type service: str """ - command = u"echo $(< /tmp/*supervisor*.log)"\ - if DUTSetup.running_in_container(node) \ - else f"journalctl --no-pager --unit={service} " \ - f"--since=\"$(echo `systemctl show -p ActiveEnterTimestamp " \ - f"{service}` | awk \'{{print $2 $3}}\')\"" + if DUTSetup.running_in_container(node): + return + command = ( + f"journalctl --no-pager _SYSTEMD_INVOCATION_ID=$(systemctl " + f"show -p InvocationID --value {service})" + ) message = f"Node {node[u'host']} failed to get logs from unit {service}" exec_cmd_no_error( @@ -65,9 +66,10 @@ class DUTSetup: :type node: dict :type service: str """ - command = f"supervisorctl restart {service}" \ - if DUTSetup.running_in_container(node) \ - else f"service {service} restart" + if DUTSetup.running_in_container(node): + command = f"supervisorctl restart {service}" + else: + command = f"systemctl restart {service}" message = f"Node {node[u'host']} failed to restart service {service}" exec_cmd_no_error( @@ -98,10 +100,10 @@ class DUTSetup: :type node: dict :type service: str """ - # TODO: change command to start once all parent function updated. - command = f"supervisorctl restart {service}" \ - if DUTSetup.running_in_container(node) \ - else f"service {service} restart" + if DUTSetup.running_in_container(node): + command = f"supervisorctl restart {service}" + else: + command = f"systemctl restart {service}" message = f"Node {node[u'host']} failed to start service {service}" exec_cmd_no_error( @@ -132,17 +134,18 @@ class DUTSetup: :type node: dict :type service: str """ - command = f"supervisorctl stop {service}" \ - if DUTSetup.running_in_container(node) \ - else f"service {service} stop" + DUTSetup.get_service_logs(node, service) + + if DUTSetup.running_in_container(node): + command = f"supervisorctl stop {service}" + else: + command = f"systemctl stop {service}" message = f"Node {node[u'host']} failed to stop service {service}" exec_cmd_no_error( node, command, timeout=180, sudo=True, message=message ) - DUTSetup.get_service_logs(node, service) - @staticmethod def stop_service_on_all_duts(nodes, service): """Stop the named service on all DUTs. @@ -157,42 +160,74 @@ class DUTSetup: DUTSetup.stop_service(node, service) @staticmethod - def get_vpp_pid(node): - """Get PID of running VPP process. + def kill_program(node, program, namespace=None): + """Kill program on the specified topology node. - :param node: DUT node. + :param node: Topology node. + :param program: Program name. + :param namespace: Namespace program is running in. :type node: dict - :returns: PID - :rtype: int - :raises RuntimeError: If it is not possible to get the PID. + :type program: str + :type namespace: str """ - ssh = SSH() - ssh.connect(node) + host = node[u"host"] + cmd_timeout = 5 + if namespace in (None, u"default"): + shell_cmd = u"sh -c" + else: + shell_cmd = f"ip netns exec {namespace} sh -c" + + pgrep_cmd = f"{shell_cmd} \'pgrep -c {program}\'" + _, stdout, _ = exec_cmd(node, pgrep_cmd, timeout=cmd_timeout, + sudo=True) + if int(stdout) == 0: + logger.trace(f"{program} is not running on {host}") + return + exec_cmd(node, f"{shell_cmd} \'pkill {program}\'", + timeout=cmd_timeout, sudo=True) + for attempt in range(5): + _, stdout, _ = exec_cmd(node, pgrep_cmd, timeout=cmd_timeout, + sudo=True) + if int(stdout) == 0: + logger.trace(f"Attempt {attempt}: {program} is dead on {host}") + return + sleep(1) + logger.trace(f"SIGKILLing {program} on {host}") + exec_cmd(node, f"{shell_cmd} \'pkill -9 {program}\'", + timeout=cmd_timeout, sudo=True) - retval = None - for i in range(3): - logger.trace(f"Try {i}: Get VPP PID") - ret_code, stdout, stderr = ssh.exec_command(u"pidof vpp") + @staticmethod + def verify_program_installed(node, program): + """Verify that program is installed on the specified topology node. - if int(ret_code): - raise RuntimeError( - f"Not possible to get PID of VPP process on node: " - f"{node[u'host']}\n {stdout + stderr}" - ) + :param node: Topology node. + :param program: Program name. + :type node: dict + :type program: str + """ + cmd = f"command -v {program}" + exec_cmd_no_error(node, cmd, message=f"{program} is not installed") - pid_list = stdout.split() - if len(pid_list) == 1: - retval = int(stdout) - elif not pid_list: - logger.debug(f"No VPP PID found on node {node[u'host']}") - continue - else: - logger.debug( - f"More then one VPP PID found on node {node[u'host']}" - ) - retval = [int(pid) for pid in pid_list] + @staticmethod + def get_pid(node, process, retries=3): + """Get PID of running process. - return retval + :param node: DUT node. + :param process: process name. + :param retries: How many times to retry on failure. + :type node: dict + :type process: str + :type retries: int + :returns: PID + :rtype: int + :raises RuntimeError: If it is not possible to get the PID. + """ + cmd = f"pidof {process}" + stdout, _ = exec_cmd_no_error( + node, cmd, retries=retries, + message=f"No {process} PID found on node {node[u'host']}") + pid_list = stdout.split() + return [int(pid) for pid in pid_list] @staticmethod def get_vpp_pids(nodes): @@ -206,84 +241,9 @@ class DUTSetup: pids = dict() for node in nodes.values(): if node[u"type"] == NodeType.DUT: - pids[node[u"host"]] = DUTSetup.get_vpp_pid(node) + pids[node[u"host"]] = DUTSetup.get_pid(node, u"vpp") return pids - @staticmethod - def crypto_device_verify(node, crypto_type, numvfs, force_init=False): - """Verify if Crypto QAT device virtual functions are initialized on all - DUTs. If parameter force initialization is set to True, then try to - initialize or remove VFs on QAT. - - :param node: DUT node. - :crypto_type: Crypto device type - HW_DH895xcc or HW_C3xxx. - :param numvfs: Number of VFs to initialize, 0 - disable the VFs. - :param force_init: If True then try to initialize to specific value. - :type node: dict - :type crypto_type: string - :type numvfs: int - :type force_init: bool - :returns: nothing - :raises RuntimeError: If QAT VFs are not created and force init is set - to False. - """ - pci_addr = Topology.get_cryptodev(node) - sriov_numvfs = DUTSetup.get_sriov_numvfs(node, pci_addr) - - if sriov_numvfs != numvfs: - if force_init: - # QAT is not initialized and we want to initialize with numvfs - DUTSetup.crypto_device_init(node, crypto_type, numvfs) - else: - raise RuntimeError( - f"QAT device failed to create VFs on {node[u'host']}" - ) - - @staticmethod - def crypto_device_init(node, crypto_type, numvfs): - """Init Crypto QAT device virtual functions on DUT. - - :param node: DUT node. - :crypto_type: Crypto device type - HW_DH895xcc or HW_C3xxx. - :param numvfs: Number of VFs to initialize, 0 - disable the VFs. - :type node: dict - :type crypto_type: string - :type numvfs: int - :returns: nothing - :raises RuntimeError: If failed to stop VPP or QAT failed to initialize. - """ - if crypto_type == u"HW_DH895xcc": - kernel_mod = u"qat_dh895xcc" - kernel_drv = u"dh895xcc" - elif crypto_type == u"HW_C3xxx": - kernel_mod = u"qat_c3xxx" - kernel_drv = u"c3xxx" - else: - raise RuntimeError( - f"Unsupported crypto device type on {node[u'host']}" - ) - - pci_addr = Topology.get_cryptodev(node) - - # QAT device must be re-bound to kernel driver before initialization. - DUTSetup.verify_kernel_module(node, kernel_mod, force_load=True) - - # Stop VPP to prevent deadlock. - DUTSetup.stop_service(node, Constants.VPP_UNIT) - - current_driver = DUTSetup.get_pci_dev_driver( - node, pci_addr.replace(u":", r"\:") - ) - if current_driver is not None: - DUTSetup.pci_driver_unbind(node, pci_addr) - - # Bind to kernel driver. - DUTSetup.pci_driver_bind(node, pci_addr, kernel_drv) - - # Initialize QAT VFs. - if numvfs > 0: - DUTSetup.set_sriov_numvfs(node, pci_addr, numvfs) - @staticmethod def get_virtfn_pci_addr(node, pf_pci_addr, vf_id): """Get PCI address of Virtual Function. @@ -295,7 +255,7 @@ class DUTSetup: :type pf_pci_addr: str :type vf_id: int :returns: Virtual Function PCI address. - :rtype: int + :rtype: str :raises RuntimeError: If failed to get Virtual Function PCI address. """ command = f"sh -c \"basename $(readlink " \ @@ -338,21 +298,37 @@ class DUTSetup: return sriov_numvfs @staticmethod - def set_sriov_numvfs(node, pf_pci_addr, numvfs=0): + def set_sriov_numvfs(node, pf_pci_addr, path="devices", numvfs=0): """Init or reset SR-IOV virtual functions by setting its number on PCI device on DUT. Setting to zero removes all VFs. :param node: DUT node. :param pf_pci_addr: Physical Function PCI device address. + :param path: Either device or driver. :param numvfs: Number of VFs to initialize, 0 - removes the VFs. :type node: dict :type pf_pci_addr: str + :type path: str :type numvfs: int :raises RuntimeError: Failed to create VFs on PCI. """ + cmd = f"test -f /sys/bus/pci/{path}/{pf_pci_addr}/sriov_numvfs" + sriov_unsupported, _, _ = exec_cmd(node, cmd) + # if sriov_numvfs doesn't exist, then sriov_unsupported != 0 + if int(sriov_unsupported): + if numvfs == 0: + # sriov is not supported and we want 0 VFs + # no need to do anything + return + + raise RuntimeError( + f"Can't configure {numvfs} VFs on {pf_pci_addr} device " + f"on {node[u'host']} since it doesn't support SR-IOV." + ) + pci = pf_pci_addr.replace(u":", r"\:") command = f"sh -c \"echo {numvfs} | " \ - f"tee /sys/bus/pci/devices/{pci}/sriov_numvfs\"" + f"tee /sys/bus/pci/{path}/{pci}/sriov_numvfs\"" message = f"Failed to create {numvfs} VFs on {pf_pci_addr} device " \ f"on {node[u'host']}" @@ -379,6 +355,25 @@ class DUTSetup: node, command, timeout=120, sudo=True, message=message ) + @staticmethod + def unbind_pci_devices_from_other_driver(node, driver, *pci_addrs): + """Unbind PCI devices from driver other than input driver on node. + + :param node: DUT node. + :param driver: Driver to not unbind from. If None or empty string, + will attempt to unbind from the current driver. + :param pci_addrs: PCI device addresses. + :type node: dict + :type driver: str + :type pci_addrs: list + """ + for pci_addr in pci_addrs: + cur_driver = DUTSetup.get_pci_dev_driver(node, pci_addr) + if not cur_driver: + return + if not driver or cur_driver != driver: + DUTSetup.pci_driver_unbind(node, pci_addr) + @staticmethod def pci_driver_bind(node, pci_addr, driver): """Bind PCI device to driver on node. @@ -481,61 +476,24 @@ class DUTSetup: def get_pci_dev_driver(node, pci_addr): """Get current PCI device driver on node. - .. note:: - # lspci -vmmks 0000:00:05.0 - Slot: 00:05.0 - Class: Ethernet controller - Vendor: Red Hat, Inc - Device: Virtio network device - SVendor: Red Hat, Inc - SDevice: Device 0001 - PhySlot: 5 - Driver: virtio-pci - :param node: DUT node. :param pci_addr: PCI device address. :type node: dict :type pci_addr: str :returns: Driver or None - :raises RuntimeError: If PCI rescan or lspci command execution failed. :raises RuntimeError: If it is not possible to get the interface driver information from the node. """ - ssh = SSH() - ssh.connect(node) - - for i in range(3): - logger.trace(f"Try number {i}: Get PCI device driver") - - cmd = f"lspci -vmmks {pci_addr}" - ret_code, stdout, _ = ssh.exec_command(cmd) - if int(ret_code): - raise RuntimeError(f"'{cmd}' failed on '{node[u'host']}'") - - for line in stdout.splitlines(): - if not line: - continue - name = None - value = None - try: - name, value = line.split(u"\t", 1) - except ValueError: - if name == u"Driver:": - return None - if name == u"Driver:": - return value - - if i < 2: - logger.trace( - f"Driver for PCI device {pci_addr} not found, " - f"executing pci rescan and retrying" - ) - cmd = u"sh -c \"echo 1 > /sys/bus/pci/rescan\"" - ret_code, _, _ = ssh.exec_command_sudo(cmd) - if int(ret_code) != 0: - raise RuntimeError(f"'{cmd}' failed on '{node[u'host']}'") - - return None + driver_path = f"/sys/bus/pci/devices/{pci_addr}/driver" + cmd = f"test -d {driver_path}" + ret_code, ret_val, _ = exec_cmd(node, cmd) + if int(ret_code): + # the directory doesn't exist which means the device is not bound + # to any driver + return None + cmd = f"basename $(readlink -f {driver_path})" + ret_val, _ = exec_cmd_no_error(node, cmd) + return ret_val.strip() @staticmethod def verify_kernel_module(node, module, force_load=False): @@ -610,60 +568,6 @@ class DUTSetup: exec_cmd_no_error(node, command, timeout=30, sudo=True, message=message) - @staticmethod - def install_vpp_on_all_duts(nodes, vpp_pkg_dir): - """Install VPP on all DUT nodes. Start the VPP service in case of - systemd is not available or does not support autostart. - - :param nodes: Nodes in the topology. - :param vpp_pkg_dir: Path to directory where VPP packages are stored. - :type nodes: dict - :type vpp_pkg_dir: str - :raises RuntimeError: If failed to remove or install VPP. - """ - for node in nodes.values(): - message = f"Failed to install VPP on host {node[u'host']}!" - if node[u"type"] == NodeType.DUT: - command = u"ln -s /dev/null /etc/sysctl.d/80-vpp.conf || true" - exec_cmd_no_error(node, command, sudo=True) - - command = u". /etc/lsb-release; echo \"${DISTRIB_ID}\"" - stdout, _ = exec_cmd_no_error(node, command) - - if stdout.strip() == u"Ubuntu": - exec_cmd_no_error( - node, u"apt-get purge -y '*vpp*' || true", - timeout=120, sudo=True - ) - # workaround to avoid installation of vpp-api-python - exec_cmd_no_error( - node, u"rm -f {vpp_pkg_dir}vpp-api-python.deb", - timeout=120, sudo=True - ) - exec_cmd_no_error( - node, f"dpkg -i --force-all {vpp_pkg_dir}*.deb", - timeout=120, sudo=True, message=message - ) - exec_cmd_no_error(node, u"dpkg -l | grep vpp", sudo=True) - if DUTSetup.running_in_container(node): - DUTSetup.restart_service(node, Constants.VPP_UNIT) - else: - exec_cmd_no_error( - node, u"yum -y remove '*vpp*' || true", - timeout=120, sudo=True - ) - # workaround to avoid installation of vpp-api-python - exec_cmd_no_error( - node, u"rm -f {vpp_pkg_dir}vpp-api-python.rpm", - timeout=120, sudo=True - ) - exec_cmd_no_error( - node, f"rpm -ivh {vpp_pkg_dir}*.rpm", - timeout=120, sudo=True, message=message - ) - exec_cmd_no_error(node, u"rpm -qai *vpp*", sudo=True) - DUTSetup.restart_service(node, Constants.VPP_UNIT) - @staticmethod def running_in_container(node): """This method tests if topology node is running inside container. @@ -674,18 +578,15 @@ class DUTSetup: to detect. :rtype: bool """ - command = u"fgrep docker /proc/1/cgroup" - message = u"Failed to get cgroup settings." + command = "cat /.dockerenv" try: - exec_cmd_no_error( - node, command, timeout=30, sudo=False, message=message - ) + exec_cmd_no_error(node, command, timeout=30) except RuntimeError: return False return True @staticmethod - def get_docker_mergeddir(node, uuid): + def get_docker_mergeddir(node, uuid=None): """Get Docker overlay for MergedDir diff. :param node: DUT node. @@ -696,184 +597,113 @@ class DUTSetup: :rtype: str :raises RuntimeError: If getting output failed. """ - command = f"docker inspect " \ + if not uuid: + command = 'fgrep "hostname" /proc/self/mountinfo | cut -f 4 -d" "' + message = "Failed to get UUID!" + stdout, _ = exec_cmd_no_error(node, command, message=message) + uuid = stdout.split(sep="/")[-2] + command = ( + f"docker inspect " f"--format='{{{{.GraphDriver.Data.MergedDir}}}}' {uuid}" + ) message = f"Failed to get directory of {uuid} on host {node[u'host']}" stdout, _ = exec_cmd_no_error(node, command, sudo=True, message=message) return stdout.strip() @staticmethod - def get_huge_page_size(node): - """Get default size of huge pages in system. + def get_hugepages_info(node, hugesize=None): + """Get number of huge pages in system. :param node: Node in the topology. + :param hugesize: Size of hugepages. Default system huge size if None. :type node: dict - :returns: Default size of free huge pages in system. - :rtype: int - :raises RuntimeError: If reading failed for three times. - """ - ssh = SSH() - ssh.connect(node) - - for _ in range(3): - ret_code, stdout, _ = ssh.exec_command_sudo( - u"grep Hugepagesize /proc/meminfo | awk '{ print $2 }'" - ) - if ret_code == 0: - try: - huge_size = int(stdout) - except ValueError: - logger.trace(u"Reading huge page size information failed") - else: - break - else: - raise RuntimeError(u"Getting huge page size information failed.") - return huge_size - - @staticmethod - def get_huge_page_free(node, huge_size): - """Get number of free huge pages in system. - - :param node: Node in the topology. - :param huge_size: Size of hugepages. - :type node: dict - :type huge_size: int - :returns: Number of free huge pages in system. - :rtype: int - :raises RuntimeError: If reading failed for three times. - """ - # TODO: add numa aware option - ssh = SSH() - ssh.connect(node) - - for _ in range(3): - ret_code, stdout, _ = ssh.exec_command_sudo( - f"cat /sys/kernel/mm/hugepages/hugepages-{huge_size}kB/" - f"free_hugepages" - ) - if ret_code == 0: - try: - huge_free = int(stdout) - except ValueError: - logger.trace(u"Reading free huge pages information failed") - else: - break - else: - raise RuntimeError(u"Getting free huge pages information failed.") - return huge_free - - @staticmethod - def get_huge_page_total(node, huge_size): - """Get total number of huge pages in system. - - :param node: Node in the topology. - :param huge_size: Size of hugepages. - :type node: dict - :type huge_size: int - :returns: Total number of huge pages in system. - :rtype: int - :raises RuntimeError: If reading failed for three times. + :type hugesize: int + :returns: Number of huge pages in system. + :rtype: dict + :raises RuntimeError: If reading failed. """ - # TODO: add numa aware option - ssh = SSH() - ssh.connect(node) - - for _ in range(3): - ret_code, stdout, _ = ssh.exec_command_sudo( - f"cat /sys/kernel/mm/hugepages/hugepages-{huge_size}kB/" - f"nr_hugepages" - ) - if ret_code == 0: - try: - huge_total = int(stdout) - except ValueError: - logger.trace(u"Reading total huge pages information failed") - else: - break - else: - raise RuntimeError(u"Getting total huge pages information failed.") - return huge_total + if not hugesize: + hugesize = "$(grep Hugepagesize /proc/meminfo | awk '{ print $2 }')" + command = f"cat /sys/kernel/mm/hugepages/hugepages-{hugesize}kB/*" + stdout, _ = exec_cmd_no_error(node, command) + try: + line = stdout.splitlines() + return { + "free_hugepages": int(line[0]), + "nr_hugepages": int(line[1]), + "nr_hugepages_mempolicy": int(line[2]), + "nr_overcommit_hugepages": int(line[3]), + "resv_hugepages": int(line[4]), + "surplus_hugepages": int(line[5]) + } + except ValueError: + logger.trace(u"Reading huge pages information failed!") @staticmethod - def check_huge_page(node, huge_mnt, mem_size, allocate=False): + def check_huge_page( + node, huge_mnt, mem_size, hugesize=2048, allocate=False): """Check if there is enough HugePages in system. If allocate is set to true, try to allocate more HugePages. :param node: Node in the topology. :param huge_mnt: HugePage mount point. - :param mem_size: Requested memory in MB. + :param mem_size: Reqeusted memory in MB. + :param hugesize: HugePage size in KB. :param allocate: Whether to allocate more memory if not enough. :type node: dict :type huge_mnt: str - :type mem_size: str + :type mem_size: int + :type hugesize: int :type allocate: bool :raises RuntimeError: Mounting hugetlbfs failed or not enough HugePages or increasing map count failed. """ - # TODO: split function into smaller parts. - ssh = SSH() - ssh.connect(node) - - # Get huge pages information - huge_size = DUTSetup.get_huge_page_size(node) - huge_free = DUTSetup.get_huge_page_free(node, huge_size) - huge_total = DUTSetup.get_huge_page_total(node, huge_size) - - # Check if memory requested is available on - mem_size = int(mem_size) - if (mem_size * 1024) > (huge_free * huge_size): - # If we want to allocate hugepage dynamically + # Get huge pages information. + hugepages = DUTSetup.get_hugepages_info(node, hugesize=hugesize) + + # Check if hugepages requested are available on node. + if hugepages[u"nr_overcommit_hugepages"]: + # If overcommit is used, we need to know how many additional pages + # we can allocate + huge_available = hugepages[u"nr_overcommit_hugepages"] - \ + hugepages[u"surplus_hugepages"] + else: + # Fallbacking to free_hugepages which were used before to detect. + huge_available = hugepages[u"free_hugepages"] + + if ((mem_size * 1024) // hugesize) > huge_available: + # If we want to allocate hugepage dynamically. if allocate: - mem_needed = (mem_size * 1024) - (huge_free * huge_size) - huge_to_allocate = ((mem_needed / huge_size) * 2) + huge_total - max_map_count = huge_to_allocate*4 - # Increase maximum number of memory map areas a process may have - ret_code, _, _ = ssh.exec_command_sudo( + huge_needed = ((mem_size * 1024) // hugesize) - huge_available + huge_to_allocate = huge_needed + hugepages[u"nr_hugepages"] + max_map_count = huge_to_allocate * 4 + # Check if huge pages mount point exist. + try: + exec_cmd_no_error(node, u"fgrep 'hugetlbfs' /proc/mounts") + except RuntimeError: + exec_cmd_no_error(node, f"mkdir -p {huge_mnt}", sudo=True) + exec_cmd_no_error( + node, + f"mount -t hugetlbfs -o pagesize={hugesize}k none " + f"{huge_mnt}", + sudo=True) + # Increase maximum number of memory map areas for process. + exec_cmd_no_error( + node, f"echo \"{max_map_count}\" | " - f"sudo tee /proc/sys/vm/max_map_count" + f"sudo tee /proc/sys/vm/max_map_count", + message=f"Increase map count failed on {node[u'host']}!" ) - if int(ret_code) != 0: - raise RuntimeError( - f"Increase map count failed on {node[u'host']}" - ) - # Increase hugepage count - ret_code, _, _ = ssh.exec_command_sudo( + # Increase hugepage count. + exec_cmd_no_error( + node, f"echo \"{huge_to_allocate}\" | " - f"sudo tee /proc/sys/vm/nr_hugepages" + f"sudo tee /proc/sys/vm/nr_hugepages", + message=f"Mount huge pages failed on {node[u'host']}!" ) - if int(ret_code) != 0: - raise RuntimeError( - f"Mount huge pages failed on {node[u'host']}" - ) - # If we do not want to allocate dynamically end with error + # If we do not want to allocate dynamically end with error. else: raise RuntimeError( - f"Not enough free huge pages: {huge_free}, " - f"{huge_free * huge_size} MB" - ) - # Check if huge pages mount point exist - has_huge_mnt = False - ret_code, stdout, _ = ssh.exec_command(u"cat /proc/mounts") - if int(ret_code) == 0: - for line in stdout.splitlines(): - # Try to find something like: - # none /mnt/huge hugetlbfs rw,realtime,pagesize=2048k 0 0 - mount = line.split() - if mount[2] == u"hugetlbfs" and mount[1] == huge_mnt: - has_huge_mnt = True - break - # If huge page mount point not exist create one - if not has_huge_mnt: - ret_code, _, _ = ssh.exec_command_sudo(f"mkdir -p {huge_mnt}") - if int(ret_code) != 0: - raise RuntimeError( - f"Create mount dir failed on {node[u'host']}" - ) - ret_code, _, _ = ssh.exec_command_sudo( - f"mount -t hugetlbfs -o pagesize=2048k none {huge_mnt}" - ) - if int(ret_code) != 0: - raise RuntimeError( - f"Mount huge pages failed on {node[u'host']}" + f"Not enough availablehuge pages: {huge_available}!" )