FIX: Force kill QEMU in case of failed initialization

[csit.git] / resources / libraries / python / QemuUtils.py
diff --git a/resources/libraries/python/QemuUtils.py b/resources/libraries/python/QemuUtils.py

index bc85435..428599b 100644 (file)
--- a/resources/libraries/python/QemuUtils.py
+++ b/resources/libraries/python/QemuUtils.py
@@ -15,11 +15,10 @@
  
  from time import time, sleep
  import json
-import re
  
  from robot.api import logger
  
-from resources.libraries.python.ssh import SSH
+from resources.libraries.python.ssh import SSH, SSHTimeout
  from resources.libraries.python.constants import Constants
  from resources.libraries.python.topology import NodeType
  
@@ -27,24 +26,28 @@ from resources.libraries.python.topology import NodeType
  class QemuUtils(object):
      """QEMU utilities."""
  
-    __QEMU_BIN = '/opt/qemu/bin/qemu-system-x86_64'
-    # QEMU Machine Protocol socket
-    __QMP_SOCK = '/tmp/qmp.sock'
-    # QEMU Guest Agent socket
-    __QGA_SOCK = '/tmp/qga.sock'
-
-    def __init__(self):
+    def __init__(self, qemu_id=1):
+        self._qemu_id = qemu_id
+        # Path to QEMU binary
+        self._qemu_bin = '/usr/bin/qemu-system-x86_64'
+        # QEMU Machine Protocol socket
+        self._qmp_sock = '/tmp/qmp{0}.sock'.format(self._qemu_id)
+        # QEMU Guest Agent socket
+        self._qga_sock = '/tmp/qga{0}.sock'.format(self._qemu_id)
+        # QEMU PID file
+        self._pid_file = '/tmp/qemu{0}.pid'.format(self._qemu_id)
          self._qemu_opt = {}
          # Default 1 CPU.
          self._qemu_opt['smp'] = '-smp 1,sockets=1,cores=1,threads=1'
          # Daemonize the QEMU process after initialization. Default one
          # management interface.
          self._qemu_opt['options'] = '-cpu host -daemonize -enable-kvm ' \
-            '-machine pc-1.0,accel=kvm,usb=off,mem-merge=off ' \
-            '-net nic,macaddr=52:54:00:00:02:01 -balloon none'
-        self._qemu_opt['ssh_fwd_port'] = 10022
+            '-machine pc,accel=kvm,usb=off,mem-merge=off ' \
+            '-net nic,macaddr=52:54:00:00:{0:02x}:ff -balloon none'\
+            .format(self._qemu_id)
+        self._qemu_opt['ssh_fwd_port'] = 10021 + qemu_id
          # Default serial console port
-        self._qemu_opt['serial_port'] = 4556
+        self._qemu_opt['serial_port'] = 4555 + qemu_id
          # Default 512MB virtual RAM
          self._qemu_opt['mem_size'] = 512
          # Default huge page mount point, required for Vhost-user interfaces.
@@ -56,18 +59,28 @@ class QemuUtils(object):
          # VM node info dict
          self._vm_info = {
              'type': NodeType.VM,
-            'port': 10022,
+            'port': self._qemu_opt['ssh_fwd_port'],
              'username': 'cisco',
              'password': 'cisco',
              'interfaces': {},
          }
+        # Virtio queue count
+        self._qemu_opt['queues'] = 1
          self._vhost_id = 0
          self._ssh = None
          self._node = None
-        self._socks = [self.__QMP_SOCK, self.__QGA_SOCK]
+        self._socks = [self._qmp_sock, self._qga_sock]
+
+    def qemu_set_bin(self, path):
+        """Set binary path for QEMU.
+
+        :param path: Absolute path in filesystem.
+        :type path: str
+        """
+        self._qemu_bin = path
  
      def qemu_set_smp(self, cpus, cores, threads, sockets):
-        """Set SMP option for QEMU
+        """Set SMP option for QEMU.
  
          :param cpus: Number of CPUs.
          :param cores: Number of CPU cores on one socket.
@@ -141,14 +154,29 @@ class QemuUtils(object):
              raise ValueError('Host CPU count must match Qemu Thread count')
  
          for qemu_cpu, host_cpu in zip(qemu_cpus, host_cpus):
-            cmd = 'taskset -p {0} {1}'.format(hex(1 << int(host_cpu)),
-                                              qemu_cpu['thread_id'])
+            cmd = 'taskset -pc {0} {1}'.format(host_cpu, qemu_cpu['thread_id'])
              (ret_code, _, stderr) = self._ssh.exec_command_sudo(cmd)
              if int(ret_code) != 0:
                  logger.debug('Set affinity failed {0}'.format(stderr))
                  raise RuntimeError('Set affinity failed on {0}'.format(
                      self._node['host']))
  
+    def qemu_set_scheduler_policy(self):
+        """Set scheduler policy to SCHED_RR with priority 1 for all Qemu CPU
+        processes.
+
+       :raises RuntimeError: Set scheduler policy failed.
+        """
+        qemu_cpus = self._qemu_qmp_exec('query-cpus')['return']
+
+        for qemu_cpu in qemu_cpus:
+            cmd = 'chrt -r -p 1 {0}'.format(qemu_cpu['thread_id'])
+            (ret_code, _, stderr) = self._ssh.exec_command_sudo(cmd)
+            if int(ret_code) != 0:
+                logger.debug('Set SCHED_RR failed {0}'.format(stderr))
+                raise RuntimeError('Set SCHED_RR failed on {0}'.format(
+                    self._node['host']))
+
      def qemu_set_node(self, node):
          """Set node to run QEMU on.
  
@@ -166,7 +194,7 @@ class QemuUtils(object):
          :param socket: Path of the unix socket.
          :param server: If True the socket shall be a listening socket.
          :param mac: Vhost-user interface MAC address (optional, otherwise is
-            used autogenerated MAC 52:54:00:00:04:xx).
+            used auto-generated MAC 52:54:00:00:xx:yy).
          :type socket: str
          :type server: bool
          :type mac: str
@@ -179,15 +207,17 @@ class QemuUtils(object):
              chardev += ',server'
          self._qemu_opt['options'] += chardev
          # Create Vhost-user network backend.
-        netdev = ' -netdev vhost-user,id=vhost{0},chardev=char{0}'.format(
-            self._vhost_id)
+        netdev = (' -netdev vhost-user,id=vhost{0},chardev=char{0},queues={1}'
+                  .format(self._vhost_id, self._qemu_opt['queues']))
          self._qemu_opt['options'] += netdev
-        # If MAC is not specified use autogenerated 52:54:00:00:04:<vhost_id>
-        # e.g. vhost1 MAC is 52:54:00:00:04:01
+        # If MAC is not specified use auto-generated MAC address based on
+        # template 52:54:00:00:<qemu_id>:<vhost_id>, e.g. vhost1 MAC of QEMU
+        #  with ID 1 is 52:54:00:00:01:01
          if mac is None:
-            mac = '52:54:00:00:04:{0:02x}'.format(self._vhost_id)
-        extend_options = 'csum=off,gso=off,guest_tso4=off,guest_tso6=off,'\
-            'guest_ecn=off,mrg_rxbuf=off'
+            mac = '52:54:00:00:{0:02x}:{1:02x}'.\
+                format(self._qemu_id, self._vhost_id)
+        extend_options = 'mq=on,csum=off,gso=off,guest_tso4=off,'\
+            'guest_tso6=off,guest_ecn=off,mrg_rxbuf=off'
          # Create Virtio network device.
          device = ' -device virtio-net-pci,netdev=vhost{0},mac={1},{2}'.format(
              self._vhost_id, mac, extend_options)
@@ -211,14 +241,15 @@ class QemuUtils(object):
              response will contain the "error" keyword instead of "return".
          """
          # To enter command mode, the qmp_capabilities command must be issued.
-        qmp_cmd = 'echo "{ \\"execute\\": \\"qmp_capabilities\\" }' + \
-            '{ \\"execute\\": \\"' + cmd + '\\" }" | sudo -S nc -U ' + \
-            self.__QMP_SOCK
+        qmp_cmd = 'echo "{ \\"execute\\": \\"qmp_capabilities\\" }' \
+                  '{ \\"execute\\": \\"' + cmd + \
+                  '\\" }" | sudo -S socat - UNIX-CONNECT:' + self._qmp_sock
+
          (ret_code, stdout, stderr) = self._ssh.exec_command(qmp_cmd)
-        if 0 != int(ret_code):
+        if int(ret_code) != 0:
              logger.debug('QMP execute failed {0}'.format(stderr))
-            raise RuntimeError('QMP execute "{0}" failed on {1}'.format(cmd,
-                self._node['host']))
+            raise RuntimeError('QMP execute "{0}"'
+                               ' failed on {1}'.format(cmd, self._node['host']))
          logger.trace(stdout)
          # Skip capabilities negotiation messages.
          out_list = stdout.splitlines()
@@ -230,13 +261,15 @@ class QemuUtils(object):
      def _qemu_qga_flush(self):
          """Flush the QGA parser state
          """
-        qga_cmd = 'printf "\xFF" | sudo -S nc ' \
-            '-q 1 -U ' + self.__QGA_SOCK
+        qga_cmd = '(printf "\xFF"; sleep 1) | sudo -S socat - UNIX-CONNECT:' + \
+                  self._qga_sock
+        #TODO: probably need something else
          (ret_code, stdout, stderr) = self._ssh.exec_command(qga_cmd)
-        if 0 != int(ret_code):
+        if int(ret_code) != 0:
              logger.debug('QGA execute failed {0}'.format(stderr))
-            raise RuntimeError('QGA execute "{0}" failed on {1}'.format(cmd,
-                self._node['host']))
+            raise RuntimeError('QGA execute "{0}" '
+                               'failed on {1}'.format(qga_cmd,
+                                                      self._node['host']))
          logger.trace(stdout)
          if not stdout:
              return {}
@@ -250,33 +283,53 @@ class QemuUtils(object):
          :param cmd: QGA command to execute.
          :type cmd: str
          """
-        qga_cmd = 'echo "{ \\"execute\\": \\"' + cmd + '\\" }" | sudo -S nc ' \
-            '-q 1 -U ' + self.__QGA_SOCK
+        qga_cmd = '(echo "{ \\"execute\\": \\"' + \
+                  cmd + \
+                  '\\" }"; sleep 1) | sudo -S socat - UNIX-CONNECT:' + \
+                  self._qga_sock
          (ret_code, stdout, stderr) = self._ssh.exec_command(qga_cmd)
-        if 0 != int(ret_code):
+        if int(ret_code) != 0:
              logger.debug('QGA execute failed {0}'.format(stderr))
-            raise RuntimeError('QGA execute "{0}" failed on {1}'.format(cmd,
-                self._node['host']))
+            raise RuntimeError('QGA execute "{0}"'
+                               ' failed on {1}'.format(cmd, self._node['host']))
          logger.trace(stdout)
          if not stdout:
              return {}
          return json.loads(stdout.split('\n', 1)[0])
  
-    def _wait_until_vm_boot(self, timeout=300):
+    def _wait_until_vm_boot(self, timeout=60):
          """Wait until QEMU VM is booted.
  
-        Ping QEMU guest agent each 5s until VM booted or timeout.
+        First try to flush qga until there is output.
+        Then ping QEMU guest agent each 5s until VM booted or timeout.
  
-        :param timeout: Waiting timeout in seconds (optional, default 300s).
+        :param timeout: Waiting timeout in seconds (optional, default 60s).
          :type timeout: int
          """
          start = time()
-        while 1:
+        while True:
+            if time() - start > timeout:
+                raise RuntimeError('timeout, VM {0} not booted on {1}'.format(
+                    self._qemu_opt['disk_image'], self._node['host']))
+            out = None
+            try:
+                out = self._qemu_qga_flush()
+            except ValueError:
+                logger.trace('QGA qga flush unexpected output {}'.format(out))
+            # Empty output - VM not booted yet
+            if not out:
+                sleep(5)
+            else:
+                break
+        while True:
              if time() - start > timeout:
                  raise RuntimeError('timeout, VM {0} not booted on {1}'.format(
                      self._qemu_opt['disk_image'], self._node['host']))
-            self._qemu_qga_flush()
-            out = self._qemu_qga_exec('guest-ping')
+            out = None
+            try:
+                out = self._qemu_qga_exec('guest-ping')
+            except ValueError:
+                logger.trace('QGA guest-ping unexpected output {}'.format(out))
              # Empty output - VM not booted yet
              if not out:
                  sleep(5)
@@ -287,8 +340,10 @@ class QemuUtils(object):
              elif out.get('error') is not None:
                  sleep(5)
              else:
-                raise RuntimeError('QGA guest-ping unexpected output {}'.format(
-                    out))
+                # If there is an unexpected output from QGA guest-info, try
+                # again until timeout.
+                logger.trace('QGA guest-ping unexpected output {}'.format(out))
+
          logger.trace('VM {0} booted on {1}'.format(self._qemu_opt['disk_image'],
                                                     self._node['host']))
  
@@ -321,17 +376,12 @@ class QemuUtils(object):
          """Huge page check."""
          huge_mnt = self._qemu_opt.get('huge_mnt')
          mem_size = self._qemu_opt.get('mem_size')
-        # Check size of free huge pages
-        (_, output, _) = self._ssh.exec_command('grep Huge /proc/meminfo')
-        regex = re.compile(r'HugePages_Free:\s+(\d+)')
-        match = regex.search(output)
-        huge_free = int(match.group(1))
-        regex = re.compile(r'HugePages_Total:\s+(\d+)')
-        match = regex.search(output)
-        huge_total = int(match.group(1))
-        regex = re.compile(r'Hugepagesize:\s+(\d+)')
-        match = regex.search(output)
-        huge_size = int(match.group(1))
+
+        # Get huge pages information
+        huge_size = self._get_huge_page_size()
+        huge_free = self._get_huge_page_free(huge_size)
+        huge_total = self._get_huge_page_total(huge_size)
+
          # Check if memory reqested by qemu is available on host
          if (mem_size * 1024) > (huge_free * huge_size):
              # If we want to allocate hugepage dynamically
@@ -353,8 +403,10 @@ class QemuUtils(object):
                          self._node['host']))
              # If we do not want to allocate dynamicaly end with error
              else:
-                raise RuntimeError('Not enough free huge pages: {0}, '
-                    '{1} MB'.format(huge_free, huge_free * huge_size))
+                raise RuntimeError(
+                    'Not enough free huge pages: {0}, '
+                    '{1} MB'.format(huge_free, huge_free * huge_size)
+                )
          # Check if huge pages mount point exist
          has_huge_mnt = False
          (_, output, _) = self._ssh.exec_command('cat /proc/mounts')
@@ -381,6 +433,80 @@ class QemuUtils(object):
                  raise RuntimeError('Mount huge pages failed on {0}'.format(
                      self._node['host']))
  
+    def _get_huge_page_size(self):
+        """Get default size of huge pages in system.
+
+        :returns: Default size of free huge pages in system.
+        :rtype: int
+        :raises: RuntimeError if reading failed for three times.
+        """
+        # TODO: remove to dedicated library
+        cmd_huge_size = "grep Hugepagesize /proc/meminfo | awk '{ print $2 }'"
+        for _ in range(3):
+            (ret, out, _) = self._ssh.exec_command_sudo(cmd_huge_size)
+            if ret == 0:
+                try:
+                    huge_size = int(out)
+                except ValueError:
+                    logger.trace('Reading huge page size information failed')
+                else:
+                    break
+        else:
+            raise RuntimeError('Getting huge page size information failed.')
+        return huge_size
+
+    def _get_huge_page_free(self, huge_size):
+        """Get total number of huge pages in system.
+
+        :param huge_size: Size of hugepages.
+        :type huge_size: int
+        :returns: Number of free huge pages in system.
+        :rtype: int
+        :raises: RuntimeError if reading failed for three times.
+        """
+        # TODO: add numa aware option
+        # TODO: remove to dedicated library
+        cmd_huge_free = 'cat /sys/kernel/mm/hugepages/hugepages-{0}kB/'\
+            'free_hugepages'.format(huge_size)
+        for _ in range(3):
+            (ret, out, _) = self._ssh.exec_command_sudo(cmd_huge_free)
+            if ret == 0:
+                try:
+                    huge_free = int(out)
+                except ValueError:
+                    logger.trace('Reading free huge pages information failed')
+                else:
+                    break
+        else:
+            raise RuntimeError('Getting free huge pages information failed.')
+        return huge_free
+
+    def _get_huge_page_total(self, huge_size):
+        """Get total number of huge pages in system.
+
+        :param huge_size: Size of hugepages.
+        :type huge_size: int
+        :returns: Total number of huge pages in system.
+        :rtype: int
+        :raises: RuntimeError if reading failed for three times.
+        """
+        # TODO: add numa aware option
+        # TODO: remove to dedicated library
+        cmd_huge_total = 'cat /sys/kernel/mm/hugepages/hugepages-{0}kB/'\
+            'nr_hugepages'.format(huge_size)
+        for _ in range(3):
+            (ret, out, _) = self._ssh.exec_command_sudo(cmd_huge_total)
+            if ret == 0:
+                try:
+                    huge_total = int(out)
+                except ValueError:
+                    logger.trace('Reading total huge pages information failed')
+                else:
+                    break
+        else:
+            raise RuntimeError('Getting total huge pages information failed.')
+        return huge_total
+
      def qemu_start(self):
          """Start QEMU and wait until VM boot.
  
@@ -395,36 +521,46 @@ class QemuUtils(object):
          # Memory and huge pages
          mem = '-object memory-backend-file,id=mem,size={0}M,mem-path={1},' \
              'share=on -m {0} -numa node,memdev=mem'.format(
-            self._qemu_opt.get('mem_size'), self._qemu_opt.get('huge_mnt'))
+                self._qemu_opt.get('mem_size'), self._qemu_opt.get('huge_mnt'))
  
-        # By default check only if hugepages are availbale.
+        # By default check only if hugepages are available.
          # If 'huge_allocate' is set to true try to allocate as well.
          self._huge_page_check(allocate=self._qemu_opt.get('huge_allocate'))
  
+        # Disk option
+        drive = '-drive file={0},format=raw,cache=none,if=virtio'.format(
+            self._qemu_opt.get('disk_image'))
          # Setup QMP via unix socket
-        qmp = '-qmp unix:{0},server,nowait'.format(self.__QMP_SOCK)
+        qmp = '-qmp unix:{0},server,nowait'.format(self._qmp_sock)
          # Setup serial console
          serial = '-chardev socket,host=127.0.0.1,port={0},id=gnc0,server,' \
              'nowait -device isa-serial,chardev=gnc0'.format(
-            self._qemu_opt.get('serial_port'))
+                self._qemu_opt.get('serial_port'))
          # Setup QGA via chardev (unix socket) and isa-serial channel
-        qga = '-chardev socket,path=/tmp/qga.sock,server,nowait,id=qga0 ' \
-            '-device isa-serial,chardev=qga0'
+        qga = '-chardev socket,path={0},server,nowait,id=qga0 ' \
+            '-device isa-serial,chardev=qga0'.format(self._qga_sock)
          # Graphic setup
          graphic = '-monitor none -display none -vga none'
+        # PID file
+        pid = '-pidfile {}'.format(self._pid_file)
+
          # Run QEMU
-        cmd = '{0} {1} {2} {3} {4} -hda {5} {6} {7} {8} {9}'.format(
-            self.__QEMU_BIN, self._qemu_opt.get('smp'), mem, ssh_fwd,
+        cmd = '{0} {1} {2} {3} {4} {5} {6} {7} {8} {9} {10}'.format(
+            self._qemu_bin, self._qemu_opt.get('smp'), mem, ssh_fwd,
              self._qemu_opt.get('options'),
-            self._qemu_opt.get('disk_image'), qmp, serial, qga, graphic)
-        (ret_code, _, stderr) = self._ssh.exec_command_sudo(cmd, timeout=300)
-        if int(ret_code) != 0:
-            logger.debug('QEMU start failed {0}'.format(stderr))
-            raise RuntimeError('QEMU start failed on {0}'.format(
-                self._node['host']))
-        logger.trace('QEMU running')
-        # Wait until VM boot
-        self._wait_until_vm_boot()
+            drive, qmp, serial, qga, graphic, pid)
+        try:
+            (ret_code, _, _) = self._ssh.exec_command_sudo(cmd, timeout=300)
+            if int(ret_code) != 0:
+                raise RuntimeError('QEMU start failed on {0}'.format(
+                    self._node['host']))
+            # Wait until VM boot
+            self._wait_until_vm_boot()
+        except (RuntimeError, SSHTimeout):
+            self.qemu_kill_all()
+            self.qemu_clear_socks()
+            raise
+        logger.trace('QEMU started successfully.')
          # Update interface names in VM node dict
          self._update_vm_interfaces()
          # Return VM node dict
@@ -443,22 +579,39 @@ class QemuUtils(object):
          out = self._qemu_qmp_exec('system_powerdown')
          err = out.get('error')
          if err is not None:
-            raise RuntimeError('QEMU system powerdown failed on {0}, '
-                'error: {1}'.format(self._node['host'], json.dumps(err)))
+            raise RuntimeError(
+                'QEMU system powerdown failed on {0}, '
+                'error: {1}'.format(self._node['host'], json.dumps(err))
+            )
  
      def qemu_system_reset(self):
          """Reset the system."""
          out = self._qemu_qmp_exec('system_reset')
          err = out.get('error')
          if err is not None:
-            raise RuntimeError('QEMU system reset failed on {0}, '
+            raise RuntimeError(
+                'QEMU system reset failed on {0}, '
                  'error: {1}'.format(self._node['host'], json.dumps(err)))
  
      def qemu_kill(self):
          """Kill qemu process."""
-        # TODO: add PID storage so that we can kill specific PID
          # Note: in QEMU start phase there are 3 QEMU processes because we
          # daemonize QEMU
+        self._ssh.exec_command_sudo('chmod +r {}'.format(self._pid_file))
+        self._ssh.exec_command_sudo('kill -SIGKILL $(cat {})'
+                                    .format(self._pid_file))
+        # Delete PID file
+        cmd = 'rm -f {}'.format(self._pid_file)
+        self._ssh.exec_command_sudo(cmd)
+
+    def qemu_kill_all(self, node=None):
+        """Kill all qemu processes on DUT node if specified.
+
+        :param node: Node to kill all QEMU processes on.
+        :type node: dict
+        """
+        if node:
+            self.qemu_set_node(node)
          self._ssh.exec_command_sudo('pkill -SIGKILL qemu')
  
      def qemu_clear_socks(self):
@@ -501,23 +654,36 @@ class QemuUtils(object):
              return ret.get('status')
          else:
              err = out.get('error')
-            raise RuntimeError('QEMU query-status failed on {0}, '
+            raise RuntimeError(
+                'QEMU query-status failed on {0}, '
                  'error: {1}'.format(self._node['host'], json.dumps(err)))
  
      @staticmethod
-    def build_qemu(node):
+    def build_qemu(node, force_install=False, apply_patch=False):
          """Build QEMU from sources.
  
          :param node: Node to build QEMU on.
+        :param force_install: If True, then remove previous build.
+        :param apply_patch: If True, then apply patches from qemu_patches dir.
          :type node: dict
+        :type force_install: bool
+        :type apply_patch: bool
+        :raises: RuntimeError if building QEMU failed.
          """
          ssh = SSH()
          ssh.connect(node)
  
+        directory = ' --directory={0}'.format(Constants.QEMU_INSTALL_DIR)
+        version = ' --version={0}'.format(Constants.QEMU_INSTALL_VERSION)
+        force = ' --force' if force_install else ''
+        patch = ' --patch' if apply_patch else ''
+
          (ret_code, stdout, stderr) = \
-            ssh.exec_command('sudo -Sn bash {0}/{1}/qemu_build.sh'.format(
-                Constants.REMOTE_FW_DIR, Constants.RESOURCES_LIB_SH), 1000)
-        logger.trace(stdout)
-        if 0 != int(ret_code):
-            logger.debug('QEMU build failed {0}'.format(stderr))
+            ssh.exec_command(
+                "sudo -E sh -c '{0}/{1}/qemu_build.sh{2}{3}{4}{5}'"\
+                .format(Constants.REMOTE_FW_DIR, Constants.RESOURCES_LIB_SH,
+                        version, directory, force, patch), 1000)
+
+        if int(ret_code) != 0:
+            logger.debug('QEMU build failed {0}'.format(stdout + stderr))
              raise RuntimeError('QEMU build failed on {0}'.format(node['host']))