CSIT-1104: Trending: Speed-up plots generation

[csit.git] / resources / tools / presentation / input_data_parser.py
diff --git a/resources/tools/presentation/input_data_parser.py b/resources/tools/presentation/input_data_parser.py

index e12e2fb..beec34c 100644 (file)
--- a/resources/tools/presentation/input_data_parser.py
+++ b/resources/tools/presentation/input_data_parser.py
@@ -18,10 +18,11 @@
  - provide access to the data.
  """
  
  - provide access to the data.
  """
  
+import multiprocessing
+import os
  import re
  import pandas as pd
  import logging
  import re
  import pandas as pd
  import logging
-import xml.etree.ElementTree as ET
  
  from robot.api import ExecutionResult, ResultVisitor
  from robot import errors
  
  from robot.api import ExecutionResult, ResultVisitor
  from robot import errors
@@ -30,6 +31,7 @@ from string import replace
  from os import remove
  
  from input_data_files import download_and_unzip_data_file
  from os import remove
  
  from input_data_files import download_and_unzip_data_file
+from utils import Worker
  
  
  class ExecutionChecker(ResultVisitor):
  
  
  class ExecutionChecker(ResultVisitor):
@@ -251,6 +253,7 @@ class ExecutionChecker(ResultVisitor):
              self._version = str(re.search(self.REGEX_VERSION, msg.message).
                                  group(2))
              self._data["metadata"]["version"] = self._version
              self._version = str(re.search(self.REGEX_VERSION, msg.message).
                                  group(2))
              self._data["metadata"]["version"] = self._version
+            self._data["metadata"]["generated"] = msg.timestamp
              self._msg_type = None
  
      def _get_vat_history(self, msg):
              self._msg_type = None
  
      def _get_vat_history(self, msg):
@@ -697,7 +700,7 @@ class InputData(object):
          self._cfg = spec
  
          # Data store:
          self._cfg = spec
  
          # Data store:
-        self._input_data = None
+        self._input_data = pd.Series()
  
      @property
      def data(self):
  
      @property
      def data(self):
@@ -748,17 +751,16 @@ class InputData(object):
          return self.data[job][build]["tests"]
  
      @staticmethod
          return self.data[job][build]["tests"]
  
      @staticmethod
-    def _parse_tests(job, build, get_timestamp=False):
+    def _parse_tests(job, build, log):
          """Process data from robot output.xml file and return JSON structured
          data.
  
          :param job: The name of job which build output data will be processed.
          :param build: The build which output data will be processed.
          """Process data from robot output.xml file and return JSON structured
          data.
  
          :param job: The name of job which build output data will be processed.
          :param build: The build which output data will be processed.
-        :param get_timestamp: If True, timestamp is read form the xml source
-            file.
+        :param log: List of log messages.
          :type job: str
          :type build: dict
          :type job: str
          :type build: dict
-        :type get_timestamp: bool
+        :type log: list of tuples (severity, msg)
          :returns: JSON data structure.
          :rtype: dict
          """
          :returns: JSON data structure.
          :rtype: dict
          """
@@ -767,68 +769,132 @@ class InputData(object):
              "job": job,
              "build": build
          }
              "job": job,
              "build": build
          }
-        if get_timestamp:
-            tree = ET.parse(build["file-name"])
-            root = tree.getroot()
-            metadata["generated"] = root.attrib["generated"]
  
          with open(build["file-name"], 'r') as data_file:
              try:
                  result = ExecutionResult(data_file)
              except errors.DataError as err:
  
          with open(build["file-name"], 'r') as data_file:
              try:
                  result = ExecutionResult(data_file)
              except errors.DataError as err:
-                logging.error("Error occurred while parsing output.xml: {0}".
-                              format(err))
+                log.append(("ERROR", "Error occurred while parsing output.xml: "
+                                     "{0}".format(err)))
                  return None
          checker = ExecutionChecker(metadata)
          result.visit(checker)
  
          return checker.data
  
                  return None
          checker = ExecutionChecker(metadata)
          result.visit(checker)
  
          return checker.data
  
-    def download_and_parse_data(self, get_timestamp=False):
+    def _download_and_parse_build(self, pid, data_queue, job, build, repeat):
+        """Download and parse the input data file.
+
+        :param pid: PID of the process executing this method.
+        :param data_queue: Shared memory between processes. Queue which keeps
+            the result data. This data is then read by the main process and used
+            in further processing.
+        :param job: Name of the Jenkins job which generated the processed input
+            file.
+        :param build: Information about the Jenkins build which generated the
+            processed input file.
+        :param repeat: Repeat the download specified number of times if not
+            successful.
+        :type pid: int
+        :type data_queue: multiprocessing.Manager().Queue()
+        :type job: str
+        :type build: dict
+        :type repeat: int
+        """
+
+        logs = list()
+
+        logging.info("  Processing the job/build: {0}: {1}".
+                     format(job, build["build"]))
+
+        logs.append(("INFO", "  Processing the job/build: {0}: {1}".
+                     format(job, build["build"])))
+
+        state = "failed"
+        success = False
+        data = None
+        do_repeat = repeat
+        while do_repeat:
+            success = download_and_unzip_data_file(self._cfg, job, build, pid,
+                                                   logs)
+            if success:
+                break
+            do_repeat -= 1
+        if not success:
+            logs.append(("ERROR", "It is not possible to download the input "
+                                  "data file from the job '{job}', build "
+                                  "'{build}', or it is damaged. Skipped.".
+                         format(job=job, build=build["build"])))
+        if success:
+            logs.append(("INFO", "  Processing data from the build '{0}' ...".
+                         format(build["build"])))
+            data = InputData._parse_tests(job, build, logs)
+            if data is None:
+                logs.append(("ERROR", "Input data file from the job '{job}', "
+                                      "build '{build}' is damaged. Skipped.".
+                             format(job=job, build=build["build"])))
+            else:
+                state = "processed"
+
+            try:
+                remove(build["file-name"])
+            except OSError as err:
+                logs.append(("ERROR", "Cannot remove the file '{0}': {1}".
+                             format(build["file-name"], err)))
+        logs.append(("INFO", "  Done."))
+
+        result = {
+            "data": data,
+            "state": state,
+            "job": job,
+            "build": build,
+            "logs": logs
+        }
+        data_queue.put(result)
+
+    def download_and_parse_data(self, repeat=1):
          """Download the input data files, parse input data from input files and
          store in pandas' Series.
  
          """Download the input data files, parse input data from input files and
          store in pandas' Series.
  
-        :param get_timestamp: If True, timestamp is read form the xml source
-            file.
-        :type get_timestamp: bool
+        :param repeat: Repeat the download specified number of times if not
+            successful.
+        :type repeat: int
          """
  
          logging.info("Downloading and parsing input files ...")
  
          """
  
          logging.info("Downloading and parsing input files ...")
  
-        job_data = dict()
+        work_queue = multiprocessing.JoinableQueue()
+        manager = multiprocessing.Manager()
+        data_queue = manager.Queue()
+        cpus = multiprocessing.cpu_count()
+
+        workers = list()
+        for cpu in range(cpus):
+            worker = Worker(work_queue,
+                            data_queue,
+                            self._download_and_parse_build)
+            worker.daemon = True
+            worker.start()
+            workers.append(worker)
+            os.system("taskset -p -c {0} {1} > /dev/null 2>&1".
+                      format(cpu, worker.pid))
+
          for job, builds in self._cfg.builds.items():
          for job, builds in self._cfg.builds.items():
-            logging.info("  Processing data from the job '{0}' ...'".
-                         format(job))
-            builds_data = dict()
              for build in builds:
              for build in builds:
-                logging.info("    Processing the build '{0}'".
-                             format(build["build"]))
-                self._cfg.set_input_state(job, build["build"], "failed")
-                if not download_and_unzip_data_file(self._cfg, job, build):
-                    logging.error("It is not possible to download the input "
-                                  "data file from the job '{job}', build "
-                                  "'{build}', or it is damaged. Skipped.".
-                                  format(job=job, build=build["build"]))
-                    continue
+                work_queue.put((job, build, repeat))
  
  
-                logging.info("      Processing data from the build '{0}' ...".
-                             format(build["build"]))
-                data = InputData._parse_tests(job, build,
-                                              get_timestamp=get_timestamp)
-                if data is None:
-                    logging.error("Input data file from the job '{job}', build "
-                                  "'{build}' is damaged. Skipped.".
-                                  format(job=job, build=build["build"]))
-                    continue
+        work_queue.join()
  
  
-                self._cfg.set_input_state(job, build["build"], "processed")
+        logging.info("Done.")
  
  
-                try:
-                    remove(build["file-name"])
-                except OSError as err:
-                    logging.error("Cannot remove the file '{0}': {1}".
-                                  format(build["file-name"], err))
+        while not data_queue.empty():
+            result = data_queue.get()
  
  
+            job = result["job"]
+            build_nr = result["build"]["build"]
+
+            if result["data"]:
+                data = result["data"]
                  build_data = pd.Series({
                      "metadata": pd.Series(data["metadata"].values(),
                                            index=data["metadata"].keys()),
                  build_data = pd.Series({
                      "metadata": pd.Series(data["metadata"].values(),
                                            index=data["metadata"].keys()),
@@ -836,15 +902,35 @@ class InputData(object):
                                          index=data["suites"].keys()),
                      "tests": pd.Series(data["tests"].values(),
                                         index=data["tests"].keys())})
                                          index=data["suites"].keys()),
                      "tests": pd.Series(data["tests"].values(),
                                         index=data["tests"].keys())})
-                builds_data[str(build["build"])] = build_data
-                build["status"] = "processed"
-                logging.info("    Done.")
  
  
-            job_data[job] = pd.Series(builds_data.values(),
-                                      index=builds_data.keys())
-            logging.info("  Done.")
+                if self._input_data.get(job, None) is None:
+                    self._input_data[job] = pd.Series()
+                self._input_data[job][str(build_nr)] = build_data
+
+                self._cfg.set_input_file_name(job, build_nr,
+                                              result["build"]["file-name"])
+
+            self._cfg.set_input_state(job, build_nr, result["state"])
+
+            for item in result["logs"]:
+                if item[0] == "INFO":
+                    logging.info(item[1])
+                elif item[0] == "ERROR":
+                    logging.error(item[1])
+                elif item[0] == "DEBUG":
+                    logging.debug(item[1])
+                elif item[0] == "CRITICAL":
+                    logging.critical(item[1])
+                elif item[0] == "WARNING":
+                    logging.warning(item[1])
+
+        del data_queue
+
+        # Terminate all workers
+        for worker in workers:
+            worker.terminate()
+            worker.join()
  
  
-        self._input_data = pd.Series(job_data.values(), index=job_data.keys())
          logging.info("Done.")
  
      @staticmethod
          logging.info("Done.")
  
      @staticmethod
@@ -921,9 +1007,6 @@ class InputData(object):
          :rtype pandas.Series
          """
  
          :rtype pandas.Series
          """
  
-        logging.info("    Creating the data set for the {0} '{1}'.".
-                     format(element.get("type", ""), element.get("title", "")))
-
          try:
              if element["filter"] in ("all", "template"):
                  cond = "True"
          try:
              if element["filter"] in ("all", "template"):
                  cond = "True"