tests: avoid test runner hanging on child test process join

[vpp.git] / test / run_tests.py
diff --git a/test/run_tests.py b/test/run_tests.py

index aee05c6..d86356f 100644 (file)
--- a/test/run_tests.py
+++ b/test/run_tests.py
@@ -11,6 +11,7 @@ import threading
  import signal
  import psutil
  import re
+import multiprocessing
  from multiprocessing import Process, Pipe, cpu_count
  from multiprocessing.queues import Queue
  from multiprocessing.managers import BaseManager
@@ -432,7 +433,17 @@ def run_forked(testcase_suites):
                          results) or stop_run
  
              for finished_testcase in finished_testcase_suites:
-                finished_testcase.child.join()
+                # Somewhat surprisingly, the join below may
+                # timeout, even if client signaled that
+                # it finished - so we note it just in case.
+                join_start = time.time()
+                finished_testcase.child.join(test_finished_join_timeout)
+                join_end = time.time()
+                if join_end - join_start >= test_finished_join_timeout:
+                    finished_testcase.logger.error(
+                        "Timeout joining finished test: %s (pid %d)" %
+                        (finished_testcase.last_test,
+                         finished_testcase.child.pid))
                  finished_testcase.close_pipes()
                  wrapped_testcase_suites.remove(finished_testcase)
                  finished_unread_testcases.add(finished_testcase)
@@ -725,6 +736,8 @@ if __name__ == '__main__':
  
      test_timeout = parse_digit_env("TIMEOUT", 600)  # default = 10 minutes
  
+    test_finished_join_timeout = 15
+
      retries = parse_digit_env("RETRIES", 0)
  
      debug = os.getenv("DEBUG", "n").lower() in ["gdb", "gdbserver"]
@@ -737,13 +750,21 @@ if __name__ == '__main__':
  
      run_interactive = debug or step or force_foreground
  
+    try:
+        num_cpus = len(os.sched_getaffinity(0))
+    except AttributeError:
+        num_cpus = multiprocessing.cpu_count()
+    shm_free = psutil.disk_usage('/dev/shm').free
+
+    print('OS reports %s available cpu(s). Free shm: %s' % (
+        num_cpus, "{:,}MB".format(shm_free / (1024 * 1024))))
+
      test_jobs = os.getenv("TEST_JOBS", "1").lower()  # default = 1 process
      if test_jobs == 'auto':
          if run_interactive:
              concurrent_tests = 1
              print('Interactive mode required, running on one core')
          else:
-            shm_free = psutil.disk_usage('/dev/shm').free
              shm_max_processes = 1
              if shm_free < min_req_shm:
                  raise Exception('Not enough free space in /dev/shm. Required '
@@ -751,14 +772,17 @@ if __name__ == '__main__':
                                  % (min_req_shm >> 20))
              else:
                  extra_shm = shm_free - min_req_shm
-                shm_max_processes += extra_shm / shm_per_process
+                shm_max_processes += extra_shm // shm_per_process
              concurrent_tests = min(cpu_count(), shm_max_processes)
              print('Found enough resources to run tests with %s cores'
                    % concurrent_tests)
      elif test_jobs.isdigit():
          concurrent_tests = int(test_jobs)
+        print("Running on %s core(s) as set by 'TEST_JOBS'." %
+              concurrent_tests)
      else:
          concurrent_tests = 1
+        print('Running on one core.')
  
      if run_interactive and concurrent_tests > 1:
          raise NotImplementedError(
@@ -810,7 +834,7 @@ if __name__ == '__main__':
          # don't fork if requiring interactive terminal
          print('Running tests in foreground in the current process')
          full_suite = unittest.TestSuite()
-        map(full_suite.addTests, suites)
+        full_suite.addTests(suites)
          result = VppTestRunner(verbosity=verbose,
                                 failfast=failfast,
                                 print_summary=True).run(full_suite)