Soak: Avoid a possible deadlock.
[csit.git] / resources / libraries / python / PLRsearch / PLRsearch.py
index e20d293..ec58fbd 100644 (file)
@@ -1,4 +1,4 @@
-# Copyright (c) 2019 Cisco and/or its affiliates.
+# Copyright (c) 2020 Cisco and/or its affiliates.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at:
@@ -57,6 +57,8 @@ class PLRsearch:
             trial_number_offset=0, timeout=1800.0, trace_enabled=False):
         """Store rate measurer and additional parameters.
 
+        The measurer must never report negative loss count.
+
         TODO: Copy AbstractMeasurer from MLRsearch.
 
         :param measurer: The measurer to call when searching.
@@ -599,15 +601,17 @@ class PLRsearch:
 
             dilled_function = dill.dumps(value_logweight_func)
             boss_pipe_end, worker_pipe_end = multiprocessing.Pipe()
-            boss_pipe_end.send(
-                (dimension, dilled_function, focus_tracker, max_samples)
-            )
+            # Do not send yet, run the worker first to avoid a deadlock.
+            # See https://stackoverflow.com/a/15716500
             worker = multiprocessing.Process(
                 target=Integrator.try_estimate_nd,
                 args=(worker_pipe_end, 10.0, self.trace_enabled)
             )
             worker.daemon = True
             worker.start()
+            boss_pipe_end.send(
+                (dimension, dilled_function, focus_tracker, max_samples)
+            )
             return boss_pipe_end
 
         erf_pipe = start_computing(self.lfit_erf, erf_focus_tracker)
@@ -633,7 +637,15 @@ class PLRsearch:
                 and number of samples used for this iteration.
             :rtype: _PartialResult
             """
-            pipe.send(None)
+            # If worker encountered an exception, we get it in the recv below,
+            # but send will report a broken pipe.
+            # EAFP says we should ignore the error (instead of polling first).
+            # https://devblogs.microsoft.com/python
+            #   /idiomatic-python-eafp-versus-lbyl/
+            try:
+                pipe.send(None)
+            except BrokenPipeError:
+                pass
             if not pipe.poll(10.0):
                 raise RuntimeError(f"Worker {name} did not finish!")
             result_or_traceback = pipe.recv()