FIX: Topology reservation 49/21349/2
authorPeter Mikus <pmikus@cisco.com>
Fri, 16 Aug 2019 06:47:53 +0000 (06:47 +0000)
committerPeter Mikus <pmikus@cisco.com>
Mon, 19 Aug 2019 05:22:03 +0000 (05:22 +0000)
Last attempt [0] is doing more harm then good. If testbed is down
but recovered quickly enough (reboot, temporary ssh connectivity
issue, ...) it never gets back into pool making other testbeds
overloaded and queued.

This patch is suppose to partially revert previous behavior until
proper patch will follow.

[0] https://gerrit.fd.io/r/c/csit/+/21148

Signed-off-by: Peter Mikus <pmikus@cisco.com>
Change-Id: I8203946d10d3e7dd51e97519d679246b5dae59e3

resources/libraries/bash/function/common.sh
resources/tools/scripts/topo_reservation.py

index 4352724..7a55d56 100644 (file)
@@ -556,36 +556,6 @@ function installed () {
 }
 
 
-function remove_topo () {
-
-    # Remove the argument from list of available topologies.
-    #
-    # Just a de-duplicated block of code
-    #
-    # Argument:
-    # - ${1} - The topology item to remove. Required.
-    # Variable read and re-written:
-    # - TOPOLOGIES - Array of paths to topologies, with failed cleanups removed.
-
-    set -exuo pipefail
-
-    warn "Testbed ${topo} seems unsuitable, removing from the list."
-
-    # Build new topology array.
-    #   TOPOLOGIES=("${TOPOLOGIES[@]/$topo}")
-    # does not really work, see:
-    # https://stackoverflow.com/questions/16860877/remove-an-element-from-a-bash-array
-
-    new_topologies=()
-    for item in "${TOPOLOGIES[@]}"; do
-        if [[ "${item}" != "${1}" ]]; then
-            new_topologies+=("${item}")
-        fi
-    done
-    TOPOLOGIES=("${new_topologies[@]}")
-}
-
-
 function reserve_and_cleanup_testbed () {
 
     # Reserve physical testbed, perform cleanup, register trap to unreserve.
@@ -638,11 +608,6 @@ function reserve_and_cleanup_testbed () {
                 fi
                 warn "Testbed cleanup failed: ${topo}"
                 untrap_and_unreserve_testbed "Fail of unreserve after cleanup."
-                # WORKING_TOPOLOGY is now empty again.
-                remove_topo "${topo}"
-            elif [[ "${result}" != "2" ]]; then
-                # 1 or unexpected return code, testbed is probably unusable.
-                remove_topo "${topo}"
             fi
             # Else testbed is accessible but currently reserved, moving on.
         done
index 77d84ef..e7e1ff6 100755 (executable)
@@ -28,6 +28,7 @@ from resources.libraries.python.ssh import exec_cmd
 
 
 RESERVATION_DIR = "/tmp/reservation_dir"
+RESERVATION_NODE = "TG"
 
 
 def diag_cmd(node, cmd):
@@ -38,9 +39,9 @@ def diag_cmd(node, cmd):
     :type ssh: dict
     :type cmd: str
     """
-    print "+", cmd
+    print('+ {cmd}'.format(cmd=cmd))
     _, stdout, _ = exec_cmd(node, cmd)
-    print stdout
+    print(stdout)
 
 
 def main():
@@ -88,37 +89,39 @@ def main():
     # we are using it, because testing shows SSH access to DUT
     # during test affects its performance (bursts of lost packets).
     try:
-        tgn = topology["TG"]
+        node = topology[RESERVATION_NODE]
     except KeyError:
-        print "Topology file does not contain 'TG' node"
+        print("Topology file does not contain '{node}' node".
+              format(node=RESERVATION_NODE))
         return 1
 
     # For system reservation we use mkdir it is an atomic operation and we can
     # store additional data (time, client_ID, ..) within reservation directory.
     if args.cancel:
-        ret, _, err = exec_cmd(tgn, "rm -r {}".format(RESERVATION_DIR))
+        ret, _, err = exec_cmd(node, "rm -r {dir}".format(dir=RESERVATION_DIR))
         if ret:
-            print "Cancellation unsuccessful:\n{}".format(err)
+            print("Cancellation unsuccessful:\n{err}".format(err=err))
         return ret
     # Before critical section, output can be outdated already.
     print("Diagnostic commands:")
     # -d and * are to supress "total <size>", see https://askubuntu.com/a/61190
-    diag_cmd(tgn, "ls --full-time -cd '{dir}'/*".format(dir=RESERVATION_DIR))
-    print("Attempting reservation.")
+    diag_cmd(node, "ls --full-time -cd '{dir}'/*".format(dir=RESERVATION_DIR))
+    print("Attempting testbed reservation.")
     # Entering critical section.
-    ret, _, err = exec_cmd(tgn, "mkdir '{dir}'".format(dir=RESERVATION_DIR))
+    ret, _, _ = exec_cmd(node, "mkdir '{dir}'".format(dir=RESERVATION_DIR))
     # Critical section is over.
     if ret:
-        print("Already reserved by another job:\n{}".format(err))
+        _, stdo, _ = exec_cmd(node, "ls '{dir}'/*".format(dir=RESERVATION_DIR))
+        print("Testbed already reserved by:\n{stdo}".format(stdo=stdo))
         return 2
     # Here the script knows it is the only owner of the testbed.
-    print("Success, writing test run info to reservation dir.")
-    ret2, _, err = exec_cmd(
-        tgn, "touch '{dir}/{runtag}'"\
+    print("Reservation success, writing additional info to reservation dir.")
+    ret, _, err = exec_cmd(
+        node, "touch '{dir}/{runtag}'"\
         .format(dir=RESERVATION_DIR, runtag=args.runtag))
-    if ret2:
-        print("Writing test run info failed, but continuing anyway:\n{}".format(
-            err))
+    if ret:
+        print("Writing test run info failed, but continuing anyway:\n{err}".
+              format(err=err))
     return 0