Skip bad testbeds in reservation 48/21148/6
authorVratko Polak <vrpolak@cisco.com>
Thu, 8 Aug 2019 13:11:38 +0000 (15:11 +0200)
committerVratko Polak <vrpolak@cisco.com>
Fri, 9 Aug 2019 14:36:09 +0000 (14:36 +0000)
Previously, ssh-inaccesible testbeds
were handled the same way as reserved,
never ending the wait loop.

With this, if no testbed is accessible,
the job fails early.

Change-Id: I01bc79094fe0232a47d795e53e3daa52e8742bac
Signed-off-by: Vratko Polak <vrpolak@cisco.com>
resources/libraries/bash/function/common.sh
resources/tools/scripts/topo_reservation.py

index a03e904..4a560ff 100644 (file)
@@ -553,6 +553,36 @@ function installed () {
 }
 
 
+function remove_topo () {
+
+    # Remove the argument from list of available topologies.
+    #
+    # Just a de-duplicated block of code
+    #
+    # Argument:
+    # - ${1} - The topology item to remove. Required.
+    # Variable read and re-written:
+    # - TOPOLOGIES - Array of paths to topologies, with failed cleanups removed.
+
+    set -exuo pipefail
+
+    warn "Testbed ${topo} seems unsuitable, removing from the list."
+
+    # Build new topology array.
+    #   TOPOLOGIES=("${TOPOLOGIES[@]/$topo}")
+    # does not really work, see:
+    # https://stackoverflow.com/questions/16860877/remove-an-element-from-a-bash-array
+
+    new_topologies=()
+    for item in "${TOPOLOGIES[@]}"; do
+        if [[ "${item}" != "${1}" ]]; then
+            new_topologies+=("${item}")
+        fi
+    done
+    TOPOLOGIES=("${new_topologies[@]}")
+}
+
+
 function reserve_and_cleanup_testbed () {
 
     # Reserve physical testbed, perform cleanup, register trap to unreserve.
@@ -576,7 +606,7 @@ function reserve_and_cleanup_testbed () {
 
     set -exuo pipefail
 
-    while [[ ${TOPOLOGIES[@]} ]]; do
+    while true; do
         for topo in "${TOPOLOGIES[@]}"; do
             set +e
             scrpt="${PYTHON_SCRIPTS_DIR}/topo_reservation.py"
@@ -609,38 +639,31 @@ function reserve_and_cleanup_testbed () {
                 warn "Testbed cleanup failed: ${topo}"
                 untrap_and_unreserve_testbed "Fail of unreserve after cleanup."
                 # WORKING_TOPOLOGY is now empty again.
-                # Build new topology array.
-                #   TOPOLOGIES=("${TOPOLOGIES[@]/$topo}")
-                # does not really work, see:
-                # https://stackoverflow.com/questions/16860877/remove-an-element-from-a-bash-array
-                new_topologies=()
-                for item in "${TOPOLOGIES[@]}"; do
-                    if [[ "${item}" != "${topo}" ]]; then
-                        new_topologies+=("${item}")
-                    fi
-                done
-                TOPOLOGIES=("${new_topologies[@]}")
-                break
+                remove_topo "${topo}"
+            elif [[ "${result}" != "2" ]]; then
+                # 1 or unexpected return code, testbed is probably unusable.
+                remove_topo "${topo}"
             fi
+            # Else testbed is accessible but currently reserved, moving on.
         done
 
         if [[ -n "${WORKING_TOPOLOGY-}" ]]; then
             # Exit the infinite while loop if we made a reservation.
+            warn "Reservation and cleanup successful."
             break
         fi
 
+        if [[ "${#TOPOLOGIES[@]}" == "0" ]]; then
+            die "Run out of operational testbeds!"
+        fi
+
         # Wait ~3minutes before next try.
-        sleep_time="$[ ( $RANDOM % 20 ) + 180 ]s" || {
+        sleep_time="$[ ( ${RANDOM} % 20 ) + 180 ]s" || {
             die "Sleep time calculation failed."
         }
         echo "Sleeping ${sleep_time}"
         sleep "${sleep_time}" || die "Sleep failed."
     done
-    if [[ ${TOPOLOGIES[@]} ]]; then
-        echo "Reservation and cleanup successful."
-    else
-        die "Run out of operational testbeds!"
-    fi
 }
 
 
index a04709b..9f26677 100755 (executable)
@@ -74,6 +74,15 @@ def main():
     via a graphical terminal, which does not allow copying of text,
     as they need less keypresses to identify the test run holding the testbed.
     Also, the listing shows timestamps, which is useful for both audiences.
+
+    This all assumes the target system accepts ssh connections.
+    If it does not, the caller probably wants to stop trying
+    to reserve this system. Therefore this script can return 3 different codes.
+    Return code 0 means the reservation was successful.
+    Return code 1 means the system is inaccessible (or similarly unsuitable).
+    Return code 2 means the system is accessible, but already reserved.
+    The reason unsuitable systems return 1 is because that is also the value
+    Python returns on encountering and unexcepted exception.
     """
     parser = argparse.ArgumentParser()
     parser.add_argument("-t", "--topo", required=True,
@@ -117,8 +126,8 @@ def main():
     ret, _, err = exec_cmd(tgn, "mkdir '{dir}'".format(dir=RESERVATION_DIR))
     # Critical section is over.
     if ret:
-        print "Reservation unsuccessful:\n{}".format(err)
-        return ret
+        print "Already reserved by another job:\n{}".format(err)
+        return 2
     # Here the script knows it is the only owner of the testbed.
     print "Success, writing test run info to reservation dir."
     # TODO: Add optional argument to exec_cmd_no_error to print message
@@ -129,7 +138,7 @@ def main():
     if ret2:
         print "Writing test run info failed, but continuing anyway:\n{}".format(
             err)
-    return ret
+    return 0
 
 
 if __name__ == "__main__":