From 6f1ff57c83763556450e8bfcb4571a9c017abe70 Mon Sep 17 00:00:00 2001 From: Vratko Polak Date: Thu, 8 Aug 2019 15:11:38 +0200 Subject: [PATCH] Skip bad testbeds in reservation Previously, ssh-inaccesible testbeds were handled the same way as reserved, never ending the wait loop. With this, if no testbed is accessible, the job fails early. Change-Id: I01bc79094fe0232a47d795e53e3daa52e8742bac Signed-off-by: Vratko Polak --- resources/libraries/bash/function/common.sh | 61 ++++++++++++++++++++--------- resources/tools/scripts/topo_reservation.py | 15 +++++-- 2 files changed, 54 insertions(+), 22 deletions(-) diff --git a/resources/libraries/bash/function/common.sh b/resources/libraries/bash/function/common.sh index a03e90422c..4a560ff98c 100644 --- a/resources/libraries/bash/function/common.sh +++ b/resources/libraries/bash/function/common.sh @@ -553,6 +553,36 @@ function installed () { } +function remove_topo () { + + # Remove the argument from list of available topologies. + # + # Just a de-duplicated block of code + # + # Argument: + # - ${1} - The topology item to remove. Required. + # Variable read and re-written: + # - TOPOLOGIES - Array of paths to topologies, with failed cleanups removed. + + set -exuo pipefail + + warn "Testbed ${topo} seems unsuitable, removing from the list." + + # Build new topology array. + # TOPOLOGIES=("${TOPOLOGIES[@]/$topo}") + # does not really work, see: + # https://stackoverflow.com/questions/16860877/remove-an-element-from-a-bash-array + + new_topologies=() + for item in "${TOPOLOGIES[@]}"; do + if [[ "${item}" != "${1}" ]]; then + new_topologies+=("${item}") + fi + done + TOPOLOGIES=("${new_topologies[@]}") +} + + function reserve_and_cleanup_testbed () { # Reserve physical testbed, perform cleanup, register trap to unreserve. @@ -576,7 +606,7 @@ function reserve_and_cleanup_testbed () { set -exuo pipefail - while [[ ${TOPOLOGIES[@]} ]]; do + while true; do for topo in "${TOPOLOGIES[@]}"; do set +e scrpt="${PYTHON_SCRIPTS_DIR}/topo_reservation.py" @@ -609,38 +639,31 @@ function reserve_and_cleanup_testbed () { warn "Testbed cleanup failed: ${topo}" untrap_and_unreserve_testbed "Fail of unreserve after cleanup." # WORKING_TOPOLOGY is now empty again. - # Build new topology array. - # TOPOLOGIES=("${TOPOLOGIES[@]/$topo}") - # does not really work, see: - # https://stackoverflow.com/questions/16860877/remove-an-element-from-a-bash-array - new_topologies=() - for item in "${TOPOLOGIES[@]}"; do - if [[ "${item}" != "${topo}" ]]; then - new_topologies+=("${item}") - fi - done - TOPOLOGIES=("${new_topologies[@]}") - break + remove_topo "${topo}" + elif [[ "${result}" != "2" ]]; then + # 1 or unexpected return code, testbed is probably unusable. + remove_topo "${topo}" fi + # Else testbed is accessible but currently reserved, moving on. done if [[ -n "${WORKING_TOPOLOGY-}" ]]; then # Exit the infinite while loop if we made a reservation. + warn "Reservation and cleanup successful." break fi + if [[ "${#TOPOLOGIES[@]}" == "0" ]]; then + die "Run out of operational testbeds!" + fi + # Wait ~3minutes before next try. - sleep_time="$[ ( $RANDOM % 20 ) + 180 ]s" || { + sleep_time="$[ ( ${RANDOM} % 20 ) + 180 ]s" || { die "Sleep time calculation failed." } echo "Sleeping ${sleep_time}" sleep "${sleep_time}" || die "Sleep failed." done - if [[ ${TOPOLOGIES[@]} ]]; then - echo "Reservation and cleanup successful." - else - die "Run out of operational testbeds!" - fi } diff --git a/resources/tools/scripts/topo_reservation.py b/resources/tools/scripts/topo_reservation.py index a04709b2d5..9f26677f42 100755 --- a/resources/tools/scripts/topo_reservation.py +++ b/resources/tools/scripts/topo_reservation.py @@ -74,6 +74,15 @@ def main(): via a graphical terminal, which does not allow copying of text, as they need less keypresses to identify the test run holding the testbed. Also, the listing shows timestamps, which is useful for both audiences. + + This all assumes the target system accepts ssh connections. + If it does not, the caller probably wants to stop trying + to reserve this system. Therefore this script can return 3 different codes. + Return code 0 means the reservation was successful. + Return code 1 means the system is inaccessible (or similarly unsuitable). + Return code 2 means the system is accessible, but already reserved. + The reason unsuitable systems return 1 is because that is also the value + Python returns on encountering and unexcepted exception. """ parser = argparse.ArgumentParser() parser.add_argument("-t", "--topo", required=True, @@ -117,8 +126,8 @@ def main(): ret, _, err = exec_cmd(tgn, "mkdir '{dir}'".format(dir=RESERVATION_DIR)) # Critical section is over. if ret: - print "Reservation unsuccessful:\n{}".format(err) - return ret + print "Already reserved by another job:\n{}".format(err) + return 2 # Here the script knows it is the only owner of the testbed. print "Success, writing test run info to reservation dir." # TODO: Add optional argument to exec_cmd_no_error to print message @@ -129,7 +138,7 @@ def main(): if ret2: print "Writing test run info failed, but continuing anyway:\n{}".format( err) - return ret + return 0 if __name__ == "__main__": -- 2.16.6