From 2b836cf4d1e4e59ca34229a9fdf49d79216da20e Mon Sep 17 00:00:00 2001 From: Dave Barach Date: Fri, 22 Apr 2016 09:54:22 -0400 Subject: [PATCH] Add a vpp_restart command vpp_restart solves the problem of restarting vpp and not exiting until the new vpp instance is sufficiently established to allow immediate binary API connections. The point is to avoid using arbitrary "sleep N" commands e.g. in CSIT shell scripts. We send SIGTERM to the current vpp process, and expect / depend on the process-monitor du jour to remove the vpp-api shared-memory segment. vpp_restart exits w/ status 0 if all is well, non-zero upon failure. In trying to make vpp_restart reliable, we discovered a recently-introduced heap corruption bug in .../dpdk/init.c, which caused vpp to crash on startup once every 20 times on a particular rig. If possible, we should check /var/log/syslog for evidence of unexpected multiple restarts during regression-testing. Change-Id: Ic48c74b1a94a4368066ba2e326d4b12d51192088 Signed-off-by: Dave Barach --- vpp-api-test/Makefile.am | 4 +- vpp-api-test/vat/restart.c | 234 +++++++++++++++++++++++++++++++++++++++++++++ vpp/api/gmon.c | 16 +++- vpp/app/version.c | 1 + 4 files changed, 250 insertions(+), 5 deletions(-) create mode 100644 vpp-api-test/vat/restart.c diff --git a/vpp-api-test/Makefile.am b/vpp-api-test/Makefile.am index 5079f8d49b9..d59c7e27f5c 100644 --- a/vpp-api-test/Makefile.am +++ b/vpp-api-test/Makefile.am @@ -20,7 +20,7 @@ libvatplugin_la_SOURCES = \ lib_LTLIBRARIES = libvatplugin.la -bin_PROGRAMS = vpp_api_test vpp_json_test +bin_PROGRAMS = vpp_api_test vpp_json_test vpp_restart vpp_api_test_SOURCES = vat/api_format.c vat/main.c vat/plugin.c vat/vat.h vat/json_format.h vat/json_format.c vpp_json_test_SOURCES = vat/json_format.h vat/json_format.c vat/json_test.c @@ -31,5 +31,7 @@ vpp_api_test_LDADD = -lvlibmemoryclient -lvlibapi -lsvm \ vpp_json_test_LDADD = -lvppinfra -lm nobase_include_HEADERS = vat/vat.h vat/json_format.h +vpp_restart_SOURCES = vat/restart.c +vpp_restart_LDADD = -lsvmdb -lsvm -lvppinfra -lpthread -lrt dist_bin_SCRIPTS = scripts/vppctl diff --git a/vpp-api-test/vat/restart.c b/vpp-api-test/vat/restart.c new file mode 100644 index 00000000000..d94ddf89505 --- /dev/null +++ b/vpp-api-test/vat/restart.c @@ -0,0 +1,234 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +int restart_main_fn (unformat_input_t * i) +{ + int verbose = 0; + int old_pid; + int wait; + u8 * chroot_path = 0; + svmdb_client_t * svmdb_client; + volatile pid_t *pidp; + struct stat statb; + ino_t old_inode; + int sleeps; + + struct timespec _req, *req = &_req; + struct timespec _rem, *rem = &_rem; + + if (geteuid()) + clib_error ("vpp_restart: must be root..."); + + while (unformat_check_input (i) != UNFORMAT_END_OF_INPUT) + { + if (unformat (i, "verbose") || unformat (i, "v")) + verbose = 1; + else if (unformat (i, "chroot %s", &chroot_path)) + ; + else + { + clib_error ("unknown input `%U'", format_unformat_error, i); + return 1; + } + } + + /* + * Step 1: look up the current VPP pid in the shared-memory database + */ + svmdb_client = svmdb_map_chroot ((char *) chroot_path); + + pidp = svmdb_local_get_variable_reference (svmdb_client, + SVMDB_NAMESPACE_VEC, + "vpp_pid"); + if (pidp == 0) + { + clib_error ("'vpp_pid' svm variable not found, vpp has never run?"); + return 2; + } + + /* Spin for up to 10 seconds for vpp to start */ + for (wait = 0; wait < 1000; wait++) + { + req->tv_sec = 0; + req->tv_nsec = 10000*1000; /* 10 ms */ + while (nanosleep(req, rem) < 0) + *req = *rem; + + if (*pidp) + goto found2; + } + + clib_error ("VPP not runnning..."); + return 3; + + found2: + + old_pid = *pidp; + + /* + * Step 2: sanity check the pid we discovered + */ + if (verbose) + fformat(stdout, "Sanity check current vpp pid %d\n", old_pid); + + if (kill (old_pid, 0) < 0) + { + svmdb_unmap (svmdb_client); + clib_error ("vpp current pid %d not running...", old_pid); + return 2; + } + + if (verbose) + fformat(stdout, "Sanity check vpp pid %d OK\n", old_pid); + + /* + * Step 3: figure out the current vpp <--> client shared-VM file + * inode number + */ + if (stat("/dev/shm/vpe-api", &statb) < 0) + { + clib_unix_error ("stat fail"); + return 4; + } + + old_inode = statb.st_ino; + + if (verbose) + fformat(stdout, "Old inode %u\n", old_inode); + + /* Note: restart wipes out the shared VM database*/ + svmdb_unmap (svmdb_client); + + /* + * Step 4: send SIGTERM to vpp. + * systemd et al. will restart vpp after wiping out the shared-VM + * database and (crucially) the shared API messaging segment + */ + + if (kill (old_pid, SIGTERM) < 0) + { + clib_unix_error ("SIGTERM fail"); + return 3; + } + + sleeps = 0; + + /* + * Step 5: wait up to 15 seconds for a new incarnation of + * the shared-VM API segment to appear. + */ + for (wait = 0; wait < 150; wait++) + { + if ((stat("/dev/shm/vpe-api", &statb) < 0) + || statb.st_ino == old_inode) + { + req->tv_sec = 0; + req->tv_nsec = 100000*1000; /* 100 ms */ + while (nanosleep(req, rem) < 0) + *req = *rem; + sleeps++; + } + else + goto new_inode; + } + + clib_error ("Timeout waiting for new inode to appear..."); + return 5; + + new_inode: + if (verbose && sleeps > 0) + fformat(stdout, "Inode sleeps %d\n", sleeps); + + if (verbose) + fformat(stdout, "New inode %u\n", statb.st_ino); + + /* + * Step 6: remap the SVM database + */ + svmdb_client = svmdb_map_chroot ((char *) chroot_path); + + pidp = svmdb_local_get_variable_reference (svmdb_client, + SVMDB_NAMESPACE_VEC, + "vpp_pid"); + if (pidp == 0) + { + clib_error ("post_restart: 'vpp_pid' svm variable not found," + "vpp did not restart?"); + return 2; + } + + sleeps = 0; + + /* + * Step 7: wait for vpp to publish its new PID + */ + + /* Spin for up to 15 seconds */ + for (wait = 0; wait < 150; wait++) + { + if (*pidp && (*pidp != old_pid)) + goto restarted; + req->tv_sec = 0; + req->tv_nsec = 100000*1000; /* 100 ms */ + while (nanosleep(req, rem) < 0) + *req = *rem; + sleeps++; + } + + clib_error ("Timeout waiting for vpp to publish pid after restart..."); + return 4; + + restarted: + + /* Done... */ + + if (verbose && sleeps) + fformat(stdout, "pid sleeps %d\n", sleeps); + + if (verbose) + fformat (stdout, "New PID %d... Restarted...\n", *pidp); + + svmdb_unmap (svmdb_client); + return 0; +} + +int main (int argc, char **argv) +{ + unformat_input_t i; + int ret; + + clib_mem_init (0, 64ULL<<20); + + unformat_init_command_line (&i, argv); + ret = restart_main_fn (&i); + unformat_free (&i); + return ret; +} diff --git a/vpp/api/gmon.c b/vpp/api/gmon.c index 32786d63b38..8ab890fceb4 100644 --- a/vpp/api/gmon.c +++ b/vpp/api/gmon.c @@ -92,6 +92,18 @@ static u64 get_significant_errors(gmon_main_t * gm) return (significant_errors); } +static clib_error_t * +publish_pid (vlib_main_t *vm) +{ + gmon_main_t *gm = &gmon_main; + + *gm->vpef_pid_ptr = getpid(); + + return 0; +} +VLIB_API_INIT_FUNCTION(publish_pid); + + static uword gmon_process (vlib_main_t * vm, vlib_node_runtime_t * rt, @@ -101,12 +113,8 @@ gmon_process (vlib_main_t * vm, u64 input_packets, last_input_packets, new_sig_errors; f64 last_runtime, dt, now; gmon_main_t *gm = &gmon_main; - pid_t vpefpid; int i; - vpefpid = getpid(); - *gm->vpef_pid_ptr = vpefpid; - last_runtime = 0.0; last_input_packets = 0; diff --git a/vpp/app/version.c b/vpp/app/version.c index 5eb1fc5aad1..588aadf29e9 100644 --- a/vpp/app/version.c +++ b/vpp/app/version.c @@ -59,6 +59,7 @@ show_vpe_version_command_fn (vlib_main_t * vm, _("Compiler", "%s", vpe_compiler); _("CPU model name", "%U", format_cpu_model_name); _("CPU microarchitecture", "%U", format_cpu_uarch); + _("Current PID", "%d", getpid()); #if DPDK > 0 _("DPDK Version", "%s", rte_version()); _("DPDK EAL init args", "%s", dpdk_main.eal_init_args_str); -- 2.16.6