CSIT-607 Optimize VIRL job scheduling algorithm
[csit.git] / resources / tools / virl / bin / start-testcase
1 #!/usr/bin/python
2
3 # Copyright (c) 2016 Cisco and/or its affiliates.
4 # Licensed under the Apache License, Version 2.0 (the "License");
5 # you may not use this file except in compliance with the License.
6 # You may obtain a copy of the License at:
7 #
8 #     http://www.apache.org/licenses/LICENSE-2.0
9 #
10 # Unless required by applicable law or agreed to in writing, software
11 # distributed under the License is distributed on an "AS IS" BASIS,
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 # See the License for the specific language governing permissions and
14 # limitations under the License.
15
16 """This script is handling starting of VIRL simulations."""
17
18 __author__ = 'ckoester@cisco.com'
19
20 import argparse
21 import netifaces
22 import os
23 import paramiko
24 import random
25 import re
26 import shutil
27 import sys
28 import tempfile
29 import time
30
31 import requests
32
33 IPS_PER_SIMULATION = 5
34
35 def indent(lines, amount, fillchar=' '):
36     """Indent the string by amount of fill chars.
37
38     :param lines: String to indent.
39     :param amount: Number of fill chars.
40     :param fillchar: Filling character.
41     :type lines: str
42     :type amount: int
43     :type fillchar: str
44     :returns: Indented string.
45     :rtype: str
46     """
47     padding = amount * fillchar
48     return padding + ('\n'+padding).join(lines.split('\n'))
49
50 def print_to_stderr(msg, end='\n'):
51     """Writes any text to stderr.
52
53     :param msg: Message to print.
54     :param end: By default print new line at the end.
55     :type msg: str
56     :type end: str
57     """
58     try:
59         sys.stderr.write(str(msg) + end)
60     except ValueError:
61         pass
62
63 def get_assigned_interfaces(args, network="flat"):
64     """Retrieve assigned interfaces in openstack network.
65
66     :param args: Command line params.
67     :param network: Openstack network.
68     :type args: ArgumentParser
69     :type network: str
70     :returns: Assigned interfaces.
71     :rtype: list
72     :raises RuntimeError: If response is not 200.
73     """
74     req = requests.get('http://{}/openstack/rest/ports/{}'
75                        .format(args.virl_ip, network),
76                        auth=(args.username, args.password))
77     if req.status_code == 200:
78         return req.json()
79     else:
80         raise RuntimeError("ERROR: Retrieving ports in use - "
81                            "Status other than 200 HTTP OK:\n{}"
82                            .format(req.content))
83
84 def get_assigned_interfaces_count(args, network="flat"):
85     """Count assigned interfaces in openstack network.
86
87     :param args: Command line params.
88     :param network: Openstack network.
89     :type args: ArgumentParser
90     :type network: str
91     :returns: Assigned interfaces count.
92     :rtype: int
93     """
94     return len(get_assigned_interfaces(args, network=network))
95
96 def check_ip_addresses(args):
97     """Check IP address availability.
98
99     :param args: Command line params.
100     :type args: ArgumentParser
101     :raises RuntimeError: If not enough free addresses available.
102     """
103     for i in range(args.wait_count):
104         if (args.quota - \
105             get_assigned_interfaces_count(args) >= IPS_PER_SIMULATION):
106             break
107         if args.verbosity >= 2:
108             print_to_stderr("DEBUG: - Attempt {} out of {}, waiting for free "
109                             "IP addresses".format(i, args.wait_count))
110         # Wait random amount of time within range 1-3 minutes
111         time.sleep(random.randint(60,180))
112     else:
113         raise RuntimeError("ERROR: Not enough IP addresses to run simulation")
114
115 def check_virl_resources(args):
116     """Check virl resources availability.
117
118     :param args: Command line params.
119     :type args: ArgumentParser
120     """
121     check_ip_addresses(args)
122
123 #
124 # FIXME: Right now, this is really coded like a shell script, as one big
125 # function executed in sequence. This should be broken down into multiple
126 # functions.
127 #
128 def main():
129     """ Main function."""
130     #
131     # Get our default interface IP address. This will become the default
132     # value for the "NFS Server IP" option.
133     #
134     gws = netifaces.gateways()
135     addrs = netifaces.ifaddresses(gws['default'][netifaces.AF_INET][1])
136     default_addr = addrs[netifaces.AF_INET][0]['addr']
137
138     #
139     # Verify CLI parameters and try to download our VPP image into a temporary
140     # file first
141     #
142     parser = argparse.ArgumentParser()
143     parser.add_argument("topology", help="the base topology to be started")
144     parser.add_argument("packages", help="Path to the VPP .deb(s) or .rpm(s) " +
145                         "that is/are to be installed", nargs='+')
146     parser.add_argument("-c", "--copy", help="Copy the VPP packages, " +
147                         "leaving the originals in place. Default is to " +
148                         "move them.", action='store_true')
149     parser.add_argument("-k", "--keep", help="Keep (do not delete) the " +
150                         "simulation in case of error", action='store_true')
151     parser.add_argument("-v", "--verbosity", action="count", default=0)
152     parser.add_argument("-nip", "--nfs-server-ip", help="NFS server (our) IP " +
153                         "default is derived from routing table: " +
154                         "{}".format(default_addr), default=default_addr)
155     parser.add_argument("-ns", "--nfs-scratch-directory",
156                         help="Server location for NFS scratch directory",
157                         default="/nfs/scratch")
158     parser.add_argument("-nc", "--nfs-common-directory",
159                         help="Server location for NFS common (read-only) " +
160                         "directory", default="/nfs/common")
161     parser.add_argument("-wc", "--wait-count",
162                         help="number of intervals to wait for simulation to " +
163                         "be ready", type=int, default=48)
164     parser.add_argument("-wt", "--wait-time",
165                         help="length of a single interval to wait for " +
166                         "simulation to be ready", type=int, default=5)
167     parser.add_argument("-vip", "--virl-ip",
168                         help="VIRL IP and Port (e.g. 127.0.0.1:19399)",
169                         default="127.0.0.1:19399")
170     parser.add_argument("-u", "--username", help="VIRL username",
171                         default="tb4-virl")
172     parser.add_argument("-au", "--admin-username", help="VIRL admin username",
173                         default="uwmadmin")
174     parser.add_argument("-p", "--password", help="VIRL password",
175                         default="Cisco1234")
176     parser.add_argument("-su", "--ssh-user", help="SSH username",
177                         default="cisco")
178     parser.add_argument("-e", "--expiry", help="Simulation expiry",
179                         default="120")
180     parser.add_argument("-spr", "--ssh-privkey", help="SSH private keyfile",
181                         default="/home/jenkins-in/.ssh/id_rsa_virl")
182     parser.add_argument("-spu", "--ssh-pubkey", help="SSH public keyfile",
183                         default="/home/jenkins-in/.ssh/id_rsa_virl.pub")
184     parser.add_argument("-r", "--release", help="VM disk image/release " +
185                         "(ex. \"csit-ubuntu-16.04.1_2016-12-19_1.6\")",
186                         default="csit-ubuntu-16.04.1_2016-12-19_1.6")
187     parser.add_argument("--topology-directory", help="Topology directory",
188                         default="/home/jenkins-in/testcase-infra/topologies")
189     parser.add_argument("-q", "--quota",
190                         help="VIRL quota for max number of allowed IPs",
191                         type=int, default=74)
192
193     args = parser.parse_args()
194
195     #
196     # Check if topology and template exist
197     #
198     if args.verbosity >= 2:
199         print_to_stderr("DEBUG: Running with topology {}"
200                         .format(args.topology))
201
202     topology_virl_filename = os.path.join(args.topology_directory,
203                                           args.topology + ".virl")
204     topology_yaml_filename = os.path.join(args.topology_directory,
205                                           args.topology + ".yaml")
206
207     if not os.path.isfile(topology_virl_filename):
208         print_to_stderr("ERROR: Topology VIRL file {} does not exist"
209                         .format(topology_virl_filename))
210         sys.exit(1)
211     if not os.path.isfile(topology_yaml_filename):
212         print_to_stderr("ERROR: Topology YAML file {} does not exist"
213                         .format(topology_yaml_filename))
214         sys.exit(1)
215
216     #
217     # Check if VPP package exists
218     #
219     for package in args.packages:
220         if args.verbosity >= 2:
221             print_to_stderr("DEBUG: Checking if file {} exists"
222                             .format(package))
223         if not os.path.isfile(package):
224             print_to_stderr("ERROR: Debian package {} does not exist"
225                             .format(package))
226             sys.exit(1)
227
228     #
229     # Start VIRL topology
230     #
231     if args.verbosity >= 1:
232         print_to_stderr("DEBUG: Starting VIRL topology")
233     temp_handle, temp_topology = tempfile.mkstemp()
234     with open(args.ssh_pubkey, 'r') as pubkey_file:
235         pub_key = pubkey_file.read().replace('\n', '')
236     with open(temp_topology, 'w') as new_file, \
237         open(topology_virl_filename, 'r') as old_file:
238         for line in old_file:
239             line = line.replace("  - VIRL-USER-SSH-PUBLIC-KEY", "  - "+pub_key)
240             line = line.replace("$$NFS_SERVER_SCRATCH$$",
241                                 args.nfs_server_ip+":"+args.nfs_scratch_directory)
242             line = line.replace("$$NFS_SERVER_COMMON$$",
243                                 args.nfs_server_ip+":"+args.nfs_common_directory)
244             line = line.replace("$$VM_IMAGE$$", "server-"+args.release)
245             new_file.write(line)
246     os.close(temp_handle)
247
248     try:
249         data = open(temp_topology, 'rb')
250         check_virl_resources(args)
251         req = requests.post('http://' + args.virl_ip + '/simengine/rest/launch',
252                             auth=(args.username, args.password),
253                             data=data)
254         if args.verbosity >= 2:
255             print_to_stderr("DEBUG: - Request URL {}"
256                             .format(req.url))
257             print_to_stderr("{}"
258                             .format(req.text))
259             print_to_stderr("DEBUG: - Response Code {}"
260                             .format(req.status_code))
261         new_file.close()
262         if req.status_code != 200:
263             raise RuntimeError("ERROR: Launching VIRL simulation - "
264                                "Status other than 200 HTTP OK:\n{}"
265                                .format(req.content))
266     except (requests.exceptions.RequestException,
267             RuntimeError) as ex_error:
268         print_to_stderr(ex_error)
269         os.remove(temp_topology)
270         sys.exit(1)
271
272     # If we got here, we had a good response. The response content is the
273     # session ID.
274     session_id = req.content
275     if args.verbosity >= 1:
276         print_to_stderr("DEBUG: VIRL simulation session-id: {}"
277                         .format(session_id))
278
279     # Set session expiry to autokill sessions if not done from jenkins
280     if not args.keep:
281         if args.verbosity >= 1:
282             print_to_stderr("DEBUG: Setting expire for session-id: {}"
283                             .format(session_id))
284         try:
285             req = requests.put('http://' + args.virl_ip +
286                                '/simengine/rest/admin-update/' + session_id +
287                                '/expiry',
288                                auth=(args.admin_username, args.password),
289                                params={'user': args.username,
290                                        'expires': args.expiry})
291             if args.verbosity >= 2:
292                 print_to_stderr("DEBUG: - Request URL {}"
293                                 .format(req.url))
294                 print_to_stderr("{}"
295                                 .format(req.text))
296                 print_to_stderr("DEBUG: - Response Code {}"
297                                 .format(req.status_code))
298             if req.status_code != 200:
299                 raise RuntimeError("ERROR: Setting expiry to simulation - "
300                                    "Status other than 200 HTTP OK:\n{}"
301                                    .format(req.content))
302         except (requests.exceptions.RequestException,
303                 RuntimeError) as ex_error:
304             print_to_stderr(ex_error)
305             req = requests.get('http://' + args.virl_ip +
306                                '/simengine/rest/stop/' + session_id,
307                                auth=(args.username, args.password))
308             os.remove(temp_topology)
309             print "{}".format(session_id)
310             sys.exit(1)
311
312     #
313     # Create simulation scratch directory. Move topology file into that
314     # directory. Copy or move debian packages into that directory.
315     #
316     scratch_directory = os.path.join(args.nfs_scratch_directory, session_id)
317     os.mkdir(scratch_directory)
318     shutil.move(temp_topology, os.path.join(scratch_directory,
319                                             "virl_topology.virl"))
320     os.mkdir(os.path.join(scratch_directory, "vpp"))
321     for package in args.packages:
322         if args.copy:
323             shutil.copy(package, os.path.join(scratch_directory, "vpp",
324                                               os.path.basename(package)))
325         else:
326             shutil.move(package, os.path.join(scratch_directory, "vpp",
327                                               os.path.basename(package)))
328
329     #
330     # Wait for simulation to become active
331     #
332     if args.verbosity >= 1:
333         print_to_stderr("DEBUG: Waiting for simulation to become active")
334
335     sim_is_started = False
336     nodelist = []
337
338     count = args.wait_count
339     while (count > 0) and not sim_is_started:
340         time.sleep(args.wait_time)
341         count -= 1
342
343         req = requests.get('http://' + args.virl_ip + '/simengine/rest/nodes/' +
344                            session_id, auth=(args.username, args.password))
345         data = req.json()
346
347         active = 0
348         total = 0
349
350         # Flush the node list every time, keep the last one
351         nodelist = []
352
353         # Hosts are the keys of the inner dictionary
354         for key in data[session_id].keys():
355             if data[session_id][key]['management-proxy'] == "self":
356                 continue
357             nodelist.append(key)
358             total += 1
359             if data[session_id][key]['state'] == "ACTIVE":
360                 active += 1
361         if args.verbosity >= 2:
362             print_to_stderr("DEBUG: - Attempt {} out of {}, total {} hosts, "
363                             "{} active".format(args.wait_count-count,
364                                                args.wait_count, total, active))
365         if active == total:
366             sim_is_started = True
367
368     if not sim_is_started:
369         print_to_stderr("ERROR: Simulation nodes never changed to ACTIVE state")
370         print_to_stderr("Last VIRL response:")
371         print_to_stderr(data)
372         if not args.keep:
373             req = requests.get('http://' + args.virl_ip +
374                                '/simengine/rest/stop/' + session_id,
375                                auth=(args.username, args.password))
376             try:
377                 shutil.rmtree(scratch_directory)
378             except shutil.Error:
379                 print_to_stderr("ERROR: Removing scratch directory")
380             print "{}".format(session_id)
381         sys.exit(1)
382
383     if args.verbosity >= 2:
384         print_to_stderr("DEBUG: Nodes: {}"
385                         .format(", ".join(nodelist)))
386
387     #
388     # Fetch simulation's IPs and create files
389     # (ansible hosts file, topology YAML file)
390     #
391     try:
392         req = requests.get('http://' + args.virl_ip +
393                            '/simengine/rest/interfaces/' + session_id,
394                            auth=(args.username, args.password),
395                            params={'fetch-state': '1'})
396         if args.verbosity >= 2:
397             print_to_stderr("DEBUG: - Request URL {}"
398                             .format(req.url))
399             print_to_stderr("DEBUG: - Request Text")
400             print_to_stderr("{}".format(req.text))
401             print_to_stderr("DEBUG: - Response Code {}"
402                             .format(req.status_code))
403         if req.status_code != 200:
404             raise RuntimeError("ERROR:Fetching IP's of simulation - "
405                                "Status other than 200 HTTP OK:\n{}"
406                                .format(req.content))
407     except (requests.exceptions.RequestException,
408             RuntimeError) as ex_error:
409         print_to_stderr(ex_error)
410         if not args.keep:
411             req = requests.get('http://' + args.virl_ip +
412                                '/simengine/rest/stop/' + session_id,
413                                auth=(args.username, args.password))
414             try:
415                 shutil.rmtree(scratch_directory)
416             except shutil.Error:
417                 print_to_stderr("ERROR: Removing scratch directory")
418             print "{}".format(session_id)
419         sys.exit(1)
420     data = req.json()
421
422     # Populate node addresses
423     nodeaddrs = {}
424     topology = {}
425     for key in nodelist:
426         nodetype = re.split('[0-9]', key)[0]
427         if not nodetype in nodeaddrs:
428             nodeaddrs[nodetype] = {}
429         nodeaddrs[nodetype][key] = re.split('\\/', \
430             data[session_id][key]['management']['ip-address'])[0]
431         if args.verbosity >= 2:
432             print_to_stderr("DEBUG: Node {} is of type {} and has mgmt IP {}"
433                             .format(key, nodetype, nodeaddrs[nodetype][key]))
434
435         topology[key] = {}
436         for key2 in data[session_id][key]:
437             topology[key]["nic-"+key2] = data[session_id][key][key2]
438             if 'ip-address' in topology[key]["nic-"+key2]:
439                 if topology[key]["nic-"+key2]['ip-address'] is not None:
440                     topology[key]["nic-"+key2]['ip-addr'] = re.split('\\/', \
441                         topology[key]["nic-"+key2]['ip-address'])[0]
442
443     # Write ansible file
444     ansiblehosts = open(os.path.join(scratch_directory, 'ansible-hosts'), 'w')
445     for key1 in nodeaddrs:
446         ansiblehosts.write("[{}]\n".format(key1))
447         for key2 in nodeaddrs[key1]:
448             ansiblehosts.write("{} hostname={}\n".format(nodeaddrs[key1][key2],
449                                                          key2))
450     ansiblehosts.close()
451
452     # Process topology YAML template
453     with open(args.ssh_privkey, 'r') as privkey_file:
454         priv_key = indent(privkey_file.read(), 6)
455
456     with open(os.path.join(scratch_directory, "topology.yaml"), 'w') as \
457         new_file, open(topology_yaml_filename, 'r') as old_file:
458         for line in old_file:
459             new_file.write(line.format(priv_key=priv_key, topology=topology))
460
461     #
462     # Wait for hosts to become reachable over SSH
463     #
464     if args.verbosity >= 1:
465         print_to_stderr("DEBUG: Waiting for hosts to become reachable over SSH")
466
467     missing = -1
468     count = args.wait_count
469     while (count > 0) and missing != 0:
470         time.sleep(args.wait_time)
471         count -= 1
472
473         missing = 0
474         for key in nodelist:
475             if not os.path.exists(os.path.join(scratch_directory, key)):
476                 missing += 1
477         if args.verbosity >= 2:
478             print_to_stderr("DEBUG: Attempt {} out of {}, waiting for {} hosts"
479                             .format(args.wait_count-count, args.wait_count,
480                                     missing))
481
482     if missing != 0:
483         print_to_stderr("ERROR: Simulation started OK but {} hosts never "
484                         "mounted their NFS directory".format(missing))
485         if not args.keep:
486             req = requests.get('http://' + args.virl_ip +
487                                '/simengine/rest/stop/' + session_id,
488                                auth=(args.username, args.password))
489             try:
490                 shutil.rmtree(scratch_directory)
491             except shutil.Error:
492                 print_to_stderr("ERROR: Removing scratch directory")
493             print "{}".format(session_id)
494         sys.exit(1)
495
496     #
497     # Upgrade VPP
498     #
499     if args.verbosity >= 1:
500         print_to_stderr("DEBUG: Uprading VPP")
501
502     for key1 in nodeaddrs:
503         if not key1 == 'tg':
504             for key2 in nodeaddrs[key1]:
505                 ipaddr = nodeaddrs[key1][key2]
506                 if args.verbosity >= 2:
507                     print_to_stderr("DEBUG: Upgrading VPP on node {}"
508                                     .format(ipaddr))
509                 paramiko.util.log_to_file(os.path.join(scratch_directory,
510                                                        "ssh.log"))
511                 client = paramiko.SSHClient()
512                 client.load_system_host_keys()
513                 client.load_host_keys("/dev/null")
514                 client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
515                 client.connect(ipaddr, username=args.ssh_user,
516                                key_filename=args.ssh_privkey)
517                 if 'centos' in args.topology:
518                     if args.verbosity >= 1:
519                         print_to_stderr("DEBUG: Installing RPM packages")
520                     vpp_install_command = 'sudo rpm -ivh /scratch/vpp/*.rpm'
521                 elif 'trusty' in args.topology or 'xenial' in args.topology:
522                     if args.verbosity >= 1:
523                         print_to_stderr("DEBUG: Installing DEB packages")
524                     vpp_install_command = 'sudo dpkg -i --force-all ' \
525                                           '/scratch/vpp/*.deb'
526                 else:
527                     print_to_stderr("ERROR: Unsupported OS requested: {}"
528                                     .format(args.topology))
529                     vpp_install_command = ''
530                 _, stdout, stderr = \
531                     client.exec_command(vpp_install_command)
532                 c_stdout = stdout.read()
533                 c_stderr = stderr.read()
534                 if args.verbosity >= 2:
535                     print_to_stderr("DEBUG: Command output was:")
536                     print_to_stderr(c_stdout)
537                     print_to_stderr("DEBUG: Command stderr was:")
538                     print_to_stderr(c_stderr)
539
540     #
541     # Write a file with timestamp to scratch directory. We can use this to track
542     # how long a simulation has been running.
543     #
544     with open(os.path.join(scratch_directory, 'start_time'), 'a') as \
545         timestampfile:
546         timestampfile.write('{}\n'.format(int(time.time())))
547
548     #
549     # Declare victory
550     #
551     if args.verbosity >= 1:
552         print_to_stderr("SESSION ID: {}".format(session_id))
553
554     print "{}".format(session_id)
555
556 if __name__ == "__main__":
557     sys.exit(main())