e5b26e910ceeb4af55147e5f6d43820443325181
[csit.git] / resources / tools / presentation / input_data_files.py
1 # Copyright (c) 2020 Cisco and/or its affiliates.
2 # Licensed under the Apache License, Version 2.0 (the "License");
3 # you may not use this file except in compliance with the License.
4 # You may obtain a copy of the License at:
5 #
6 #     http://www.apache.org/licenses/LICENSE-2.0
7 #
8 # Unless required by applicable law or agreed to in writing, software
9 # distributed under the License is distributed on an "AS IS" BASIS,
10 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 # See the License for the specific language governing permissions and
12 # limitations under the License.
13
14 """Inputs
15 Download all data.
16 """
17
18 import re
19 import logging
20 import gzip
21
22 from os import rename, mkdir
23 from os.path import join
24 from http.client import responses, HTTPException
25 from zipfile import ZipFile, is_zipfile, BadZipfile
26
27 import requests
28
29 from requests.adapters import HTTPAdapter, Retry
30 from requests.exceptions import RequestException
31 from requests import codes
32
33 from urllib3.exceptions import HTTPError
34
35 from pal_errors import PresentationError
36
37
38 # Chunk size used for file download
39 CHUNK_SIZE = 512
40
41 # Separator used in file names
42 SEPARATOR = u"__"
43
44 REGEX_RELEASE = re.compile(r'(\D*)(\d{4}|master)(\D*)')
45
46
47 def _download_file(url, file_name, arch=False, verify=True, repeat=1):
48     """Download a file with input data.
49
50     :param url: URL to the file to download.
51     :param file_name: Name of file to download.
52     :param arch: If True, also .gz file is downloaded.
53     :param verify: If true, verify the certificate.
54     :param repeat: The number of attempts to download the file.
55     :type url: str
56     :type file_name: str
57     :type arch: bool
58     :type verify: bool
59     :type repeat: int
60     :returns: True if the download was successful, otherwise False.
61     :rtype: bool
62     """
63
64     def requests_retry_session(retries=3,
65                                backoff_factor=0.3,
66                                status_forcelist=(500, 502, 504)):
67         """
68
69         :param retries: Total number of retries to allow.
70         :param backoff_factor: A backoff factor to apply between attempts after
71             the second try.
72         :param status_forcelist: A set of integer HTTP status codes that are
73             forced to retry.
74         :type retries: int
75         :type backoff_factor: float
76         :type status_forcelist: iterable
77         :returns: Session object.
78         :rtype: requests.Session
79         """
80
81         retry = Retry(
82             total=retries,
83             read=retries,
84             connect=retries,
85             backoff_factor=backoff_factor,
86             status_forcelist=status_forcelist,
87         )
88         adapter = HTTPAdapter(max_retries=retry)
89         session = requests.Session()
90         session.mount(u"http://", adapter)
91         session.mount(u"https://", adapter)
92         return session
93
94     success = False
95     while repeat:
96         repeat -= 1
97         session = None
98         try:
99             logging.info(f"    Connecting to {url} ...")
100             session = requests_retry_session()
101             response = session.get(url, stream=True, verify=verify)
102             code = response.status_code
103             logging.info(f"    {code}: {responses[code]}")
104
105             if code != codes[u"OK"]:
106                 if session:
107                     session.close()
108                 url = url.replace(u"_info", u"")
109                 logging.info(f"    Connecting to {url} ...")
110                 session = requests_retry_session()
111                 response = session.get(url, stream=True, verify=verify)
112                 code = response.status_code
113                 logging.info(f"    {code}: {responses[code]}")
114                 if code != codes[u"OK"]:
115                     return False, file_name
116                 file_name = file_name.replace(u"_info", u"")
117
118             dst_file_name = file_name.replace(u".gz", u"")
119             logging.info(f"    Downloading the file {url} to {dst_file_name}")
120             with open(dst_file_name, u"wb") as file_handle:
121                 for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
122                     if chunk:
123                         file_handle.write(chunk)
124
125             if arch and u".gz" in file_name:
126                 if session:
127                     session.close()
128                 logging.info(f"    Downloading the file {url} to {file_name}")
129                 session = requests_retry_session()
130                 response = session.get(url, stream=True, verify=verify)
131                 if response.status_code == codes[u"OK"]:
132                     with open(file_name, u"wb") as file_handle:
133                         file_handle.write(response.raw.read())
134                 else:
135                     logging.error(
136                         f"Not possible to download the file "
137                         f"{url} to {file_name}"
138                     )
139
140             success = True
141             repeat = 0
142         except (HTTPException, HTTPError) as err:
143             logging.error(f"Connection broken:\n{repr(err)}")
144         except RequestException as err:
145             logging.error(f"HTTP Request exception:\n{repr(err)}")
146         except (IOError, ValueError, KeyError) as err:
147             logging.error(f"Download failed.\n{repr(err)}")
148         finally:
149             if session:
150                 session.close()
151
152     logging.info(u"    Download finished.")
153     return success, file_name
154
155
156 def _unzip_file(spec, build, pid):
157     """Unzip downloaded source file.
158
159     :param spec: Specification read form the specification file.
160     :param build: Information about the build.
161     :type spec: Specification
162     :type build: dict
163     :returns: True if the download was successful, otherwise False.
164     :rtype: bool
165     """
166
167     file_name = build[u"file-name"]
168     if u".zip" in file_name:
169         data_file = spec.input[u"zip-extract"]
170     else:
171         data_file = spec.input[u"extract"]
172
173     directory = spec.environment[u"paths"][u"DIR[WORKING,DATA]"]
174     tmp_dir = join(directory, str(pid))
175     try:
176         mkdir(tmp_dir)
177     except OSError:
178         pass
179     new_name = \
180         f"{file_name.rsplit(u'.')[-2]}{SEPARATOR}{data_file.split(u'/')[-1]}"
181
182     logging.info(f"    Unzipping: {data_file} from {file_name}.")
183     try:
184         with ZipFile(file_name, u'r') as zip_file:
185             zip_file.extract(data_file, tmp_dir)
186         logging.info(
187             f"    Renaming the file {join(tmp_dir, data_file)} to {new_name}"
188         )
189         rename(join(tmp_dir, data_file), new_name)
190         build[u"file-name"] = new_name
191         return True
192     except (BadZipfile, RuntimeError) as err:
193         logging.error(f"Failed to unzip the file {file_name}: {repr(err)}.")
194         return False
195     except OSError as err:
196         logging.error(f"Failed to rename the file {data_file}: {repr(err)}.")
197         return False
198
199
200 def download_and_unzip_data_file(spec, job, build, pid):
201     """Download and unzip a source file.
202
203     :param spec: Specification read form the specification file.
204     :param job: Name of the Jenkins job.
205     :param build: Information about the build.
206     :param pid: PID of the process executing this method.
207     :type spec: Specification
208     :type job: str
209     :type build: dict
210     :type pid: int
211     :returns: True if the download was successful, otherwise False.
212     :rtype: bool
213     """
214
215     # Try to download .gz from s3_storage
216     file_name = spec.input[u"file-name"]
217     url = u"{0}/{1}".format(
218         spec.environment[u'urls'][u'URL[S3_STORAGE,LOG]'],
219         spec.input[u'download-path'].format(
220             job=job, build=build[u'build'], filename=file_name
221         )
222     )
223     new_name = join(
224         spec.environment[u"paths"][u"DIR[WORKING,DATA]"],
225         f"{job}{SEPARATOR}{build[u'build']}{SEPARATOR}{file_name}"
226     )
227
228     logging.info(f"Trying to download {url}")
229
230     arch = bool(spec.configuration.get(u"archive-inputs", True))
231     success, downloaded_name = _download_file(
232         url, new_name, arch=arch, verify=False, repeat=3
233     )
234
235     if not success:
236         # Try to download .gz from logs.fd.io
237         file_name = spec.input[u"file-name"]
238         url = u"{0}/{1}".format(
239             spec.environment[u'urls'][u'URL[NEXUS,LOG]'],
240             spec.input[u'download-path'].format(
241                 job=job, build=build[u'build'], filename=file_name
242             )
243         )
244         new_name = join(
245             spec.environment[u"paths"][u"DIR[WORKING,DATA]"],
246             f"{job}{SEPARATOR}{build[u'build']}{SEPARATOR}{file_name}"
247         )
248
249         logging.info(f"Trying to download {url}")
250
251         arch = bool(spec.configuration.get(u"archive-inputs", True))
252         success, downloaded_name = _download_file(
253             url, new_name, arch=arch, verify=True, repeat=3
254         )
255
256     if not success:
257
258         # Try to download .gz or .zip from docs.fd.io
259         file_name = (spec.input[u"file-name"], spec.input[u"zip-file-name"])
260         release = re.search(REGEX_RELEASE, job).group(2)
261         for idx, rls in enumerate((release, u"master", )):
262             try:
263                 rls = f"rls{int(rls)}"
264             except ValueError:
265                 # It is master
266                 pass
267             url = (
268                 f"{spec.environment[u'urls'][u'URL[NEXUS,DOC]']}/"
269                 f"{rls}/"
270                 f"{spec.environment[u'urls'][u'DIR[NEXUS,DOC]']}/"
271                 f"{job}{SEPARATOR}{build[u'build']}{SEPARATOR}{file_name[idx]}"
272             )
273
274             logging.info(f"Downloading {url}")
275
276             new_name = join(
277                 spec.environment[u"paths"][u"DIR[WORKING,DATA]"],
278                 f"{job}{SEPARATOR}{build[u'build']}{SEPARATOR}{file_name[idx]}"
279             )
280             success, downloaded_name = _download_file(url, new_name, arch=arch)
281             if success:
282                 file_name = file_name[idx]
283                 if file_name.endswith(u".gz"):
284                     with gzip.open(downloaded_name[:-3], u"rb") as gzip_file:
285                         file_content = gzip_file.read()
286                     with open(downloaded_name[:-3], u"wb") as xml_file:
287                         xml_file.write(file_content)
288                 break
289
290     if not success:
291
292         # Try to download .zip from jenkins.fd.io
293         file_name = spec.input[u"zip-file-name"]
294         download_path = spec.input[u"zip-download-path"]
295         if job.startswith(u"csit-"):
296             url = spec.environment[u"urls"][u"URL[JENKINS,CSIT]"]
297         else:
298             raise PresentationError(f"No url defined for the job {job}.")
299
300         full_name = download_path.format(
301             job=job, build=build[u"build"], filename=file_name
302         )
303         url = u"{0}/{1}".format(url, full_name)
304         new_name = join(
305             spec.environment[u"paths"][u"DIR[WORKING,DATA]"],
306             f"{job}{SEPARATOR}{build[u'build']}{SEPARATOR}{file_name}"
307         )
308
309         logging.info(f"Downloading {url}")
310
311         success, downloaded_name = _download_file(url, new_name)
312
313     if success and downloaded_name.endswith(u".zip"):
314         if not is_zipfile(downloaded_name):
315             logging.error(f"Zip file {new_name} is corrupted.")
316             success = False
317
318     if success:
319         build[u"file-name"] = downloaded_name
320
321         if file_name.endswith(u".gz"):
322             build[u"file-name"] = downloaded_name[:-3]
323
324         if downloaded_name.endswith(u".zip"):
325             success = _unzip_file(spec, build, pid)
326
327     return success