63f91972e526ab75b8843d1818d94da41b296193
[csit.git] / resources / tools / presentation / input_data_files.py
1 # Copyright (c) 2021 Cisco and/or its affiliates.
2 # Licensed under the Apache License, Version 2.0 (the "License");
3 # you may not use this file except in compliance with the License.
4 # You may obtain a copy of the License at:
5 #
6 #     http://www.apache.org/licenses/LICENSE-2.0
7 #
8 # Unless required by applicable law or agreed to in writing, software
9 # distributed under the License is distributed on an "AS IS" BASIS,
10 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 # See the License for the specific language governing permissions and
12 # limitations under the License.
13
14 """Inputs
15 Download all data.
16 """
17
18 import re
19 import logging
20 import gzip
21
22 from os import rename, mkdir
23 from os.path import join
24 from http.client import responses, HTTPException
25 from zipfile import ZipFile, is_zipfile, BadZipfile
26
27 import requests
28
29 from requests.adapters import HTTPAdapter, Retry
30 from requests.exceptions import RequestException
31 from requests import codes
32
33 from urllib3.exceptions import HTTPError
34
35 from pal_errors import PresentationError
36
37
38 # Chunk size used for file download
39 CHUNK_SIZE = 512
40
41 # Separator used in file names
42 SEPARATOR = u"__"
43
44 REGEX_RELEASE = re.compile(r'(\D*)(\d{4}|master)(\D*)')
45
46
47 def _download_file(url, file_name, arch=False, verify=True, repeat=1):
48     """Download a file with input data.
49
50     :param url: URL to the file to download.
51     :param file_name: Name of file to download.
52     :param arch: If True, also .gz file is downloaded.
53     :param verify: If true, verify the certificate.
54     :param repeat: The number of attempts to download the file.
55     :type url: str
56     :type file_name: str
57     :type arch: bool
58     :type verify: bool
59     :type repeat: int
60     :returns: True if the download was successful, otherwise False.
61     :rtype: bool
62     """
63
64     def requests_retry_session(retries=3,
65                                backoff_factor=0.3,
66                                status_forcelist=(500, 502, 504)):
67         """
68
69         :param retries: Total number of retries to allow.
70         :param backoff_factor: A backoff factor to apply between attempts after
71             the second try.
72         :param status_forcelist: A set of integer HTTP status codes that are
73             forced to retry.
74         :type retries: int
75         :type backoff_factor: float
76         :type status_forcelist: iterable
77         :returns: Session object.
78         :rtype: requests.Session
79         """
80
81         retry = Retry(
82             total=retries,
83             read=retries,
84             connect=retries,
85             backoff_factor=backoff_factor,
86             status_forcelist=status_forcelist,
87         )
88         adapter = HTTPAdapter(max_retries=retry)
89         session = requests.Session()
90         session.mount(u"http://", adapter)
91         session.mount(u"https://", adapter)
92         return session
93
94     success = False
95     while repeat:
96         repeat -= 1
97         session = None
98         try:
99             logging.info(f"    Connecting to {url} ...")
100             session = requests_retry_session()
101             response = session.get(url, stream=True, verify=verify)
102             code = response.status_code
103             logging.info(f"    {code}: {responses[code]}")
104
105             if code != codes[u"OK"]:
106                 if session:
107                     session.close()
108                 url = url.replace(u"_info", u"")
109                 logging.info(f"    Connecting to {url} ...")
110                 session = requests_retry_session()
111                 response = session.get(url, stream=True, verify=verify)
112                 code = response.status_code
113                 logging.info(f"    {code}: {responses[code]}")
114                 if code != codes[u"OK"]:
115                     return False, file_name
116                 file_name = file_name.replace(u"_info", u"")
117
118             dst_file_name = file_name.replace(u".gz", u"")
119             logging.info(f"    Downloading the file {url} to {dst_file_name}")
120             with open(dst_file_name, u"wb") as file_handle:
121                 for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
122                     if chunk:
123                         file_handle.write(chunk)
124
125             if arch and u".gz" in file_name:
126                 if session:
127                     session.close()
128                 logging.info(f"    Downloading the file {url} to {file_name}")
129                 session = requests_retry_session()
130                 response = session.get(url, stream=True, verify=verify)
131                 if response.status_code == codes[u"OK"]:
132                     with open(file_name, u"wb") as file_handle:
133                         file_handle.write(response.raw.read())
134                 else:
135                     logging.error(
136                         f"Not possible to download the file "
137                         f"{url} to {file_name}"
138                     )
139
140             success = True
141             repeat = 0
142         except (HTTPException, HTTPError) as err:
143             logging.error(f"Connection broken:\n{repr(err)}")
144         except RequestException as err:
145             logging.error(f"HTTP Request exception:\n{repr(err)}")
146         except (IOError, ValueError, KeyError) as err:
147             logging.error(f"Download failed.\n{repr(err)}")
148         finally:
149             if session:
150                 session.close()
151
152     logging.info(u"    Download finished.")
153     return success, file_name
154
155
156 def _unzip_file(spec, build, pid):
157     """Unzip downloaded source file.
158
159     :param spec: Specification read form the specification file.
160     :param build: Information about the build.
161     :type spec: Specification
162     :type build: dict
163     :returns: True if the download was successful, otherwise False.
164     :rtype: bool
165     """
166
167     file_name = build[u"file-name"]
168     if u".zip" in file_name:
169         data_file = spec.input[u"zip-extract"]
170     else:
171         data_file = spec.input[u"extract"]
172
173     directory = spec.environment[u"paths"][u"DIR[WORKING,DATA]"]
174     tmp_dir = join(directory, str(pid))
175     try:
176         mkdir(tmp_dir)
177     except OSError:
178         pass
179     new_name = \
180         f"{file_name.rsplit(u'.')[-2]}{SEPARATOR}{data_file.split(u'/')[-1]}"
181
182     logging.info(f"    Unzipping: {data_file} from {file_name}.")
183     try:
184         with ZipFile(file_name, u'r') as zip_file:
185             zip_file.extract(data_file, tmp_dir)
186         logging.info(
187             f"    Renaming the file {join(tmp_dir, data_file)} to {new_name}"
188         )
189         rename(join(tmp_dir, data_file), new_name)
190         build[u"file-name"] = new_name
191         return True
192     except (BadZipfile, RuntimeError) as err:
193         logging.error(f"Failed to unzip the file {file_name}: {repr(err)}.")
194         return False
195     except OSError as err:
196         logging.error(f"Failed to rename the file {data_file}: {repr(err)}.")
197         return False
198
199
200 def download_and_unzip_data_file(spec, job, build, pid):
201     """Download and unzip a source file.
202
203     :param spec: Specification read form the specification file.
204     :param job: Name of the Jenkins job.
205     :param build: Information about the build.
206     :param pid: PID of the process executing this method.
207     :type spec: Specification
208     :type job: str
209     :type build: dict
210     :type pid: int
211     :returns: True if the download was successful, otherwise False.
212     :rtype: bool
213     """
214
215     success = False
216
217     file_name = spec.input[u"file-name"]
218     new_name = join(
219         spec.environment[u"paths"][u"DIR[WORKING,DATA]"],
220         f"{job}{SEPARATOR}{build[u'build']}{SEPARATOR}{file_name}"
221     )
222     arch = bool(spec.configuration.get(u"archive-inputs", True))
223     downloaded_name = u""
224
225     # Try to download .gz from s3_storage
226     for path in spec.input[u'download-path']:
227         url = u"{0}/{1}".format(
228             spec.environment[u'urls'][u'URL[S3_STORAGE,LOG]'],
229             path.format(job=job, build=build[u'build'], filename=file_name)
230         )
231         logging.info(f"Trying to download {url}")
232         success, downloaded_name = _download_file(
233             url, new_name, arch=arch, verify=False, repeat=3
234         )
235         if success:
236             break
237
238     if not success:
239         # Try to download .gz from logs.fd.io
240         for path in spec.input[u'download-path']:
241             url = u"{0}/{1}".format(
242                 spec.environment[u'urls'][u'URL[NEXUS,LOG]'],
243                 path.format(job=job, build=build[u'build'], filename=file_name)
244             )
245             logging.info(f"Trying to download {url}")
246             success, downloaded_name = _download_file(
247                 url, new_name, arch=arch, verify=True, repeat=3
248             )
249             if success:
250                 break
251
252     if not success:
253         # Try to download .gz or .zip from docs.fd.io
254         file_name = (spec.input[u"file-name"], spec.input[u"zip-file-name"])
255         release = re.search(REGEX_RELEASE, job).group(2)
256         for idx, rls in enumerate((release, u"master", )):
257             try:
258                 rls = f"rls{int(rls)}"
259             except ValueError:
260                 # It is master
261                 pass
262             url = (
263                 f"{spec.environment[u'urls'][u'URL[NEXUS,DOC]']}/"
264                 f"{rls}/"
265                 f"{spec.environment[u'urls'][u'DIR[NEXUS,DOC]']}/"
266                 f"{job}{SEPARATOR}{build[u'build']}{SEPARATOR}{file_name[idx]}"
267             )
268
269             logging.info(f"Downloading {url}")
270
271             new_name = join(
272                 spec.environment[u"paths"][u"DIR[WORKING,DATA]"],
273                 f"{job}{SEPARATOR}{build[u'build']}{SEPARATOR}{file_name[idx]}"
274             )
275             success, downloaded_name = _download_file(url, new_name, arch=arch)
276             if success:
277                 file_name = file_name[idx]
278                 if file_name.endswith(u".gz"):
279                     with gzip.open(downloaded_name[:-3], u"rb") as gzip_file:
280                         file_content = gzip_file.read()
281                     with open(downloaded_name[:-3], u"wb") as xml_file:
282                         xml_file.write(file_content)
283                 break
284
285     # if not success:
286     #     # Try to download .zip from jenkins.fd.io
287     #     file_name = spec.input[u"zip-file-name"]
288     #     download_path = spec.input[u"zip-download-path"]
289     #     if job.startswith(u"csit-"):
290     #         url = spec.environment[u"urls"][u"URL[JENKINS,CSIT]"]
291     #     else:
292     #         raise PresentationError(f"No url defined for the job {job}.")
293     #
294     #     full_name = download_path.format(
295     #         job=job, build=build[u"build"], filename=file_name
296     #     )
297     #     url = u"{0}/{1}".format(url, full_name)
298     #     new_name = join(
299     #         spec.environment[u"paths"][u"DIR[WORKING,DATA]"],
300     #         f"{job}{SEPARATOR}{build[u'build']}{SEPARATOR}{file_name}"
301     #     )
302     #     logging.info(f"Downloading {url}")
303     #     success, downloaded_name = _download_file(url, new_name)
304
305     if success and downloaded_name.endswith(u".zip"):
306         if not is_zipfile(downloaded_name):
307             logging.error(f"Zip file {new_name} is corrupted.")
308             success = False
309
310     if success:
311         build[u"file-name"] = downloaded_name
312
313         if file_name.endswith(u".gz"):
314             build[u"file-name"] = downloaded_name[:-3]
315
316         if downloaded_name.endswith(u".zip"):
317             success = _unzip_file(spec, build, pid)
318
319     return success