023d52a0d6de5be49d1eb4b4a5eddecf5bf8af08
[csit.git] / resources / tools / presentation / input_data_files.py
1 # Copyright (c) 2020 Cisco and/or its affiliates.
2 # Licensed under the Apache License, Version 2.0 (the "License");
3 # you may not use this file except in compliance with the License.
4 # You may obtain a copy of the License at:
5 #
6 #     http://www.apache.org/licenses/LICENSE-2.0
7 #
8 # Unless required by applicable law or agreed to in writing, software
9 # distributed under the License is distributed on an "AS IS" BASIS,
10 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 # See the License for the specific language governing permissions and
12 # limitations under the License.
13
14 """Inputs
15 Download all data.
16 """
17
18 import re
19 import logging
20 import gzip
21
22 from os import rename, mkdir
23 from os.path import join
24 from http.client import responses, IncompleteRead
25 from zipfile import ZipFile, is_zipfile, BadZipfile
26
27 import requests
28
29 from requests.adapters import HTTPAdapter, Retry
30 from requests.exceptions import RequestException
31 from requests import codes
32
33 from pal_errors import PresentationError
34
35
36 # Chunk size used for file download
37 CHUNK_SIZE = 512
38
39 # Separator used in file names
40 SEPARATOR = u"__"
41
42 REGEX_RELEASE = re.compile(r'(\D*)(\d{4}|master)(\D*)')
43
44
45 def _download_file(url, file_name, arch=False, verify=True, repeat=1):
46     """Download a file with input data.
47
48     :param url: URL to the file to download.
49     :param file_name: Name of file to download.
50     :param arch: If True, also .gz file is downloaded.
51     :param verify: If true, verify the certificate.
52     :param repeat: The number of attempts to download the file.
53     :type url: str
54     :type file_name: str
55     :type arch: bool
56     :type verify: bool
57     :type repeat: int
58     :returns: True if the download was successful, otherwise False.
59     :rtype: bool
60     """
61
62     def requests_retry_session(retries=3,
63                                backoff_factor=0.3,
64                                status_forcelist=(500, 502, 504)):
65         """
66
67         :param retries: Total number of retries to allow.
68         :param backoff_factor: A backoff factor to apply between attempts after
69             the second try.
70         :param status_forcelist: A set of integer HTTP status codes that are
71             forced to retry.
72         :type retries: int
73         :type backoff_factor: float
74         :type status_forcelist: iterable
75         :returns: Session object.
76         :rtype: requests.Session
77         """
78
79         retry = Retry(
80             total=retries,
81             read=retries,
82             connect=retries,
83             backoff_factor=backoff_factor,
84             status_forcelist=status_forcelist,
85         )
86         adapter = HTTPAdapter(max_retries=retry)
87         session = requests.Session()
88         session.mount(u"http://", adapter)
89         session.mount(u"https://", adapter)
90         return session
91
92     while repeat:
93         repeat -= 1
94         success = False
95         session = None
96         try:
97             logging.info(f"    Connecting to {url} ...")
98             session = requests_retry_session()
99             response = session.get(url, stream=True, verify=verify)
100             code = response.status_code
101             logging.info(f"    {code}: {responses[code]}")
102
103             if code != codes[u"OK"]:
104                 if session:
105                     session.close()
106                 url = url.replace(u"_info", u"")
107                 logging.info(f"    Connecting to {url} ...")
108                 session = requests_retry_session()
109                 response = session.get(url, stream=True, verify=verify)
110                 code = response.status_code
111                 logging.info(f"    {code}: {responses[code]}")
112                 if code != codes[u"OK"]:
113                     return False, file_name
114                 file_name = file_name.replace(u"_info", u"")
115
116             dst_file_name = file_name.replace(u".gz", u"")
117             logging.info(f"    Downloading the file {url} to {dst_file_name}")
118             with open(dst_file_name, u"wb") as file_handle:
119                 for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
120                     if chunk:
121                         file_handle.write(chunk)
122
123             if arch and u".gz" in file_name:
124                 if session:
125                     session.close()
126                 logging.info(f"    Downloading the file {url} to {file_name}")
127                 session = requests_retry_session()
128                 response = session.get(url, stream=True, verify=verify)
129                 if response.status_code == codes[u"OK"]:
130                     with open(file_name, u"wb") as file_handle:
131                         file_handle.write(response.raw.read())
132                 else:
133                     logging.error(
134                         f"Not possible to download the file "
135                         f"{url} to {file_name}"
136                     )
137
138             success = True
139             repeat = 0
140         except IncompleteRead as err:
141             logging.error(f"Connection broken:\n{repr(err)}")
142         except RequestException as err:
143             logging.error(f"HTTP Request exception:\n{repr(err)}")
144         except (IOError, ValueError, KeyError) as err:
145             logging.error(f"Download failed.\n{repr(err)}")
146         finally:
147             if session:
148                 session.close()
149
150     logging.info(u"    Download finished.")
151     return success, file_name
152
153
154 def _unzip_file(spec, build, pid):
155     """Unzip downloaded source file.
156
157     :param spec: Specification read form the specification file.
158     :param build: Information about the build.
159     :type spec: Specification
160     :type build: dict
161     :returns: True if the download was successful, otherwise False.
162     :rtype: bool
163     """
164
165     file_name = build[u"file-name"]
166     if u".zip" in file_name:
167         data_file = spec.input[u"zip-extract"]
168     else:
169         data_file = spec.input[u"extract"]
170
171     directory = spec.environment[u"paths"][u"DIR[WORKING,DATA]"]
172     tmp_dir = join(directory, str(pid))
173     try:
174         mkdir(tmp_dir)
175     except OSError:
176         pass
177     new_name = \
178         f"{file_name.rsplit(u'.')[-2]}{SEPARATOR}{data_file.split(u'/')[-1]}"
179
180     logging.info(f"    Unzipping: {data_file} from {file_name}.")
181     try:
182         with ZipFile(file_name, u'r') as zip_file:
183             zip_file.extract(data_file, tmp_dir)
184         logging.info(
185             f"    Renaming the file {join(tmp_dir, data_file)} to {new_name}"
186         )
187         rename(join(tmp_dir, data_file), new_name)
188         build[u"file-name"] = new_name
189         return True
190     except (BadZipfile, RuntimeError) as err:
191         logging.error(f"Failed to unzip the file {file_name}: {repr(err)}.")
192         return False
193     except OSError as err:
194         logging.error(f"Failed to rename the file {data_file}: {repr(err)}.")
195         return False
196
197
198 def download_and_unzip_data_file(spec, job, build, pid):
199     """Download and unzip a source file.
200
201     :param spec: Specification read form the specification file.
202     :param job: Name of the Jenkins job.
203     :param build: Information about the build.
204     :param pid: PID of the process executing this method.
205     :type spec: Specification
206     :type job: str
207     :type build: dict
208     :type pid: int
209     :returns: True if the download was successful, otherwise False.
210     :rtype: bool
211     """
212
213     # Try to download .gz from s3_storage
214     file_name = spec.input[u"file-name"]
215     url = u"{0}/{1}".format(
216         spec.environment[u'urls'][u'URL[S3_STORAGE,LOG]'],
217         spec.input[u'download-path'].format(
218             job=job, build=build[u'build'], filename=file_name
219         )
220     )
221     new_name = join(
222         spec.environment[u"paths"][u"DIR[WORKING,DATA]"],
223         f"{job}{SEPARATOR}{build[u'build']}{SEPARATOR}{file_name}"
224     )
225
226     logging.info(f"Trying to download {url}")
227
228     arch = bool(spec.configuration.get(u"archive-inputs", True))
229     success, downloaded_name = _download_file(
230         url, new_name, arch=arch, verify=False, repeat=3
231     )
232
233     if not success:
234         # Try to download .gz from logs.fd.io
235         file_name = spec.input[u"file-name"]
236         url = u"{0}/{1}".format(
237             spec.environment[u'urls'][u'URL[NEXUS,LOG]'],
238             spec.input[u'download-path'].format(
239                 job=job, build=build[u'build'], filename=file_name
240             )
241         )
242         new_name = join(
243             spec.environment[u"paths"][u"DIR[WORKING,DATA]"],
244             f"{job}{SEPARATOR}{build[u'build']}{SEPARATOR}{file_name}"
245         )
246
247         logging.info(f"Trying to download {url}")
248
249         arch = bool(spec.configuration.get(u"archive-inputs", True))
250         success, downloaded_name = _download_file(url, new_name, arch=arch)
251
252     if not success:
253
254         # Try to download .gz or .zip from docs.fd.io
255         file_name = (spec.input[u"file-name"], spec.input[u"zip-file-name"])
256         release = re.search(REGEX_RELEASE, job).group(2)
257         for idx, rls in enumerate((release, u"master", )):
258             try:
259                 rls = f"rls{int(rls)}"
260             except ValueError:
261                 # It is master
262                 pass
263             url = (
264                 f"{spec.environment[u'urls'][u'URL[NEXUS,DOC]']}/"
265                 f"{rls}/"
266                 f"{spec.environment[u'urls'][u'DIR[NEXUS,DOC]']}/"
267                 f"{job}{SEPARATOR}{build[u'build']}{SEPARATOR}{file_name[idx]}"
268             )
269
270             logging.info(f"Downloading {url}")
271
272             new_name = join(
273                 spec.environment[u"paths"][u"DIR[WORKING,DATA]"],
274                 f"{job}{SEPARATOR}{build[u'build']}{SEPARATOR}{file_name[idx]}"
275             )
276             success, downloaded_name = _download_file(url, new_name, arch=arch)
277             if success:
278                 file_name = file_name[idx]
279                 if file_name.endswith(u".gz"):
280                     with gzip.open(downloaded_name[:-3], u"rb") as gzip_file:
281                         file_content = gzip_file.read()
282                     with open(downloaded_name[:-3], u"wb") as xml_file:
283                         xml_file.write(file_content)
284                 break
285
286     if not success:
287
288         # Try to download .zip from jenkins.fd.io
289         file_name = spec.input[u"zip-file-name"]
290         download_path = spec.input[u"zip-download-path"]
291         if job.startswith(u"csit-"):
292             url = spec.environment[u"urls"][u"URL[JENKINS,CSIT]"]
293         else:
294             raise PresentationError(f"No url defined for the job {job}.")
295
296         full_name = download_path.format(
297             job=job, build=build[u"build"], filename=file_name
298         )
299         url = u"{0}/{1}".format(url, full_name)
300         new_name = join(
301             spec.environment[u"paths"][u"DIR[WORKING,DATA]"],
302             f"{job}{SEPARATOR}{build[u'build']}{SEPARATOR}{file_name}"
303         )
304
305         logging.info(f"Downloading {url}")
306
307         success, downloaded_name = _download_file(url, new_name)
308
309     if success and downloaded_name.endswith(u".zip"):
310         if not is_zipfile(downloaded_name):
311             logging.error(f"Zip file {new_name} is corrupted.")
312             success = False
313
314     if success:
315         build[u"file-name"] = downloaded_name
316
317         if file_name.endswith(u".gz"):
318             build[u"file-name"] = downloaded_name[:-3]
319
320         if downloaded_name.endswith(u".zip"):
321             success = _unzip_file(spec, build, pid)
322
323     return success