PAL: Download .gz first
[csit.git] / resources / tools / presentation / input_data_files.py
1 # Copyright (c) 2018 Cisco and/or its affiliates.
2 # Licensed under the Apache License, Version 2.0 (the "License");
3 # you may not use this file except in compliance with the License.
4 # You may obtain a copy of the License at:
5 #
6 #     http://www.apache.org/licenses/LICENSE-2.0
7 #
8 # Unless required by applicable law or agreed to in writing, software
9 # distributed under the License is distributed on an "AS IS" BASIS,
10 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 # See the License for the specific language governing permissions and
12 # limitations under the License.
13
14 """Inputs
15 Download all data.
16 """
17
18 import re
19 import logging
20 import gzip
21
22 from os import rename, mkdir
23 from os.path import join
24 from http.client import responses
25 from zipfile import ZipFile, is_zipfile, BadZipfile
26
27 import requests
28
29 from requests.adapters import HTTPAdapter, Retry
30 from requests.exceptions import RequestException
31 from requests import codes
32
33 from pal_errors import PresentationError
34
35
36 # Chunk size used for file download
37 CHUNK_SIZE = 512
38
39 # Separator used in file names
40 SEPARATOR = u"__"
41
42 REGEX_RELEASE = re.compile(r'(\D*)(\d{4}|master)(\D*)')
43
44
45 def _download_file(url, file_name, log, arch=False):
46     """Download a file with input data.
47
48     :param url: URL to the file to download.
49     :param file_name: Name of file to download.
50     :param log: List of log messages.
51     :param arch: If True, also .gz file is downloaded
52     :type url: str
53     :type file_name: str
54     :type log: list of tuples (severity, msg)
55     :type arch: bool
56     :returns: True if the download was successful, otherwise False.
57     :rtype: bool
58     """
59
60     def requests_retry_session(retries=3,
61                                backoff_factor=0.3,
62                                status_forcelist=(500, 502, 504)):
63         """
64
65         :param retries: Total number of retries to allow.
66         :param backoff_factor: A backoff factor to apply between attempts after
67             the second try.
68         :param status_forcelist: A set of integer HTTP status codes that are
69             forced to retry.
70         :type retries: int
71         :type backoff_factor: float
72         :type status_forcelist: iterable
73         :returns: Session object.
74         :rtype: requests.Session
75         """
76
77         retry = Retry(
78             total=retries,
79             read=retries,
80             connect=retries,
81             backoff_factor=backoff_factor,
82             status_forcelist=status_forcelist,
83         )
84         adapter = HTTPAdapter(max_retries=retry)
85         session = requests.Session()
86         session.mount(u"http://", adapter)
87         session.mount(u"https://", adapter)
88         return session
89
90     success = False
91     session = None
92     try:
93         log.append((u"INFO", f"    Connecting to {url} ..."))
94         session = requests_retry_session()
95         response = session.get(url, stream=True)
96         code = response.status_code
97         log.append((u"INFO", f"    {code}: {responses[code]}"))
98
99         if code != codes[u"OK"]:
100             if session:
101                 session.close()
102             url = url.replace(u"_info", u"")
103             log.append((u"INFO", f"    Connecting to {url} ..."))
104             session = requests_retry_session()
105             response = session.get(url, stream=True)
106             code = response.status_code
107             log.append((u"INFO", f"    {code}: {responses[code]}"))
108             if code != codes[u"OK"]:
109                 return False, file_name
110             file_name = file_name.replace(u"_info", u"")
111
112         dst_file_name = file_name.replace(u".gz", u"")
113         log.append(
114             (u"INFO", f"    Downloading the file {url} to {dst_file_name} ...")
115         )
116         with open(dst_file_name, u"wb") as file_handle:
117             for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
118                 if chunk:
119                     file_handle.write(chunk)
120
121         if arch and u".gz" in file_name:
122             if session:
123                 session.close()
124             log.append(
125                 (u"INFO", f"    Downloading the file {url} to {file_name} ...")
126             )
127             session = requests_retry_session()
128             response = session.get(url, stream=True)
129             if response.status_code == codes[u"OK"]:
130                 with open(file_name, u"wb") as file_handle:
131                     file_handle.write(response.raw.read())
132             else:
133                 log.append(
134                     (u"ERROR", f"Not possible to download the file {url} to "
135                                f"{file_name} ...")
136                 )
137
138         success = True
139     except RequestException as err:
140         log.append(
141             (u"ERROR", f"HTTP Request exception:\n{repr(err)}")
142         )
143     except (IOError, ValueError, KeyError) as err:
144         log.append((u"ERROR", f"Download failed.\n{repr(err)}"))
145     finally:
146         if session:
147             session.close()
148
149     log.append((u"INFO", u"    Download finished."))
150     return success, file_name
151
152
153 def _unzip_file(spec, build, pid, log):
154     """Unzip downloaded source file.
155
156     :param spec: Specification read form the specification file.
157     :param build: Information about the build.
158     :param log: List of log messages.
159     :type spec: Specification
160     :type build: dict
161     :type log: list of tuples (severity, msg)
162     :returns: True if the download was successful, otherwise False.
163     :rtype: bool
164     """
165
166     file_name = build[u"file-name"]
167     if u".zip" in file_name:
168         data_file = spec.input[u"zip-extract"]
169     else:
170         data_file = spec.input[u"extract"]
171
172     directory = spec.environment[u"paths"][u"DIR[WORKING,DATA]"]
173     tmp_dir = join(directory, str(pid))
174     try:
175         mkdir(tmp_dir)
176     except OSError:
177         pass
178     new_name = \
179         f"{file_name.rsplit(u'.')[-2]}{SEPARATOR}{data_file.split(u'/')[-1]}"
180
181     log.append((u"INFO", f"    Unzipping: {data_file} from {file_name}."))
182     try:
183         with ZipFile(file_name, u'r') as zip_file:
184             zip_file.extract(data_file, tmp_dir)
185         log.append(
186             (u"INFO", f"    Renaming the file {join(tmp_dir, data_file)} to "
187                       f"{new_name}")
188         )
189         rename(join(tmp_dir, data_file), new_name)
190         build[u"file-name"] = new_name
191         return True
192     except (BadZipfile, RuntimeError) as err:
193         log.append(
194             (u"ERROR", f"Failed to unzip the file {file_name}: {repr(err)}.")
195         )
196         return False
197     except OSError as err:
198         log.append(
199             (u"ERROR", f"Failed to rename the file {data_file}: {repr(err)}.")
200         )
201         return False
202
203
204 def download_and_unzip_data_file(spec, job, build, pid, log):
205     """Download and unzip a source file.
206
207     :param spec: Specification read form the specification file.
208     :param job: Name of the Jenkins job.
209     :param build: Information about the build.
210     :param pid: PID of the process executing this method.
211     :param log: List of log messages.
212     :type spec: Specification
213     :type job: str
214     :type build: dict
215     :type pid: int
216     :type log: list of tuples (severity, msg)
217     :returns: True if the download was successful, otherwise False.
218     :rtype: bool
219     """
220
221     # Try to download .gz from logs.fd.io
222
223     file_name = spec.input[u"file-name"]
224     url = u"{0}/{1}".format(
225         spec.environment[u'urls'][u'URL[NEXUS,LOG]'],
226         spec.input[u'download-path'].format(
227             job=job, build=build[u'build'], filename=file_name
228         )
229     )
230     new_name = join(
231         spec.environment[u"paths"][u"DIR[WORKING,DATA]"],
232         f"{job}{SEPARATOR}{build[u'build']}{SEPARATOR}{file_name}"
233     )
234
235     logging.info(f"Trying to download {url}")
236
237     arch = bool(spec.configuration.get(u"archive-inputs", True))
238     success, downloaded_name = _download_file(url, new_name, log, arch=arch)
239
240     if not success:
241
242         # Try to download .gz or .zip from docs.fd.io
243         file_name = (spec.input[u"file-name"], spec.input[u"zip-file-name"])
244         release = re.search(REGEX_RELEASE, job).group(2)
245         for idx, rls in enumerate((release, u"master", )):
246             try:
247                 rls = f"rls{int(rls)}"
248             except ValueError:
249                 # It is master
250                 pass
251             url = (
252                 f"{spec.environment[u'urls'][u'URL[NEXUS,DOC]']}/"
253                 f"{rls}/"
254                 f"{spec.environment[u'urls'][u'DIR[NEXUS,DOC]']}/"
255                 f"{job}{SEPARATOR}{build[u'build']}{SEPARATOR}{file_name[idx]}"
256             )
257
258             logging.info(f"Downloading {url}")
259
260             new_name = join(
261                 spec.environment[u"paths"][u"DIR[WORKING,DATA]"],
262                 f"{job}{SEPARATOR}{build[u'build']}{SEPARATOR}{file_name[idx]}"
263             )
264             success, downloaded_name = _download_file(
265                 url, new_name, log, arch=arch
266             )
267             if success:
268                 file_name = file_name[idx]
269                 if file_name.endswith(u".gz"):
270                     with gzip.open(downloaded_name[:-3], u"rb") as gzip_file:
271                         file_content = gzip_file.read()
272                     with open(downloaded_name[:-3], u"wb") as xml_file:
273                         xml_file.write(file_content)
274                 break
275
276     if not success:
277
278         # Try to download .zip from jenkins.fd.io
279         file_name = spec.input[u"zip-file-name"]
280         download_path = spec.input[u"zip-download-path"]
281         if job.startswith(u"csit-"):
282             url = spec.environment[u"urls"][u"URL[JENKINS,CSIT]"]
283         elif job.startswith(u"hc2vpp-"):
284             url = spec.environment[u"urls"][u"URL[JENKINS,HC]"]
285         else:
286             raise PresentationError(f"No url defined for the job {job}.")
287
288         full_name = download_path.format(
289             job=job, build=build[u"build"], filename=file_name
290         )
291         url = u"{0}/{1}".format(url, full_name)
292         new_name = join(
293             spec.environment[u"paths"][u"DIR[WORKING,DATA]"],
294             f"{job}{SEPARATOR}{build[u'build']}{SEPARATOR}{file_name}"
295         )
296
297         logging.info(f"Downloading {url}")
298
299         success, downloaded_name = _download_file(url, new_name, log)
300
301     if success and downloaded_name.endswith(u".zip"):
302         if not is_zipfile(downloaded_name):
303             log.append((u"ERROR", f"Zip file {new_name} is corrupted."))
304             success = False
305
306     if success:
307         build[u"file-name"] = downloaded_name
308
309         if file_name.endswith(u".gz"):
310             build[u"file-name"] = downloaded_name[:-3]
311
312         if downloaded_name.endswith(u".zip"):
313             success = _unzip_file(spec, build, pid, log)
314
315     return success