47249b1323beacde9a469b92f7083e12a5404399
[csit.git] / resources / tools / presentation / input_data_files.py
1 # Copyright (c) 2020 Cisco and/or its affiliates.
2 # Licensed under the Apache License, Version 2.0 (the "License");
3 # you may not use this file except in compliance with the License.
4 # You may obtain a copy of the License at:
5 #
6 #     http://www.apache.org/licenses/LICENSE-2.0
7 #
8 # Unless required by applicable law or agreed to in writing, software
9 # distributed under the License is distributed on an "AS IS" BASIS,
10 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 # See the License for the specific language governing permissions and
12 # limitations under the License.
13
14 """Inputs
15 Download all data.
16 """
17
18 import re
19 import logging
20 import gzip
21
22 from os import rename, mkdir
23 from os.path import join
24 from http.client import responses
25 from zipfile import ZipFile, is_zipfile, BadZipfile
26
27 import requests
28
29 from requests.adapters import HTTPAdapter, Retry
30 from requests.exceptions import RequestException
31 from requests import codes
32
33 from pal_errors import PresentationError
34
35
36 # Chunk size used for file download
37 CHUNK_SIZE = 512
38
39 # Separator used in file names
40 SEPARATOR = u"__"
41
42 REGEX_RELEASE = re.compile(r'(\D*)(\d{4}|master)(\D*)')
43
44
45 def _download_file(url, file_name, arch=False):
46     """Download a file with input data.
47
48     :param url: URL to the file to download.
49     :param file_name: Name of file to download.
50     :param arch: If True, also .gz file is downloaded
51     :type url: str
52     :type file_name: str
53     :type arch: bool
54     :returns: True if the download was successful, otherwise False.
55     :rtype: bool
56     """
57
58     def requests_retry_session(retries=3,
59                                backoff_factor=0.3,
60                                status_forcelist=(500, 502, 504)):
61         """
62
63         :param retries: Total number of retries to allow.
64         :param backoff_factor: A backoff factor to apply between attempts after
65             the second try.
66         :param status_forcelist: A set of integer HTTP status codes that are
67             forced to retry.
68         :type retries: int
69         :type backoff_factor: float
70         :type status_forcelist: iterable
71         :returns: Session object.
72         :rtype: requests.Session
73         """
74
75         retry = Retry(
76             total=retries,
77             read=retries,
78             connect=retries,
79             backoff_factor=backoff_factor,
80             status_forcelist=status_forcelist,
81         )
82         adapter = HTTPAdapter(max_retries=retry)
83         session = requests.Session()
84         session.mount(u"http://", adapter)
85         session.mount(u"https://", adapter)
86         return session
87
88     success = False
89     session = None
90     try:
91         logging.info(f"    Connecting to {url} ...")
92         session = requests_retry_session()
93         response = session.get(url, stream=True)
94         code = response.status_code
95         logging.info(f"    {code}: {responses[code]}")
96
97         if code != codes[u"OK"]:
98             if session:
99                 session.close()
100             url = url.replace(u"_info", u"")
101             logging.info(f"    Connecting to {url} ...")
102             session = requests_retry_session()
103             response = session.get(url, stream=True)
104             code = response.status_code
105             logging.info(f"    {code}: {responses[code]}")
106             if code != codes[u"OK"]:
107                 return False, file_name
108             file_name = file_name.replace(u"_info", u"")
109
110         dst_file_name = file_name.replace(u".gz", u"")
111         logging.info(f"    Downloading the file {url} to {dst_file_name} ...")
112         with open(dst_file_name, u"wb") as file_handle:
113             for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
114                 if chunk:
115                     file_handle.write(chunk)
116
117         if arch and u".gz" in file_name:
118             if session:
119                 session.close()
120             logging.info(f"    Downloading the file {url} to {file_name} ...")
121             session = requests_retry_session()
122             response = session.get(url, stream=True)
123             if response.status_code == codes[u"OK"]:
124                 with open(file_name, u"wb") as file_handle:
125                     file_handle.write(response.raw.read())
126             else:
127                 logging.error(
128                     f"Not possible to download the file {url} to {file_name}"
129                 )
130
131         success = True
132     except RequestException as err:
133         logging.error(f"HTTP Request exception:\n{repr(err)}")
134     except (IOError, ValueError, KeyError) as err:
135         logging.error(f"Download failed.\n{repr(err)}")
136     finally:
137         if session:
138             session.close()
139
140     logging.info(u"    Download finished.")
141     return success, file_name
142
143
144 def _unzip_file(spec, build, pid):
145     """Unzip downloaded source file.
146
147     :param spec: Specification read form the specification file.
148     :param build: Information about the build.
149     :type spec: Specification
150     :type build: dict
151     :returns: True if the download was successful, otherwise False.
152     :rtype: bool
153     """
154
155     file_name = build[u"file-name"]
156     if u".zip" in file_name:
157         data_file = spec.input[u"zip-extract"]
158     else:
159         data_file = spec.input[u"extract"]
160
161     directory = spec.environment[u"paths"][u"DIR[WORKING,DATA]"]
162     tmp_dir = join(directory, str(pid))
163     try:
164         mkdir(tmp_dir)
165     except OSError:
166         pass
167     new_name = \
168         f"{file_name.rsplit(u'.')[-2]}{SEPARATOR}{data_file.split(u'/')[-1]}"
169
170     logging.info(f"    Unzipping: {data_file} from {file_name}.")
171     try:
172         with ZipFile(file_name, u'r') as zip_file:
173             zip_file.extract(data_file, tmp_dir)
174         logging.info(
175             f"    Renaming the file {join(tmp_dir, data_file)} to {new_name}"
176         )
177         rename(join(tmp_dir, data_file), new_name)
178         build[u"file-name"] = new_name
179         return True
180     except (BadZipfile, RuntimeError) as err:
181         logging.error(f"Failed to unzip the file {file_name}: {repr(err)}.")
182         return False
183     except OSError as err:
184         logging.error(f"Failed to rename the file {data_file}: {repr(err)}.")
185         return False
186
187
188 def download_and_unzip_data_file(spec, job, build, pid):
189     """Download and unzip a source file.
190
191     :param spec: Specification read form the specification file.
192     :param job: Name of the Jenkins job.
193     :param build: Information about the build.
194     :param pid: PID of the process executing this method.
195     :type spec: Specification
196     :type job: str
197     :type build: dict
198     :type pid: int
199     :returns: True if the download was successful, otherwise False.
200     :rtype: bool
201     """
202
203     # Try to download .gz from s3_storage
204     file_name = spec.input[u"file-name"]
205     url = u"{0}/{1}".format(
206         spec.environment[u'urls'][u'URL[S3_STORAGE,LOG]'],
207         spec.input[u'download-path'].format(
208             job=job, build=build[u'build'], filename=file_name
209         )
210     )
211     new_name = join(
212         spec.environment[u"paths"][u"DIR[WORKING,DATA]"],
213         f"{job}{SEPARATOR}{build[u'build']}{SEPARATOR}{file_name}"
214     )
215
216     logging.info(f"Trying to download {url}")
217
218     arch = bool(spec.configuration.get(u"archive-inputs", True))
219     success, downloaded_name = _download_file(url, new_name, arch=arch)
220
221     if not success:
222         # Try to download .gz from logs.fd.io
223         file_name = spec.input[u"file-name"]
224         url = u"{0}/{1}".format(
225             spec.environment[u'urls'][u'URL[NEXUS,LOG]'],
226             spec.input[u'download-path'].format(
227                 job=job, build=build[u'build'], filename=file_name
228             )
229         )
230         new_name = join(
231             spec.environment[u"paths"][u"DIR[WORKING,DATA]"],
232             f"{job}{SEPARATOR}{build[u'build']}{SEPARATOR}{file_name}"
233         )
234
235         logging.info(f"Trying to download {url}")
236
237         arch = bool(spec.configuration.get(u"archive-inputs", True))
238         success, downloaded_name = _download_file(url, new_name, arch=arch)
239
240     if not success:
241
242         # Try to download .gz or .zip from docs.fd.io
243         file_name = (spec.input[u"file-name"], spec.input[u"zip-file-name"])
244         release = re.search(REGEX_RELEASE, job).group(2)
245         for idx, rls in enumerate((release, u"master", )):
246             try:
247                 rls = f"rls{int(rls)}"
248             except ValueError:
249                 # It is master
250                 pass
251             url = (
252                 f"{spec.environment[u'urls'][u'URL[NEXUS,DOC]']}/"
253                 f"{rls}/"
254                 f"{spec.environment[u'urls'][u'DIR[NEXUS,DOC]']}/"
255                 f"{job}{SEPARATOR}{build[u'build']}{SEPARATOR}{file_name[idx]}"
256             )
257
258             logging.info(f"Downloading {url}")
259
260             new_name = join(
261                 spec.environment[u"paths"][u"DIR[WORKING,DATA]"],
262                 f"{job}{SEPARATOR}{build[u'build']}{SEPARATOR}{file_name[idx]}"
263             )
264             success, downloaded_name = _download_file(url, new_name, arch=arch)
265             if success:
266                 file_name = file_name[idx]
267                 if file_name.endswith(u".gz"):
268                     with gzip.open(downloaded_name[:-3], u"rb") as gzip_file:
269                         file_content = gzip_file.read()
270                     with open(downloaded_name[:-3], u"wb") as xml_file:
271                         xml_file.write(file_content)
272                 break
273
274     if not success:
275
276         # Try to download .zip from jenkins.fd.io
277         file_name = spec.input[u"zip-file-name"]
278         download_path = spec.input[u"zip-download-path"]
279         if job.startswith(u"csit-"):
280             url = spec.environment[u"urls"][u"URL[JENKINS,CSIT]"]
281         else:
282             raise PresentationError(f"No url defined for the job {job}.")
283
284         full_name = download_path.format(
285             job=job, build=build[u"build"], filename=file_name
286         )
287         url = u"{0}/{1}".format(url, full_name)
288         new_name = join(
289             spec.environment[u"paths"][u"DIR[WORKING,DATA]"],
290             f"{job}{SEPARATOR}{build[u'build']}{SEPARATOR}{file_name}"
291         )
292
293         logging.info(f"Downloading {url}")
294
295         success, downloaded_name = _download_file(url, new_name)
296
297     if success and downloaded_name.endswith(u".zip"):
298         if not is_zipfile(downloaded_name):
299             logging.error(f"Zip file {new_name} is corrupted.")
300             success = False
301
302     if success:
303         build[u"file-name"] = downloaded_name
304
305         if file_name.endswith(u".gz"):
306             build[u"file-name"] = downloaded_name[:-3]
307
308         if downloaded_name.endswith(u".zip"):
309             success = _unzip_file(spec, build, pid)
310
311     return success