Revert "fix(jobspec): Delete ipsec nfv density tests"
[csit.git] / fdio.infra.terraform / terraform-nomad-prometheus / conf / nomad / prometheus.hcl.tftpl
1 job "${job_name}" {
2   # The "region" parameter specifies the region in which to execute the job.
3   # If omitted, this inherits the default region name of "global".
4   # region    = "${region}"
5
6   # The "datacenters" parameter specifies the list of datacenters which should
7   # be considered when placing this task. This must be provided.
8   datacenters = "${datacenters}"
9
10   # The "type" parameter controls the type of job, which impacts the scheduler's
11   # decision on placement.
12   #
13   # https://www.nomadproject.io/docs/jobspec/schedulers
14   #
15   type        = "service"
16
17   update {
18     # The "max_parallel" parameter specifies the maximum number of updates to
19     # perform in parallel.
20     max_parallel      = ${max_parallel}
21
22     health_check      = "checks"
23
24     # The "min_healthy_time" parameter specifies the minimum time the allocation
25     # must be in the healthy state before it is marked as healthy and unblocks
26     # further allocations from being updated.
27     min_healthy_time  = "10s"
28
29     # The "healthy_deadline" parameter specifies the deadline in which the
30     # allocation must be marked as healthy after which the allocation is
31     # automatically transitioned to unhealthy. Transitioning to unhealthy will
32     # fail the deployment and potentially roll back the job if "auto_revert" is
33     # set to true.
34     healthy_deadline  = "3m"
35
36     # The "progress_deadline" parameter specifies the deadline in which an
37     # allocation must be marked as healthy. The deadline begins when the first
38     # allocation for the deployment is created and is reset whenever an allocation
39     # as part of the deployment transitions to a healthy state. If no allocation
40     # transitions to the healthy state before the progress deadline, the
41     # deployment is marked as failed.
42     progress_deadline = "10m"
43
44 %{ if use_canary }
45     # The "canary" parameter specifies that changes to the job that would result
46     # in destructive updates should create the specified number of canaries
47     # without stopping any previous allocations. Once the operator determines the
48     # canaries are healthy, they can be promoted which unblocks a rolling update
49     # of the remaining allocations at a rate of "max_parallel".
50     #
51     # Further, setting "canary" equal to the count of the task group allows
52     # blue/green deployments. When the job is updated, a full set of the new
53     # version is deployed and upon promotion the old version is stopped.
54     canary            = ${canary}
55
56     # Specifies if the job should auto-promote to the canary version when all
57     # canaries become healthy during a deployment. Defaults to false which means
58     # canaries must be manually updated with the nomad deployment promote
59     # command.
60     auto_promote      = ${auto_promote}
61
62     # The "auto_revert" parameter specifies if the job should auto-revert to the
63     # last stable job on deployment failure. A job is marked as stable if all the
64     # allocations as part of its deployment were marked healthy.
65     auto_revert       = ${auto_revert}
66 %{ endif }
67   }
68
69   # The "group" stanza defines a series of tasks that should be co-located on
70   # the same Nomad client. Any task within a group will be placed on the same
71   # client.
72   #
73   # https://www.nomadproject.io/docs/job-specification/group
74   #
75   group "${job_name}-group-1" {
76     # The "count" parameter specifies the number of the task groups that should
77     # be running under this group. This value must be non-negative.
78     count = ${group_count}
79
80     # The volume stanza allows the group to specify that it requires a given
81     # volume from the cluster. The key of the stanza is the name of the volume
82     # as it will be exposed to task configuration.
83     #
84     # https://www.nomadproject.io/docs/job-specification/volume
85     #
86     %{ if use_host_volume }
87     volume "${job_name}-volume-1" {
88       type      = "host"
89       read_only = false
90       source    = "${volume_source}"
91     }
92     %{ endif }
93
94     # The restart stanza configures a tasks's behavior on task failure. Restarts
95     # happen on the client that is running the task.
96     #
97     # https://www.nomadproject.io/docs/job-specification/restart
98     #
99     restart {
100       interval = "30m"
101       attempts = 40
102       delay    = "15s"
103       mode     = "delay"
104     }
105
106     # The constraint allows restricting the set of eligible nodes. Constraints
107     # may filter on attributes or client metadata.
108     #
109     # https://www.nomadproject.io/docs/job-specification/constraint
110     #
111     constraint {
112       attribute = "$${attr.cpu.arch}"
113       operator  = "!="
114       value     = "arm64"
115     }
116     constraint {
117       attribute = "$${node.class}"
118       value     = "builder"
119     }
120
121     # The network stanza specifies the networking requirements for the task
122     # group, including the network mode and port allocations. When scheduling
123     # jobs in Nomad they are provisioned across your fleet of machines along
124     # with other jobs and services. Because you don't know in advance what host
125     # your job will be provisioned on, Nomad will provide your tasks with
126     # network configuration when they start up.
127     #
128     # https://www.nomadproject.io/docs/job-specification/network
129     #
130     network {
131       port "${service_name}" {
132         static = ${port}
133         to     = ${port}
134       }
135     }
136
137     # The "task" stanza creates an individual unit of work, such as a Docker
138     # container, web application, or batch processing.
139     #
140     # https://www.nomadproject.io/docs/job-specification/task
141     #
142     task "${job_name}-task-1" {
143       # The "driver" parameter specifies the task driver that should be used to
144       # run the task.
145       driver = "exec"
146
147       %{ if use_host_volume }
148       volume_mount {
149         volume      = "${job_name}-volume-1"
150         destination = "${volume_destination}"
151         read_only   = false
152       }
153       %{ endif }
154
155       %{ if use_vault_provider }
156       vault {
157         policies = "${vault_kv_policy_name}"
158       }
159       %{ endif }
160
161       # The "config" stanza specifies the driver configuration, which is passed
162       # directly to the driver to start the task. The details of configurations
163       # are specific to each driver, so please see specific driver
164       # documentation for more information.
165       config {
166         command = "local/prometheus-${version}.linux-amd64/prometheus"
167         args    = [
168           "--config.file=secrets/prometheus.yml",
169           "--web.config.file=secrets/web-config.yml",
170           "--storage.tsdb.path=${volume_destination}prometheus/",
171           "--storage.tsdb.retention.time=7d"
172         ]
173       }
174
175       # The artifact stanza instructs Nomad to fetch and unpack a remote
176       # resource, such as a file, tarball, or binary. Nomad downloads artifacts
177       # using the popular go-getter library, which permits downloading artifacts
178       # from a variety of locations using a URL as the input source.
179       #
180       # https://www.nomadproject.io/docs/job-specification/artifact
181       #
182       artifact {
183         source = "${artifact_source}"
184         options {
185           checksum = "sha256:${artifact_source_checksum}"
186         }
187       }
188
189       # The "template" stanza instructs Nomad to manage a template, such as
190       # a configuration file or script. This template can optionally pull data
191       # from Consul or Vault to populate runtime configuration data.
192       #
193       # https://www.nomadproject.io/docs/job-specification/template
194       #
195       template {
196         change_mode     = "noop"
197         change_signal   = "SIGINT"
198         destination     = "secrets/cert_file.crt"
199         left_delimiter  = "{{{"
200         right_delimiter = "}}}"
201         data            = <<EOH
202 -----BEGIN CERTIFICATE-----
203 MIIFszCCA5ugAwIBAgIUDtmFbbnYaXbXH5ddtHi9l25wM7owDQYJKoZIhvcNAQEL
204 BQAwaTELMAkGA1UEBhMCU0sxEzARBgNVBAgMClNvbWUtU3RhdGUxITAfBgNVBAoM
205 GEludGVybmV0IFdpZGdpdHMgUHR5IEx0ZDEiMCAGA1UEAwwZcHJvbWV0aGV1cy5z
206 ZXJ2aWNlLmNvbnN1bDAeFw0yMjEyMzEyMDMxMDFaFw0yMzAxMzAyMDMxMDFaMGkx
207 CzAJBgNVBAYTAlNLMRMwEQYDVQQIDApTb21lLVN0YXRlMSEwHwYDVQQKDBhJbnRl
208 cm5ldCBXaWRnaXRzIFB0eSBMdGQxIjAgBgNVBAMMGXByb21ldGhldXMuc2Vydmlj
209 ZS5jb25zdWwwggIiMA0GCSqGSIb3DQEBAQUAA4ICDwAwggIKAoICAQCGH4Tyj+9G
210 wYJNb3ubIdr5r0/DZL6XEnRIMiz88TN2QmdwAGKyQqQd7ka0IkdDHPhpRuK8IV1g
211 ELQhKab7YJCa6zWuy+rQ6JFlotGC+2tIXd3MDriUd1VPVoX6fw/5zUK/2j6exBk4
212 iqxPXHchQLzZ0viUXhQIBS1IUMTbfc0vjA8U0uPgpmAR7ieePWFwmUDxjOLMvJw6
213 +goeOfaHhW4yYgT+kg7L3rT62G+KG6Op/p7k7BNZ6G6Y6K6uJ7Z/AayAClF2sPZz
214 UIGr0uEDvD4IcAsfQgpR5vK/SVBFU5+DSO68mm11m+8IH/HA6GvNSEvCRC0Wtrsm
215 Dyq+9S3wZ7tNi7msjQWWKTB1GvTbCbPE1G/q5GJdoKUnioys6AMP4DTEV9o3lCSg
216 0sjYnkSTKgRplnuY/7Y2qSNnD1Rw0ZneSkF+8ocgiYcTvtyOY2fkhlT2VaQLX987
217 m7892ikPvoCnc/LVeREWW7hCuIQ1E1CCqg304Kd9gCgKoOGXoYmC/3wgJW0RkaM0
218 x5DpNLYx0y11CPVg315dvprOuedap6J3CNhBE3fO8ymwepFTzTcWLWgSVWrRLZnx
219 Lgb4SPhjxPg6cCZptkmXrPA+9SgW8iNHd/Fer6MAs82Kcp2T1C+qq9RurL/jjxTD
220 JaFrwZC2lgWELToMyVDrkBJJbA/2cU9CMQIDAQABo1MwUTAdBgNVHQ4EFgQUx1Mi
221 fylZExNnIz0EkrPRdXYmHmAwHwYDVR0jBBgwFoAUx1MifylZExNnIz0EkrPRdXYm
222 HmAwDwYDVR0TAQH/BAUwAwEB/zANBgkqhkiG9w0BAQsFAAOCAgEAbvlpMg4YRTNe
223 0cgqMZky/GpNjvE/zFManUGgYns8TKyZ8U0laBxRQ4XU/fASwAcOBJYtrkG7w8z+
224 FaOUptaOlNGW1VWsPDJt8ZQ2gAcTwKSW2EsBWCmOUJVNH5F0f6fTSqIUIXyxhP2w
225 JVniSkfarhb/Y1EDCACdr7Xpu6iF+nQo2o4/HE4Wkto4qwvlrdApYv4dl5J1TWjq
226 72fO9axDlNnEGVxa3C3xvKOQqWrEUy/HqC9p4it1yCiq6IYVLyve0meVFBY9xNXU
227 137AN7ks4ouuR1FZQkhLtqFuIekSZ5l4G4alwdv1NB8vohJMuMJyk9DarTLqXcYU
228 1uypZSmgREn8ByYrj4ochkSpiPw7wgK4H1Aa2cy4KUuzmLLShYu6Mov7hyJDoJSe
229 JsDVNoEBuhql4jENATqbWT3pIgYwBvBEXuYXqekcNmVZkKiSOlsxKFfSz21HYDgA
230 lCu4SMtlRYHcm4TuoTuy/FEPxHSjFY3pMciJrnO/qUrv9LlWPe1wjKhZLRPEebTk
231 r+Oh+aVWpy3ps7shPTjczOrmQykWWBGAjndZjZi4VvZNRxkGZuNwzzZcEkzt0Db7
232 l83pTRD58mvLHWl2QXoBS3t7IM6sOMwQvPx1Inp7hb7UIpNsJQaUrhhfKqy0sK18
233 mXs4VRtrxYycXxsLbk0SaZGh+juT53M=
234 -----END CERTIFICATE-----
235 EOH
236       }
237
238       template {
239         change_mode     = "noop"
240         change_signal   = "SIGINT"
241         destination     = "secrets/key_file.key"
242         left_delimiter  = "{{{"
243         right_delimiter = "}}}"
244         data            = <<EOH
245 -----BEGIN PRIVATE KEY-----
246 MIIJQQIBADANBgkqhkiG9w0BAQEFAASCCSswggknAgEAAoICAQCGH4Tyj+9GwYJN
247 b3ubIdr5r0/DZL6XEnRIMiz88TN2QmdwAGKyQqQd7ka0IkdDHPhpRuK8IV1gELQh
248 Kab7YJCa6zWuy+rQ6JFlotGC+2tIXd3MDriUd1VPVoX6fw/5zUK/2j6exBk4iqxP
249 XHchQLzZ0viUXhQIBS1IUMTbfc0vjA8U0uPgpmAR7ieePWFwmUDxjOLMvJw6+goe
250 OfaHhW4yYgT+kg7L3rT62G+KG6Op/p7k7BNZ6G6Y6K6uJ7Z/AayAClF2sPZzUIGr
251 0uEDvD4IcAsfQgpR5vK/SVBFU5+DSO68mm11m+8IH/HA6GvNSEvCRC0WtrsmDyq+
252 9S3wZ7tNi7msjQWWKTB1GvTbCbPE1G/q5GJdoKUnioys6AMP4DTEV9o3lCSg0sjY
253 nkSTKgRplnuY/7Y2qSNnD1Rw0ZneSkF+8ocgiYcTvtyOY2fkhlT2VaQLX987m789
254 2ikPvoCnc/LVeREWW7hCuIQ1E1CCqg304Kd9gCgKoOGXoYmC/3wgJW0RkaM0x5Dp
255 NLYx0y11CPVg315dvprOuedap6J3CNhBE3fO8ymwepFTzTcWLWgSVWrRLZnxLgb4
256 SPhjxPg6cCZptkmXrPA+9SgW8iNHd/Fer6MAs82Kcp2T1C+qq9RurL/jjxTDJaFr
257 wZC2lgWELToMyVDrkBJJbA/2cU9CMQIDAQABAoICAA5AQByT3Z07h3BZ5ZzUqpM4
258 JPYCeNvNeqyHJE+WA11P7fSxHcuKGC0T+dA/Cipf5CcvgHzz4JuJ+tHBPrxcBNFp
259 J5GUmjUrWPOfKrrLoxkT3DLH56Xizh45d8/ne1eUD0EaW+f7tyBSX7+o+AGBAu/0
260 IjSFkIRPpIGYD2qxAcHJFHsmc08V7oRJNU1zgSx5JDTmPtz5N3Juye9vQjohG9Xf
261 o183Pro7xigXIjbe+/NemhyB1waJE2NM6e6YSqRRFbafIgvF/tG+3qBWrlD6ye6U
262 lSHznuwX6XgYvp43Je5JrBA/Kl1CPdIzrrjMGVQ9F8ui+dV9ggInv2d93q06IGUU
263 D1o9XsZivYkn1EkLEhFXD5CYj6oR1M+MyvUrBD0bJePQCBUo+WJ2sEDt9PN2AtFL
264 9j7NKK/xXX5cTdAajeIvSS1PUGAHi7r1OF/c7bn3UFNOuOBEYzLsSZGP34AVglor
265 NON0ENCTuylmDSFd8vpaKFQpV5SK3M2k8dPRe7VEu2C9UlRvAq0xnabSHNxbwNLU
266 KuGDMSCKDc2npf3oCeQKU2PngAcePnwWSiapAkf5OqltQ/vMbrEpROpfzXLlRxLZ
267 76MDMFMQkT7m0hik6aPBHTitcWRalxHhK0ze8GvO0wesIBdyYShPKg+VDNg3qFMm
268 epVXzoi8xNzW8S6yi9DJAoIBAQC2l90VF5evDsv8nwsWMIa/rFGGLitpw23+oNcZ
269 xsIDMsGie06GYwzYHNRsd3sqK5TNLtl2vJGaVNbeDcC5T22NAYPRjNas7I5svIki
270 SnT4K68ICIVVxtfETbh2qoXSu+O3pyWJmHqqcQrvW2DlUvs0nxk/v3GukFjTVbuU
271 qmXp1KjPAVMNYoWNCJkHLEpq6e3K3q4YhEImGhMbN8suvVR9+fkKx8QvKHcqT2kn
272 9AlK7t57IPqovbni9KMfMZ+wPqw6HsYTL8lQE5NaqMB5q9Pl3SnzcRR0FSadNAiD
273 /W9jWyMazE0UsNDn241X81tVlU78Kx9S/IN97m/FSeDA1XudAoIBAQC8CzVeHxTw
274 U+ts/fi1XEuWOph2cIm6qd4aiyGX/riux0O6GUFuIQkosP5StWJyNPLBohWHC6eq
275 hPk7b0vPWmxuhttUPLA/+6+CICC0jEMWvnDAd5aJULfT0pTLZyizVu2f/GbVaiL6
276 pgsqeGyKnuh9cNTW5w7Mc45fXkgyKrB4W5aPfjoHN51n+jUqaDrfrp3CoWFviNDn
277 n3WNFtgrkj/jzQM8XFixhwxADfjd8+sZVmHT4GYjIDS4pCqs5gtIZYKhXDb0Dydj
278 fH/HiEXC63z0SuFjGNbomC/Era7kI3+1aK2qs6dyASzZKDN6dHKYoalHReUe/Cxk
279 prRcyYRWhA6lAoIBAEVrLy5Zrd1sLrl4beqdwF0W0lfFLdQj7Kml1KGEIza8EUoI
280 vy3wcm2naEtkkXrS3tuzOBIgVurp3lbFu8O4Ito8/TSp6uQLe4pzk19qF1ZSpVTU
281 iHy4AEgtlDfpVL9tl4G3FlpdkiVCnPmrMAd/qOm0oxDNZBcN4fdW3N4EeoKPyy4I
282 Pt8T2dpormU/vXswPKuoRWAkyFFcEG+Eosa+TGUoqDolAL09ETEQx9XcvbuzXPpK
283 64FDwGw8vdeaMi/7Y9ck5AFfZZYAG0GYbrTTUthNYSmgkDoh4HBb2/DyZWrMt2f0
284 zElVf9bmbbJGXy8GeOT+MAaI4iT6hZvoHn6xqzECggEABoQg6k0LbbSKwPEgEDDN
285 kbwgEmKd8zD1uFe/50N1ZOEU0LsVUFqmtZlEhtswOSLqkpkqQ868laUb+dpGdz37
286 6eyUZxvfQ6hWEZ1JZNhDbuNUhubd+Y4pgJaYf1/owiYt/9BAQ/70jVj5pBQeNsOA
287 7O/fAD9rfNw4P8fFmq9uBA2wbvKB0kQ0GSlLdFe+SogDgX4UIUhNbOlSqnvzK7da
288 rWsqRIoyrJwwaXvSduZ/7BXZN/1brLXt/cP6kpk6JN0XpL3MTbLEu6bRyrlHKZT9
289 dH2vx75RnCfB5//YwqEUSNYCxpqJH+M4iaHh/slQO0fG1OhwIx278BTyxRBanKDg
290 3QKCAQBoVnM3PDqaSAT1g3f3neYiXyZektJganRLj5wmDXYAySM2ag/oDacswmP/
291 J0BQ9KYK+dSgXldlaXtC05oxdhxY5cawbCFNfbjGDZ6zGwgLDocyFtqOBZf6UXCV
292 Gtj/9r6iyD2/2wbo/lrS0d3yNcNN0nkZUxoyl+J6uGB1o8bo+cfL+mi4pkALKV8L
293 Oa/fPazAQtikZBHSWtdQamyUMFSAdMUeYIhaXBfkNUZG4sz9nKD5UGBOmquLMBt6
294 zBPM+4dv4x/MEAEnSC2ANW8vDGFBgG/5H5+j2F0RM6O1MlkDzrOAIvUTrMJlJDBt
295 775JbZNCKpaELqxy4BNPfRDEJGBh
296 -----END PRIVATE KEY-----
297 EOH
298       }
299
300       # The "template" stanza instructs Nomad to manage a template, such as
301       # a configuration file or script. This template can optionally pull data
302       # from Consul or Vault to populate runtime configuration data.
303       #
304       #     https://www.nomadproject.io/docs/job-specification/template
305       #
306       template {
307         change_mode     = "noop"
308         change_signal   = "SIGINT"
309         destination     = "secrets/alerts.yml"
310         left_delimiter  = "{{{"
311         right_delimiter = "}}}"
312         data            = <<EOH
313 ---
314 groups:
315 - name: "Jenkins Job Health Exporter"
316   rules:
317   - alert: JenkinsJobHealthExporterFailures
318     expr: jenkins_job_failure{id=~".*"} > jenkins_job_success{id=~".*"}
319     for: 0m
320     labels:
321       severity: critical
322     annotations:
323       summary: "Jenkins Job Health detected high failure rate on jenkins jobs."
324       description: "Job: {{ $labels.id }}"
325   - alert: JenkinsJobHealthExporterUnstable
326     expr: jenkins_job_unstable{id=~".*"} > jenkins_job_success{id=~".*"}
327     for: 0m
328     labels:
329       severity: warning
330     annotations:
331       summary: "Jenkins Job Health detected high unstable rate on jenkins jobs."
332       description: "Job: {{ $labels.id }}"
333 - name: "Consul"
334   rules:
335   - alert: ConsulServiceHealthcheckFailed
336     expr: consul_catalog_service_node_healthy == 0
337     for: 0m
338     labels:
339       severity: critical
340     annotations:
341       summary: "Consul service healthcheck failed (instance {{ $labels.instance }})."
342       description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`."
343   - alert: ConsulMissingMasterNode
344     expr: consul_raft_peers < 3
345     for: 0m
346     labels:
347       severity: critical
348     annotations:
349       summary: "Consul missing master node (instance {{ $labels.instance }})."
350       description: "Numbers of consul raft peers should be 3, in order to preserve quorum."
351   - alert: ConsulAgentUnhealthy
352     expr: consul_health_node_status{status="critical"} == 1
353     for: 0m
354     labels:
355       severity: critical
356     annotations:
357       summary: "Consul agent unhealthy (instance {{ $labels.instance }})."
358       description: "A Consul agent is down."
359 - name: "Hosts"
360   rules:
361   - alert: NodeDown
362     expr: up == 0
363     for: 0m
364     labels:
365       severity: critical
366     annotations:
367       summary: "Prometheus target missing (instance {{ $labels.instance }})."
368       description: "A Prometheus target has disappeared. An exporter might be crashed."
369   - alert: HostOutOfMemory
370     expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
371     for: 2m
372     labels:
373       severity: warning
374     annotations:
375       summary: "Host out of memory (instance {{ $labels.instance }})."
376       description: "Node memory is filling up (< 10% left)."
377   - alert: HostOomKillDetected
378     expr: increase(node_vmstat_oom_kill[1m]) > 0
379     for: 0m
380     labels:
381       severity: warning
382     annotations:
383       summary: "Host OOM kill detected (instance {{ $labels.instance }})."
384       description: "OOM kill detected."
385   - alert: HostMemoryUnderMemoryPressure
386     expr: rate(node_vmstat_pgmajfault[1m]) > 1000
387     for: 2m
388     labels:
389       severity: warning
390     annotations:
391       summary: "Host memory under memory pressure (instance {{ $labels.instance }})."
392       description: "The node is under heavy memory pressure. High rate of major page faults."
393   - alert: HostOutOfDiskSpace
394     expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
395     for: 2m
396     labels:
397       severity: warning
398     annotations:
399       summary: "Host out of disk space (instance {{ $labels.instance }})."
400       description: "Disk is almost full (< 10% left)."
401   - alert: HostRaidDiskFailure
402     expr: node_md_disks{state="failed"} > 0
403     for: 2m
404     labels:
405       severity: warning
406     annotations:
407       summary: "Host RAID disk failure (instance {{ $labels.instance }})."
408       description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap."
409   - alert: HostConntrackLimit
410     expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
411     for: 5m
412     labels:
413       severity: warning
414     annotations:
415       summary: "Host conntrack limit (instance {{ $labels.instance }})."
416       description: "The number of conntrack is approching limit."
417   - alert: HostNetworkInterfaceSaturated
418     expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8
419     for: 1m
420     labels:
421       severity: warning
422     annotations:
423       summary: "Host Network Interface Saturated (instance {{ $labels.instance }})."
424       description: "The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded."
425   - alert: HostSystemdServiceCrashed
426     expr: node_systemd_unit_state{state="failed"} == 1
427     for: 0m
428     labels:
429       severity: warning
430     annotations:
431       summary: "Host SystemD service crashed (instance {{ $labels.instance }})."
432       description: "SystemD service crashed."
433   - alert: HostEdacCorrectableErrorsDetected
434     expr: increase(node_edac_correctable_errors_total[1m]) > 0
435     for: 0m
436     labels:
437       severity: info
438     annotations:
439       summary: "Host EDAC Correctable Errors detected (instance {{ $labels.instance }})."
440       description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'
441   - alert: HostEdacUncorrectableErrorsDetected
442     expr: node_edac_uncorrectable_errors_total > 0
443     for: 0m
444     labels:
445       severity: warning
446     annotations:
447       summary: "Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})."
448       description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'
449 - name: "Prometheus"
450   rules:
451   - alert: PrometheusConfigurationReloadFailure
452     expr: prometheus_config_last_reload_successful != 1
453     for: 0m
454     labels:
455       severity: warning
456     annotations:
457       summary: "Prometheus configuration reload failure (instance {{ $labels.instance }})."
458       description: "Prometheus configuration reload error."
459   - alert: PrometheusTooManyRestarts
460     expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
461     for: 0m
462     labels:
463       severity: warning
464     annotations:
465       summary: "Prometheus too many restarts (instance {{ $labels.instance }})."
466       description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping."
467   - alert: PrometheusAlertmanagerConfigurationReloadFailure
468     expr: alertmanager_config_last_reload_successful != 1
469     for: 0m
470     labels:
471       severity: warning
472     annotations:
473       summary: "Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})."
474       description: "AlertManager configuration reload error."
475   - alert: PrometheusRuleEvaluationFailures
476     expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
477     for: 0m
478     labels:
479       severity: critical
480     annotations:
481       summary: "Prometheus rule evaluation failures (instance {{ $labels.instance }})."
482       description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts."
483   - alert: PrometheusTargetScrapingSlow
484     expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60
485     for: 5m
486     labels:
487       severity: warning
488     annotations:
489       summary: "Prometheus target scraping slow (instance {{ $labels.instance }})."
490       description: "Prometheus is scraping exporters slowly."
491   - alert: PrometheusTsdbCompactionsFailed
492     expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
493     for: 0m
494     labels:
495       severity: critical
496     annotations:
497       summary: "Prometheus TSDB compactions failed (instance {{ $labels.instance }})."
498       description: "Prometheus encountered {{ $value }} TSDB compactions failures."
499   - alert: PrometheusTsdbHeadTruncationsFailed
500     expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
501     for: 0m
502     labels:
503       severity: critical
504     annotations:
505       summary: "Prometheus TSDB head truncations failed (instance {{ $labels.instance }})."
506       description: "Prometheus encountered {{ $value }} TSDB head truncation failures."
507   - alert: PrometheusTsdbWalCorruptions
508     expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
509     for: 0m
510     labels:
511       severity: critical
512     annotations:
513       summary: "Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})."
514       description: "Prometheus encountered {{ $value }} TSDB WAL corruptions."
515   - alert: PrometheusTsdbWalTruncationsFailed
516     expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
517     for: 0m
518     labels:
519       severity: critical
520     annotations:
521       summary: "Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})."
522       description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures."
523 EOH
524       }
525
526       template {
527         change_mode     = "noop"
528         change_signal   = "SIGINT"
529         destination     = "secrets/prometheus.yml"
530         data            = <<EOH
531 ---
532 global:
533   scrape_interval:     5s
534   scrape_timeout:      5s
535   evaluation_interval: 5s
536
537 alerting:
538   alertmanagers:
539   - consul_sd_configs:
540     - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
541       services: [ 'alertmanager' ]
542
543 rule_files:
544   - 'alerts.yml'
545
546 scrape_configs:
547   - job_name: 'Nomad Cluster'
548     consul_sd_configs:
549     - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
550       services: [ 'nomad-client', 'nomad' ]
551     relabel_configs:
552     - source_labels: [__meta_consul_tags]
553       regex: '(.*)http(.*)'
554       action: keep
555     metrics_path: /v1/metrics
556     params:
557       format: [ 'prometheus' ]
558
559   - job_name: 'Consul Cluster'
560     static_configs:
561       - targets: [ '10.30.51.23:8500' ]
562       - targets: [ '10.30.51.24:8500' ]
563       - targets: [ '10.30.51.25:8500' ]
564       - targets: [ '10.30.51.26:8500' ]
565       - targets: [ '10.30.51.27:8500' ]
566       - targets: [ '10.30.51.28:8500' ]
567       - targets: [ '10.30.51.50:8500' ]
568       - targets: [ '10.30.51.51:8500' ]
569       - targets: [ '10.30.51.70:8500' ]
570       - targets: [ '10.30.51.71:8500' ]
571       - targets: [ '10.30.51.91:8500' ]
572       - targets: [ '10.30.51.92:8500' ]
573     metrics_path: /v1/agent/metrics
574     params:
575       format: [ 'prometheus' ]
576
577   - job_name: 'Jenkins Job Health Exporter'
578     static_configs:
579       - targets: [ '10.30.51.22:9186' ]
580     metric_relabel_configs:
581       - source_labels: [ __name__ ]
582         regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'
583         action: replace
584         replacement: '$1'
585         target_label: id
586       - source_labels: [ __name__ ]
587         regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'
588         replacement: 'jenkins_job_$2'
589         target_label: __name__
590
591   - job_name: 'Node Exporter'
592     static_configs:
593       - targets: [ '10.30.51.23:9100' ]
594       - targets: [ '10.30.51.24:9100' ]
595       - targets: [ '10.30.51.25:9100' ]
596       - targets: [ '10.30.51.26:9100' ]
597       - targets: [ '10.30.51.27:9100' ]
598       - targets: [ '10.30.51.28:9100' ]
599       - targets: [ '10.30.51.50:9100' ]
600       - targets: [ '10.30.51.51:9100' ]
601       - targets: [ '10.30.51.70:9100' ]
602       - targets: [ '10.30.51.71:9100' ]
603       - targets: [ '10.30.51.91:9100' ]
604       - targets: [ '10.30.51.92:9100' ]
605
606   - job_name: 'Alertmanager'
607     consul_sd_configs:
608     - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
609       services: [ 'alertmanager' ]
610
611   - job_name: 'Prometheus'
612     honor_timestamps: true
613     params:
614       format:
615       - prometheus
616     scheme: https
617     follow_redirects: true
618     enable_http2: true
619     consul_sd_configs:
620     - server: {{ env "CONSUL_HTTP_ADDR" }}
621       services:
622       - prometheus
623     tls_config:
624       cert_file: cert_file.crt
625       key_file: key_file.key
626       insecure_skip_verify: true
627 EOH
628       }
629
630       template {
631         change_mode     = "noop"
632         change_signal   = "SIGINT"
633         destination     = "secrets/web-config.yml"
634         left_delimiter  = "{{{"
635         right_delimiter = "}}}"
636         data            = <<EOH
637 ---
638 tls_server_config:
639   cert_file: cert_file.crt
640   key_file: key_file.key
641 EOH
642       }
643
644       # The service stanza instructs Nomad to register a service with Consul.
645       #
646       # https://www.nomadproject.io/docs/job-specification/service
647       #
648       service {
649         name       = "${service_name}"
650         port       = "${service_name}"
651         tags       = [ "${service_name}$${NOMAD_ALLOC_INDEX}" ]
652         check {
653           name            = "Prometheus Check Live"
654           type            = "http"
655           path            = "/-/healthy"
656           protocol        = "https"
657           tls_skip_verify = true
658           interval        = "10s"
659           timeout         = "2s"
660         }
661       }
662
663       # The "resources" stanza describes the requirements a task needs to
664       # execute. Resource requirements include memory, network, cpu, and more.
665       # This ensures the task will execute on a machine that contains enough
666       # resource capacity.
667       #
668       # https://www.nomadproject.io/docs/job-specification/resources
669       #
670       resources {
671         cpu    = ${cpu}
672         memory = ${memory}
673       }
674     }
675   }
676 }