2 # The "region" parameter specifies the region in which to execute the job.
3 # If omitted, this inherits the default region name of "global".
6 # The "datacenters" parameter specifies the list of datacenters which should
7 # be considered when placing this task. This must be provided.
8 datacenters = "${datacenters}"
10 # The "type" parameter controls the type of job, which impacts the scheduler's
11 # decision on placement.
13 # https://www.nomadproject.io/docs/jobspec/schedulers
18 # The "max_parallel" parameter specifies the maximum number of updates to
19 # perform in parallel.
20 max_parallel = ${max_parallel}
22 health_check = "checks"
24 # The "min_healthy_time" parameter specifies the minimum time the allocation
25 # must be in the healthy state before it is marked as healthy and unblocks
26 # further allocations from being updated.
27 min_healthy_time = "10s"
29 # The "healthy_deadline" parameter specifies the deadline in which the
30 # allocation must be marked as healthy after which the allocation is
31 # automatically transitioned to unhealthy. Transitioning to unhealthy will
32 # fail the deployment and potentially roll back the job if "auto_revert" is
34 healthy_deadline = "3m"
36 # The "progress_deadline" parameter specifies the deadline in which an
37 # allocation must be marked as healthy. The deadline begins when the first
38 # allocation for the deployment is created and is reset whenever an allocation
39 # as part of the deployment transitions to a healthy state. If no allocation
40 # transitions to the healthy state before the progress deadline, the
41 # deployment is marked as failed.
42 progress_deadline = "10m"
45 # The "canary" parameter specifies that changes to the job that would result
46 # in destructive updates should create the specified number of canaries
47 # without stopping any previous allocations. Once the operator determines the
48 # canaries are healthy, they can be promoted which unblocks a rolling update
49 # of the remaining allocations at a rate of "max_parallel".
51 # Further, setting "canary" equal to the count of the task group allows
52 # blue/green deployments. When the job is updated, a full set of the new
53 # version is deployed and upon promotion the old version is stopped.
56 # Specifies if the job should auto-promote to the canary version when all
57 # canaries become healthy during a deployment. Defaults to false which means
58 # canaries must be manually updated with the nomad deployment promote
60 auto_promote = ${auto_promote}
62 # The "auto_revert" parameter specifies if the job should auto-revert to the
63 # last stable job on deployment failure. A job is marked as stable if all the
64 # allocations as part of its deployment were marked healthy.
65 auto_revert = ${auto_revert}
69 # The "group" stanza defines a series of tasks that should be co-located on
70 # the same Nomad client. Any task within a group will be placed on the same
73 # https://www.nomadproject.io/docs/job-specification/group
75 group "${job_name}-group-1" {
76 # The "count" parameter specifies the number of the task groups that should
77 # be running under this group. This value must be non-negative.
78 count = ${group_count}
80 # The volume stanza allows the group to specify that it requires a given
81 # volume from the cluster. The key of the stanza is the name of the volume
82 # as it will be exposed to task configuration.
84 # https://www.nomadproject.io/docs/job-specification/volume
86 %{ if use_host_volume }
87 volume "${job_name}-volume-1" {
90 source = "${volume_source}"
94 # The restart stanza configures a tasks's behavior on task failure. Restarts
95 # happen on the client that is running the task.
97 # https://www.nomadproject.io/docs/job-specification/restart
106 # The constraint allows restricting the set of eligible nodes. Constraints
107 # may filter on attributes or client metadata.
109 # https://www.nomadproject.io/docs/job-specification/constraint
112 attribute = "$${attr.cpu.arch}"
117 attribute = "$${node.class}"
121 # The network stanza specifies the networking requirements for the task
122 # group, including the network mode and port allocations. When scheduling
123 # jobs in Nomad they are provisioned across your fleet of machines along
124 # with other jobs and services. Because you don't know in advance what host
125 # your job will be provisioned on, Nomad will provide your tasks with
126 # network configuration when they start up.
128 # https://www.nomadproject.io/docs/job-specification/network
131 port "${service_name}" {
137 # The "task" stanza creates an individual unit of work, such as a Docker
138 # container, web application, or batch processing.
140 # https://www.nomadproject.io/docs/job-specification/task
142 task "${job_name}-task-1" {
143 # The "driver" parameter specifies the task driver that should be used to
147 %{ if use_host_volume }
149 volume = "${job_name}-volume-1"
150 destination = "${volume_destination}"
155 %{ if use_vault_provider }
157 policies = "${vault_kv_policy_name}"
161 # The "config" stanza specifies the driver configuration, which is passed
162 # directly to the driver to start the task. The details of configurations
163 # are specific to each driver, so please see specific driver
164 # documentation for more information.
166 command = "local/prometheus-${version}.linux-amd64/prometheus"
168 "--config.file=secrets/prometheus.yml",
169 "--web.config.file=secrets/web-config.yml",
170 "--storage.tsdb.path=${volume_destination}prometheus/",
171 "--storage.tsdb.retention.time=7d"
175 # The artifact stanza instructs Nomad to fetch and unpack a remote
176 # resource, such as a file, tarball, or binary. Nomad downloads artifacts
177 # using the popular go-getter library, which permits downloading artifacts
178 # from a variety of locations using a URL as the input source.
180 # https://www.nomadproject.io/docs/job-specification/artifact
183 source = "${artifact_source}"
185 checksum = "sha256:${artifact_source_checksum}"
189 # The "template" stanza instructs Nomad to manage a template, such as
190 # a configuration file or script. This template can optionally pull data
191 # from Consul or Vault to populate runtime configuration data.
193 # https://www.nomadproject.io/docs/job-specification/template
197 change_signal = "SIGINT"
198 destination = "secrets/cert_file.crt"
199 left_delimiter = "{{{"
200 right_delimiter = "}}}"
202 -----BEGIN CERTIFICATE-----
203 MIIFszCCA5ugAwIBAgIUDtmFbbnYaXbXH5ddtHi9l25wM7owDQYJKoZIhvcNAQEL
204 BQAwaTELMAkGA1UEBhMCU0sxEzARBgNVBAgMClNvbWUtU3RhdGUxITAfBgNVBAoM
205 GEludGVybmV0IFdpZGdpdHMgUHR5IEx0ZDEiMCAGA1UEAwwZcHJvbWV0aGV1cy5z
206 ZXJ2aWNlLmNvbnN1bDAeFw0yMjEyMzEyMDMxMDFaFw0yMzAxMzAyMDMxMDFaMGkx
207 CzAJBgNVBAYTAlNLMRMwEQYDVQQIDApTb21lLVN0YXRlMSEwHwYDVQQKDBhJbnRl
208 cm5ldCBXaWRnaXRzIFB0eSBMdGQxIjAgBgNVBAMMGXByb21ldGhldXMuc2Vydmlj
209 ZS5jb25zdWwwggIiMA0GCSqGSIb3DQEBAQUAA4ICDwAwggIKAoICAQCGH4Tyj+9G
210 wYJNb3ubIdr5r0/DZL6XEnRIMiz88TN2QmdwAGKyQqQd7ka0IkdDHPhpRuK8IV1g
211 ELQhKab7YJCa6zWuy+rQ6JFlotGC+2tIXd3MDriUd1VPVoX6fw/5zUK/2j6exBk4
212 iqxPXHchQLzZ0viUXhQIBS1IUMTbfc0vjA8U0uPgpmAR7ieePWFwmUDxjOLMvJw6
213 +goeOfaHhW4yYgT+kg7L3rT62G+KG6Op/p7k7BNZ6G6Y6K6uJ7Z/AayAClF2sPZz
214 UIGr0uEDvD4IcAsfQgpR5vK/SVBFU5+DSO68mm11m+8IH/HA6GvNSEvCRC0Wtrsm
215 Dyq+9S3wZ7tNi7msjQWWKTB1GvTbCbPE1G/q5GJdoKUnioys6AMP4DTEV9o3lCSg
216 0sjYnkSTKgRplnuY/7Y2qSNnD1Rw0ZneSkF+8ocgiYcTvtyOY2fkhlT2VaQLX987
217 m7892ikPvoCnc/LVeREWW7hCuIQ1E1CCqg304Kd9gCgKoOGXoYmC/3wgJW0RkaM0
218 x5DpNLYx0y11CPVg315dvprOuedap6J3CNhBE3fO8ymwepFTzTcWLWgSVWrRLZnx
219 Lgb4SPhjxPg6cCZptkmXrPA+9SgW8iNHd/Fer6MAs82Kcp2T1C+qq9RurL/jjxTD
220 JaFrwZC2lgWELToMyVDrkBJJbA/2cU9CMQIDAQABo1MwUTAdBgNVHQ4EFgQUx1Mi
221 fylZExNnIz0EkrPRdXYmHmAwHwYDVR0jBBgwFoAUx1MifylZExNnIz0EkrPRdXYm
222 HmAwDwYDVR0TAQH/BAUwAwEB/zANBgkqhkiG9w0BAQsFAAOCAgEAbvlpMg4YRTNe
223 0cgqMZky/GpNjvE/zFManUGgYns8TKyZ8U0laBxRQ4XU/fASwAcOBJYtrkG7w8z+
224 FaOUptaOlNGW1VWsPDJt8ZQ2gAcTwKSW2EsBWCmOUJVNH5F0f6fTSqIUIXyxhP2w
225 JVniSkfarhb/Y1EDCACdr7Xpu6iF+nQo2o4/HE4Wkto4qwvlrdApYv4dl5J1TWjq
226 72fO9axDlNnEGVxa3C3xvKOQqWrEUy/HqC9p4it1yCiq6IYVLyve0meVFBY9xNXU
227 137AN7ks4ouuR1FZQkhLtqFuIekSZ5l4G4alwdv1NB8vohJMuMJyk9DarTLqXcYU
228 1uypZSmgREn8ByYrj4ochkSpiPw7wgK4H1Aa2cy4KUuzmLLShYu6Mov7hyJDoJSe
229 JsDVNoEBuhql4jENATqbWT3pIgYwBvBEXuYXqekcNmVZkKiSOlsxKFfSz21HYDgA
230 lCu4SMtlRYHcm4TuoTuy/FEPxHSjFY3pMciJrnO/qUrv9LlWPe1wjKhZLRPEebTk
231 r+Oh+aVWpy3ps7shPTjczOrmQykWWBGAjndZjZi4VvZNRxkGZuNwzzZcEkzt0Db7
232 l83pTRD58mvLHWl2QXoBS3t7IM6sOMwQvPx1Inp7hb7UIpNsJQaUrhhfKqy0sK18
233 mXs4VRtrxYycXxsLbk0SaZGh+juT53M=
234 -----END CERTIFICATE-----
240 change_signal = "SIGINT"
241 destination = "secrets/key_file.key"
242 left_delimiter = "{{{"
243 right_delimiter = "}}}"
245 -----BEGIN PRIVATE KEY-----
246 MIIJQQIBADANBgkqhkiG9w0BAQEFAASCCSswggknAgEAAoICAQCGH4Tyj+9GwYJN
247 b3ubIdr5r0/DZL6XEnRIMiz88TN2QmdwAGKyQqQd7ka0IkdDHPhpRuK8IV1gELQh
248 Kab7YJCa6zWuy+rQ6JFlotGC+2tIXd3MDriUd1VPVoX6fw/5zUK/2j6exBk4iqxP
249 XHchQLzZ0viUXhQIBS1IUMTbfc0vjA8U0uPgpmAR7ieePWFwmUDxjOLMvJw6+goe
250 OfaHhW4yYgT+kg7L3rT62G+KG6Op/p7k7BNZ6G6Y6K6uJ7Z/AayAClF2sPZzUIGr
251 0uEDvD4IcAsfQgpR5vK/SVBFU5+DSO68mm11m+8IH/HA6GvNSEvCRC0WtrsmDyq+
252 9S3wZ7tNi7msjQWWKTB1GvTbCbPE1G/q5GJdoKUnioys6AMP4DTEV9o3lCSg0sjY
253 nkSTKgRplnuY/7Y2qSNnD1Rw0ZneSkF+8ocgiYcTvtyOY2fkhlT2VaQLX987m789
254 2ikPvoCnc/LVeREWW7hCuIQ1E1CCqg304Kd9gCgKoOGXoYmC/3wgJW0RkaM0x5Dp
255 NLYx0y11CPVg315dvprOuedap6J3CNhBE3fO8ymwepFTzTcWLWgSVWrRLZnxLgb4
256 SPhjxPg6cCZptkmXrPA+9SgW8iNHd/Fer6MAs82Kcp2T1C+qq9RurL/jjxTDJaFr
257 wZC2lgWELToMyVDrkBJJbA/2cU9CMQIDAQABAoICAA5AQByT3Z07h3BZ5ZzUqpM4
258 JPYCeNvNeqyHJE+WA11P7fSxHcuKGC0T+dA/Cipf5CcvgHzz4JuJ+tHBPrxcBNFp
259 J5GUmjUrWPOfKrrLoxkT3DLH56Xizh45d8/ne1eUD0EaW+f7tyBSX7+o+AGBAu/0
260 IjSFkIRPpIGYD2qxAcHJFHsmc08V7oRJNU1zgSx5JDTmPtz5N3Juye9vQjohG9Xf
261 o183Pro7xigXIjbe+/NemhyB1waJE2NM6e6YSqRRFbafIgvF/tG+3qBWrlD6ye6U
262 lSHznuwX6XgYvp43Je5JrBA/Kl1CPdIzrrjMGVQ9F8ui+dV9ggInv2d93q06IGUU
263 D1o9XsZivYkn1EkLEhFXD5CYj6oR1M+MyvUrBD0bJePQCBUo+WJ2sEDt9PN2AtFL
264 9j7NKK/xXX5cTdAajeIvSS1PUGAHi7r1OF/c7bn3UFNOuOBEYzLsSZGP34AVglor
265 NON0ENCTuylmDSFd8vpaKFQpV5SK3M2k8dPRe7VEu2C9UlRvAq0xnabSHNxbwNLU
266 KuGDMSCKDc2npf3oCeQKU2PngAcePnwWSiapAkf5OqltQ/vMbrEpROpfzXLlRxLZ
267 76MDMFMQkT7m0hik6aPBHTitcWRalxHhK0ze8GvO0wesIBdyYShPKg+VDNg3qFMm
268 epVXzoi8xNzW8S6yi9DJAoIBAQC2l90VF5evDsv8nwsWMIa/rFGGLitpw23+oNcZ
269 xsIDMsGie06GYwzYHNRsd3sqK5TNLtl2vJGaVNbeDcC5T22NAYPRjNas7I5svIki
270 SnT4K68ICIVVxtfETbh2qoXSu+O3pyWJmHqqcQrvW2DlUvs0nxk/v3GukFjTVbuU
271 qmXp1KjPAVMNYoWNCJkHLEpq6e3K3q4YhEImGhMbN8suvVR9+fkKx8QvKHcqT2kn
272 9AlK7t57IPqovbni9KMfMZ+wPqw6HsYTL8lQE5NaqMB5q9Pl3SnzcRR0FSadNAiD
273 /W9jWyMazE0UsNDn241X81tVlU78Kx9S/IN97m/FSeDA1XudAoIBAQC8CzVeHxTw
274 U+ts/fi1XEuWOph2cIm6qd4aiyGX/riux0O6GUFuIQkosP5StWJyNPLBohWHC6eq
275 hPk7b0vPWmxuhttUPLA/+6+CICC0jEMWvnDAd5aJULfT0pTLZyizVu2f/GbVaiL6
276 pgsqeGyKnuh9cNTW5w7Mc45fXkgyKrB4W5aPfjoHN51n+jUqaDrfrp3CoWFviNDn
277 n3WNFtgrkj/jzQM8XFixhwxADfjd8+sZVmHT4GYjIDS4pCqs5gtIZYKhXDb0Dydj
278 fH/HiEXC63z0SuFjGNbomC/Era7kI3+1aK2qs6dyASzZKDN6dHKYoalHReUe/Cxk
279 prRcyYRWhA6lAoIBAEVrLy5Zrd1sLrl4beqdwF0W0lfFLdQj7Kml1KGEIza8EUoI
280 vy3wcm2naEtkkXrS3tuzOBIgVurp3lbFu8O4Ito8/TSp6uQLe4pzk19qF1ZSpVTU
281 iHy4AEgtlDfpVL9tl4G3FlpdkiVCnPmrMAd/qOm0oxDNZBcN4fdW3N4EeoKPyy4I
282 Pt8T2dpormU/vXswPKuoRWAkyFFcEG+Eosa+TGUoqDolAL09ETEQx9XcvbuzXPpK
283 64FDwGw8vdeaMi/7Y9ck5AFfZZYAG0GYbrTTUthNYSmgkDoh4HBb2/DyZWrMt2f0
284 zElVf9bmbbJGXy8GeOT+MAaI4iT6hZvoHn6xqzECggEABoQg6k0LbbSKwPEgEDDN
285 kbwgEmKd8zD1uFe/50N1ZOEU0LsVUFqmtZlEhtswOSLqkpkqQ868laUb+dpGdz37
286 6eyUZxvfQ6hWEZ1JZNhDbuNUhubd+Y4pgJaYf1/owiYt/9BAQ/70jVj5pBQeNsOA
287 7O/fAD9rfNw4P8fFmq9uBA2wbvKB0kQ0GSlLdFe+SogDgX4UIUhNbOlSqnvzK7da
288 rWsqRIoyrJwwaXvSduZ/7BXZN/1brLXt/cP6kpk6JN0XpL3MTbLEu6bRyrlHKZT9
289 dH2vx75RnCfB5//YwqEUSNYCxpqJH+M4iaHh/slQO0fG1OhwIx278BTyxRBanKDg
290 3QKCAQBoVnM3PDqaSAT1g3f3neYiXyZektJganRLj5wmDXYAySM2ag/oDacswmP/
291 J0BQ9KYK+dSgXldlaXtC05oxdhxY5cawbCFNfbjGDZ6zGwgLDocyFtqOBZf6UXCV
292 Gtj/9r6iyD2/2wbo/lrS0d3yNcNN0nkZUxoyl+J6uGB1o8bo+cfL+mi4pkALKV8L
293 Oa/fPazAQtikZBHSWtdQamyUMFSAdMUeYIhaXBfkNUZG4sz9nKD5UGBOmquLMBt6
294 zBPM+4dv4x/MEAEnSC2ANW8vDGFBgG/5H5+j2F0RM6O1MlkDzrOAIvUTrMJlJDBt
295 775JbZNCKpaELqxy4BNPfRDEJGBh
296 -----END PRIVATE KEY-----
300 # The "template" stanza instructs Nomad to manage a template, such as
301 # a configuration file or script. This template can optionally pull data
302 # from Consul or Vault to populate runtime configuration data.
304 # https://www.nomadproject.io/docs/job-specification/template
308 change_signal = "SIGINT"
309 destination = "secrets/alerts.yml"
310 left_delimiter = "{{{"
311 right_delimiter = "}}}"
315 - name: "Jenkins Job Health Exporter"
317 - alert: JenkinsJobHealthExporterFailures
318 expr: jenkins_job_failure{id=~".*"} > jenkins_job_success{id=~".*"}
323 summary: "Jenkins Job Health detected high failure rate on jenkins jobs."
324 description: "Job: {{ $labels.id }}"
325 - alert: JenkinsJobHealthExporterUnstable
326 expr: jenkins_job_unstable{id=~".*"} > jenkins_job_success{id=~".*"}
331 summary: "Jenkins Job Health detected high unstable rate on jenkins jobs."
332 description: "Job: {{ $labels.id }}"
335 - alert: ConsulServiceHealthcheckFailed
336 expr: consul_catalog_service_node_healthy == 0
341 summary: "Consul service healthcheck failed (instance {{ $labels.instance }})."
342 description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`."
343 - alert: ConsulMissingMasterNode
344 expr: consul_raft_peers < 3
349 summary: "Consul missing master node (instance {{ $labels.instance }})."
350 description: "Numbers of consul raft peers should be 3, in order to preserve quorum."
351 - alert: ConsulAgentUnhealthy
352 expr: consul_health_node_status{status="critical"} == 1
357 summary: "Consul agent unhealthy (instance {{ $labels.instance }})."
358 description: "A Consul agent is down."
367 summary: "Prometheus target missing (instance {{ $labels.instance }})."
368 description: "A Prometheus target has disappeared. An exporter might be crashed."
369 - alert: HostOutOfMemory
370 expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
375 summary: "Host out of memory (instance {{ $labels.instance }})."
376 description: "Node memory is filling up (< 10% left)."
377 - alert: HostOomKillDetected
378 expr: increase(node_vmstat_oom_kill[1m]) > 0
383 summary: "Host OOM kill detected (instance {{ $labels.instance }})."
384 description: "OOM kill detected."
385 - alert: HostMemoryUnderMemoryPressure
386 expr: rate(node_vmstat_pgmajfault[1m]) > 1000
391 summary: "Host memory under memory pressure (instance {{ $labels.instance }})."
392 description: "The node is under heavy memory pressure. High rate of major page faults."
393 - alert: HostOutOfDiskSpace
394 expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
399 summary: "Host out of disk space (instance {{ $labels.instance }})."
400 description: "Disk is almost full (< 10% left)."
401 - alert: HostRaidDiskFailure
402 expr: node_md_disks{state="failed"} > 0
407 summary: "Host RAID disk failure (instance {{ $labels.instance }})."
408 description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap."
409 - alert: HostConntrackLimit
410 expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
415 summary: "Host conntrack limit (instance {{ $labels.instance }})."
416 description: "The number of conntrack is approching limit."
417 - alert: HostNetworkInterfaceSaturated
418 expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8
423 summary: "Host Network Interface Saturated (instance {{ $labels.instance }})."
424 description: "The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded."
425 - alert: HostSystemdServiceCrashed
426 expr: node_systemd_unit_state{state="failed"} == 1
431 summary: "Host SystemD service crashed (instance {{ $labels.instance }})."
432 description: "SystemD service crashed."
433 - alert: HostEdacCorrectableErrorsDetected
434 expr: increase(node_edac_correctable_errors_total[1m]) > 0
439 summary: "Host EDAC Correctable Errors detected (instance {{ $labels.instance }})."
440 description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'
441 - alert: HostEdacUncorrectableErrorsDetected
442 expr: node_edac_uncorrectable_errors_total > 0
447 summary: "Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})."
448 description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'
451 - alert: PrometheusConfigurationReloadFailure
452 expr: prometheus_config_last_reload_successful != 1
457 summary: "Prometheus configuration reload failure (instance {{ $labels.instance }})."
458 description: "Prometheus configuration reload error."
459 - alert: PrometheusTooManyRestarts
460 expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
465 summary: "Prometheus too many restarts (instance {{ $labels.instance }})."
466 description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping."
467 - alert: PrometheusAlertmanagerConfigurationReloadFailure
468 expr: alertmanager_config_last_reload_successful != 1
473 summary: "Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})."
474 description: "AlertManager configuration reload error."
475 - alert: PrometheusRuleEvaluationFailures
476 expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
481 summary: "Prometheus rule evaluation failures (instance {{ $labels.instance }})."
482 description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts."
483 - alert: PrometheusTargetScrapingSlow
484 expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60
489 summary: "Prometheus target scraping slow (instance {{ $labels.instance }})."
490 description: "Prometheus is scraping exporters slowly."
491 - alert: PrometheusTsdbCompactionsFailed
492 expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
497 summary: "Prometheus TSDB compactions failed (instance {{ $labels.instance }})."
498 description: "Prometheus encountered {{ $value }} TSDB compactions failures."
499 - alert: PrometheusTsdbHeadTruncationsFailed
500 expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
505 summary: "Prometheus TSDB head truncations failed (instance {{ $labels.instance }})."
506 description: "Prometheus encountered {{ $value }} TSDB head truncation failures."
507 - alert: PrometheusTsdbWalCorruptions
508 expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
513 summary: "Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})."
514 description: "Prometheus encountered {{ $value }} TSDB WAL corruptions."
515 - alert: PrometheusTsdbWalTruncationsFailed
516 expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
521 summary: "Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})."
522 description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures."
528 change_signal = "SIGINT"
529 destination = "secrets/prometheus.yml"
535 evaluation_interval: 5s
540 - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
541 services: [ 'alertmanager' ]
547 - job_name: 'Nomad Cluster'
549 - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
550 services: [ 'nomad-client', 'nomad' ]
552 - source_labels: [__meta_consul_tags]
553 regex: '(.*)http(.*)'
555 metrics_path: /v1/metrics
557 format: [ 'prometheus' ]
559 - job_name: 'Consul Cluster'
561 - targets: [ '10.30.51.23:8500' ]
562 - targets: [ '10.30.51.24:8500' ]
563 - targets: [ '10.30.51.25:8500' ]
564 - targets: [ '10.30.51.26:8500' ]
565 - targets: [ '10.30.51.27:8500' ]
566 - targets: [ '10.30.51.28:8500' ]
567 - targets: [ '10.30.51.50:8500' ]
568 - targets: [ '10.30.51.51:8500' ]
569 - targets: [ '10.30.51.70:8500' ]
570 - targets: [ '10.30.51.71:8500' ]
571 - targets: [ '10.30.51.91:8500' ]
572 - targets: [ '10.30.51.92:8500' ]
573 metrics_path: /v1/agent/metrics
575 format: [ 'prometheus' ]
577 - job_name: 'Jenkins Job Health Exporter'
579 - targets: [ '10.30.51.22:9186' ]
580 metric_relabel_configs:
581 - source_labels: [ __name__ ]
582 regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'
586 - source_labels: [ __name__ ]
587 regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'
588 replacement: 'jenkins_job_$2'
589 target_label: __name__
591 - job_name: 'Node Exporter'
593 - targets: [ '10.30.51.23:9100' ]
594 - targets: [ '10.30.51.24:9100' ]
595 - targets: [ '10.30.51.25:9100' ]
596 - targets: [ '10.30.51.26:9100' ]
597 - targets: [ '10.30.51.27:9100' ]
598 - targets: [ '10.30.51.28:9100' ]
599 - targets: [ '10.30.51.50:9100' ]
600 - targets: [ '10.30.51.51:9100' ]
601 - targets: [ '10.30.51.70:9100' ]
602 - targets: [ '10.30.51.71:9100' ]
603 - targets: [ '10.30.51.91:9100' ]
604 - targets: [ '10.30.51.92:9100' ]
606 - job_name: 'Alertmanager'
608 - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
609 services: [ 'alertmanager' ]
611 - job_name: 'Prometheus'
612 honor_timestamps: true
617 follow_redirects: true
620 - server: {{ env "CONSUL_HTTP_ADDR" }}
624 cert_file: cert_file.crt
625 key_file: key_file.key
626 insecure_skip_verify: true
632 change_signal = "SIGINT"
633 destination = "secrets/web-config.yml"
634 left_delimiter = "{{{"
635 right_delimiter = "}}}"
639 cert_file: cert_file.crt
640 key_file: key_file.key
644 # The service stanza instructs Nomad to register a service with Consul.
646 # https://www.nomadproject.io/docs/job-specification/service
649 name = "${service_name}"
650 port = "${service_name}"
651 tags = [ "${service_name}$${NOMAD_ALLOC_INDEX}" ]
653 name = "Prometheus Check Live"
657 tls_skip_verify = true
663 # The "resources" stanza describes the requirements a task needs to
664 # execute. Resource requirements include memory, network, cpu, and more.
665 # This ensures the task will execute on a machine that contains enough
668 # https://www.nomadproject.io/docs/job-specification/resources