fdio.infra.terraform/terraform-nomad-prometheus/conf/nomad/prometheus.hcl.tftpl

   1 job "${job_name}" {
   2   # The "region" parameter specifies the region in which to execute the job.
   3   # If omitted, this inherits the default region name of "global".
   4   # region    = "${region}"
   5
   6   # The "datacenters" parameter specifies the list of datacenters which should
   7   # be considered when placing this task. This must be provided.
   8   datacenters = "${datacenters}"
   9
  10   # The "type" parameter controls the type of job, which impacts the scheduler's
  11   # decision on placement.
  12   #
  13   # https://www.nomadproject.io/docs/jobspec/schedulers
  14   #
  15   type        = "service"
  16
  17   update {
  18     # The "max_parallel" parameter specifies the maximum number of updates to
  19     # perform in parallel.
  20     max_parallel      = ${max_parallel}
  21
  22     health_check      = "checks"
  23
  24     # The "min_healthy_time" parameter specifies the minimum time the allocation
  25     # must be in the healthy state before it is marked as healthy and unblocks
  26     # further allocations from being updated.
  27     min_healthy_time  = "10s"
  28
  29     # The "healthy_deadline" parameter specifies the deadline in which the
  30     # allocation must be marked as healthy after which the allocation is
  31     # automatically transitioned to unhealthy. Transitioning to unhealthy will
  32     # fail the deployment and potentially roll back the job if "auto_revert" is
  33     # set to true.
  34     healthy_deadline  = "3m"
  35
  36     # The "progress_deadline" parameter specifies the deadline in which an
  37     # allocation must be marked as healthy. The deadline begins when the first
  38     # allocation for the deployment is created and is reset whenever an allocation
  39     # as part of the deployment transitions to a healthy state. If no allocation
  40     # transitions to the healthy state before the progress deadline, the
  41     # deployment is marked as failed.
  42     progress_deadline = "10m"
  43
  44 %{ if use_canary }
  45     # The "canary" parameter specifies that changes to the job that would result
  46     # in destructive updates should create the specified number of canaries
  47     # without stopping any previous allocations. Once the operator determines the
  48     # canaries are healthy, they can be promoted which unblocks a rolling update
  49     # of the remaining allocations at a rate of "max_parallel".
  50     #
  51     # Further, setting "canary" equal to the count of the task group allows
  52     # blue/green deployments. When the job is updated, a full set of the new
  53     # version is deployed and upon promotion the old version is stopped.
  54     canary            = ${canary}
  55
  56     # Specifies if the job should auto-promote to the canary version when all
  57     # canaries become healthy during a deployment. Defaults to false which means
  58     # canaries must be manually updated with the nomad deployment promote
  59     # command.
  60     auto_promote      = ${auto_promote}
  61
  62     # The "auto_revert" parameter specifies if the job should auto-revert to the
  63     # last stable job on deployment failure. A job is marked as stable if all the
  64     # allocations as part of its deployment were marked healthy.
  65     auto_revert       = ${auto_revert}
  66 %{ endif }
  67   }
  68
  69   # The "group" stanza defines a series of tasks that should be co-located on
  70   # the same Nomad client. Any task within a group will be placed on the same
  71   # client.
  72   #
  73   # https://www.nomadproject.io/docs/job-specification/group
  74   #
  75   group "${job_name}-group-1" {
  76     # The "count" parameter specifies the number of the task groups that should
  77     # be running under this group. This value must be non-negative.
  78     count = ${group_count}
  79
  80     # The volume stanza allows the group to specify that it requires a given
  81     # volume from the cluster. The key of the stanza is the name of the volume
  82     # as it will be exposed to task configuration.
  83     #
  84     # https://www.nomadproject.io/docs/job-specification/volume
  85     #
  86     %{ if use_host_volume }
  87     volume "${job_name}-volume-1" {
  88       type      = "host"
  89       read_only = false
  90       source    = "${volume_source}"
  91     }
  92     %{ endif }
  93
  94     # The restart stanza configures a tasks's behavior on task failure. Restarts
  95     # happen on the client that is running the task.
  96     #
  97     # https://www.nomadproject.io/docs/job-specification/restart
  98     #
  99     restart {
 100       interval = "30m"
 101       attempts = 40
 102       delay    = "15s"
 103       mode     = "delay"
 104     }
 105
 106     # The constraint allows restricting the set of eligible nodes. Constraints
 107     # may filter on attributes or client metadata.
 108     #
 109     # https://www.nomadproject.io/docs/job-specification/constraint
 110     #
 111     constraint {
 112       attribute = "$${attr.cpu.arch}"
 113       operator  = "!="
 114       value     = "arm64"
 115     }
 116     constraint {
 117       attribute = "$${node.class}"
 118       value     = "builder"
 119     }
 120
 121     # The network stanza specifies the networking requirements for the task
 122     # group, including the network mode and port allocations. When scheduling
 123     # jobs in Nomad they are provisioned across your fleet of machines along
 124     # with other jobs and services. Because you don't know in advance what host
 125     # your job will be provisioned on, Nomad will provide your tasks with
 126     # network configuration when they start up.
 127     #
 128     # https://www.nomadproject.io/docs/job-specification/network
 129     #
 130     network {
 131       port "${service_name}" {
 132         static = ${port}
 133         to     = ${port}
 134       }
 135     }
 136
 137     # The "task" stanza creates an individual unit of work, such as a Docker
 138     # container, web application, or batch processing.
 139     #
 140     # https://www.nomadproject.io/docs/job-specification/task
 141     #
 142     task "${job_name}-task-1" {
 143       # The "driver" parameter specifies the task driver that should be used to
 144       # run the task.
 145       driver = "exec"
 146
 147       %{ if use_host_volume }
 148       volume_mount {
 149         volume      = "${job_name}-volume-1"
 150         destination = "${volume_destination}"
 151         read_only   = false
 152       }
 153       %{ endif }
 154
 155       %{ if use_vault_provider }
 156       vault {
 157         policies = "${vault_kv_policy_name}"
 158       }
 159       %{ endif }
 160
 161       # The "config" stanza specifies the driver configuration, which is passed
 162       # directly to the driver to start the task. The details of configurations
 163       # are specific to each driver, so please see specific driver
 164       # documentation for more information.
 165       config {
 166         command = "local/prometheus-${version}.linux-amd64/prometheus"
 167         args    = [
 168           "--config.file=secrets/prometheus.yml",
 169           "--web.config.file=secrets/web-config.yml",
 170           "--storage.tsdb.path=${volume_destination}prometheus/",
 171           "--storage.tsdb.retention.time=7d"
 172         ]
 173       }
 174
 175       # The artifact stanza instructs Nomad to fetch and unpack a remote
 176       # resource, such as a file, tarball, or binary. Nomad downloads artifacts
 177       # using the popular go-getter library, which permits downloading artifacts
 178       # from a variety of locations using a URL as the input source.
 179       #
 180       # https://www.nomadproject.io/docs/job-specification/artifact
 181       #
 182       artifact {
 183         source = "${artifact_source}"
 184         options {
 185           checksum = "sha256:${artifact_source_checksum}"
 186         }
 187       }
 188
 189       # The "template" stanza instructs Nomad to manage a template, such as
 190       # a configuration file or script. This template can optionally pull data
 191       # from Consul or Vault to populate runtime configuration data.
 192       #
 193       # https://www.nomadproject.io/docs/job-specification/template
 194       #
 195       template {
 196         change_mode     = "noop"
 197         change_signal   = "SIGINT"
 198         destination     = "secrets/cert_file.crt"
 199         left_delimiter  = "{{{"
 200         right_delimiter = "}}}"
 201         data            = <<EOH
 202 -----BEGIN CERTIFICATE-----
 203 MIIFszCCA5ugAwIBAgIUDtmFbbnYaXbXH5ddtHi9l25wM7owDQYJKoZIhvcNAQEL
 204 BQAwaTELMAkGA1UEBhMCU0sxEzARBgNVBAgMClNvbWUtU3RhdGUxITAfBgNVBAoM
 205 GEludGVybmV0IFdpZGdpdHMgUHR5IEx0ZDEiMCAGA1UEAwwZcHJvbWV0aGV1cy5z
 206 ZXJ2aWNlLmNvbnN1bDAeFw0yMjEyMzEyMDMxMDFaFw0yMzAxMzAyMDMxMDFaMGkx
 207 CzAJBgNVBAYTAlNLMRMwEQYDVQQIDApTb21lLVN0YXRlMSEwHwYDVQQKDBhJbnRl
 208 cm5ldCBXaWRnaXRzIFB0eSBMdGQxIjAgBgNVBAMMGXByb21ldGhldXMuc2Vydmlj
 209 ZS5jb25zdWwwggIiMA0GCSqGSIb3DQEBAQUAA4ICDwAwggIKAoICAQCGH4Tyj+9G
 210 wYJNb3ubIdr5r0/DZL6XEnRIMiz88TN2QmdwAGKyQqQd7ka0IkdDHPhpRuK8IV1g
 211 ELQhKab7YJCa6zWuy+rQ6JFlotGC+2tIXd3MDriUd1VPVoX6fw/5zUK/2j6exBk4
 212 iqxPXHchQLzZ0viUXhQIBS1IUMTbfc0vjA8U0uPgpmAR7ieePWFwmUDxjOLMvJw6
 213 +goeOfaHhW4yYgT+kg7L3rT62G+KG6Op/p7k7BNZ6G6Y6K6uJ7Z/AayAClF2sPZz
 214 UIGr0uEDvD4IcAsfQgpR5vK/SVBFU5+DSO68mm11m+8IH/HA6GvNSEvCRC0Wtrsm
 215 Dyq+9S3wZ7tNi7msjQWWKTB1GvTbCbPE1G/q5GJdoKUnioys6AMP4DTEV9o3lCSg
 216 0sjYnkSTKgRplnuY/7Y2qSNnD1Rw0ZneSkF+8ocgiYcTvtyOY2fkhlT2VaQLX987
 217 m7892ikPvoCnc/LVeREWW7hCuIQ1E1CCqg304Kd9gCgKoOGXoYmC/3wgJW0RkaM0
 218 x5DpNLYx0y11CPVg315dvprOuedap6J3CNhBE3fO8ymwepFTzTcWLWgSVWrRLZnx
 219 Lgb4SPhjxPg6cCZptkmXrPA+9SgW8iNHd/Fer6MAs82Kcp2T1C+qq9RurL/jjxTD
 220 JaFrwZC2lgWELToMyVDrkBJJbA/2cU9CMQIDAQABo1MwUTAdBgNVHQ4EFgQUx1Mi
 221 fylZExNnIz0EkrPRdXYmHmAwHwYDVR0jBBgwFoAUx1MifylZExNnIz0EkrPRdXYm
 222 HmAwDwYDVR0TAQH/BAUwAwEB/zANBgkqhkiG9w0BAQsFAAOCAgEAbvlpMg4YRTNe
 223 0cgqMZky/GpNjvE/zFManUGgYns8TKyZ8U0laBxRQ4XU/fASwAcOBJYtrkG7w8z+
 224 FaOUptaOlNGW1VWsPDJt8ZQ2gAcTwKSW2EsBWCmOUJVNH5F0f6fTSqIUIXyxhP2w
 225 JVniSkfarhb/Y1EDCACdr7Xpu6iF+nQo2o4/HE4Wkto4qwvlrdApYv4dl5J1TWjq
 226 72fO9axDlNnEGVxa3C3xvKOQqWrEUy/HqC9p4it1yCiq6IYVLyve0meVFBY9xNXU
 227 137AN7ks4ouuR1FZQkhLtqFuIekSZ5l4G4alwdv1NB8vohJMuMJyk9DarTLqXcYU
 228 1uypZSmgREn8ByYrj4ochkSpiPw7wgK4H1Aa2cy4KUuzmLLShYu6Mov7hyJDoJSe
 229 JsDVNoEBuhql4jENATqbWT3pIgYwBvBEXuYXqekcNmVZkKiSOlsxKFfSz21HYDgA
 230 lCu4SMtlRYHcm4TuoTuy/FEPxHSjFY3pMciJrnO/qUrv9LlWPe1wjKhZLRPEebTk
 231 r+Oh+aVWpy3ps7shPTjczOrmQykWWBGAjndZjZi4VvZNRxkGZuNwzzZcEkzt0Db7
 232 l83pTRD58mvLHWl2QXoBS3t7IM6sOMwQvPx1Inp7hb7UIpNsJQaUrhhfKqy0sK18
 233 mXs4VRtrxYycXxsLbk0SaZGh+juT53M=
 234 -----END CERTIFICATE-----
 235 EOH
 236       }
 237
 238       template {
 239         change_mode     = "noop"
 240         change_signal   = "SIGINT"
 241         destination     = "secrets/key_file.key"
 242         left_delimiter  = "{{{"
 243         right_delimiter = "}}}"
 244         data            = <<EOH
 245 -----BEGIN PRIVATE KEY-----
 246 MIIJQQIBADANBgkqhkiG9w0BAQEFAASCCSswggknAgEAAoICAQCGH4Tyj+9GwYJN
 247 b3ubIdr5r0/DZL6XEnRIMiz88TN2QmdwAGKyQqQd7ka0IkdDHPhpRuK8IV1gELQh
 248 Kab7YJCa6zWuy+rQ6JFlotGC+2tIXd3MDriUd1VPVoX6fw/5zUK/2j6exBk4iqxP
 249 XHchQLzZ0viUXhQIBS1IUMTbfc0vjA8U0uPgpmAR7ieePWFwmUDxjOLMvJw6+goe
 250 OfaHhW4yYgT+kg7L3rT62G+KG6Op/p7k7BNZ6G6Y6K6uJ7Z/AayAClF2sPZzUIGr
 251 0uEDvD4IcAsfQgpR5vK/SVBFU5+DSO68mm11m+8IH/HA6GvNSEvCRC0WtrsmDyq+
 252 9S3wZ7tNi7msjQWWKTB1GvTbCbPE1G/q5GJdoKUnioys6AMP4DTEV9o3lCSg0sjY
 253 nkSTKgRplnuY/7Y2qSNnD1Rw0ZneSkF+8ocgiYcTvtyOY2fkhlT2VaQLX987m789
 254 2ikPvoCnc/LVeREWW7hCuIQ1E1CCqg304Kd9gCgKoOGXoYmC/3wgJW0RkaM0x5Dp
 255 NLYx0y11CPVg315dvprOuedap6J3CNhBE3fO8ymwepFTzTcWLWgSVWrRLZnxLgb4
 256 SPhjxPg6cCZptkmXrPA+9SgW8iNHd/Fer6MAs82Kcp2T1C+qq9RurL/jjxTDJaFr
 257 wZC2lgWELToMyVDrkBJJbA/2cU9CMQIDAQABAoICAA5AQByT3Z07h3BZ5ZzUqpM4
 258 JPYCeNvNeqyHJE+WA11P7fSxHcuKGC0T+dA/Cipf5CcvgHzz4JuJ+tHBPrxcBNFp
 259 J5GUmjUrWPOfKrrLoxkT3DLH56Xizh45d8/ne1eUD0EaW+f7tyBSX7+o+AGBAu/0
 260 IjSFkIRPpIGYD2qxAcHJFHsmc08V7oRJNU1zgSx5JDTmPtz5N3Juye9vQjohG9Xf
 261 o183Pro7xigXIjbe+/NemhyB1waJE2NM6e6YSqRRFbafIgvF/tG+3qBWrlD6ye6U
 262 lSHznuwX6XgYvp43Je5JrBA/Kl1CPdIzrrjMGVQ9F8ui+dV9ggInv2d93q06IGUU
 263 D1o9XsZivYkn1EkLEhFXD5CYj6oR1M+MyvUrBD0bJePQCBUo+WJ2sEDt9PN2AtFL
 264 9j7NKK/xXX5cTdAajeIvSS1PUGAHi7r1OF/c7bn3UFNOuOBEYzLsSZGP34AVglor
 265 NON0ENCTuylmDSFd8vpaKFQpV5SK3M2k8dPRe7VEu2C9UlRvAq0xnabSHNxbwNLU
 266 KuGDMSCKDc2npf3oCeQKU2PngAcePnwWSiapAkf5OqltQ/vMbrEpROpfzXLlRxLZ
 267 76MDMFMQkT7m0hik6aPBHTitcWRalxHhK0ze8GvO0wesIBdyYShPKg+VDNg3qFMm
 268 epVXzoi8xNzW8S6yi9DJAoIBAQC2l90VF5evDsv8nwsWMIa/rFGGLitpw23+oNcZ
 269 xsIDMsGie06GYwzYHNRsd3sqK5TNLtl2vJGaVNbeDcC5T22NAYPRjNas7I5svIki
 270 SnT4K68ICIVVxtfETbh2qoXSu+O3pyWJmHqqcQrvW2DlUvs0nxk/v3GukFjTVbuU
 271 qmXp1KjPAVMNYoWNCJkHLEpq6e3K3q4YhEImGhMbN8suvVR9+fkKx8QvKHcqT2kn
 272 9AlK7t57IPqovbni9KMfMZ+wPqw6HsYTL8lQE5NaqMB5q9Pl3SnzcRR0FSadNAiD
 273 /W9jWyMazE0UsNDn241X81tVlU78Kx9S/IN97m/FSeDA1XudAoIBAQC8CzVeHxTw
 274 U+ts/fi1XEuWOph2cIm6qd4aiyGX/riux0O6GUFuIQkosP5StWJyNPLBohWHC6eq
 275 hPk7b0vPWmxuhttUPLA/+6+CICC0jEMWvnDAd5aJULfT0pTLZyizVu2f/GbVaiL6
 276 pgsqeGyKnuh9cNTW5w7Mc45fXkgyKrB4W5aPfjoHN51n+jUqaDrfrp3CoWFviNDn
 277 n3WNFtgrkj/jzQM8XFixhwxADfjd8+sZVmHT4GYjIDS4pCqs5gtIZYKhXDb0Dydj
 278 fH/HiEXC63z0SuFjGNbomC/Era7kI3+1aK2qs6dyASzZKDN6dHKYoalHReUe/Cxk
 279 prRcyYRWhA6lAoIBAEVrLy5Zrd1sLrl4beqdwF0W0lfFLdQj7Kml1KGEIza8EUoI
 280 vy3wcm2naEtkkXrS3tuzOBIgVurp3lbFu8O4Ito8/TSp6uQLe4pzk19qF1ZSpVTU
 281 iHy4AEgtlDfpVL9tl4G3FlpdkiVCnPmrMAd/qOm0oxDNZBcN4fdW3N4EeoKPyy4I
 282 Pt8T2dpormU/vXswPKuoRWAkyFFcEG+Eosa+TGUoqDolAL09ETEQx9XcvbuzXPpK
 283 64FDwGw8vdeaMi/7Y9ck5AFfZZYAG0GYbrTTUthNYSmgkDoh4HBb2/DyZWrMt2f0
 284 zElVf9bmbbJGXy8GeOT+MAaI4iT6hZvoHn6xqzECggEABoQg6k0LbbSKwPEgEDDN
 285 kbwgEmKd8zD1uFe/50N1ZOEU0LsVUFqmtZlEhtswOSLqkpkqQ868laUb+dpGdz37
 286 6eyUZxvfQ6hWEZ1JZNhDbuNUhubd+Y4pgJaYf1/owiYt/9BAQ/70jVj5pBQeNsOA
 287 7O/fAD9rfNw4P8fFmq9uBA2wbvKB0kQ0GSlLdFe+SogDgX4UIUhNbOlSqnvzK7da
 288 rWsqRIoyrJwwaXvSduZ/7BXZN/1brLXt/cP6kpk6JN0XpL3MTbLEu6bRyrlHKZT9
 289 dH2vx75RnCfB5//YwqEUSNYCxpqJH+M4iaHh/slQO0fG1OhwIx278BTyxRBanKDg
 290 3QKCAQBoVnM3PDqaSAT1g3f3neYiXyZektJganRLj5wmDXYAySM2ag/oDacswmP/
 291 J0BQ9KYK+dSgXldlaXtC05oxdhxY5cawbCFNfbjGDZ6zGwgLDocyFtqOBZf6UXCV
 292 Gtj/9r6iyD2/2wbo/lrS0d3yNcNN0nkZUxoyl+J6uGB1o8bo+cfL+mi4pkALKV8L
 293 Oa/fPazAQtikZBHSWtdQamyUMFSAdMUeYIhaXBfkNUZG4sz9nKD5UGBOmquLMBt6
 294 zBPM+4dv4x/MEAEnSC2ANW8vDGFBgG/5H5+j2F0RM6O1MlkDzrOAIvUTrMJlJDBt
 295 775JbZNCKpaELqxy4BNPfRDEJGBh
 296 -----END PRIVATE KEY-----
 297 EOH
 298       }
 299
 300       # The "template" stanza instructs Nomad to manage a template, such as
 301       # a configuration file or script. This template can optionally pull data
 302       # from Consul or Vault to populate runtime configuration data.
 303       #
 304       #     https://www.nomadproject.io/docs/job-specification/template
 305       #
 306       template {
 307         change_mode     = "noop"
 308         change_signal   = "SIGINT"
 309         destination     = "secrets/alerts.yml"
 310         left_delimiter  = "{{{"
 311         right_delimiter = "}}}"
 312         data            = <<EOH
 313 ---
 314 groups:
 315 - name: "Jenkins Job Health Exporter"
 316   rules:
 317   - alert: JenkinsJobHealthExporterFailures
 318     expr: jenkins_job_failure{id=~".*"} > jenkins_job_success{id=~".*"}
 319     for: 0m
 320     labels:
 321       severity: critical
 322     annotations:
 323       summary: "Jenkins Job Health detected high failure rate on jenkins jobs."
 324       description: "Job: {{ $labels.id }}"
 325   - alert: JenkinsJobHealthExporterUnstable
 326     expr: jenkins_job_unstable{id=~".*"} > jenkins_job_success{id=~".*"}
 327     for: 0m
 328     labels:
 329       severity: warning
 330     annotations:
 331       summary: "Jenkins Job Health detected high unstable rate on jenkins jobs."
 332       description: "Job: {{ $labels.id }}"
 333 - name: "Consul"
 334   rules:
 335   - alert: ConsulServiceHealthcheckFailed
 336     expr: consul_catalog_service_node_healthy == 0
 337     for: 0m
 338     labels:
 339       severity: critical
 340     annotations:
 341       summary: "Consul service healthcheck failed (instance {{ $labels.instance }})."
 342       description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`."
 343   - alert: ConsulMissingMasterNode
 344     expr: consul_raft_peers < 3
 345     for: 0m
 346     labels:
 347       severity: critical
 348     annotations:
 349       summary: "Consul missing master node (instance {{ $labels.instance }})."
 350       description: "Numbers of consul raft peers should be 3, in order to preserve quorum."
 351   - alert: ConsulAgentUnhealthy
 352     expr: consul_health_node_status{status="critical"} == 1
 353     for: 0m
 354     labels:
 355       severity: critical
 356     annotations:
 357       summary: "Consul agent unhealthy (instance {{ $labels.instance }})."
 358       description: "A Consul agent is down."
 359 - name: "Hosts"
 360   rules:
 361   - alert: NodeDown
 362     expr: up == 0
 363     for: 0m
 364     labels:
 365       severity: critical
 366     annotations:
 367       summary: "Prometheus target missing (instance {{ $labels.instance }})."
 368       description: "A Prometheus target has disappeared. An exporter might be crashed."
 369   - alert: HostOutOfMemory
 370     expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
 371     for: 2m
 372     labels:
 373       severity: warning
 374     annotations:
 375       summary: "Host out of memory (instance {{ $labels.instance }})."
 376       description: "Node memory is filling up (< 10% left)."
 377   - alert: HostOomKillDetected
 378     expr: increase(node_vmstat_oom_kill[1m]) > 0
 379     for: 0m
 380     labels:
 381       severity: warning
 382     annotations:
 383       summary: "Host OOM kill detected (instance {{ $labels.instance }})."
 384       description: "OOM kill detected."
 385   - alert: HostMemoryUnderMemoryPressure
 386     expr: rate(node_vmstat_pgmajfault[1m]) > 1000
 387     for: 2m
 388     labels:
 389       severity: warning
 390     annotations:
 391       summary: "Host memory under memory pressure (instance {{ $labels.instance }})."
 392       description: "The node is under heavy memory pressure. High rate of major page faults."
 393   - alert: HostOutOfDiskSpace
 394     expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
 395     for: 2m
 396     labels:
 397       severity: warning
 398     annotations:
 399       summary: "Host out of disk space (instance {{ $labels.instance }})."
 400       description: "Disk is almost full (< 10% left)."
 401   - alert: HostRaidDiskFailure
 402     expr: node_md_disks{state="failed"} > 0
 403     for: 2m
 404     labels:
 405       severity: warning
 406     annotations:
 407       summary: "Host RAID disk failure (instance {{ $labels.instance }})."
 408       description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap."
 409   - alert: HostConntrackLimit
 410     expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
 411     for: 5m
 412     labels:
 413       severity: warning
 414     annotations:
 415       summary: "Host conntrack limit (instance {{ $labels.instance }})."
 416       description: "The number of conntrack is approching limit."
 417   - alert: HostNetworkInterfaceSaturated
 418     expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8
 419     for: 1m
 420     labels:
 421       severity: warning
 422     annotations:
 423       summary: "Host Network Interface Saturated (instance {{ $labels.instance }})."
 424       description: "The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded."
 425   - alert: HostSystemdServiceCrashed
 426     expr: node_systemd_unit_state{state="failed"} == 1
 427     for: 0m
 428     labels:
 429       severity: warning
 430     annotations:
 431       summary: "Host SystemD service crashed (instance {{ $labels.instance }})."
 432       description: "SystemD service crashed."
 433   - alert: HostEdacCorrectableErrorsDetected
 434     expr: increase(node_edac_correctable_errors_total[1m]) > 0
 435     for: 0m
 436     labels:
 437       severity: info
 438     annotations:
 439       summary: "Host EDAC Correctable Errors detected (instance {{ $labels.instance }})."
 440       description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'
 441   - alert: HostEdacUncorrectableErrorsDetected
 442     expr: node_edac_uncorrectable_errors_total > 0
 443     for: 0m
 444     labels:
 445       severity: warning
 446     annotations:
 447       summary: "Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})."
 448       description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'
 449 - name: "Prometheus"
 450   rules:
 451   - alert: PrometheusConfigurationReloadFailure
 452     expr: prometheus_config_last_reload_successful != 1
 453     for: 0m
 454     labels:
 455       severity: warning
 456     annotations:
 457       summary: "Prometheus configuration reload failure (instance {{ $labels.instance }})."
 458       description: "Prometheus configuration reload error."
 459   - alert: PrometheusTooManyRestarts
 460     expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
 461     for: 0m
 462     labels:
 463       severity: warning
 464     annotations:
 465       summary: "Prometheus too many restarts (instance {{ $labels.instance }})."
 466       description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping."
 467   - alert: PrometheusAlertmanagerConfigurationReloadFailure
 468     expr: alertmanager_config_last_reload_successful != 1
 469     for: 0m
 470     labels:
 471       severity: warning
 472     annotations:
 473       summary: "Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})."
 474       description: "AlertManager configuration reload error."
 475   - alert: PrometheusRuleEvaluationFailures
 476     expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
 477     for: 0m
 478     labels:
 479       severity: critical
 480     annotations:
 481       summary: "Prometheus rule evaluation failures (instance {{ $labels.instance }})."
 482       description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts."
 483   - alert: PrometheusTargetScrapingSlow
 484     expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60
 485     for: 5m
 486     labels:
 487       severity: warning
 488     annotations:
 489       summary: "Prometheus target scraping slow (instance {{ $labels.instance }})."
 490       description: "Prometheus is scraping exporters slowly."
 491   - alert: PrometheusTsdbCompactionsFailed
 492     expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
 493     for: 0m
 494     labels:
 495       severity: critical
 496     annotations:
 497       summary: "Prometheus TSDB compactions failed (instance {{ $labels.instance }})."
 498       description: "Prometheus encountered {{ $value }} TSDB compactions failures."
 499   - alert: PrometheusTsdbHeadTruncationsFailed
 500     expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
 501     for: 0m
 502     labels:
 503       severity: critical
 504     annotations:
 505       summary: "Prometheus TSDB head truncations failed (instance {{ $labels.instance }})."
 506       description: "Prometheus encountered {{ $value }} TSDB head truncation failures."
 507   - alert: PrometheusTsdbWalCorruptions
 508     expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
 509     for: 0m
 510     labels:
 511       severity: critical
 512     annotations:
 513       summary: "Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})."
 514       description: "Prometheus encountered {{ $value }} TSDB WAL corruptions."
 515   - alert: PrometheusTsdbWalTruncationsFailed
 516     expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
 517     for: 0m
 518     labels:
 519       severity: critical
 520     annotations:
 521       summary: "Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})."
 522       description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures."
 523 EOH
 524       }
 525
 526       template {
 527         change_mode     = "noop"
 528         change_signal   = "SIGINT"
 529         destination     = "secrets/prometheus.yml"
 530         data            = <<EOH
 531 ---
 532 global:
 533   scrape_interval:     5s
 534   scrape_timeout:      5s
 535   evaluation_interval: 5s
 536
 537 alerting:
 538   alertmanagers:
 539   - consul_sd_configs:
 540     - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
 541       services: [ 'alertmanager' ]
 542
 543 rule_files:
 544   - 'alerts.yml'
 545
 546 scrape_configs:
 547   - job_name: 'Nomad Cluster'
 548     consul_sd_configs:
 549     - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
 550       services: [ 'nomad-client', 'nomad' ]
 551     relabel_configs:
 552     - source_labels: [__meta_consul_tags]
 553       regex: '(.*)http(.*)'
 554       action: keep
 555     metrics_path: /v1/metrics
 556     params:
 557       format: [ 'prometheus' ]
 558
 559   - job_name: 'Consul Cluster'
 560     static_configs:
 561       - targets: [ '10.30.51.23:8500' ]
 562       - targets: [ '10.30.51.24:8500' ]
 563       - targets: [ '10.30.51.25:8500' ]
 564       - targets: [ '10.30.51.26:8500' ]
 565       - targets: [ '10.30.51.27:8500' ]
 566       - targets: [ '10.30.51.28:8500' ]
 567       - targets: [ '10.30.51.50:8500' ]
 568       - targets: [ '10.30.51.51:8500' ]
 569       - targets: [ '10.30.51.70:8500' ]
 570       - targets: [ '10.30.51.71:8500' ]
 571       - targets: [ '10.30.51.91:8500' ]
 572       - targets: [ '10.30.51.92:8500' ]
 573     metrics_path: /v1/agent/metrics
 574     params:
 575       format: [ 'prometheus' ]
 576
 577   - job_name: 'Jenkins Job Health Exporter'
 578     static_configs:
 579       - targets: [ '10.30.51.22:9186' ]
 580     metric_relabel_configs:
 581       - source_labels: [ __name__ ]
 582         regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'
 583         action: replace
 584         replacement: '$1'
 585         target_label: id
 586       - source_labels: [ __name__ ]
 587         regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'
 588         replacement: 'jenkins_job_$2'
 589         target_label: __name__
 590
 591   - job_name: 'Node Exporter'
 592     static_configs:
 593       - targets: [ '10.30.51.23:9100' ]
 594       - targets: [ '10.30.51.24:9100' ]
 595       - targets: [ '10.30.51.25:9100' ]
 596       - targets: [ '10.30.51.26:9100' ]
 597       - targets: [ '10.30.51.27:9100' ]
 598       - targets: [ '10.30.51.28:9100' ]
 599       - targets: [ '10.30.51.50:9100' ]
 600       - targets: [ '10.30.51.51:9100' ]
 601       - targets: [ '10.30.51.70:9100' ]
 602       - targets: [ '10.30.51.71:9100' ]
 603       - targets: [ '10.30.51.91:9100' ]
 604       - targets: [ '10.30.51.92:9100' ]
 605
 606   - job_name: 'Alertmanager'
 607     consul_sd_configs:
 608     - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
 609       services: [ 'alertmanager' ]
 610
 611   - job_name: 'Prometheus'
 612     honor_timestamps: true
 613     params:
 614       format:
 615       - prometheus
 616     scheme: https
 617     follow_redirects: true
 618     enable_http2: true
 619     consul_sd_configs:
 620     - server: {{ env "CONSUL_HTTP_ADDR" }}
 621       services:
 622       - prometheus
 623     tls_config:
 624       cert_file: cert_file.crt
 625       key_file: key_file.key
 626       insecure_skip_verify: true
 627 EOH
 628       }
 629
 630       template {
 631         change_mode     = "noop"
 632         change_signal   = "SIGINT"
 633         destination     = "secrets/web-config.yml"
 634         left_delimiter  = "{{{"
 635         right_delimiter = "}}}"
 636         data            = <<EOH
 637 ---
 638 tls_server_config:
 639   cert_file: cert_file.crt
 640   key_file: key_file.key
 641 EOH
 642       }
 643
 644       # The service stanza instructs Nomad to register a service with Consul.
 645       #
 646       # https://www.nomadproject.io/docs/job-specification/service
 647       #
 648       service {
 649         name       = "${service_name}"
 650         port       = "${service_name}"
 651         tags       = [ "${service_name}$${NOMAD_ALLOC_INDEX}" ]
 652         check {
 653           name            = "Prometheus Check Live"
 654           type            = "http"
 655           path            = "/-/healthy"
 656           protocol        = "https"
 657           tls_skip_verify = true
 658           interval        = "10s"
 659           timeout         = "2s"
 660         }
 661       }
 662
 663       # The "resources" stanza describes the requirements a task needs to
 664       # execute. Resource requirements include memory, network, cpu, and more.
 665       # This ensures the task will execute on a machine that contains enough
 666       # resource capacity.
 667       #
 668       # https://www.nomadproject.io/docs/job-specification/resources
 669       #
 670       resources {
 671         cpu    = ${cpu}
 672         memory = ${memory}
 673       }
 674     }
 675   }
 676 }