fdio.infra.terraform/terraform-nomad-prometheus/conf/nomad/prometheus.hcl.tftpl

   1 job "${job_name}" {
   2   # The "region" parameter specifies the region in which to execute the job.
   3   # If omitted, this inherits the default region name of "global".
   4   # region    = "${region}"
   5
   6   # The "datacenters" parameter specifies the list of datacenters which should
   7   # be considered when placing this task. This must be provided.
   8   datacenters = "${datacenters}"
   9
  10   # The "type" parameter controls the type of job, which impacts the scheduler's
  11   # decision on placement. This configuration is optional and defaults to
  12   # "service". For a full list of job types and their differences, please see
  13   # the online documentation.
  14   #
  15   #     https://www.nomadproject.io/docs/jobspec/schedulers
  16   #
  17   type        = "service"
  18
  19   update {
  20     # The "max_parallel" parameter specifies the maximum number of updates to
  21     # perform in parallel. In this case, this specifies to update a single task
  22     # at a time.
  23     max_parallel      = ${max_parallel}
  24
  25     health_check      = "checks"
  26
  27     # The "min_healthy_time" parameter specifies the minimum time the allocation
  28     # must be in the healthy state before it is marked as healthy and unblocks
  29     # further allocations from being updated.
  30     min_healthy_time  = "10s"
  31
  32     # The "healthy_deadline" parameter specifies the deadline in which the
  33     # allocation must be marked as healthy after which the allocation is
  34     # automatically transitioned to unhealthy. Transitioning to unhealthy will
  35     # fail the deployment and potentially roll back the job if "auto_revert" is
  36     # set to true.
  37     healthy_deadline  = "3m"
  38
  39     # The "progress_deadline" parameter specifies the deadline in which an
  40     # allocation must be marked as healthy. The deadline begins when the first
  41     # allocation for the deployment is created and is reset whenever an allocation
  42     # as part of the deployment transitions to a healthy state. If no allocation
  43     # transitions to the healthy state before the progress deadline, the
  44     # deployment is marked as failed.
  45     progress_deadline = "10m"
  46
  47 %{ if use_canary }
  48     # The "canary" parameter specifies that changes to the job that would result
  49     # in destructive updates should create the specified number of canaries
  50     # without stopping any previous allocations. Once the operator determines the
  51     # canaries are healthy, they can be promoted which unblocks a rolling update
  52     # of the remaining allocations at a rate of "max_parallel".
  53     #
  54     # Further, setting "canary" equal to the count of the task group allows
  55     # blue/green deployments. When the job is updated, a full set of the new
  56     # version is deployed and upon promotion the old version is stopped.
  57     canary            = ${canary}
  58
  59     # Specifies if the job should auto-promote to the canary version when all
  60     # canaries become healthy during a deployment. Defaults to false which means
  61     # canaries must be manually updated with the nomad deployment promote
  62     # command.
  63     auto_promote      = ${auto_promote}
  64
  65     # The "auto_revert" parameter specifies if the job should auto-revert to the
  66     # last stable job on deployment failure. A job is marked as stable if all the
  67     # allocations as part of its deployment were marked healthy.
  68     auto_revert       = ${auto_revert}
  69 %{ endif }
  70   }
  71
  72   # The "group" stanza defines a series of tasks that should be co-located on
  73   # the same Nomad client. Any task within a group will be placed on the same
  74   # client.
  75   #
  76   #     https://www.nomadproject.io/docs/job-specification/group
  77   #
  78   group "${job_name}-group-1" {
  79     # The "count" parameter specifies the number of the task groups that should
  80     # be running under this group. This value must be non-negative and defaults
  81     # to 1.
  82     count = ${group_count}
  83
  84     # The volume stanza allows the group to specify that it requires a given
  85     # volume from the cluster. The key of the stanza is the name of the volume
  86     # as it will be exposed to task configuration.
  87     #
  88     # https://www.nomadproject.io/docs/job-specification/volume
  89     %{ if use_host_volume }
  90     volume "${job_name}-volume-1" {
  91       type      = "host"
  92       read_only = false
  93       source    = "${volume_source}"
  94     }
  95     %{ endif }
  96
  97     # The restart stanza configures a tasks's behavior on task failure. Restarts
  98     # happen on the client that is running the task.
  99     #
 100     # https://www.nomadproject.io/docs/job-specification/restart
 101     #
 102     restart {
 103       interval  = "30m"
 104       attempts  = 40
 105       delay     = "15s"
 106       mode      = "delay"
 107     }
 108
 109     # The constraint allows restricting the set of eligible nodes. Constraints
 110     # may filter on attributes or client metadata.
 111     #
 112     #     https://www.nomadproject.io/docs/job-specification/constraint
 113     #
 114     constraint {
 115       attribute = "$${attr.cpu.arch}"
 116       operator  = "!="
 117       value     = "arm64"
 118     }
 119
 120     constraint {
 121       attribute = "$${node.class}"
 122       value     = "builder"
 123     }
 124
 125     # The network stanza specifies the networking requirements for the task
 126     # group, including the network mode and port allocations. When scheduling
 127     # jobs in Nomad they are provisioned across your fleet of machines along
 128     # with other jobs and services. Because you don't know in advance what host
 129     # your job will be provisioned on, Nomad will provide your tasks with
 130     # network configuration when they start up.
 131     #
 132     #     https://www.nomadproject.io/docs/job-specification/network
 133     #
 134     network {
 135       port "${service_name}" {
 136         static = ${port}
 137         to     = ${port}
 138       }
 139     }
 140
 141     # The "task" stanza creates an individual unit of work, such as a Docker
 142     # container, web application, or batch processing.
 143     #
 144     #     https://www.nomadproject.io/docs/job-specification/task
 145     #
 146     task "${job_name}-task-1" {
 147       # The "driver" parameter specifies the task driver that should be used to
 148       # run the task.
 149       driver = "exec"
 150
 151     %{ if use_host_volume }
 152       volume_mount {
 153         volume      = "${job_name}-volume-1"
 154         destination = "${volume_destination}"
 155         read_only   = false
 156       }
 157     %{ endif }
 158
 159     %{ if use_vault_provider }
 160       vault {
 161         policies = "${vault_kv_policy_name}"
 162       }
 163     %{ endif }
 164
 165       # The "config" stanza specifies the driver configuration, which is passed
 166       # directly to the driver to start the task. The details of configurations
 167       # are specific to each driver, so please see specific driver
 168       # documentation for more information.
 169       config {
 170         command         = "local/prometheus-${version}.linux-amd64/prometheus"
 171         args            = [
 172           "--config.file=secrets/prometheus.yml",
 173           "--storage.tsdb.path=${volume_destination}prometheus/",
 174           "--storage.tsdb.retention.time=7d"
 175         ]
 176       }
 177
 178       # The artifact stanza instructs Nomad to fetch and unpack a remote resource,
 179       # such as a file, tarball, or binary. Nomad downloads artifacts using the
 180       # popular go-getter library, which permits downloading artifacts from a
 181       # variety of locations using a URL as the input source.
 182       #
 183       #     https://www.nomadproject.io/docs/job-specification/artifact
 184       #
 185       artifact {
 186         source = "${url}"
 187       }
 188
 189       # The "template" stanza instructs Nomad to manage a template, such as
 190       # a configuration file or script. This template can optionally pull data
 191       # from Consul or Vault to populate runtime configuration data.
 192       #
 193       #     https://www.nomadproject.io/docs/job-specification/template
 194       #
 195       template {
 196         change_mode     = "noop"
 197         change_signal   = "SIGINT"
 198         destination     = "secrets/alerts.yml"
 199         left_delimiter  = "{{{"
 200         right_delimiter = "}}}"
 201         data            = <<EOH
 202 ---
 203 groups:
 204 - name: "Jenkins Job Health Exporter"
 205   rules:
 206   - alert: JenkinsJobHealthExporterFailures
 207     expr: jenkins_job_failure{id=~".*"} > jenkins_job_success{id=~".*"}
 208     for: 0m
 209     labels:
 210       severity: critical
 211     annotations:
 212       summary: "Jenkins Job Health detected high failure rate on jenkins jobs."
 213       description: "Job: {{ $labels.id }}"
 214   - alert: JenkinsJobHealthExporterUnstable
 215     expr: jenkins_job_unstable{id=~".*"} > jenkins_job_success{id=~".*"}
 216     for: 0m
 217     labels:
 218       severity: warning
 219     annotations:
 220       summary: "Jenkins Job Health detected high unstable rate on jenkins jobs."
 221       description: "Job: {{ $labels.id }}"
 222 - name: "Consul"
 223   rules:
 224   - alert: ConsulServiceHealthcheckFailed
 225     expr: consul_catalog_service_node_healthy == 0
 226     for: 0m
 227     labels:
 228       severity: critical
 229     annotations:
 230       summary: "Consul service healthcheck failed (instance {{ $labels.instance }})."
 231       description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`."
 232   - alert: ConsulMissingMasterNode
 233     expr: consul_raft_peers < 3
 234     for: 0m
 235     labels:
 236       severity: critical
 237     annotations:
 238       summary: "Consul missing master node (instance {{ $labels.instance }})."
 239       description: "Numbers of consul raft peers should be 3, in order to preserve quorum."
 240   - alert: ConsulAgentUnhealthy
 241     expr: consul_health_node_status{status="critical"} == 1
 242     for: 0m
 243     labels:
 244       severity: critical
 245     annotations:
 246       summary: "Consul agent unhealthy (instance {{ $labels.instance }})."
 247       description: "A Consul agent is down."
 248 - name: "Hosts"
 249   rules:
 250   - alert: NodeDown
 251     expr: up == 0
 252     for: 0m
 253     labels:
 254       severity: critical
 255     annotations:
 256       summary: "Prometheus target missing (instance {{ $labels.instance }})."
 257       description: "A Prometheus target has disappeared. An exporter might be crashed."
 258   - alert: HostOutOfMemory
 259     expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
 260     for: 2m
 261     labels:
 262       severity: warning
 263     annotations:
 264       summary: "Host out of memory (instance {{ $labels.instance }})."
 265       description: "Node memory is filling up (< 10% left)."
 266   - alert: HostOomKillDetected
 267     expr: increase(node_vmstat_oom_kill[1m]) > 0
 268     for: 0m
 269     labels:
 270       severity: warning
 271     annotations:
 272       summary: "Host OOM kill detected (instance {{ $labels.instance }})."
 273       description: "OOM kill detected."
 274   - alert: HostMemoryUnderMemoryPressure
 275     expr: rate(node_vmstat_pgmajfault[1m]) > 1000
 276     for: 2m
 277     labels:
 278       severity: warning
 279     annotations:
 280       summary: "Host memory under memory pressure (instance {{ $labels.instance }})."
 281       description: "The node is under heavy memory pressure. High rate of major page faults."
 282   - alert: HostOutOfDiskSpace
 283     expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
 284     for: 2m
 285     labels:
 286       severity: warning
 287     annotations:
 288       summary: "Host out of disk space (instance {{ $labels.instance }})."
 289       description: "Disk is almost full (< 10% left)."
 290   - alert: HostRaidDiskFailure
 291     expr: node_md_disks{state="failed"} > 0
 292     for: 2m
 293     labels:
 294       severity: warning
 295     annotations:
 296       summary: "Host RAID disk failure (instance {{ $labels.instance }})."
 297       description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap."
 298   - alert: HostConntrackLimit
 299     expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
 300     for: 5m
 301     labels:
 302       severity: warning
 303     annotations:
 304       summary: "Host conntrack limit (instance {{ $labels.instance }})."
 305       description: "The number of conntrack is approching limit."
 306   - alert: HostNetworkInterfaceSaturated
 307     expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8
 308     for: 1m
 309     labels:
 310       severity: warning
 311     annotations:
 312       summary: "Host Network Interface Saturated (instance {{ $labels.instance }})."
 313       description: "The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded."
 314   - alert: HostSystemdServiceCrashed
 315     expr: node_systemd_unit_state{state="failed"} == 1
 316     for: 0m
 317     labels:
 318       severity: warning
 319     annotations:
 320       summary: "Host SystemD service crashed (instance {{ $labels.instance }})."
 321       description: "SystemD service crashed."
 322   - alert: HostEdacCorrectableErrorsDetected
 323     expr: increase(node_edac_correctable_errors_total[1m]) > 0
 324     for: 0m
 325     labels:
 326       severity: info
 327     annotations:
 328       summary: "Host EDAC Correctable Errors detected (instance {{ $labels.instance }})."
 329       description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'
 330   - alert: HostEdacUncorrectableErrorsDetected
 331     expr: node_edac_uncorrectable_errors_total > 0
 332     for: 0m
 333     labels:
 334       severity: warning
 335     annotations:
 336       summary: "Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})."
 337       description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'
 338 - name: "Min.io"
 339   rules:
 340   - alert: MinioDiskOffline
 341     expr: minio_offline_disks > 0
 342     for: 0m
 343     labels:
 344       severity: critical
 345     annotations:
 346       summary: "Minio disk offline (instance {{ $labels.instance }})"
 347       description: "Minio disk is offline."
 348   - alert: MinioStorageSpaceExhausted
 349     expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 < 10
 350     for: 2m
 351     labels:
 352       severity: warning
 353     annotations:
 354       summary: "Minio storage space exhausted (instance {{ $labels.instance }})."
 355       description: "Minio storage space is low (< 10 GB)."
 356 - name: "Prometheus"
 357   rules:
 358   - alert: PrometheusConfigurationReloadFailure
 359     expr: prometheus_config_last_reload_successful != 1
 360     for: 0m
 361     labels:
 362       severity: warning
 363     annotations:
 364       summary: "Prometheus configuration reload failure (instance {{ $labels.instance }})."
 365       description: "Prometheus configuration reload error."
 366   - alert: PrometheusTooManyRestarts
 367     expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
 368     for: 0m
 369     labels:
 370       severity: warning
 371     annotations:
 372       summary: "Prometheus too many restarts (instance {{ $labels.instance }})."
 373       description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping."
 374   - alert: PrometheusAlertmanagerConfigurationReloadFailure
 375     expr: alertmanager_config_last_reload_successful != 1
 376     for: 0m
 377     labels:
 378       severity: warning
 379     annotations:
 380       summary: "Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})."
 381       description: "AlertManager configuration reload error."
 382   - alert: PrometheusRuleEvaluationFailures
 383     expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
 384     for: 0m
 385     labels:
 386       severity: critical
 387     annotations:
 388       summary: "Prometheus rule evaluation failures (instance {{ $labels.instance }})."
 389       description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts."
 390   - alert: PrometheusTargetScrapingSlow
 391     expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60
 392     for: 5m
 393     labels:
 394       severity: warning
 395     annotations:
 396       summary: "Prometheus target scraping slow (instance {{ $labels.instance }})."
 397       description: "Prometheus is scraping exporters slowly."
 398   - alert: PrometheusTsdbCompactionsFailed
 399     expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
 400     for: 0m
 401     labels:
 402       severity: critical
 403     annotations:
 404       summary: "Prometheus TSDB compactions failed (instance {{ $labels.instance }})."
 405       description: "Prometheus encountered {{ $value }} TSDB compactions failures."
 406   - alert: PrometheusTsdbHeadTruncationsFailed
 407     expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
 408     for: 0m
 409     labels:
 410       severity: critical
 411     annotations:
 412       summary: "Prometheus TSDB head truncations failed (instance {{ $labels.instance }})."
 413       description: "Prometheus encountered {{ $value }} TSDB head truncation failures."
 414   - alert: PrometheusTsdbWalCorruptions
 415     expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
 416     for: 0m
 417     labels:
 418       severity: critical
 419     annotations:
 420       summary: "Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})."
 421       description: "Prometheus encountered {{ $value }} TSDB WAL corruptions."
 422   - alert: PrometheusTsdbWalTruncationsFailed
 423     expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
 424     for: 0m
 425     labels:
 426       severity: critical
 427     annotations:
 428       summary: "Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})."
 429       description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures."
 430 EOH
 431       }
 432
 433       template {
 434         change_mode     = "noop"
 435         change_signal   = "SIGINT"
 436         destination     = "secrets/prometheus.yml"
 437         data            = <<EOH
 438 ---
 439 global:
 440   scrape_interval:     5s
 441   scrape_timeout:      5s
 442   evaluation_interval: 5s
 443
 444 alerting:
 445   alertmanagers:
 446   - consul_sd_configs:
 447     - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
 448       services: [ 'alertmanager' ]
 449
 450 rule_files:
 451   - 'alerts.yml'
 452
 453 scrape_configs:
 454
 455   - job_name: 'Nomad Cluster'
 456     consul_sd_configs:
 457     - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
 458       services: [ 'nomad-client', 'nomad' ]
 459     relabel_configs:
 460     - source_labels: [__meta_consul_tags]
 461       regex: '(.*)http(.*)'
 462       action: keep
 463     metrics_path: /v1/metrics
 464     params:
 465       format: [ 'prometheus' ]
 466
 467   - job_name: 'Consul Cluster'
 468     static_configs:
 469       - targets: [ '10.30.51.16:8500' ]
 470       - targets: [ '10.30.51.17:8500' ]
 471       - targets: [ '10.30.51.18:8500' ]
 472       - targets: [ '10.30.51.19:8500' ]
 473       - targets: [ '10.30.51.20:8500' ]
 474       - targets: [ '10.30.51.21:8500' ]
 475       - targets: [ '10.30.51.22:8500' ]
 476       - targets: [ '10.30.51.23:8500' ]
 477       - targets: [ '10.30.51.24:8500' ]
 478       - targets: [ '10.30.51.25:8500' ]
 479       - targets: [ '10.30.51.26:8500' ]
 480       - targets: [ '10.30.51.50:8500' ]
 481       - targets: [ '10.30.51.51:8500' ]
 482       - targets: [ '10.30.51.70:8500' ]
 483       - targets: [ '10.30.51.71:8500' ]
 484       - targets: [ '10.30.51.91:8500' ]
 485       - targets: [ '10.30.51.92:8500' ]
 486     metrics_path: /v1/agent/metrics
 487     params:
 488       format: [ 'prometheus' ]
 489
 490   - job_name: 'Jenkins Job Health Exporter'
 491     static_configs:
 492       - targets: [ '10.30.51.22:9186' ]
 493     metric_relabel_configs:
 494       - source_labels: [ __name__ ]
 495         regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'
 496         action: replace
 497         replacement: '$1'
 498         target_label: id
 499       - source_labels: [ __name__ ]
 500         regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'
 501         replacement: 'jenkins_job_$2'
 502         target_label: __name__
 503
 504   - job_name: 'Node Exporter'
 505     static_configs:
 506       - targets: [ '10.30.51.16:9100' ]
 507       - targets: [ '10.30.51.17:9100' ]
 508       - targets: [ '10.30.51.18:9100' ]
 509       - targets: [ '10.30.51.19:9100' ]
 510       - targets: [ '10.30.51.20:9100' ]
 511       - targets: [ '10.30.51.21:9100' ]
 512       - targets: [ '10.30.51.22:9100' ]
 513       - targets: [ '10.30.51.23:9100' ]
 514       - targets: [ '10.30.51.24:9100' ]
 515       - targets: [ '10.30.51.25:9100' ]
 516       - targets: [ '10.30.51.26:9100' ]
 517       - targets: [ '10.30.51.50:9100' ]
 518       - targets: [ '10.30.51.51:9100' ]
 519       - targets: [ '10.30.51.70:9100' ]
 520       - targets: [ '10.30.51.71:9100' ]
 521       - targets: [ '10.30.51.91:9100' ]
 522       - targets: [ '10.30.51.92:9100' ]
 523
 524   - job_name: 'Alertmanager'
 525     consul_sd_configs:
 526     - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
 527       services: [ 'alertmanager' ]
 528
 529   - job_name: 'Grafana'
 530     consul_sd_configs:
 531     - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
 532       services: [ 'grafana' ]
 533
 534   - job_name: 'Prometheus'
 535     consul_sd_configs:
 536     - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
 537       services: [ 'prometheus' ]
 538
 539   - job_name: 'Minio'
 540     bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg
 541     consul_sd_configs:
 542     - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
 543       services: [ 'storage' ]
 544     metrics_path: /minio/prometheus/metrics
 545 EOH
 546       }
 547
 548       # The service stanza instructs Nomad to register a service with Consul.
 549       #
 550       #     https://www.nomadproject.io/docs/job-specification/service
 551       #
 552       service {
 553         name       = "${service_name}"
 554         port       = "${service_name}"
 555         tags       = [ "${service_name}$${NOMAD_ALLOC_INDEX}" ]
 556         check {
 557           name     = "Prometheus Check Live"
 558           type     = "http"
 559           path     = "/-/healthy"
 560           interval = "10s"
 561           timeout  = "2s"
 562         }
 563       }
 564
 565       # The "resources" stanza describes the requirements a task needs to
 566       # execute. Resource requirements include memory, network, cpu, and more.
 567       # This ensures the task will execute on a machine that contains enough
 568       # resource capacity.
 569       #
 570       #     https://www.nomadproject.io/docs/job-specification/resources
 571       #
 572       resources {
 573         cpu    = ${cpu}
 574         memory = ${memory}
 575       }
 576     }
 577   }
 578 }