fdio.infra.terraform/1n_nmd/prometheus/conf/nomad/prometheus.hcl

   1 job "${job_name}" {
   2   # The "region" parameter specifies the region in which to execute the job.
   3   # If omitted, this inherits the default region name of "global".
   4   # region = "global"
   5   #
   6   # The "datacenters" parameter specifies the list of datacenters which should
   7   # be considered when placing this task. This must be provided.
   8   datacenters         = "${datacenters}"
   9
  10   # The "type" parameter controls the type of job, which impacts the scheduler's
  11   # decision on placement. This configuration is optional and defaults to
  12   # "service". For a full list of job types and their differences, please see
  13   # the online documentation.
  14   #
  15   # For more information, please see the online documentation at:
  16   #
  17   #     https://www.nomadproject.io/docs/jobspec/schedulers
  18   #
  19   type                = "service"
  20
  21   update {
  22     # The "max_parallel" parameter specifies the maximum number of updates to
  23     # perform in parallel. In this case, this specifies to update a single task
  24     # at a time.
  25     max_parallel      = 1
  26
  27     health_check      = "checks"
  28
  29     # The "min_healthy_time" parameter specifies the minimum time the allocation
  30     # must be in the healthy state before it is marked as healthy and unblocks
  31     # further allocations from being updated.
  32     min_healthy_time  = "10s"
  33
  34     # The "healthy_deadline" parameter specifies the deadline in which the
  35     # allocation must be marked as healthy after which the allocation is
  36     # automatically transitioned to unhealthy. Transitioning to unhealthy will
  37     # fail the deployment and potentially roll back the job if "auto_revert" is
  38     # set to true.
  39     healthy_deadline  = "3m"
  40
  41     # The "progress_deadline" parameter specifies the deadline in which an
  42     # allocation must be marked as healthy. The deadline begins when the first
  43     # allocation for the deployment is created and is reset whenever an allocation
  44     # as part of the deployment transitions to a healthy state. If no allocation
  45     # transitions to the healthy state before the progress deadline, the
  46     # deployment is marked as failed.
  47     progress_deadline = "10m"
  48
  49 %{ if use_canary }
  50     # The "canary" parameter specifies that changes to the job that would result
  51     # in destructive updates should create the specified number of canaries
  52     # without stopping any previous allocations. Once the operator determines the
  53     # canaries are healthy, they can be promoted which unblocks a rolling update
  54     # of the remaining allocations at a rate of "max_parallel".
  55     #
  56     # Further, setting "canary" equal to the count of the task group allows
  57     # blue/green deployments. When the job is updated, a full set of the new
  58     # version is deployed and upon promotion the old version is stopped.
  59     canary            = 1
  60
  61     # Specifies if the job should auto-promote to the canary version when all
  62     # canaries become healthy during a deployment. Defaults to false which means
  63     # canaries must be manually updated with the nomad deployment promote
  64     # command.
  65     auto_promote      = true
  66
  67     # The "auto_revert" parameter specifies if the job should auto-revert to the
  68     # last stable job on deployment failure. A job is marked as stable if all the
  69     # allocations as part of its deployment were marked healthy.
  70     auto_revert       = true
  71 %{ endif }
  72   }
  73
  74   # The reschedule stanza specifies the group's rescheduling strategy. If
  75   # specified at the job level, the configuration will apply to all groups
  76   # within the job. If the reschedule stanza is present on both the job and the
  77   # group, they are merged with the group stanza taking the highest precedence
  78   # and then the job.
  79   reschedule {
  80     delay             = "30s"
  81     delay_function    = "constant"
  82     unlimited         = true
  83   }
  84
  85   # The "group" stanza defines a series of tasks that should be co-located on
  86   # the same Nomad client. Any task within a group will be placed on the same
  87   # client.
  88   #
  89   # For more information and examples on the "group" stanza, please see
  90   # the online documentation at:
  91   #
  92   #     https://www.nomadproject.io/docs/job-specification/group
  93   #
  94   group "prod-group1-${service_name}" {
  95     # The "count" parameter specifies the number of the task groups that should
  96     # be running under this group. This value must be non-negative and defaults
  97     # to 1.
  98     count               = ${group_count}
  99
 100     # The restart stanza configures a tasks's behavior on task failure. Restarts
 101     # happen on the client that is running the task.
 102     #
 103     # https://www.nomadproject.io/docs/job-specification/restart
 104     #
 105     restart {
 106       interval  = "30m"
 107       attempts  = 40
 108       delay     = "15s"
 109       mode      = "delay"
 110     }
 111
 112     # The volume stanza allows the group to specify that it requires a given
 113     # volume from the cluster.
 114     #
 115     # For more information and examples on the "volume" stanza, please see
 116     # the online documentation at:
 117     #
 118     #     https://www.nomadproject.io/docs/job-specification/volume
 119     #
 120     %{ if use_host_volume }
 121     volume "prod-volume1-${service_name}" {
 122       type              = "host"
 123       read_only         = false
 124       source            = "${host_volume}"
 125     }
 126     %{ endif }
 127
 128     # The constraint allows restricting the set of eligible nodes. Constraints
 129     # may filter on attributes or client metadata.
 130     #
 131     # For more information and examples on the "volume" stanza, please see
 132     # the online documentation at:
 133     #
 134     #     https://www.nomadproject.io/docs/job-specification/constraint
 135     #
 136     constraint {
 137       attribute         = "$${attr.cpu.arch}"
 138       operator          = "!="
 139       value             = "arm64"
 140     }
 141
 142     # The "task" stanza creates an individual unit of work, such as a Docker
 143     # container, web application, or batch processing.
 144     #
 145     # For more information and examples on the "task" stanza, please see
 146     # the online documentation at:
 147     #
 148     #     https://www.nomadproject.io/docs/job-specification/task
 149     #
 150     task "prod-task1-${service_name}" {
 151       # The "driver" parameter specifies the task driver that should be used to
 152       # run the task.
 153       driver            = "exec"
 154
 155       %{ if use_host_volume }
 156       volume_mount {
 157         volume          = "prod-volume1-${service_name}"
 158         destination     = "${data_dir}"
 159         read_only       = false
 160       }
 161       %{ endif }
 162
 163       %{ if use_vault_provider }
 164       vault {
 165         policies        = "${vault_kv_policy_name}"
 166       }
 167       %{ endif }
 168
 169       # The "config" stanza specifies the driver configuration, which is passed
 170       # directly to the driver to start the task. The details of configurations
 171       # are specific to each driver, so please see specific driver
 172       # documentation for more information.
 173       config {
 174         command         = "local/prometheus-${version}.linux-amd64/prometheus"
 175         args            = [
 176           "--config.file=secrets/prometheus.yml",
 177           "--storage.tsdb.path=${data_dir}prometheus/",
 178           "--storage.tsdb.retention.time=7d"
 179         ]
 180       }
 181
 182       # The artifact stanza instructs Nomad to fetch and unpack a remote resource,
 183       # such as a file, tarball, or binary. Nomad downloads artifacts using the
 184       # popular go-getter library, which permits downloading artifacts from a
 185       # variety of locations using a URL as the input source.
 186       #
 187       # For more information and examples on the "artifact" stanza, please see
 188       # the online documentation at:
 189       #
 190       #     https://www.nomadproject.io/docs/job-specification/artifact
 191       #
 192       artifact {
 193         source          = "${url}"
 194       }
 195
 196       # The "template" stanza instructs Nomad to manage a template, such as
 197       # a configuration file or script. This template can optionally pull data
 198       # from Consul or Vault to populate runtime configuration data.
 199       #
 200       # For more information and examples on the "template" stanza, please see
 201       # the online documentation at:
 202       #
 203       #     https://www.nomadproject.io/docs/job-specification/template
 204       #
 205       template {
 206         change_mode     = "noop"
 207         change_signal   = "SIGINT"
 208         destination     = "secrets/alerts.yml"
 209         left_delimiter  = "{{{"
 210         right_delimiter = "}}}"
 211         data            = <<EOH
 212 ---
 213 groups:
 214 - name: "Jenkins Job Health Exporter"
 215   rules:
 216   - alert: JenkinsJobHealthExporterFailures
 217     expr: jenkins_job_failure{id=~".*"} > jenkins_job_success{id=~".*"}
 218     for: 0m
 219     labels:
 220       severity: critical
 221     annotations:
 222       summary: "Jenkins Job Health detected high failure rate on jenkins jobs."
 223       description: "Job: {{ $labels.id }}"
 224   - alert: JenkinsJobHealthExporterUnstable
 225     expr: jenkins_job_unstable{id=~".*"} > jenkins_job_success{id=~".*"}
 226     for: 0m
 227     labels:
 228       severity: warning
 229     annotations:
 230       summary: "Jenkins Job Health detected high unstable rate on jenkins jobs."
 231       description: "Job: {{ $labels.id }}"
 232 - name: "Consul"
 233   rules:
 234   - alert: ConsulServiceHealthcheckFailed
 235     expr: consul_catalog_service_node_healthy == 0
 236     for: 0m
 237     labels:
 238       severity: critical
 239     annotations:
 240       summary: "Consul service healthcheck failed (instance {{ $labels.instance }})."
 241       description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`."
 242   - alert: ConsulMissingMasterNode
 243     expr: consul_raft_peers < 3
 244     for: 0m
 245     labels:
 246       severity: critical
 247     annotations:
 248       summary: "Consul missing master node (instance {{ $labels.instance }})."
 249       description: "Numbers of consul raft peers should be 3, in order to preserve quorum."
 250   - alert: ConsulAgentUnhealthy
 251     expr: consul_health_node_status{status="critical"} == 1
 252     for: 0m
 253     labels:
 254       severity: critical
 255     annotations:
 256       summary: "Consul agent unhealthy (instance {{ $labels.instance }})."
 257       description: "A Consul agent is down."
 258 - name: "Hosts"
 259   rules:
 260   - alert: NodeDown
 261     expr: up == 0
 262     for: 0m
 263     labels:
 264       severity: critical
 265     annotations:
 266       summary: "Prometheus target missing (instance {{ $labels.instance }})."
 267       description: "A Prometheus target has disappeared. An exporter might be crashed."
 268   - alert: HostOutOfMemory
 269     expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
 270     for: 2m
 271     labels:
 272       severity: warning
 273     annotations:
 274       summary: "Host out of memory (instance {{ $labels.instance }})."
 275       description: "Node memory is filling up (< 10% left)."
 276   - alert: HostOomKillDetected
 277     expr: increase(node_vmstat_oom_kill[1m]) > 0
 278     for: 0m
 279     labels:
 280       severity: warning
 281     annotations:
 282       summary: "Host OOM kill detected (instance {{ $labels.instance }})."
 283       description: "OOM kill detected."
 284   - alert: HostMemoryUnderMemoryPressure
 285     expr: rate(node_vmstat_pgmajfault[1m]) > 1000
 286     for: 2m
 287     labels:
 288       severity: warning
 289     annotations:
 290       summary: "Host memory under memory pressure (instance {{ $labels.instance }})."
 291       description: "The node is under heavy memory pressure. High rate of major page faults."
 292   - alert: HostOutOfDiskSpace
 293     expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
 294     for: 2m
 295     labels:
 296       severity: warning
 297     annotations:
 298       summary: "Host out of disk space (instance {{ $labels.instance }})."
 299       description: "Disk is almost full (< 10% left)."
 300   - alert: HostRaidDiskFailure
 301     expr: node_md_disks{state="failed"} > 0
 302     for: 2m
 303     labels:
 304       severity: warning
 305     annotations:
 306       summary: "Host RAID disk failure (instance {{ $labels.instance }})."
 307       description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap."
 308   - alert: HostConntrackLimit
 309     expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
 310     for: 5m
 311     labels:
 312       severity: warning
 313     annotations:
 314       summary: "Host conntrack limit (instance {{ $labels.instance }})."
 315       description: "The number of conntrack is approching limit."
 316   - alert: HostNetworkInterfaceSaturated
 317     expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8
 318     for: 1m
 319     labels:
 320       severity: warning
 321     annotations:
 322       summary: "Host Network Interface Saturated (instance {{ $labels.instance }})."
 323       description: "The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded."
 324   - alert: HostSystemdServiceCrashed
 325     expr: node_systemd_unit_state{state="failed"} == 1
 326     for: 0m
 327     labels:
 328       severity: warning
 329     annotations:
 330       summary: "Host SystemD service crashed (instance {{ $labels.instance }})."
 331       description: "SystemD service crashed."
 332   - alert: HostEdacCorrectableErrorsDetected
 333     expr: increase(node_edac_correctable_errors_total[1m]) > 0
 334     for: 0m
 335     labels:
 336       severity: info
 337     annotations:
 338       summary: "Host EDAC Correctable Errors detected (instance {{ $labels.instance }})."
 339       description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'
 340   - alert: HostEdacUncorrectableErrorsDetected
 341     expr: node_edac_uncorrectable_errors_total > 0
 342     for: 0m
 343     labels:
 344       severity: warning
 345     annotations:
 346       summary: "Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})."
 347       description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'
 348 - name: "Min.io"
 349   rules:
 350   - alert: MinioDiskOffline
 351     expr: minio_offline_disks > 0
 352     for: 0m
 353     labels:
 354       severity: critical
 355     annotations:
 356       summary: "Minio disk offline (instance {{ $labels.instance }})"
 357       description: "Minio disk is offline."
 358   - alert: MinioStorageSpaceExhausted
 359     expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 < 10
 360     for: 2m
 361     labels:
 362       severity: warning
 363     annotations:
 364       summary: "Minio storage space exhausted (instance {{ $labels.instance }})."
 365       description: "Minio storage space is low (< 10 GB)."
 366 - name: "Prometheus"
 367   rules:
 368   - alert: PrometheusConfigurationReloadFailure
 369     expr: prometheus_config_last_reload_successful != 1
 370     for: 0m
 371     labels:
 372       severity: warning
 373     annotations:
 374       summary: "Prometheus configuration reload failure (instance {{ $labels.instance }})."
 375       description: "Prometheus configuration reload error."
 376   - alert: PrometheusTooManyRestarts
 377     expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
 378     for: 0m
 379     labels:
 380       severity: warning
 381     annotations:
 382       summary: "Prometheus too many restarts (instance {{ $labels.instance }})."
 383       description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping."
 384   - alert: PrometheusAlertmanagerConfigurationReloadFailure
 385     expr: alertmanager_config_last_reload_successful != 1
 386     for: 0m
 387     labels:
 388       severity: warning
 389     annotations:
 390       summary: "Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})."
 391       description: "AlertManager configuration reload error."
 392   - alert: PrometheusRuleEvaluationFailures
 393     expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
 394     for: 0m
 395     labels:
 396       severity: critical
 397     annotations:
 398       summary: "Prometheus rule evaluation failures (instance {{ $labels.instance }})."
 399       description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts."
 400   - alert: PrometheusTargetScrapingSlow
 401     expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60
 402     for: 5m
 403     labels:
 404       severity: warning
 405     annotations:
 406       summary: "Prometheus target scraping slow (instance {{ $labels.instance }})."
 407       description: "Prometheus is scraping exporters slowly."
 408   - alert: PrometheusTsdbCompactionsFailed
 409     expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
 410     for: 0m
 411     labels:
 412       severity: critical
 413     annotations:
 414       summary: "Prometheus TSDB compactions failed (instance {{ $labels.instance }})."
 415       description: "Prometheus encountered {{ $value }} TSDB compactions failures."
 416   - alert: PrometheusTsdbHeadTruncationsFailed
 417     expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
 418     for: 0m
 419     labels:
 420       severity: critical
 421     annotations:
 422       summary: "Prometheus TSDB head truncations failed (instance {{ $labels.instance }})."
 423       description: "Prometheus encountered {{ $value }} TSDB head truncation failures."
 424   - alert: PrometheusTsdbWalCorruptions
 425     expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
 426     for: 0m
 427     labels:
 428       severity: critical
 429     annotations:
 430       summary: "Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})."
 431       description: "Prometheus encountered {{ $value }} TSDB WAL corruptions."
 432   - alert: PrometheusTsdbWalTruncationsFailed
 433     expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
 434     for: 0m
 435     labels:
 436       severity: critical
 437     annotations:
 438       summary: "Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})."
 439       description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures."
 440 EOH
 441       }
 442
 443       template {
 444         change_mode     = "noop"
 445         change_signal   = "SIGINT"
 446         destination     = "secrets/prometheus.yml"
 447         data            = <<EOH
 448 ---
 449 global:
 450   scrape_interval:     5s
 451   scrape_timeout:      5s
 452   evaluation_interval: 5s
 453
 454 alerting:
 455   alertmanagers:
 456   - consul_sd_configs:
 457     - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
 458       services: [ 'alertmanager' ]
 459
 460 rule_files:
 461   - 'alerts.yml'
 462
 463 scrape_configs:
 464
 465   - job_name: 'Nomad Cluster'
 466     consul_sd_configs:
 467     - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
 468       services: [ 'nomad-client', 'nomad' ]
 469     relabel_configs:
 470     - source_labels: [__meta_consul_tags]
 471       regex: '(.*)http(.*)'
 472       action: keep
 473     metrics_path: /v1/metrics
 474     params:
 475       format: [ 'prometheus' ]
 476
 477   - job_name: 'Consul Cluster'
 478     static_configs:
 479       - targets: [ '10.30.51.28:8500' ]
 480       - targets: [ '10.30.51.29:8500' ]
 481       - targets: [ '10.30.51.30:8500' ]
 482       - targets: [ '10.30.51.32:8500' ]
 483       - targets: [ '10.30.51.33:8500' ]
 484       - targets: [ '10.30.51.34:8500' ]
 485       - targets: [ '10.30.51.35:8500' ]
 486       - targets: [ '10.30.51.39:8500' ]
 487       - targets: [ '10.30.51.40:8500' ]
 488       - targets: [ '10.30.51.50:8500' ]
 489       - targets: [ '10.30.51.51:8500' ]
 490       - targets: [ '10.30.51.65:8500' ]
 491       - targets: [ '10.30.51.66:8500' ]
 492       - targets: [ '10.30.51.67:8500' ]
 493       - targets: [ '10.30.51.68:8500' ]
 494       - targets: [ '10.30.51.70:8500' ]
 495       - targets: [ '10.30.51.71:8500' ]
 496       - targets: [ '10.32.8.14:8500' ]
 497       - targets: [ '10.32.8.15:8500' ]
 498       - targets: [ '10.32.8.16:8500' ]
 499       - targets: [ '10.32.8.17:8500' ]
 500     metrics_path: /v1/agent/metrics
 501     params:
 502       format: [ 'prometheus' ]
 503
 504   - job_name: 'Blackbox Exporter (icmp)'
 505     static_configs:
 506       - targets: [ 'gerrit.fd.io' ]
 507       - targets: [ 'jenkins.fd.io' ]
 508       - targets: [ '10.30.51.32' ]
 509     params:
 510       module: [ 'icmp_v4' ]
 511     relabel_configs:
 512       - source_labels: [__address__]
 513         target_label: __param_target
 514       - source_labels: [__param_target]
 515         target_label: instance
 516       - target_label: __address__
 517         replacement: localhost:9115
 518     metrics_path: /probe
 519
 520   - job_name: 'Blackbox Exporter (http)'
 521     static_configs:
 522       - targets: [ 'gerrit.fd.io' ]
 523       - targets: [ 'jenkins.fd.io' ]
 524     params:
 525       module: [ 'http_2xx' ]
 526     relabel_configs:
 527       - source_labels: [__address__]
 528         target_label: __param_target
 529       - source_labels: [__param_target]
 530         target_label: instance
 531       - target_label: __address__
 532         replacement: localhost:9115
 533     metrics_path: /probe
 534
 535   - job_name: 'Jenkins Job Health Exporter'
 536     static_configs:
 537       - targets: [ '10.30.51.32:9186' ]
 538     metric_relabel_configs:
 539       - source_labels: [ __name__ ]
 540         regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'
 541         action: replace
 542         replacement: '$1'
 543         target_label: id
 544       - source_labels: [ __name__ ]
 545         regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'
 546         replacement: 'jenkins_job_$2'
 547         target_label: __name__
 548
 549   - job_name: 'Node Exporter'
 550     static_configs:
 551       - targets: [ '10.30.51.28:9100' ]
 552       - targets: [ '10.30.51.29:9100' ]
 553       - targets: [ '10.30.51.30:9100' ]
 554       - targets: [ '10.30.51.32:9100' ]
 555       - targets: [ '10.30.51.33:9100' ]
 556       - targets: [ '10.30.51.34:9100' ]
 557       - targets: [ '10.30.51.35:9100' ]
 558       - targets: [ '10.30.51.39:9100' ]
 559       - targets: [ '10.30.51.40:9100' ]
 560       - targets: [ '10.30.51.50:9100' ]
 561       - targets: [ '10.30.51.51:9100' ]
 562       - targets: [ '10.30.51.65:9100' ]
 563       - targets: [ '10.30.51.66:9100' ]
 564       - targets: [ '10.30.51.67:9100' ]
 565       - targets: [ '10.30.51.68:9100' ]
 566       - targets: [ '10.30.51.70:9100' ]
 567       - targets: [ '10.30.51.71:9100' ]
 568       - targets: [ '10.32.8.14:9100' ]
 569       - targets: [ '10.32.8.15:9100' ]
 570       - targets: [ '10.32.8.16:9100' ]
 571       - targets: [ '10.32.8.17:9100' ]
 572
 573   - job_name: 'Alertmanager'
 574     consul_sd_configs:
 575     - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
 576       services: [ 'alertmanager' ]
 577
 578   - job_name: 'Grafana'
 579     consul_sd_configs:
 580     - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
 581       services: [ 'grafana' ]
 582
 583   - job_name: 'Prometheus'
 584     consul_sd_configs:
 585     - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
 586       services: [ 'prometheus' ]
 587
 588   - job_name: 'Minio'
 589     bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg
 590     consul_sd_configs:
 591     - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
 592       services: [ 'storage' ]
 593     metrics_path: /minio/prometheus/metrics
 594 EOH
 595       }
 596
 597       # The service stanza instructs Nomad to register a service with Consul.
 598       #
 599       # For more information and examples on the "task" stanza, please see
 600       # the online documentation at:
 601       #
 602       #     https://www.nomadproject.io/docs/job-specification/service
 603       #
 604       service {
 605         name            = "${service_name}"
 606         port            = "${service_name}"
 607         tags            = [ "${service_name}$${NOMAD_ALLOC_INDEX}" ]
 608         check {
 609           name          = "Prometheus Check Live"
 610           type          = "http"
 611           path          = "/-/healthy"
 612           interval      = "10s"
 613           timeout       = "2s"
 614         }
 615       }
 616
 617       # The "resources" stanza describes the requirements a task needs to
 618       # execute. Resource requirements include memory, network, cpu, and more.
 619       # This ensures the task will execute on a machine that contains enough
 620       # resource capacity.
 621       #
 622       # For more information and examples on the "resources" stanza, please see
 623       # the online documentation at:
 624       #
 625       #     https://www.nomadproject.io/docs/job-specification/resources
 626       #
 627       resources {
 628         cpu             = ${cpu}
 629         memory          = ${mem}
 630         # The network stanza specifies the networking requirements for the task
 631         # group, including the network mode and port allocations. When scheduling
 632         # jobs in Nomad they are provisioned across your fleet of machines along
 633         # with other jobs and services. Because you don't know in advance what host
 634         # your job will be provisioned on, Nomad will provide your tasks with
 635         # network configuration when they start up.
 636         #
 637         # For more information and examples on the "template" stanza, please see
 638         # the online documentation at:
 639         #
 640         #     https://www.nomadproject.io/docs/job-specification/network
 641         #
 642         network {
 643           port "${service_name}" {
 644             static      = ${port}
 645           }
 646         }
 647       }
 648     }
 649   }
 650 }