fdio.infra.terraform/1n_nmd/prometheus/conf/nomad/prometheus.hcl

   1 job "${job_name}" {
   2   # The "region" parameter specifies the region in which to execute the job.
   3   # If omitted, this inherits the default region name of "global".
   4   # region = "global"
   5   #
   6   # The "datacenters" parameter specifies the list of datacenters which should
   7   # be considered when placing this task. This must be provided.
   8   datacenters         = "${datacenters}"
   9
  10   # The "type" parameter controls the type of job, which impacts the scheduler's
  11   # decision on placement. This configuration is optional and defaults to
  12   # "service". For a full list of job types and their differences, please see
  13   # the online documentation.
  14   #
  15   # For more information, please see the online documentation at:
  16   #
  17   #     https://www.nomadproject.io/docs/jobspec/schedulers
  18   #
  19   type                = "service"
  20
  21   update {
  22     # The "max_parallel" parameter specifies the maximum number of updates to
  23     # perform in parallel. In this case, this specifies to update a single task
  24     # at a time.
  25     max_parallel      = 1
  26
  27     health_check      = "checks"
  28
  29     # The "min_healthy_time" parameter specifies the minimum time the allocation
  30     # must be in the healthy state before it is marked as healthy and unblocks
  31     # further allocations from being updated.
  32     min_healthy_time  = "10s"
  33
  34     # The "healthy_deadline" parameter specifies the deadline in which the
  35     # allocation must be marked as healthy after which the allocation is
  36     # automatically transitioned to unhealthy. Transitioning to unhealthy will
  37     # fail the deployment and potentially roll back the job if "auto_revert" is
  38     # set to true.
  39     healthy_deadline  = "3m"
  40
  41     # The "progress_deadline" parameter specifies the deadline in which an
  42     # allocation must be marked as healthy. The deadline begins when the first
  43     # allocation for the deployment is created and is reset whenever an allocation
  44     # as part of the deployment transitions to a healthy state. If no allocation
  45     # transitions to the healthy state before the progress deadline, the
  46     # deployment is marked as failed.
  47     progress_deadline = "10m"
  48
  49 %{ if use_canary }
  50     # The "canary" parameter specifies that changes to the job that would result
  51     # in destructive updates should create the specified number of canaries
  52     # without stopping any previous allocations. Once the operator determines the
  53     # canaries are healthy, they can be promoted which unblocks a rolling update
  54     # of the remaining allocations at a rate of "max_parallel".
  55     #
  56     # Further, setting "canary" equal to the count of the task group allows
  57     # blue/green deployments. When the job is updated, a full set of the new
  58     # version is deployed and upon promotion the old version is stopped.
  59     canary            = 1
  60
  61     # Specifies if the job should auto-promote to the canary version when all
  62     # canaries become healthy during a deployment. Defaults to false which means
  63     # canaries must be manually updated with the nomad deployment promote
  64     # command.
  65     auto_promote      = true
  66
  67     # The "auto_revert" parameter specifies if the job should auto-revert to the
  68     # last stable job on deployment failure. A job is marked as stable if all the
  69     # allocations as part of its deployment were marked healthy.
  70     auto_revert       = true
  71 %{ endif }
  72   }
  73
  74   # The reschedule stanza specifies the group's rescheduling strategy. If
  75   # specified at the job level, the configuration will apply to all groups
  76   # within the job. If the reschedule stanza is present on both the job and the
  77   # group, they are merged with the group stanza taking the highest precedence
  78   # and then the job.
  79   reschedule {
  80     delay             = "30s"
  81     delay_function    = "constant"
  82     unlimited         = true
  83   }
  84
  85   # The "group" stanza defines a series of tasks that should be co-located on
  86   # the same Nomad client. Any task within a group will be placed on the same
  87   # client.
  88   #
  89   # For more information and examples on the "group" stanza, please see
  90   # the online documentation at:
  91   #
  92   #     https://www.nomadproject.io/docs/job-specification/group
  93   #
  94   group "prod-group1-${service_name}" {
  95     # The "count" parameter specifies the number of the task groups that should
  96     # be running under this group. This value must be non-negative and defaults
  97     # to 1.
  98     count               = ${group_count}
  99
 100     # The restart stanza configures a tasks's behavior on task failure. Restarts
 101     # happen on the client that is running the task.
 102     #
 103     # https://www.nomadproject.io/docs/job-specification/restart
 104     #
 105     restart {
 106       interval  = "30m"
 107       attempts  = 40
 108       delay     = "15s"
 109       mode      = "delay"
 110     }
 111
 112     # The volume stanza allows the group to specify that it requires a given
 113     # volume from the cluster.
 114     #
 115     # For more information and examples on the "volume" stanza, please see
 116     # the online documentation at:
 117     #
 118     #     https://www.nomadproject.io/docs/job-specification/volume
 119     #
 120     %{ if use_host_volume }
 121     volume "prod-volume1-${service_name}" {
 122       type              = "host"
 123       read_only         = false
 124       source            = "${host_volume}"
 125     }
 126     %{ endif }
 127
 128     # The constraint allows restricting the set of eligible nodes. Constraints
 129     # may filter on attributes or client metadata.
 130     #
 131     # For more information and examples on the "volume" stanza, please see
 132     # the online documentation at:
 133     #
 134     #     https://www.nomadproject.io/docs/job-specification/constraint
 135     #
 136     constraint {
 137       attribute         = "$${attr.cpu.arch}"
 138       operator          = "!="
 139       value             = "arm64"
 140     }
 141
 142     constraint {
 143       attribute      = "$${node.class}"
 144       value          = "builder"
 145     }
 146
 147     # The "task" stanza creates an individual unit of work, such as a Docker
 148     # container, web application, or batch processing.
 149     #
 150     # For more information and examples on the "task" stanza, please see
 151     # the online documentation at:
 152     #
 153     #     https://www.nomadproject.io/docs/job-specification/task
 154     #
 155     task "prod-task1-${service_name}" {
 156       # The "driver" parameter specifies the task driver that should be used to
 157       # run the task.
 158       driver            = "exec"
 159
 160       %{ if use_host_volume }
 161       volume_mount {
 162         volume          = "prod-volume1-${service_name}"
 163         destination     = "${data_dir}"
 164         read_only       = false
 165       }
 166       %{ endif }
 167
 168       %{ if use_vault_provider }
 169       vault {
 170         policies        = "${vault_kv_policy_name}"
 171       }
 172       %{ endif }
 173
 174       # The "config" stanza specifies the driver configuration, which is passed
 175       # directly to the driver to start the task. The details of configurations
 176       # are specific to each driver, so please see specific driver
 177       # documentation for more information.
 178       config {
 179         command         = "local/prometheus-${version}.linux-amd64/prometheus"
 180         args            = [
 181           "--config.file=secrets/prometheus.yml",
 182           "--storage.tsdb.path=${data_dir}prometheus/",
 183           "--storage.tsdb.retention.time=7d"
 184         ]
 185       }
 186
 187       # The artifact stanza instructs Nomad to fetch and unpack a remote resource,
 188       # such as a file, tarball, or binary. Nomad downloads artifacts using the
 189       # popular go-getter library, which permits downloading artifacts from a
 190       # variety of locations using a URL as the input source.
 191       #
 192       # For more information and examples on the "artifact" stanza, please see
 193       # the online documentation at:
 194       #
 195       #     https://www.nomadproject.io/docs/job-specification/artifact
 196       #
 197       artifact {
 198         source          = "${url}"
 199       }
 200
 201       # The "template" stanza instructs Nomad to manage a template, such as
 202       # a configuration file or script. This template can optionally pull data
 203       # from Consul or Vault to populate runtime configuration data.
 204       #
 205       # For more information and examples on the "template" stanza, please see
 206       # the online documentation at:
 207       #
 208       #     https://www.nomadproject.io/docs/job-specification/template
 209       #
 210       template {
 211         change_mode     = "noop"
 212         change_signal   = "SIGINT"
 213         destination     = "secrets/alerts.yml"
 214         left_delimiter  = "{{{"
 215         right_delimiter = "}}}"
 216         data            = <<EOH
 217 ---
 218 groups:
 219 - name: "Jenkins Job Health Exporter"
 220   rules:
 221   - alert: JenkinsJobHealthExporterFailures
 222     expr: jenkins_job_failure{id=~".*"} > jenkins_job_success{id=~".*"}
 223     for: 0m
 224     labels:
 225       severity: critical
 226     annotations:
 227       summary: "Jenkins Job Health detected high failure rate on jenkins jobs."
 228       description: "Job: {{ $labels.id }}"
 229   - alert: JenkinsJobHealthExporterUnstable
 230     expr: jenkins_job_unstable{id=~".*"} > jenkins_job_success{id=~".*"}
 231     for: 0m
 232     labels:
 233       severity: warning
 234     annotations:
 235       summary: "Jenkins Job Health detected high unstable rate on jenkins jobs."
 236       description: "Job: {{ $labels.id }}"
 237 - name: "Consul"
 238   rules:
 239   - alert: ConsulServiceHealthcheckFailed
 240     expr: consul_catalog_service_node_healthy == 0
 241     for: 0m
 242     labels:
 243       severity: critical
 244     annotations:
 245       summary: "Consul service healthcheck failed (instance {{ $labels.instance }})."
 246       description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`."
 247   - alert: ConsulMissingMasterNode
 248     expr: consul_raft_peers < 3
 249     for: 0m
 250     labels:
 251       severity: critical
 252     annotations:
 253       summary: "Consul missing master node (instance {{ $labels.instance }})."
 254       description: "Numbers of consul raft peers should be 3, in order to preserve quorum."
 255   - alert: ConsulAgentUnhealthy
 256     expr: consul_health_node_status{status="critical"} == 1
 257     for: 0m
 258     labels:
 259       severity: critical
 260     annotations:
 261       summary: "Consul agent unhealthy (instance {{ $labels.instance }})."
 262       description: "A Consul agent is down."
 263 - name: "Hosts"
 264   rules:
 265   - alert: NodeDown
 266     expr: up == 0
 267     for: 0m
 268     labels:
 269       severity: critical
 270     annotations:
 271       summary: "Prometheus target missing (instance {{ $labels.instance }})."
 272       description: "A Prometheus target has disappeared. An exporter might be crashed."
 273   - alert: HostOutOfMemory
 274     expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
 275     for: 2m
 276     labels:
 277       severity: warning
 278     annotations:
 279       summary: "Host out of memory (instance {{ $labels.instance }})."
 280       description: "Node memory is filling up (< 10% left)."
 281   - alert: HostOomKillDetected
 282     expr: increase(node_vmstat_oom_kill[1m]) > 0
 283     for: 0m
 284     labels:
 285       severity: warning
 286     annotations:
 287       summary: "Host OOM kill detected (instance {{ $labels.instance }})."
 288       description: "OOM kill detected."
 289   - alert: HostMemoryUnderMemoryPressure
 290     expr: rate(node_vmstat_pgmajfault[1m]) > 1000
 291     for: 2m
 292     labels:
 293       severity: warning
 294     annotations:
 295       summary: "Host memory under memory pressure (instance {{ $labels.instance }})."
 296       description: "The node is under heavy memory pressure. High rate of major page faults."
 297   - alert: HostOutOfDiskSpace
 298     expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
 299     for: 2m
 300     labels:
 301       severity: warning
 302     annotations:
 303       summary: "Host out of disk space (instance {{ $labels.instance }})."
 304       description: "Disk is almost full (< 10% left)."
 305   - alert: HostRaidDiskFailure
 306     expr: node_md_disks{state="failed"} > 0
 307     for: 2m
 308     labels:
 309       severity: warning
 310     annotations:
 311       summary: "Host RAID disk failure (instance {{ $labels.instance }})."
 312       description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap."
 313   - alert: HostConntrackLimit
 314     expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
 315     for: 5m
 316     labels:
 317       severity: warning
 318     annotations:
 319       summary: "Host conntrack limit (instance {{ $labels.instance }})."
 320       description: "The number of conntrack is approching limit."
 321   - alert: HostNetworkInterfaceSaturated
 322     expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8
 323     for: 1m
 324     labels:
 325       severity: warning
 326     annotations:
 327       summary: "Host Network Interface Saturated (instance {{ $labels.instance }})."
 328       description: "The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded."
 329   - alert: HostSystemdServiceCrashed
 330     expr: node_systemd_unit_state{state="failed"} == 1
 331     for: 0m
 332     labels:
 333       severity: warning
 334     annotations:
 335       summary: "Host SystemD service crashed (instance {{ $labels.instance }})."
 336       description: "SystemD service crashed."
 337   - alert: HostEdacCorrectableErrorsDetected
 338     expr: increase(node_edac_correctable_errors_total[1m]) > 0
 339     for: 0m
 340     labels:
 341       severity: info
 342     annotations:
 343       summary: "Host EDAC Correctable Errors detected (instance {{ $labels.instance }})."
 344       description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'
 345   - alert: HostEdacUncorrectableErrorsDetected
 346     expr: node_edac_uncorrectable_errors_total > 0
 347     for: 0m
 348     labels:
 349       severity: warning
 350     annotations:
 351       summary: "Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})."
 352       description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'
 353 - name: "Min.io"
 354   rules:
 355   - alert: MinioDiskOffline
 356     expr: minio_offline_disks > 0
 357     for: 0m
 358     labels:
 359       severity: critical
 360     annotations:
 361       summary: "Minio disk offline (instance {{ $labels.instance }})"
 362       description: "Minio disk is offline."
 363   - alert: MinioStorageSpaceExhausted
 364     expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 < 10
 365     for: 2m
 366     labels:
 367       severity: warning
 368     annotations:
 369       summary: "Minio storage space exhausted (instance {{ $labels.instance }})."
 370       description: "Minio storage space is low (< 10 GB)."
 371 - name: "Prometheus"
 372   rules:
 373   - alert: PrometheusConfigurationReloadFailure
 374     expr: prometheus_config_last_reload_successful != 1
 375     for: 0m
 376     labels:
 377       severity: warning
 378     annotations:
 379       summary: "Prometheus configuration reload failure (instance {{ $labels.instance }})."
 380       description: "Prometheus configuration reload error."
 381   - alert: PrometheusTooManyRestarts
 382     expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
 383     for: 0m
 384     labels:
 385       severity: warning
 386     annotations:
 387       summary: "Prometheus too many restarts (instance {{ $labels.instance }})."
 388       description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping."
 389   - alert: PrometheusAlertmanagerConfigurationReloadFailure
 390     expr: alertmanager_config_last_reload_successful != 1
 391     for: 0m
 392     labels:
 393       severity: warning
 394     annotations:
 395       summary: "Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})."
 396       description: "AlertManager configuration reload error."
 397   - alert: PrometheusRuleEvaluationFailures
 398     expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
 399     for: 0m
 400     labels:
 401       severity: critical
 402     annotations:
 403       summary: "Prometheus rule evaluation failures (instance {{ $labels.instance }})."
 404       description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts."
 405   - alert: PrometheusTargetScrapingSlow
 406     expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60
 407     for: 5m
 408     labels:
 409       severity: warning
 410     annotations:
 411       summary: "Prometheus target scraping slow (instance {{ $labels.instance }})."
 412       description: "Prometheus is scraping exporters slowly."
 413   - alert: PrometheusTsdbCompactionsFailed
 414     expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
 415     for: 0m
 416     labels:
 417       severity: critical
 418     annotations:
 419       summary: "Prometheus TSDB compactions failed (instance {{ $labels.instance }})."
 420       description: "Prometheus encountered {{ $value }} TSDB compactions failures."
 421   - alert: PrometheusTsdbHeadTruncationsFailed
 422     expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
 423     for: 0m
 424     labels:
 425       severity: critical
 426     annotations:
 427       summary: "Prometheus TSDB head truncations failed (instance {{ $labels.instance }})."
 428       description: "Prometheus encountered {{ $value }} TSDB head truncation failures."
 429   - alert: PrometheusTsdbWalCorruptions
 430     expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
 431     for: 0m
 432     labels:
 433       severity: critical
 434     annotations:
 435       summary: "Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})."
 436       description: "Prometheus encountered {{ $value }} TSDB WAL corruptions."
 437   - alert: PrometheusTsdbWalTruncationsFailed
 438     expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
 439     for: 0m
 440     labels:
 441       severity: critical
 442     annotations:
 443       summary: "Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})."
 444       description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures."
 445 EOH
 446       }
 447
 448       template {
 449         change_mode     = "noop"
 450         change_signal   = "SIGINT"
 451         destination     = "secrets/prometheus.yml"
 452         data            = <<EOH
 453 ---
 454 global:
 455   scrape_interval:     5s
 456   scrape_timeout:      5s
 457   evaluation_interval: 5s
 458
 459 alerting:
 460   alertmanagers:
 461   - consul_sd_configs:
 462     - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
 463       services: [ 'alertmanager' ]
 464
 465 rule_files:
 466   - 'alerts.yml'
 467
 468 scrape_configs:
 469
 470   - job_name: 'Nomad Cluster'
 471     consul_sd_configs:
 472     - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
 473       services: [ 'nomad-client', 'nomad' ]
 474     relabel_configs:
 475     - source_labels: [__meta_consul_tags]
 476       regex: '(.*)http(.*)'
 477       action: keep
 478     metrics_path: /v1/metrics
 479     params:
 480       format: [ 'prometheus' ]
 481
 482   - job_name: 'Consul Cluster'
 483     static_configs:
 484       - targets: [ '10.30.51.22:8500' ]
 485       - targets: [ '10.30.51.24:8500' ]
 486       - targets: [ '10.30.51.25:8500' ]
 487       - targets: [ '10.30.51.26:8500' ]
 488       - targets: [ '10.30.51.28:8500' ]
 489       - targets: [ '10.30.51.29:8500' ]
 490       - targets: [ '10.30.51.30:8500' ]
 491       - targets: [ '10.30.51.39:8500' ]
 492       - targets: [ '10.30.51.40:8500' ]
 493       - targets: [ '10.30.51.50:8500' ]
 494       - targets: [ '10.30.51.51:8500' ]
 495       - targets: [ '10.30.51.65:8500' ]
 496       - targets: [ '10.30.51.66:8500' ]
 497       - targets: [ '10.30.51.67:8500' ]
 498       - targets: [ '10.30.51.68:8500' ]
 499       - targets: [ '10.30.51.70:8500' ]
 500       - targets: [ '10.30.51.71:8500' ]
 501       - targets: [ '10.32.8.14:8500' ]
 502       - targets: [ '10.32.8.15:8500' ]
 503       - targets: [ '10.32.8.16:8500' ]
 504       - targets: [ '10.32.8.17:8500' ]
 505     metrics_path: /v1/agent/metrics
 506     params:
 507       format: [ 'prometheus' ]
 508
 509   - job_name: 'Blackbox Exporter (icmp)'
 510     static_configs:
 511       - targets: [ 'gerrit.fd.io' ]
 512       - targets: [ 'jenkins.fd.io' ]
 513       - targets: [ '10.32.8.17' ]
 514     params:
 515       module: [ 'icmp_v4' ]
 516     relabel_configs:
 517       - source_labels: [__address__]
 518         target_label: __param_target
 519       - source_labels: [__param_target]
 520         target_label: instance
 521       - target_label: __address__
 522         replacement: localhost:9115
 523     metrics_path: /probe
 524
 525   - job_name: 'Blackbox Exporter (http)'
 526     static_configs:
 527       - targets: [ 'gerrit.fd.io' ]
 528       - targets: [ 'jenkins.fd.io' ]
 529     params:
 530       module: [ 'http_2xx' ]
 531     relabel_configs:
 532       - source_labels: [__address__]
 533         target_label: __param_target
 534       - source_labels: [__param_target]
 535         target_label: instance
 536       - target_label: __address__
 537         replacement: localhost:9115
 538     metrics_path: /probe
 539
 540   - job_name: 'Jenkins Job Health Exporter'
 541     static_configs:
 542       - targets: [ '10.30.51.22:9186' ]
 543     metric_relabel_configs:
 544       - source_labels: [ __name__ ]
 545         regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'
 546         action: replace
 547         replacement: '$1'
 548         target_label: id
 549       - source_labels: [ __name__ ]
 550         regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'
 551         replacement: 'jenkins_job_$2'
 552         target_label: __name__
 553
 554   - job_name: 'Node Exporter'
 555     static_configs:
 556       - targets: [ '10.30.51.22:9100' ]
 557       - targets: [ '10.30.51.24:9100' ]
 558       - targets: [ '10.30.51.25:9100' ]
 559       - targets: [ '10.30.51.26:9100' ]
 560       - targets: [ '10.30.51.28:9100' ]
 561       - targets: [ '10.30.51.29:9100' ]
 562       - targets: [ '10.30.51.30:9100' ]
 563       - targets: [ '10.30.51.39:9100' ]
 564       - targets: [ '10.30.51.40:9100' ]
 565       - targets: [ '10.30.51.50:9100' ]
 566       - targets: [ '10.30.51.51:9100' ]
 567       - targets: [ '10.30.51.65:9100' ]
 568       - targets: [ '10.30.51.66:9100' ]
 569       - targets: [ '10.30.51.67:9100' ]
 570       - targets: [ '10.30.51.68:9100' ]
 571       - targets: [ '10.30.51.70:9100' ]
 572       - targets: [ '10.30.51.71:9100' ]
 573       - targets: [ '10.32.8.14:9100' ]
 574       - targets: [ '10.32.8.15:9100' ]
 575       - targets: [ '10.32.8.16:9100' ]
 576       - targets: [ '10.32.8.17:9100' ]
 577
 578   - job_name: 'Alertmanager'
 579     consul_sd_configs:
 580     - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
 581       services: [ 'alertmanager' ]
 582
 583   - job_name: 'Grafana'
 584     consul_sd_configs:
 585     - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
 586       services: [ 'grafana' ]
 587
 588   - job_name: 'Prometheus'
 589     consul_sd_configs:
 590     - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
 591       services: [ 'prometheus' ]
 592
 593   - job_name: 'Minio'
 594     bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg
 595     consul_sd_configs:
 596     - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
 597       services: [ 'storage' ]
 598     metrics_path: /minio/prometheus/metrics
 599 EOH
 600       }
 601
 602       # The service stanza instructs Nomad to register a service with Consul.
 603       #
 604       # For more information and examples on the "task" stanza, please see
 605       # the online documentation at:
 606       #
 607       #     https://www.nomadproject.io/docs/job-specification/service
 608       #
 609       service {
 610         name            = "${service_name}"
 611         port            = "${service_name}"
 612         tags            = [ "${service_name}$${NOMAD_ALLOC_INDEX}" ]
 613         check {
 614           name          = "Prometheus Check Live"
 615           type          = "http"
 616           path          = "/-/healthy"
 617           interval      = "10s"
 618           timeout       = "2s"
 619         }
 620       }
 621
 622       # The "resources" stanza describes the requirements a task needs to
 623       # execute. Resource requirements include memory, network, cpu, and more.
 624       # This ensures the task will execute on a machine that contains enough
 625       # resource capacity.
 626       #
 627       # For more information and examples on the "resources" stanza, please see
 628       # the online documentation at:
 629       #
 630       #     https://www.nomadproject.io/docs/job-specification/resources
 631       #
 632       resources {
 633         cpu             = ${cpu}
 634         memory          = ${mem}
 635         # The network stanza specifies the networking requirements for the task
 636         # group, including the network mode and port allocations. When scheduling
 637         # jobs in Nomad they are provisioned across your fleet of machines along
 638         # with other jobs and services. Because you don't know in advance what host
 639         # your job will be provisioned on, Nomad will provide your tasks with
 640         # network configuration when they start up.
 641         #
 642         # For more information and examples on the "template" stanza, please see
 643         # the online documentation at:
 644         #
 645         #     https://www.nomadproject.io/docs/job-specification/network
 646         #
 647         network {
 648           port "${service_name}" {
 649             static      = ${port}
 650           }
 651         }
 652       }
 653     }
 654   }
 655 }