terraform-ci-infra/1n_nmd/prometheus/conf/nomad/prometheus.hcl

   1 job "${job_name}" {
   2   # The "region" parameter specifies the region in which to execute the job.
   3   # If omitted, this inherits the default region name of "global".
   4   # region = "global"
   5   #
   6   # The "datacenters" parameter specifies the list of datacenters which should
   7   # be considered when placing this task. This must be provided.
   8   datacenters         = "${datacenters}"
   9
  10   # The "type" parameter controls the type of job, which impacts the scheduler's
  11   # decision on placement. This configuration is optional and defaults to
  12   # "service". For a full list of job types and their differences, please see
  13   # the online documentation.
  14   #
  15   # For more information, please see the online documentation at:
  16   #
  17   #     https://www.nomadproject.io/docs/jobspec/schedulers
  18   #
  19   type                = "service"
  20
  21   update {
  22     # The "max_parallel" parameter specifies the maximum number of updates to
  23     # perform in parallel. In this case, this specifies to update a single task
  24     # at a time.
  25     max_parallel      = 1
  26
  27     health_check      = "checks"
  28
  29     # The "min_healthy_time" parameter specifies the minimum time the allocation
  30     # must be in the healthy state before it is marked as healthy and unblocks
  31     # further allocations from being updated.
  32     min_healthy_time  = "10s"
  33
  34     # The "healthy_deadline" parameter specifies the deadline in which the
  35     # allocation must be marked as healthy after which the allocation is
  36     # automatically transitioned to unhealthy. Transitioning to unhealthy will
  37     # fail the deployment and potentially roll back the job if "auto_revert" is
  38     # set to true.
  39     healthy_deadline  = "3m"
  40
  41     # The "progress_deadline" parameter specifies the deadline in which an
  42     # allocation must be marked as healthy. The deadline begins when the first
  43     # allocation for the deployment is created and is reset whenever an allocation
  44     # as part of the deployment transitions to a healthy state. If no allocation
  45     # transitions to the healthy state before the progress deadline, the
  46     # deployment is marked as failed.
  47     progress_deadline = "10m"
  48
  49 %{ if use_canary }
  50     # The "canary" parameter specifies that changes to the job that would result
  51     # in destructive updates should create the specified number of canaries
  52     # without stopping any previous allocations. Once the operator determines the
  53     # canaries are healthy, they can be promoted which unblocks a rolling update
  54     # of the remaining allocations at a rate of "max_parallel".
  55     #
  56     # Further, setting "canary" equal to the count of the task group allows
  57     # blue/green deployments. When the job is updated, a full set of the new
  58     # version is deployed and upon promotion the old version is stopped.
  59     canary            = 1
  60
  61     # Specifies if the job should auto-promote to the canary version when all
  62     # canaries become healthy during a deployment. Defaults to false which means
  63     # canaries must be manually updated with the nomad deployment promote
  64     # command.
  65     auto_promote      = true
  66
  67     # The "auto_revert" parameter specifies if the job should auto-revert to the
  68     # last stable job on deployment failure. A job is marked as stable if all the
  69     # allocations as part of its deployment were marked healthy.
  70     auto_revert       = true
  71 %{ endif }
  72   }
  73
  74   # The "group" stanza defines a series of tasks that should be co-located on
  75   # the same Nomad client. Any task within a group will be placed on the same
  76   # client.
  77   #
  78   # For more information and examples on the "group" stanza, please see
  79   # the online documentation at:
  80   #
  81   #     https://www.nomadproject.io/docs/job-specification/group
  82   #
  83   group "prod-group1-${service_name}" {
  84     # The "count" parameter specifies the number of the task groups that should
  85     # be running under this group. This value must be non-negative and defaults
  86     # to 1.
  87     count               = ${group_count}
  88
  89     # The volume stanza allows the group to specify that it requires a given
  90     # volume from the cluster.
  91     #
  92     # For more information and examples on the "volume" stanza, please see
  93     # the online documentation at:
  94     #
  95     #     https://www.nomadproject.io/docs/job-specification/volume
  96     #
  97     %{ if use_host_volume }
  98     volume "prod-volume1-${service_name}" {
  99       type              = "host"
 100       read_only         = false
 101       source            = "${host_volume}"
 102     }
 103     %{ endif }
 104
 105     # The constraint allows restricting the set of eligible nodes. Constraints
 106     # may filter on attributes or client metadata.
 107     #
 108     # For more information and examples on the "volume" stanza, please see
 109     # the online documentation at:
 110     #
 111     #     https://www.nomadproject.io/docs/job-specification/constraint
 112     #
 113     constraint {
 114       attribute         = "$${attr.cpu.arch}"
 115       operator          = "!="
 116       value             = "arm64"
 117     }
 118
 119     # The "task" stanza creates an individual unit of work, such as a Docker
 120     # container, web application, or batch processing.
 121     #
 122     # For more information and examples on the "task" stanza, please see
 123     # the online documentation at:
 124     #
 125     #     https://www.nomadproject.io/docs/job-specification/task
 126     #
 127     task "prod-task1-${service_name}" {
 128       # The "driver" parameter specifies the task driver that should be used to
 129       # run the task.
 130       driver            = "exec"
 131
 132       %{ if use_host_volume }
 133       volume_mount {
 134         volume          = "prod-volume1-${service_name}"
 135         destination     = "${data_dir}"
 136         read_only       = false
 137       }
 138       %{ endif }
 139
 140       %{ if use_vault_provider }
 141       vault {
 142         policies        = "${vault_kv_policy_name}"
 143       }
 144       %{ endif }
 145
 146       # The "config" stanza specifies the driver configuration, which is passed
 147       # directly to the driver to start the task. The details of configurations
 148       # are specific to each driver, so please see specific driver
 149       # documentation for more information.
 150       config {
 151         command         = "local/prometheus-${version}.linux-amd64/prometheus"
 152         args            = [
 153           "--config.file=secrets/prometheus.yml",
 154           "--storage.tsdb.path=${data_dir}prometheus/",
 155           "--storage.tsdb.retention.time=15d"
 156         ]
 157       }
 158
 159       # The artifact stanza instructs Nomad to fetch and unpack a remote resource,
 160       # such as a file, tarball, or binary. Nomad downloads artifacts using the
 161       # popular go-getter library, which permits downloading artifacts from a
 162       # variety of locations using a URL as the input source.
 163       #
 164       # For more information and examples on the "artifact" stanza, please see
 165       # the online documentation at:
 166       #
 167       #     https://www.nomadproject.io/docs/job-specification/artifact
 168       #
 169       artifact {
 170         source          = "${url}"
 171       }
 172
 173       # The "template" stanza instructs Nomad to manage a template, such as
 174       # a configuration file or script. This template can optionally pull data
 175       # from Consul or Vault to populate runtime configuration data.
 176       #
 177       # For more information and examples on the "template" stanza, please see
 178       # the online documentation at:
 179       #
 180       #     https://www.nomadproject.io/docs/job-specification/template
 181       #
 182       template {
 183         change_mode     = "noop"
 184         change_signal   = "SIGINT"
 185         destination     = "secrets/alerts.yml"
 186         left_delimiter  = "{{{"
 187         right_delimiter = "}}}"
 188         data            = <<EOH
 189 ---
 190 groups:
 191 - name: "Jenkins Job Health Exporter"
 192   rules:
 193   - alert: JenkinsJobHealthExporterFailures
 194     expr: jenkins_job_failure{id=~".*"} >= 10
 195     for: 0m
 196     labels:
 197       severity: critical
 198     annotations:
 199       summary: "Jenkins Job Health detected high failure rate on jenkins jobs."
 200       description: "Job: {{ $labels.id }}"
 201   - alert: JenkinsJobHealthExporterUnstable
 202     expr: jenkins_job_unstable{id=~".*"} >= 10
 203     for: 0m
 204     labels:
 205       severity: warning
 206     annotations:
 207       summary: "Jenkins Job Health detected high unstable rate on jenkins jobs."
 208       description: "Job: {{ $labels.id }}"
 209 - name: "Consul"
 210   rules:
 211   - alert: ConsulServiceHealthcheckFailed
 212     expr: consul_catalog_service_node_healthy == 0
 213     for: 0m
 214     labels:
 215       severity: critical
 216     annotations:
 217       summary: "Consul service healthcheck failed (instance {{ $labels.instance }})."
 218       description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`."
 219   - alert: ConsulMissingMasterNode
 220     expr: consul_raft_peers < 3
 221     for: 0m
 222     labels:
 223       severity: critical
 224     annotations:
 225       summary: "Consul missing master node (instance {{ $labels.instance }})."
 226       description: "Numbers of consul raft peers should be 3, in order to preserve quorum."
 227   - alert: ConsulAgentUnhealthy
 228     expr: consul_health_node_status{status="critical"} == 1
 229     for: 0m
 230     labels:
 231       severity: critical
 232     annotations:
 233       summary: "Consul agent unhealthy (instance {{ $labels.instance }})."
 234       description: "A Consul agent is down."
 235 - name: "Hosts"
 236   rules:
 237   - alert: NodeDown
 238     expr: up == 0
 239     for: 0m
 240     labels:
 241       severity: critical
 242     annotations:
 243       summary: "Prometheus target missing (instance {{ $labels.instance }})."
 244       description: "A Prometheus target has disappeared. An exporter might be crashed."
 245   - alert: HostHighCpuLoad
 246     expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95
 247     for: 0m
 248     labels:
 249       severity: warning
 250     annotations:
 251       summary: "Host high CPU load (instance {{ $labels.instance }})."
 252       description: "CPU load is > 95%."
 253   - alert: HostOutOfMemory
 254     expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
 255     for: 2m
 256     labels:
 257       severity: warning
 258     annotations:
 259       summary: "Host out of memory (instance {{ $labels.instance }})."
 260       description: "Node memory is filling up (< 10% left)."
 261   - alert: HostOomKillDetected
 262     expr: increase(node_vmstat_oom_kill[1m]) > 0
 263     for: 0m
 264     labels:
 265       severity: warning
 266     annotations:
 267       summary: "Host OOM kill detected (instance {{ $labels.instance }})."
 268       description: "OOM kill detected."
 269   - alert: HostMemoryUnderMemoryPressure
 270     expr: rate(node_vmstat_pgmajfault[1m]) > 1000
 271     for: 2m
 272     labels:
 273       severity: warning
 274     annotations:
 275       summary: "Host memory under memory pressure (instance {{ $labels.instance }})."
 276       description: "The node is under heavy memory pressure. High rate of major page faults."
 277   - alert: HostOutOfDiskSpace
 278     expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
 279     for: 2m
 280     labels:
 281       severity: warning
 282     annotations:
 283       summary: "Host out of disk space (instance {{ $labels.instance }})."
 284       description: "Disk is almost full (< 10% left)."
 285   - alert: HostRaidDiskFailure
 286     expr: node_md_disks{state="failed"} > 0
 287     for: 2m
 288     labels:
 289       severity: warning
 290     annotations:
 291       summary: "Host RAID disk failure (instance {{ $labels.instance }})."
 292       description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap."
 293   - alert: HostConntrackLimit
 294     expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
 295     for: 5m
 296     labels:
 297       severity: warning
 298     annotations:
 299       summary: "Host conntrack limit (instance {{ $labels.instance }})."
 300       description: "The number of conntrack is approching limit."
 301   - alert: HostNetworkInterfaceSaturated
 302     expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8
 303     for: 1m
 304     labels:
 305       severity: warning
 306     annotations:
 307       summary: "Host Network Interface Saturated (instance {{ $labels.instance }})."
 308       description: "The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded."
 309   - alert: HostSystemdServiceCrashed
 310     expr: node_systemd_unit_state{state="failed"} == 1
 311     for: 0m
 312     labels:
 313       severity: warning
 314     annotations:
 315       summary: "Host SystemD service crashed (instance {{ $labels.instance }})."
 316       description: "SystemD service crashed."
 317   - alert: HostEdacCorrectableErrorsDetected
 318     expr: increase(node_edac_correctable_errors_total[1m]) > 0
 319     for: 0m
 320     labels:
 321       severity: info
 322     annotations:
 323       summary: "Host EDAC Correctable Errors detected (instance {{ $labels.instance }})."
 324       description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'
 325   - alert: HostEdacUncorrectableErrorsDetected
 326     expr: node_edac_uncorrectable_errors_total > 0
 327     for: 0m
 328     labels:
 329       severity: warning
 330     annotations:
 331       summary: "Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})."
 332       description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'
 333 - name: "Min.io"
 334   rules:
 335   - alert: MinioDiskOffline
 336     expr: minio_offline_disks > 0
 337     for: 0m
 338     labels:
 339       severity: critical
 340     annotations:
 341       summary: "Minio disk offline (instance {{ $labels.instance }})"
 342       description: "Minio disk is offline."
 343   - alert: MinioStorageSpaceExhausted
 344     expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 < 10
 345     for: 2m
 346     labels:
 347       severity: warning
 348     annotations:
 349       summary: "Minio storage space exhausted (instance {{ $labels.instance }})."
 350       description: "Minio storage space is low (< 10 GB)."
 351 - name: "Prometheus"
 352   rules:
 353   - alert: PrometheusConfigurationReloadFailure
 354     expr: prometheus_config_last_reload_successful != 1
 355     for: 0m
 356     labels:
 357       severity: warning
 358     annotations:
 359       summary: "Prometheus configuration reload failure (instance {{ $labels.instance }})."
 360       description: "Prometheus configuration reload error."
 361   - alert: PrometheusTooManyRestarts
 362     expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
 363     for: 0m
 364     labels:
 365       severity: warning
 366     annotations:
 367       summary: "Prometheus too many restarts (instance {{ $labels.instance }})."
 368       description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping."
 369   - alert: PrometheusAlertmanagerConfigurationReloadFailure
 370     expr: alertmanager_config_last_reload_successful != 1
 371     for: 0m
 372     labels:
 373       severity: warning
 374     annotations:
 375       summary: "Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})."
 376       description: "AlertManager configuration reload error."
 377   - alert: PrometheusRuleEvaluationFailures
 378     expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
 379     for: 0m
 380     labels:
 381       severity: critical
 382     annotations:
 383       summary: "Prometheus rule evaluation failures (instance {{ $labels.instance }})."
 384       description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts."
 385   - alert: PrometheusTargetScrapingSlow
 386     expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60
 387     for: 5m
 388     labels:
 389       severity: warning
 390     annotations:
 391       summary: "Prometheus target scraping slow (instance {{ $labels.instance }})."
 392       description: "Prometheus is scraping exporters slowly."
 393   - alert: PrometheusTsdbCompactionsFailed
 394     expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
 395     for: 0m
 396     labels:
 397       severity: critical
 398     annotations:
 399       summary: "Prometheus TSDB compactions failed (instance {{ $labels.instance }})."
 400       description: "Prometheus encountered {{ $value }} TSDB compactions failures."
 401   - alert: PrometheusTsdbHeadTruncationsFailed
 402     expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
 403     for: 0m
 404     labels:
 405       severity: critical
 406     annotations:
 407       summary: "Prometheus TSDB head truncations failed (instance {{ $labels.instance }})."
 408       description: "Prometheus encountered {{ $value }} TSDB head truncation failures."
 409   - alert: PrometheusTsdbWalCorruptions
 410     expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
 411     for: 0m
 412     labels:
 413       severity: critical
 414     annotations:
 415       summary: "Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})."
 416       description: "Prometheus encountered {{ $value }} TSDB WAL corruptions."
 417   - alert: PrometheusTsdbWalTruncationsFailed
 418     expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
 419     for: 0m
 420     labels:
 421       severity: critical
 422     annotations:
 423       summary: "Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})."
 424       description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures."
 425 EOH
 426       }
 427
 428       template {
 429         change_mode     = "noop"
 430         change_signal   = "SIGINT"
 431         destination     = "secrets/prometheus.yml"
 432         data            = <<EOH
 433 ---
 434 global:
 435   scrape_interval:     5s
 436   scrape_timeout:      5s
 437   evaluation_interval: 5s
 438
 439 alerting:
 440   alertmanagers:
 441   - consul_sd_configs:
 442     - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
 443       services: [ 'alertmanager' ]
 444
 445 rule_files:
 446   - 'alerts.yml'
 447
 448 scrape_configs:
 449
 450   - job_name: 'Nomad Cluster'
 451     consul_sd_configs:
 452     - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
 453       services: [ 'nomad-client', 'nomad' ]
 454     relabel_configs:
 455     - source_labels: [__meta_consul_tags]
 456       regex: '(.*)http(.*)'
 457       action: keep
 458     metrics_path: /v1/metrics
 459     params:
 460       format: [ 'prometheus' ]
 461
 462   - job_name: 'Consul Cluster'
 463     static_configs:
 464       - targets: [ '10.30.51.28:8500' ]
 465       - targets: [ '10.30.51.29:8500' ]
 466       - targets: [ '10.30.51.30:8500' ]
 467       - targets: [ '10.30.51.32:8500' ]
 468       - targets: [ '10.30.51.33:8500' ]
 469       - targets: [ '10.30.51.34:8500' ]
 470       - targets: [ '10.30.51.35:8500' ]
 471       - targets: [ '10.30.51.39:8500' ]
 472       - targets: [ '10.30.51.40:8500' ]
 473       - targets: [ '10.30.51.50:8500' ]
 474       - targets: [ '10.30.51.51:8500' ]
 475       - targets: [ '10.30.51.65:8500' ]
 476       - targets: [ '10.30.51.66:8500' ]
 477       - targets: [ '10.30.51.67:8500' ]
 478       - targets: [ '10.30.51.68:8500' ]
 479       - targets: [ '10.30.51.70:8500' ]
 480       - targets: [ '10.30.51.71:8500' ]
 481       - targets: [ '10.32.8.14:8500' ]
 482       - targets: [ '10.32.8.15:8500' ]
 483       - targets: [ '10.32.8.16:8500' ]
 484       - targets: [ '10.32.8.17:8500' ]
 485     metrics_path: /v1/agent/metrics
 486     params:
 487       format: [ 'prometheus' ]
 488
 489   - job_name: 'Blackbox Exporter (icmp)'
 490     static_configs:
 491       - targets: [ 'gerrit.fd.io' ]
 492       - targets: [ 'jenkins.fd.io' ]
 493       - targets: [ '10.30.51.32' ]
 494     params:
 495       module: [ 'icmp_v4' ]
 496     relabel_configs:
 497       - source_labels: [__address__]
 498         target_label: __param_target
 499       - source_labels: [__param_target]
 500         target_label: instance
 501       - target_label: __address__
 502         replacement: localhost:9115
 503     metrics_path: /probe
 504
 505   - job_name: 'Blackbox Exporter (http)'
 506     static_configs:
 507       - targets: [ 'gerrit.fd.io' ]
 508       - targets: [ 'jenkins.fd.io' ]
 509     params:
 510       module: [ 'http_2xx' ]
 511     relabel_configs:
 512       - source_labels: [__address__]
 513         target_label: __param_target
 514       - source_labels: [__param_target]
 515         target_label: instance
 516       - target_label: __address__
 517         replacement: localhost:9115
 518     metrics_path: /probe
 519
 520   - job_name: 'cAdvisor Exporter'
 521     static_configs:
 522       - targets: [ '10.30.51.28:8080' ]
 523       - targets: [ '10.30.51.29:8080' ]
 524       - targets: [ '10.30.51.30:8080' ]
 525       #- targets: [ '10.30.51.32:8080' ]
 526       - targets: [ '10.30.51.33:8080' ]
 527       - targets: [ '10.30.51.34:8080' ]
 528       - targets: [ '10.30.51.35:8080' ]
 529       - targets: [ '10.30.51.39:8080' ]
 530       - targets: [ '10.30.51.40:8080' ]
 531       - targets: [ '10.30.51.50:8080' ]
 532       - targets: [ '10.30.51.51:8080' ]
 533       - targets: [ '10.30.51.65:8080' ]
 534       - targets: [ '10.30.51.66:8080' ]
 535       - targets: [ '10.30.51.67:8080' ]
 536       - targets: [ '10.30.51.68:8080' ]
 537       - targets: [ '10.30.51.70:8080' ]
 538       - targets: [ '10.30.51.71:8080' ]
 539       - targets: [ '10.32.8.14:8080' ]
 540       - targets: [ '10.32.8.15:8080' ]
 541       - targets: [ '10.32.8.16:8080' ]
 542       - targets: [ '10.32.8.17:8080' ]
 543
 544   - job_name: 'Jenkins Job Health Exporter'
 545     static_configs:
 546       - targets: [ '10.30.51.32:9186' ]
 547     metric_relabel_configs:
 548       - source_labels: [ __name__ ]
 549         regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'
 550         action: replace
 551         replacement: '$1'
 552         target_label: id
 553       - source_labels: [ __name__ ]
 554         regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'
 555         replacement: 'jenkins_job_$2'
 556         target_label: __name__
 557
 558   - job_name: 'Node Exporter'
 559     static_configs:
 560       - targets: [ '10.30.51.28:9100' ]
 561       - targets: [ '10.30.51.29:9100' ]
 562       - targets: [ '10.30.51.30:9100' ]
 563       - targets: [ '10.30.51.32:9100' ]
 564       - targets: [ '10.30.51.33:9100' ]
 565       - targets: [ '10.30.51.34:9100' ]
 566       - targets: [ '10.30.51.35:9100' ]
 567       - targets: [ '10.30.51.39:9100' ]
 568       - targets: [ '10.30.51.40:9100' ]
 569       - targets: [ '10.30.51.50:9100' ]
 570       - targets: [ '10.30.51.51:9100' ]
 571       - targets: [ '10.30.51.65:9100' ]
 572       - targets: [ '10.30.51.66:9100' ]
 573       - targets: [ '10.30.51.67:9100' ]
 574       - targets: [ '10.30.51.68:9100' ]
 575       - targets: [ '10.30.51.70:9100' ]
 576       - targets: [ '10.30.51.71:9100' ]
 577       - targets: [ '10.32.8.14:9100' ]
 578       - targets: [ '10.32.8.15:9100' ]
 579       - targets: [ '10.32.8.16:9100' ]
 580       - targets: [ '10.32.8.17:9100' ]
 581
 582   - job_name: 'Alertmanager'
 583     consul_sd_configs:
 584     - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
 585       services: [ 'alertmanager' ]
 586
 587   - job_name: 'Grafana'
 588     consul_sd_configs:
 589     - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
 590       services: [ 'grafana' ]
 591
 592   - job_name: 'Prometheus'
 593     consul_sd_configs:
 594     - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
 595       services: [ 'prometheus' ]
 596
 597   - job_name: 'Minio'
 598     bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg
 599     consul_sd_configs:
 600     - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
 601       services: [ 'storage' ]
 602     metrics_path: /minio/prometheus/metrics
 603 EOH
 604       }
 605
 606       # The service stanza instructs Nomad to register a service with Consul.
 607       #
 608       # For more information and examples on the "task" stanza, please see
 609       # the online documentation at:
 610       #
 611       #     https://www.nomadproject.io/docs/job-specification/service
 612       #
 613       service {
 614         name            = "${service_name}"
 615         port            = "${service_name}"
 616         tags            = [ "${service_name}$${NOMAD_ALLOC_INDEX}" ]
 617         check {
 618           name          = "Prometheus Check Live"
 619           type          = "http"
 620           path          = "/-/healthy"
 621           interval      = "10s"
 622           timeout       = "2s"
 623         }
 624       }
 625
 626       # The "resources" stanza describes the requirements a task needs to
 627       # execute. Resource requirements include memory, network, cpu, and more.
 628       # This ensures the task will execute on a machine that contains enough
 629       # resource capacity.
 630       #
 631       # For more information and examples on the "resources" stanza, please see
 632       # the online documentation at:
 633       #
 634       #     https://www.nomadproject.io/docs/job-specification/resources
 635       #
 636       resources {
 637         cpu             = ${cpu}
 638         memory          = ${mem}
 639         # The network stanza specifies the networking requirements for the task
 640         # group, including the network mode and port allocations. When scheduling
 641         # jobs in Nomad they are provisioned across your fleet of machines along
 642         # with other jobs and services. Because you don't know in advance what host
 643         # your job will be provisioned on, Nomad will provide your tasks with
 644         # network configuration when they start up.
 645         #
 646         # For more information and examples on the "template" stanza, please see
 647         # the online documentation at:
 648         #
 649         #     https://www.nomadproject.io/docs/job-specification/network
 650         #
 651         network {
 652           port "${service_name}" {
 653             static      = ${port}
 654           }
 655         }
 656       }
 657     }
 658   }
 659 }