terraform-ci-infra/1n_nmd/prometheus/conf/nomad/prometheus.hcl

   1 job "${job_name}" {
   2   # The "region" parameter specifies the region in which to execute the job.
   3   # If omitted, this inherits the default region name of "global".
   4   # region = "global"
   5   #
   6   # The "datacenters" parameter specifies the list of datacenters which should
   7   # be considered when placing this task. This must be provided.
   8   datacenters         = "${datacenters}"
   9
  10   # The "type" parameter controls the type of job, which impacts the scheduler's
  11   # decision on placement. This configuration is optional and defaults to
  12   # "service". For a full list of job types and their differences, please see
  13   # the online documentation.
  14   #
  15   # For more information, please see the online documentation at:
  16   #
  17   #     https://www.nomadproject.io/docs/jobspec/schedulers
  18   #
  19   type                = "service"
  20
  21   update {
  22     # The "max_parallel" parameter specifies the maximum number of updates to
  23     # perform in parallel. In this case, this specifies to update a single task
  24     # at a time.
  25     max_parallel      = 1
  26
  27     health_check      = "checks"
  28
  29     # The "min_healthy_time" parameter specifies the minimum time the allocation
  30     # must be in the healthy state before it is marked as healthy and unblocks
  31     # further allocations from being updated.
  32     min_healthy_time  = "10s"
  33
  34     # The "healthy_deadline" parameter specifies the deadline in which the
  35     # allocation must be marked as healthy after which the allocation is
  36     # automatically transitioned to unhealthy. Transitioning to unhealthy will
  37     # fail the deployment and potentially roll back the job if "auto_revert" is
  38     # set to true.
  39     healthy_deadline  = "3m"
  40
  41     # The "progress_deadline" parameter specifies the deadline in which an
  42     # allocation must be marked as healthy. The deadline begins when the first
  43     # allocation for the deployment is created and is reset whenever an allocation
  44     # as part of the deployment transitions to a healthy state. If no allocation
  45     # transitions to the healthy state before the progress deadline, the
  46     # deployment is marked as failed.
  47     progress_deadline = "10m"
  48
  49 %{ if use_canary }
  50     # The "canary" parameter specifies that changes to the job that would result
  51     # in destructive updates should create the specified number of canaries
  52     # without stopping any previous allocations. Once the operator determines the
  53     # canaries are healthy, they can be promoted which unblocks a rolling update
  54     # of the remaining allocations at a rate of "max_parallel".
  55     #
  56     # Further, setting "canary" equal to the count of the task group allows
  57     # blue/green deployments. When the job is updated, a full set of the new
  58     # version is deployed and upon promotion the old version is stopped.
  59     canary            = 1
  60
  61     # Specifies if the job should auto-promote to the canary version when all
  62     # canaries become healthy during a deployment. Defaults to false which means
  63     # canaries must be manually updated with the nomad deployment promote
  64     # command.
  65     auto_promote      = true
  66
  67     # The "auto_revert" parameter specifies if the job should auto-revert to the
  68     # last stable job on deployment failure. A job is marked as stable if all the
  69     # allocations as part of its deployment were marked healthy.
  70     auto_revert       = true
  71 %{ endif }
  72   }
  73
  74   # The reschedule stanza specifies the group's rescheduling strategy. If
  75   # specified at the job level, the configuration will apply to all groups
  76   # within the job. If the reschedule stanza is present on both the job and the
  77   # group, they are merged with the group stanza taking the highest precedence
  78   # and then the job.
  79   reschedule {
  80     delay             = "30s"
  81     delay_function    = "constant"
  82     unlimited         = true
  83   }
  84
  85   # The "group" stanza defines a series of tasks that should be co-located on
  86   # the same Nomad client. Any task within a group will be placed on the same
  87   # client.
  88   #
  89   # For more information and examples on the "group" stanza, please see
  90   # the online documentation at:
  91   #
  92   #     https://www.nomadproject.io/docs/job-specification/group
  93   #
  94   group "prod-group1-${service_name}" {
  95     # The "count" parameter specifies the number of the task groups that should
  96     # be running under this group. This value must be non-negative and defaults
  97     # to 1.
  98     count               = ${group_count}
  99
 100     # The restart stanza configures a tasks's behavior on task failure. Restarts
 101     # happen on the client that is running the task.
 102     #
 103     # https://www.nomadproject.io/docs/job-specification/restart
 104     #
 105     restart {
 106       interval  = "30m"
 107       attempts  = 40
 108       delay     = "15s"
 109       mode      = "delay"
 110     }
 111
 112     # The volume stanza allows the group to specify that it requires a given
 113     # volume from the cluster.
 114     #
 115     # For more information and examples on the "volume" stanza, please see
 116     # the online documentation at:
 117     #
 118     #     https://www.nomadproject.io/docs/job-specification/volume
 119     #
 120     %{ if use_host_volume }
 121     volume "prod-volume1-${service_name}" {
 122       type              = "host"
 123       read_only         = false
 124       source            = "${host_volume}"
 125     }
 126     %{ endif }
 127
 128     # The constraint allows restricting the set of eligible nodes. Constraints
 129     # may filter on attributes or client metadata.
 130     #
 131     # For more information and examples on the "volume" stanza, please see
 132     # the online documentation at:
 133     #
 134     #     https://www.nomadproject.io/docs/job-specification/constraint
 135     #
 136     constraint {
 137       attribute         = "$${attr.cpu.arch}"
 138       operator          = "!="
 139       value             = "arm64"
 140     }
 141
 142     # The "task" stanza creates an individual unit of work, such as a Docker
 143     # container, web application, or batch processing.
 144     #
 145     # For more information and examples on the "task" stanza, please see
 146     # the online documentation at:
 147     #
 148     #     https://www.nomadproject.io/docs/job-specification/task
 149     #
 150     task "prod-task1-${service_name}" {
 151       # The "driver" parameter specifies the task driver that should be used to
 152       # run the task.
 153       driver            = "exec"
 154
 155       %{ if use_host_volume }
 156       volume_mount {
 157         volume          = "prod-volume1-${service_name}"
 158         destination     = "${data_dir}"
 159         read_only       = false
 160       }
 161       %{ endif }
 162
 163       %{ if use_vault_provider }
 164       vault {
 165         policies        = "${vault_kv_policy_name}"
 166       }
 167       %{ endif }
 168
 169       # The "config" stanza specifies the driver configuration, which is passed
 170       # directly to the driver to start the task. The details of configurations
 171       # are specific to each driver, so please see specific driver
 172       # documentation for more information.
 173       config {
 174         command         = "local/prometheus-${version}.linux-amd64/prometheus"
 175         args            = [
 176           "--config.file=secrets/prometheus.yml",
 177           "--storage.tsdb.path=${data_dir}prometheus/",
 178           "--storage.tsdb.retention.time=15d"
 179         ]
 180       }
 181
 182       # The artifact stanza instructs Nomad to fetch and unpack a remote resource,
 183       # such as a file, tarball, or binary. Nomad downloads artifacts using the
 184       # popular go-getter library, which permits downloading artifacts from a
 185       # variety of locations using a URL as the input source.
 186       #
 187       # For more information and examples on the "artifact" stanza, please see
 188       # the online documentation at:
 189       #
 190       #     https://www.nomadproject.io/docs/job-specification/artifact
 191       #
 192       artifact {
 193         source          = "${url}"
 194       }
 195
 196       # The "template" stanza instructs Nomad to manage a template, such as
 197       # a configuration file or script. This template can optionally pull data
 198       # from Consul or Vault to populate runtime configuration data.
 199       #
 200       # For more information and examples on the "template" stanza, please see
 201       # the online documentation at:
 202       #
 203       #     https://www.nomadproject.io/docs/job-specification/template
 204       #
 205       template {
 206         change_mode     = "noop"
 207         change_signal   = "SIGINT"
 208         destination     = "secrets/alerts.yml"
 209         left_delimiter  = "{{{"
 210         right_delimiter = "}}}"
 211         data            = <<EOH
 212 ---
 213 groups:
 214 - name: "Jenkins Job Health Exporter"
 215   rules:
 216   - alert: JenkinsJobHealthExporterFailures
 217     expr: jenkins_job_failure{id=~".*"} > jenkins_job_success{id=~".*"}
 218     for: 0m
 219     labels:
 220       severity: critical
 221     annotations:
 222       summary: "Jenkins Job Health detected high failure rate on jenkins jobs."
 223       description: "Job: {{ $labels.id }}"
 224   - alert: JenkinsJobHealthExporterUnstable
 225     expr: jenkins_job_unstable{id=~".*"} > jenkins_job_success{id=~".*"}
 226     for: 0m
 227     labels:
 228       severity: warning
 229     annotations:
 230       summary: "Jenkins Job Health detected high unstable rate on jenkins jobs."
 231       description: "Job: {{ $labels.id }}"
 232 - name: "Consul"
 233   rules:
 234   - alert: ConsulServiceHealthcheckFailed
 235     expr: consul_catalog_service_node_healthy == 0
 236     for: 0m
 237     labels:
 238       severity: critical
 239     annotations:
 240       summary: "Consul service healthcheck failed (instance {{ $labels.instance }})."
 241       description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`."
 242   - alert: ConsulMissingMasterNode
 243     expr: consul_raft_peers < 3
 244     for: 0m
 245     labels:
 246       severity: critical
 247     annotations:
 248       summary: "Consul missing master node (instance {{ $labels.instance }})."
 249       description: "Numbers of consul raft peers should be 3, in order to preserve quorum."
 250   - alert: ConsulAgentUnhealthy
 251     expr: consul_health_node_status{status="critical"} == 1
 252     for: 0m
 253     labels:
 254       severity: critical
 255     annotations:
 256       summary: "Consul agent unhealthy (instance {{ $labels.instance }})."
 257       description: "A Consul agent is down."
 258 - name: "Hosts"
 259   rules:
 260   - alert: NodeDown
 261     expr: up == 0
 262     for: 0m
 263     labels:
 264       severity: critical
 265     annotations:
 266       summary: "Prometheus target missing (instance {{ $labels.instance }})."
 267       description: "A Prometheus target has disappeared. An exporter might be crashed."
 268   - alert: HostHighCpuLoad
 269     expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95
 270     for: 0m
 271     labels:
 272       severity: warning
 273     annotations:
 274       summary: "Host high CPU load (instance {{ $labels.instance }})."
 275       description: "CPU load is > 95%."
 276   - alert: HostOutOfMemory
 277     expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
 278     for: 2m
 279     labels:
 280       severity: warning
 281     annotations:
 282       summary: "Host out of memory (instance {{ $labels.instance }})."
 283       description: "Node memory is filling up (< 10% left)."
 284   - alert: HostOomKillDetected
 285     expr: increase(node_vmstat_oom_kill[1m]) > 0
 286     for: 0m
 287     labels:
 288       severity: warning
 289     annotations:
 290       summary: "Host OOM kill detected (instance {{ $labels.instance }})."
 291       description: "OOM kill detected."
 292   - alert: HostMemoryUnderMemoryPressure
 293     expr: rate(node_vmstat_pgmajfault[1m]) > 1000
 294     for: 2m
 295     labels:
 296       severity: warning
 297     annotations:
 298       summary: "Host memory under memory pressure (instance {{ $labels.instance }})."
 299       description: "The node is under heavy memory pressure. High rate of major page faults."
 300   - alert: HostOutOfDiskSpace
 301     expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
 302     for: 2m
 303     labels:
 304       severity: warning
 305     annotations:
 306       summary: "Host out of disk space (instance {{ $labels.instance }})."
 307       description: "Disk is almost full (< 10% left)."
 308   - alert: HostRaidDiskFailure
 309     expr: node_md_disks{state="failed"} > 0
 310     for: 2m
 311     labels:
 312       severity: warning
 313     annotations:
 314       summary: "Host RAID disk failure (instance {{ $labels.instance }})."
 315       description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap."
 316   - alert: HostConntrackLimit
 317     expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
 318     for: 5m
 319     labels:
 320       severity: warning
 321     annotations:
 322       summary: "Host conntrack limit (instance {{ $labels.instance }})."
 323       description: "The number of conntrack is approching limit."
 324   - alert: HostNetworkInterfaceSaturated
 325     expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8
 326     for: 1m
 327     labels:
 328       severity: warning
 329     annotations:
 330       summary: "Host Network Interface Saturated (instance {{ $labels.instance }})."
 331       description: "The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded."
 332   - alert: HostSystemdServiceCrashed
 333     expr: node_systemd_unit_state{state="failed"} == 1
 334     for: 0m
 335     labels:
 336       severity: warning
 337     annotations:
 338       summary: "Host SystemD service crashed (instance {{ $labels.instance }})."
 339       description: "SystemD service crashed."
 340   - alert: HostEdacCorrectableErrorsDetected
 341     expr: increase(node_edac_correctable_errors_total[1m]) > 0
 342     for: 0m
 343     labels:
 344       severity: info
 345     annotations:
 346       summary: "Host EDAC Correctable Errors detected (instance {{ $labels.instance }})."
 347       description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'
 348   - alert: HostEdacUncorrectableErrorsDetected
 349     expr: node_edac_uncorrectable_errors_total > 0
 350     for: 0m
 351     labels:
 352       severity: warning
 353     annotations:
 354       summary: "Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})."
 355       description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'
 356 - name: "Min.io"
 357   rules:
 358   - alert: MinioDiskOffline
 359     expr: minio_offline_disks > 0
 360     for: 0m
 361     labels:
 362       severity: critical
 363     annotations:
 364       summary: "Minio disk offline (instance {{ $labels.instance }})"
 365       description: "Minio disk is offline."
 366   - alert: MinioStorageSpaceExhausted
 367     expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 < 10
 368     for: 2m
 369     labels:
 370       severity: warning
 371     annotations:
 372       summary: "Minio storage space exhausted (instance {{ $labels.instance }})."
 373       description: "Minio storage space is low (< 10 GB)."
 374 - name: "Prometheus"
 375   rules:
 376   - alert: PrometheusConfigurationReloadFailure
 377     expr: prometheus_config_last_reload_successful != 1
 378     for: 0m
 379     labels:
 380       severity: warning
 381     annotations:
 382       summary: "Prometheus configuration reload failure (instance {{ $labels.instance }})."
 383       description: "Prometheus configuration reload error."
 384   - alert: PrometheusTooManyRestarts
 385     expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
 386     for: 0m
 387     labels:
 388       severity: warning
 389     annotations:
 390       summary: "Prometheus too many restarts (instance {{ $labels.instance }})."
 391       description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping."
 392   - alert: PrometheusAlertmanagerConfigurationReloadFailure
 393     expr: alertmanager_config_last_reload_successful != 1
 394     for: 0m
 395     labels:
 396       severity: warning
 397     annotations:
 398       summary: "Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})."
 399       description: "AlertManager configuration reload error."
 400   - alert: PrometheusRuleEvaluationFailures
 401     expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
 402     for: 0m
 403     labels:
 404       severity: critical
 405     annotations:
 406       summary: "Prometheus rule evaluation failures (instance {{ $labels.instance }})."
 407       description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts."
 408   - alert: PrometheusTargetScrapingSlow
 409     expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60
 410     for: 5m
 411     labels:
 412       severity: warning
 413     annotations:
 414       summary: "Prometheus target scraping slow (instance {{ $labels.instance }})."
 415       description: "Prometheus is scraping exporters slowly."
 416   - alert: PrometheusTsdbCompactionsFailed
 417     expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
 418     for: 0m
 419     labels:
 420       severity: critical
 421     annotations:
 422       summary: "Prometheus TSDB compactions failed (instance {{ $labels.instance }})."
 423       description: "Prometheus encountered {{ $value }} TSDB compactions failures."
 424   - alert: PrometheusTsdbHeadTruncationsFailed
 425     expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
 426     for: 0m
 427     labels:
 428       severity: critical
 429     annotations:
 430       summary: "Prometheus TSDB head truncations failed (instance {{ $labels.instance }})."
 431       description: "Prometheus encountered {{ $value }} TSDB head truncation failures."
 432   - alert: PrometheusTsdbWalCorruptions
 433     expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
 434     for: 0m
 435     labels:
 436       severity: critical
 437     annotations:
 438       summary: "Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})."
 439       description: "Prometheus encountered {{ $value }} TSDB WAL corruptions."
 440   - alert: PrometheusTsdbWalTruncationsFailed
 441     expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
 442     for: 0m
 443     labels:
 444       severity: critical
 445     annotations:
 446       summary: "Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})."
 447       description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures."
 448 EOH
 449       }
 450
 451       template {
 452         change_mode     = "noop"
 453         change_signal   = "SIGINT"
 454         destination     = "secrets/prometheus.yml"
 455         data            = <<EOH
 456 ---
 457 global:
 458   scrape_interval:     5s
 459   scrape_timeout:      5s
 460   evaluation_interval: 5s
 461
 462 alerting:
 463   alertmanagers:
 464   - consul_sd_configs:
 465     - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
 466       services: [ 'alertmanager' ]
 467
 468 rule_files:
 469   - 'alerts.yml'
 470
 471 scrape_configs:
 472
 473   - job_name: 'Nomad Cluster'
 474     consul_sd_configs:
 475     - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
 476       services: [ 'nomad-client', 'nomad' ]
 477     relabel_configs:
 478     - source_labels: [__meta_consul_tags]
 479       regex: '(.*)http(.*)'
 480       action: keep
 481     metrics_path: /v1/metrics
 482     params:
 483       format: [ 'prometheus' ]
 484
 485   - job_name: 'Consul Cluster'
 486     static_configs:
 487       - targets: [ '10.30.51.28:8500' ]
 488       - targets: [ '10.30.51.29:8500' ]
 489       - targets: [ '10.30.51.30:8500' ]
 490       - targets: [ '10.30.51.32:8500' ]
 491       - targets: [ '10.30.51.33:8500' ]
 492       - targets: [ '10.30.51.34:8500' ]
 493       - targets: [ '10.30.51.35:8500' ]
 494       - targets: [ '10.30.51.39:8500' ]
 495       - targets: [ '10.30.51.40:8500' ]
 496       - targets: [ '10.30.51.50:8500' ]
 497       - targets: [ '10.30.51.51:8500' ]
 498       - targets: [ '10.30.51.65:8500' ]
 499       - targets: [ '10.30.51.66:8500' ]
 500       - targets: [ '10.30.51.67:8500' ]
 501       - targets: [ '10.30.51.68:8500' ]
 502       - targets: [ '10.30.51.70:8500' ]
 503       - targets: [ '10.30.51.71:8500' ]
 504       - targets: [ '10.32.8.14:8500' ]
 505       - targets: [ '10.32.8.15:8500' ]
 506       - targets: [ '10.32.8.16:8500' ]
 507       - targets: [ '10.32.8.17:8500' ]
 508     metrics_path: /v1/agent/metrics
 509     params:
 510       format: [ 'prometheus' ]
 511
 512   - job_name: 'Blackbox Exporter (icmp)'
 513     static_configs:
 514       - targets: [ 'gerrit.fd.io' ]
 515       - targets: [ 'jenkins.fd.io' ]
 516       - targets: [ '10.30.51.32' ]
 517     params:
 518       module: [ 'icmp_v4' ]
 519     relabel_configs:
 520       - source_labels: [__address__]
 521         target_label: __param_target
 522       - source_labels: [__param_target]
 523         target_label: instance
 524       - target_label: __address__
 525         replacement: localhost:9115
 526     metrics_path: /probe
 527
 528   - job_name: 'Blackbox Exporter (http)'
 529     static_configs:
 530       - targets: [ 'gerrit.fd.io' ]
 531       - targets: [ 'jenkins.fd.io' ]
 532     params:
 533       module: [ 'http_2xx' ]
 534     relabel_configs:
 535       - source_labels: [__address__]
 536         target_label: __param_target
 537       - source_labels: [__param_target]
 538         target_label: instance
 539       - target_label: __address__
 540         replacement: localhost:9115
 541     metrics_path: /probe
 542
 543   - job_name: 'cAdvisor Exporter'
 544     static_configs:
 545       - targets: [ '10.30.51.28:8080' ]
 546       - targets: [ '10.30.51.29:8080' ]
 547       - targets: [ '10.30.51.30:8080' ]
 548       #- targets: [ '10.30.51.32:8080' ]
 549       - targets: [ '10.30.51.33:8080' ]
 550       - targets: [ '10.30.51.34:8080' ]
 551       - targets: [ '10.30.51.35:8080' ]
 552       - targets: [ '10.30.51.39:8080' ]
 553       - targets: [ '10.30.51.40:8080' ]
 554       - targets: [ '10.30.51.50:8080' ]
 555       - targets: [ '10.30.51.51:8080' ]
 556       - targets: [ '10.30.51.65:8080' ]
 557       - targets: [ '10.30.51.66:8080' ]
 558       - targets: [ '10.30.51.67:8080' ]
 559       - targets: [ '10.30.51.68:8080' ]
 560       - targets: [ '10.30.51.70:8080' ]
 561       - targets: [ '10.30.51.71:8080' ]
 562       - targets: [ '10.32.8.14:8080' ]
 563       - targets: [ '10.32.8.15:8080' ]
 564       - targets: [ '10.32.8.16:8080' ]
 565       - targets: [ '10.32.8.17:8080' ]
 566
 567   - job_name: 'Jenkins Job Health Exporter'
 568     static_configs:
 569       - targets: [ '10.30.51.32:9186' ]
 570     metric_relabel_configs:
 571       - source_labels: [ __name__ ]
 572         regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'
 573         action: replace
 574         replacement: '$1'
 575         target_label: id
 576       - source_labels: [ __name__ ]
 577         regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'
 578         replacement: 'jenkins_job_$2'
 579         target_label: __name__
 580
 581   - job_name: 'Node Exporter'
 582     static_configs:
 583       - targets: [ '10.30.51.28:9100' ]
 584       - targets: [ '10.30.51.29:9100' ]
 585       - targets: [ '10.30.51.30:9100' ]
 586       - targets: [ '10.30.51.32:9100' ]
 587       - targets: [ '10.30.51.33:9100' ]
 588       - targets: [ '10.30.51.34:9100' ]
 589       - targets: [ '10.30.51.35:9100' ]
 590       - targets: [ '10.30.51.39:9100' ]
 591       - targets: [ '10.30.51.40:9100' ]
 592       - targets: [ '10.30.51.50:9100' ]
 593       - targets: [ '10.30.51.51:9100' ]
 594       - targets: [ '10.30.51.65:9100' ]
 595       - targets: [ '10.30.51.66:9100' ]
 596       - targets: [ '10.30.51.67:9100' ]
 597       - targets: [ '10.30.51.68:9100' ]
 598       - targets: [ '10.30.51.70:9100' ]
 599       - targets: [ '10.30.51.71:9100' ]
 600       - targets: [ '10.32.8.14:9100' ]
 601       - targets: [ '10.32.8.15:9100' ]
 602       - targets: [ '10.32.8.16:9100' ]
 603       - targets: [ '10.32.8.17:9100' ]
 604
 605   - job_name: 'Alertmanager'
 606     consul_sd_configs:
 607     - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
 608       services: [ 'alertmanager' ]
 609
 610   - job_name: 'Grafana'
 611     consul_sd_configs:
 612     - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
 613       services: [ 'grafana' ]
 614
 615   - job_name: 'Prometheus'
 616     consul_sd_configs:
 617     - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
 618       services: [ 'prometheus' ]
 619
 620   - job_name: 'Minio'
 621     bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg
 622     consul_sd_configs:
 623     - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
 624       services: [ 'storage' ]
 625     metrics_path: /minio/prometheus/metrics
 626 EOH
 627       }
 628
 629       # The service stanza instructs Nomad to register a service with Consul.
 630       #
 631       # For more information and examples on the "task" stanza, please see
 632       # the online documentation at:
 633       #
 634       #     https://www.nomadproject.io/docs/job-specification/service
 635       #
 636       service {
 637         name            = "${service_name}"
 638         port            = "${service_name}"
 639         tags            = [ "${service_name}$${NOMAD_ALLOC_INDEX}" ]
 640         check {
 641           name          = "Prometheus Check Live"
 642           type          = "http"
 643           path          = "/-/healthy"
 644           interval      = "10s"
 645           timeout       = "2s"
 646         }
 647       }
 648
 649       # The "resources" stanza describes the requirements a task needs to
 650       # execute. Resource requirements include memory, network, cpu, and more.
 651       # This ensures the task will execute on a machine that contains enough
 652       # resource capacity.
 653       #
 654       # For more information and examples on the "resources" stanza, please see
 655       # the online documentation at:
 656       #
 657       #     https://www.nomadproject.io/docs/job-specification/resources
 658       #
 659       resources {
 660         cpu             = ${cpu}
 661         memory          = ${mem}
 662         # The network stanza specifies the networking requirements for the task
 663         # group, including the network mode and port allocations. When scheduling
 664         # jobs in Nomad they are provisioned across your fleet of machines along
 665         # with other jobs and services. Because you don't know in advance what host
 666         # your job will be provisioned on, Nomad will provide your tasks with
 667         # network configuration when they start up.
 668         #
 669         # For more information and examples on the "template" stanza, please see
 670         # the online documentation at:
 671         #
 672         #     https://www.nomadproject.io/docs/job-specification/network
 673         #
 674         network {
 675           port "${service_name}" {
 676             static      = ${port}
 677           }
 678         }
 679       }
 680     }
 681   }
 682 }