Infra: JenkinsJobHealthExporter

author pmikus <pmikus@cisco.com>

Fri, 5 Feb 2021 14:51:43 +0000 (14:51 +0000)

committer Peter Mikus <pmikus@cisco.com>

Wed, 10 Feb 2021 09:04:46 +0000 (09:04 +0000)
author pmikus <pmikus@cisco.com>
Fri, 5 Feb 2021 14:51:43 +0000 (14:51 +0000)
committer Peter Mikus <pmikus@cisco.com>
Wed, 10 Feb 2021 09:04:46 +0000 (09:04 +0000)
diff --git a/resources/tools/testbed-setup/ansible/nomad.yaml b/resources/tools/testbed-setup/ansible/nomad.yaml

index 134fffc..db1c396 100644 (file)
--- a/resources/tools/testbed-setup/ansible/nomad.yaml
+++ b/resources/tools/testbed-setup/ansible/nomad.yaml
@@ -18,5 +18,7 @@
        tags: consul
      - role: prometheus_exporter
        tags: prometheus_exporter
+    - role: jenkins_job_health_exporter
+      tags: jenkins_job_health_exporter
      - role: cadvisor
        tags: cadvisor
 \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/roles/jenkins_job_health_exporter/defaults/main.yaml b/resources/tools/testbed-setup/ansible/roles/jenkins_job_health_exporter/defaults/main.yaml

new file mode 100644 (file)

index 0000000..9813d41
--- /dev/null
+++ b/resources/tools/testbed-setup/ansible/roles/jenkins_job_health_exporter/defaults/main.yaml
@@ -0,0 +1,35 @@
+---
+# file: roles/jenkins_job_health_exporter/defaults/main.yaml
+
+# Conf - Jenkins Job Health Exporter.
+jenkins_host: "jenkins.fd.io"
+poll_interval_sec: 1800
+req_timeout_sec: 30
+bind_to: "0.0.0.0:9186"
+last_builds: 10
+jobs:
+  - "vpp-csit-verify-api-crc-master"
+  - "vpp-beta-verify-master-ubuntu2004-aarch64"
+  - "vpp-verify-master-centos8-aarch64"
+  - "vpp-verify-master-ubuntu1804-aarch64"
+  - "vpp-gcc-verify-master-ubuntu2004-x86_64"
+  - "vpp-verify-master-centos8-x86_64"
+  - "vpp-verify-master-debian10-x86_64"
+  - "vpp-verify-master-ubuntu2004-x86_64"
+  - "vpp-verify-master-ubuntu1804-x86_64"
+  - "vpp-debug-verify-master-ubuntu2004-x86_64"
+  - "vpp-checkstyle-verify-master-ubuntu2004-x86_64"
+  - "vpp-sphinx-docs-verify-master-ubuntu1804-x86_64"
+  - "vpp-docs-verify-master-ubuntu1804-x86_64"
+  - "vpp-make-test-docs-verify-master-ubuntu1804-x86_64"
+  - "vpp-csit-verify-device-master-1n-skx"
+  - "vpp-csit-verify-device-master-1n-tx2"
+
+# Conf - Service.
+jenkins_job_health_exporter_restart_handler_state: "restarted"
+
+# Inst - System paths.
+jenkins_job_health_exporter_target_dir: "/usr/bin"
+jenkins_job_health_exporter_conf_dir: "/etc"
+jenkins_job_health_exporter_url: "https://github.com/ayourtch/jenkins-job-health-exporter/releases/download"
+jenkins_job_health_exporter_version: "v0.0.3"
+\ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/roles/jenkins_job_health_exporter/handlers/main.yaml b/resources/tools/testbed-setup/ansible/roles/jenkins_job_health_exporter/handlers/main.yaml

new file mode 100644 (file)

index 0000000..29fee98
--- /dev/null
+++ b/resources/tools/testbed-setup/ansible/roles/jenkins_job_health_exporter/handlers/main.yaml
@@ -0,0 +1,9 @@
+---
+# file roles/jenkins_job_health_exporter/handlers/main.yaml
+
+- name: Restart Jenkins Job Health Exporter
+  systemd:
+    daemon_reload: true
+    enabled: true
+    name: "jenkins-job-health-exporter"
+    state: "{{ jenkins_job_health_exporter_restart_handler_state }}"
diff --git a/resources/tools/testbed-setup/ansible/roles/jenkins_job_health_exporter/tasks/main.yaml b/resources/tools/testbed-setup/ansible/roles/jenkins_job_health_exporter/tasks/main.yaml

new file mode 100644 (file)

index 0000000..5dbe476
--- /dev/null
+++ b/resources/tools/testbed-setup/ansible/roles/jenkins_job_health_exporter/tasks/main.yaml
@@ -0,0 +1,38 @@
+---
+# file: roles/jenkins_job_health_exporter/tasks/main.yaml
+
+- name: Conf - Jenkins Job Health Exporter Config
+  template:
+    src: "templates/jenkins-job-health-exporter.j2"
+    dest: "/etc/jenkins-job-health-exporter.json"
+    owner: "root"
+    group: "root"
+    mode: "0644"
+  when:
+    - ansible_hostname == "s42-nomad"
+  tags:
+    - conf-jenkins-job-json
+
+- name: Inst - Jenkins Job Health Exporter Binary
+  get_url:
+    url: "{{ jenkins_job_health_exporter_url }}/{{ jenkins_job_health_exporter_version }}/jenkins-job-health-exporter"
+    dest: "{{ jenkins_job_health_exporter_target_dir }}/jenkins-job-health-exporter"
+    mode: "0755"
+  when:
+    - ansible_hostname == "s42-nomad"
+  tags:
+    - inst-jenkins-job-binary
+
+- name: Inst - Jenkins Job Health Exporter Service
+  template:
+    src: "templates/jenkins-job-health-exporter.service.j2"
+    dest: "/lib/systemd/system/jenkins-job-health-exporter.service"
+    owner: "root"
+    group: "root"
+    mode: "0644"
+  when:
+    - ansible_hostname == "s42-nomad"
+  notify:
+    - "Restart Jenkins Job Health Exporter"
+  tags:
+    - inst-jenkins-job-service
diff --git a/resources/tools/testbed-setup/ansible/roles/jenkins_job_health_exporter/templates/jenkins-job-health-exporter.j2 b/resources/tools/testbed-setup/ansible/roles/jenkins_job_health_exporter/templates/jenkins-job-health-exporter.j2

new file mode 100644 (file)

index 0000000..5942b78
--- /dev/null
+++ b/resources/tools/testbed-setup/ansible/roles/jenkins_job_health_exporter/templates/jenkins-job-health-exporter.j2
@@ -0,0 +1,16 @@
+{
+  "jenkins_host": "{{ jenkins_host }}",
+  "poll_interval_sec": {{ poll_interval_sec }},
+  "req_timeout_sec": {{ req_timeout_sec }},
+  "bind_to": "{{ bind_to }}",
+  "last_builds": {{ last_builds }},
+  "jobs": [
+{% for item in jobs %}
+    "{{ item }}"
+{%- if not loop.last %},
+{% endif %}
+{% endfor %}
+
+  ],
+  "verbose": 3
+}
+\ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/roles/jenkins_job_health_exporter/templates/jenkins-job-health-exporter.service.j2 b/resources/tools/testbed-setup/ansible/roles/jenkins_job_health_exporter/templates/jenkins-job-health-exporter.service.j2

new file mode 100644 (file)

index 0000000..38073d0
--- /dev/null
+++ b/resources/tools/testbed-setup/ansible/roles/jenkins_job_health_exporter/templates/jenkins-job-health-exporter.service.j2
@@ -0,0 +1,13 @@
+[Unit]
+Description=Jenkins Job Health Exporter
+Documentation=https://github.com/ayourtch/jenkins-job-health-exporter
+
+[Service]
+Restart=always
+ExecStart={{ jenkins_job_health_exporter_target_dir }}/jenkins-job-health-exporter {{ jenkins_job_health_exporter_conf_dir }}/jenkins-job-health-exporter.json
+ExecReload=/bin/kill -HUP $MAINPID
+TimeoutStopSec=20s
+SendSIGKILL=no
+
+[Install]
+WantedBy=multi-user.target
+\ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/vpp_device.yaml b/resources/tools/testbed-setup/ansible/vpp_device.yaml

index 7dc3614..5281c57 100644 (file)
--- a/resources/tools/testbed-setup/ansible/vpp_device.yaml
+++ b/resources/tools/testbed-setup/ansible/vpp_device.yaml
@@ -20,6 +20,8 @@
        tags: consul
      - role: prometheus_exporter
        tags: prometheus_exporter
+    - role: jenkins_job_health_exporter
+      tags: jenkins_job_health_exporter
      - role: cadvisor
        tags: cadvisor
      - role: vpp_device
diff --git a/terraform-ci-infra/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl b/terraform-ci-infra/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl

index fafe623..40d84e3 100644 (file)
--- a/terraform-ci-infra/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl
+++ b/terraform-ci-infra/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl
@@ -160,10 +160,6 @@ job "${job_name}" {
          left_delimiter  = "{{{"
          right_delimiter = "}}}"
          data            = <<EOH
-global:
-  # The API URL to use for Slack notifications.
-  slack_api_url: 'https://hooks.slack.com/services/${slack_api_key}'
-
  # The directory from which notification templates are read.
  templates:
  - '/etc/alertmanager/template/*.tmpl'
@@ -185,7 +181,7 @@ templates:
  
  # The root route on which each incoming alert enters.
  route:
-  receiver: '${default_receiver}'
+  receiver: '${slack_default_receiver}'
  
    # The labels by which incoming alerts are grouped together. For example,
    # multiple alerts coming in for cluster=A and alertname=LatencyHigh would
@@ -217,18 +213,21 @@ route:
    # overwritten on each.
    # The child route trees.
    routes:
-  # This routes performs a regular expression match on alert labels to
-  # catch alerts that are related to a list of services.
+  - match_re:
+      alertname: JenkinsJob.*
+    receiver: ${slack_jenkins_receiver}
+    routes:
+    - match:
+        severity: critical
+      receiver: '${slack_jenkins_receiver}'
+
    - match_re:
        service: .*
-    receiver: ${default_receiver}
-    # The service has a sub-route for critical alerts, any alerts
-    # that do not match, i.e. severity != critical, fall-back to the
-    # parent node and are sent to 'team-X-mails'
+    receiver: ${slack_default_receiver}
      routes:
      - match:
          severity: critical
-      receiver: '${default_receiver}'
+      receiver: '${slack_default_receiver}'
  
  # Inhibition rules allow to mute a set of alerts given that another alert is
  # firing.
@@ -239,17 +238,42 @@ inhibit_rules:
      severity: 'critical'
    target_match:
      severity: 'warning'
-  # Apply inhibition if the alertname is the same.
-  # CAUTION:
-  #   If all label names listed in `equal` are missing
-  #   from both the source and target alerts,
-  #   the inhibition rule will apply!
-  equal: ['alertname', 'cluster', 'service']
+  equal: ['alertname', 'instance']
  
  receivers:
-- name: '${default_receiver}'
+- name: '${slack_jenkins_receiver}'
+  slack_configs:
+  - api_url: 'https://hooks.slack.com/services/${slack_jenkins_api_key}'
+    channel: '#${slack_jenkins_channel}'
+    send_resolved: true
+    icon_url: https://avatars3.githubusercontent.com/u/3380462
+    title: |-
+     [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}
+     {{- if gt (len .CommonLabels) (len .GroupLabels) -}}
+       {{" "}}(
+       {{- with .CommonLabels.Remove .GroupLabels.Names }}
+         {{- range $index, $label := .SortedPairs -}}
+           {{ if $index }}, {{ end }}
+           {{- $label.Name }}="{{ $label.Value -}}"
+         {{- end }}
+       {{- end -}}
+       )
+     {{- end }}
+    text: >-
+     {{ range .Alerts -}}
+     *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}
+
+     *Description:* {{ .Annotations.description }}
+
+     *Details:*
+       {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
+       {{ end }}
+     {{ end }}
+
+- name: '${slack_default_receiver}'
    slack_configs:
-  - channel: '#${slack_channel}'
+  - api_url: 'https://hooks.slack.com/services/${slack_default_api_key}'
+    channel: '#${slack_default_channel}'
      send_resolved: true
      icon_url: https://avatars3.githubusercontent.com/u/3380462
      title: |-
diff --git a/terraform-ci-infra/1n_nmd/alertmanager/main.tf b/terraform-ci-infra/1n_nmd/alertmanager/main.tf

index 8307601..9525aab 100644 (file)
--- a/terraform-ci-infra/1n_nmd/alertmanager/main.tf
+++ b/terraform-ci-infra/1n_nmd/alertmanager/main.tf
@@ -14,20 +14,23 @@ locals {
  data "template_file" "nomad_job_alertmanager" {
    template         = file("${path.module}/conf/nomad/alertmanager.hcl")
    vars             = {
-    datacenters        = local.datacenters
-    url                = local.alertmanager_url
-    job_name           = var.alertmanager_job_name
-    use_canary         = var.alertmanager_use_canary
-    group_count        = var.alertmanager_group_count
-    service_name       = var.alertmanager_service_name
-    use_vault_provider = var.alertmanager_vault_secret.use_vault_provider
-    version            = var.alertmanager_version
-    cpu                = var.alertmanager_cpu
-    mem                = var.alertmanager_mem
-    port               = var.alertmanager_port
-    slack_api_key      = var.alertmanager_slack_api_key
-    slack_channel      = var.alertmanager_slack_channel
-    default_receiver   = var.alertmanager_default_receiver
+    datacenters            = local.datacenters
+    url                    = local.alertmanager_url
+    job_name               = var.alertmanager_job_name
+    use_canary             = var.alertmanager_use_canary
+    group_count            = var.alertmanager_group_count
+    service_name           = var.alertmanager_service_name
+    use_vault_provider     = var.alertmanager_vault_secret.use_vault_provider
+    version                = var.alertmanager_version
+    cpu                    = var.alertmanager_cpu
+    mem                    = var.alertmanager_mem
+    port                   = var.alertmanager_port
+    slack_jenkins_api_key  = var.alertmanager_slack_jenkins_api_key
+    slack_jenkins_channel  = var.alertmanager_slack_jenkins_channel
+    slack_jenkins_receiver = var.alertmanager_slack_jenkins_receiver
+    slack_default_api_key  = var.alertmanager_slack_default_api_key
+    slack_default_channel  = var.alertmanager_slack_default_channel
+    slack_default_receiver = var.alertmanager_slack_default_receiver
    }
  }
  
diff --git a/terraform-ci-infra/1n_nmd/alertmanager/variables.tf b/terraform-ci-infra/1n_nmd/alertmanager/variables.tf

index 152530b..ffedf24 100644 (file)
--- a/terraform-ci-infra/1n_nmd/alertmanager/variables.tf
+++ b/terraform-ci-infra/1n_nmd/alertmanager/variables.tf
@@ -65,20 +65,38 @@ variable "alertmanager_port" {
    default     = 9093
  }
  
-variable "alertmanager_default_receiver" {
-  description = "Alertmanager default receiver"
+variable "alertmanager_slack_jenkins_api_key" {
+  description = "Alertmanager jenkins slack API key"
    type        = string
-  default     = "default-receiver"
+  default     = "XXXXXXXXX/XXXXXXXXXXX/XXXXXXXXXXXXXXXXXXXXXXXX"
+}
+
+variable "alertmanager_slack_jenkins_receiver" {
+  description = "Alertmanager jenkins slack receiver"
+  type        = string
+  default     = "jenkins-slack-receiver"
+}
+
+variable "alertmanager_slack_jenkins_channel" {
+  description = "Alertmanager jenkins slack channel"
+  type        = string
+  default     = "jenkins-channel"
  }
  
-variable "alertmanager_slack_api_key" {
-  description = "Alertmanager slack API key"
+variable "alertmanager_slack_default_api_key" {
+  description = "Alertmanager default slack API key"
    type        = string
    default     = "XXXXXXXXX/XXXXXXXXXXX/XXXXXXXXXXXXXXXXXXXXXXXX"
  }
  
-variable "alertmanager_slack_channel" {
-  description = "Alertmanager slack channel"
+variable "alertmanager_slack_default_receiver" {
+  description = "Alertmanager default slack receiver"
+  type        = string
+  default     = "default-slack-receiver"
+}
+
+variable "alertmanager_slack_default_channel" {
+  description = "Alertmanager default slack channel"
    type        = string
-  default     = "slack-channel"
+  default     = "default-channel"
  }
 \ No newline at end of file
diff --git a/terraform-ci-infra/1n_nmd/main.tf b/terraform-ci-infra/1n_nmd/main.tf

index 985f9ae..b21b76b 100644 (file)
--- a/terraform-ci-infra/1n_nmd/main.tf
+++ b/terraform-ci-infra/1n_nmd/main.tf
@@ -5,31 +5,33 @@
  # and have them automatically associated with the root provider
  # configurations.
  module "alertmanager" {
-  source                         = "./alertmanager"
-  providers                      = {
+  source                             = "./alertmanager"
+  providers                          = {
      nomad = nomad.yul1
    }
  
    # nomad
-  nomad_datacenters              = [ "yul1" ]
+  nomad_datacenters                  = [ "yul1" ]
  
    # alertmanager
-  alertmanager_job_name          = "prod-alertmanager"
-  alertmanager_use_canary        = true
-  alertmanager_group_count       = 1
-  alertmanager_vault_secret      = {
-    use_vault_provider           = false,
-    vault_kv_policy_name         = "kv-secret",
-    vault_kv_path                = "secret/data/prometheus",
-    vault_kv_field_access_key    = "access_key",
-    vault_kv_field_secret_key    = "secret_key"
+  alertmanager_job_name              = "prod-alertmanager"
+  alertmanager_use_canary            = true
+  alertmanager_group_count           = 1
+  alertmanager_vault_secret          = {
+    use_vault_provider               = false,
+    vault_kv_policy_name             = "kv-secret",
+    vault_kv_path                    = "secret/data/prometheus",
+    vault_kv_field_access_key        = "access_key",
+    vault_kv_field_secret_key        = "secret_key"
    }
-  alertmanager_version           = "0.21.0"
-  alertmanager_cpu               = 1000
-  alertmanager_mem               = 1024
-  alertmanager_port              = 9093
-  alertmanager_slack_api_key     = "TE07RD1V1/B01L7PQK9S8/pbADGhhhj60JSxHRi3K0NoW6"
-  alertmanager_slack_channel     = "fdio-infra-monitoring"
+  alertmanager_version               = "0.21.0"
+  alertmanager_cpu                   = 1000
+  alertmanager_mem                   = 1024
+  alertmanager_port                  = 9093
+  alertmanager_slack_jenkins_api_key = "TE07RD1V1/B01LPL8KM0F/JZQh0gF5U5SoNYnedBknzdHz"
+  alertmanager_slack_jenkins_channel = "fdio-jobs-monitoring"
+  alertmanager_slack_default_api_key = "TE07RD1V1/B01L7PQK9S8/pbADGhhhj60JSxHRi3K0NoW6"
+  alertmanager_slack_default_channel = "fdio-infra-monitoring"
  }
  
  module "grafana" {
diff --git a/terraform-ci-infra/1n_nmd/prometheus/conf/nomad/prometheus.hcl b/terraform-ci-infra/1n_nmd/prometheus/conf/nomad/prometheus.hcl

index 4918a5f..d851628 100644 (file)
--- a/terraform-ci-infra/1n_nmd/prometheus/conf/nomad/prometheus.hcl
+++ b/terraform-ci-infra/1n_nmd/prometheus/conf/nomad/prometheus.hcl
@@ -188,6 +188,24 @@ job "${job_name}" {
          data            = <<EOH
  ---
  groups:
+- name: "Jenkins Job Health Exporter"
+  rules:
+  - alert: JenkinsJobHealthExporterFailures
+    expr: jenkins_job_failure{id=~".*"} >= 10
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: "Jenkins Job Health detected high failure rate on jenkins jobs."
+      description: "Job: {{ $labels.id }}"
+  - alert: JenkinsJobHealthExporterUnstable
+    expr: jenkins_job_unstable{id=~".*"} >= 10
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Jenkins Job Health detected high unstable rate on jenkins jobs."
+      description: "Job: {{ $labels.id }}"
  - name: "Consul"
    rules:
    - alert: ConsulServiceHealthcheckFailed
@@ -523,6 +541,20 @@ scrape_configs:
        - targets: [ '10.32.8.16:8080' ]
        - targets: [ '10.32.8.17:8080' ]
  
+  - job_name: 'Jenkins Job Health Exporter'
+    static_configs:
+      - targets: [ '10.30.51.32:9186' ]
+    metric_relabel_configs:
+      - source_labels: [ __name__ ]
+        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'
+        action: replace
+        replacement: '$1'
+        target_label: id
+      - source_labels: [ __name__ ]
+        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'
+        replacement: 'jenkins_job_$2'
+        target_label: __name__
+
    - job_name: 'Node Exporter'
      static_configs:
        - targets: [ '10.30.51.28:9100' ]
diff --git a/terraform-ci-infra/1n_nmd/terraform.tfstate b/terraform-ci-infra/1n_nmd/terraform.tfstate

index d60f102..576e2aa 100644 (file)
--- a/terraform-ci-infra/1n_nmd/terraform.tfstate
+++ b/terraform-ci-infra/1n_nmd/terraform.tfstate
@@ -1,7 +1,7 @@
  {
    "version": 4,
    "terraform_version": "0.14.5",
-  "serial": 1042,
+  "serial": 1112,
    "lineage": "e4e7f30a-652d-7a31-e31c-5e3a3388c9b9",
    "outputs": {},
    "resources": [
@@ -16,20 +16,23 @@
            "schema_version": 0,
            "attributes": {
              "filename": null,
-            "id": "1ea874db11980359d80aada65912759b6acd9a7399f011d823a72eb7c5b4a329",
-            "rendered": "job \"prod-alertmanager\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-alertmanager\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count             = 1\n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute       = \"${attr.cpu.arch}\"\n      operator        = \"!=\"\n      value           = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-alertmanager\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver          = \"exec\"\n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command       = \"local/alertmanager-0.21.0.linux-amd64/alertmanager\"\n        args          = [\n          \"--config.file=secrets/alertmanager.yml\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alertmanager.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\nglobal:\n  # The API URL to use for Slack notifications.\n  slack_api_url: 'https://hooks.slack.com/services/XX'\n\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n#  # CA certificate to validate the server certificate with.\n#  ca_file: \u003cfilepath\u003e ]\n#\n#  # Certificate and key files for client cert authentication to the server.\n#  cert_file: \u003cfilepath\u003e\n#  key_file: \u003cfilepath\u003e\n#\n#  # ServerName extension to indicate the name of the server.\n#  # http://tools.ietf.org/html/rfc4366#section-3.1\n#  server_name: \u003cstring\u003e\n#\n#  # Disable validation of the server certificate.\n#  insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n  receiver: 'default-receiver'\n\n  # The labels by which incoming alerts are grouped together. For example,\n  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n  # be batched into a single group.\n  #\n  # To aggregate by all possible labels use '...' as the sole label name.\n  # This effectively disables aggregation entirely, passing through all\n  # alerts as-is. This is unlikely to be what you want, unless you have\n  # a very low alert volume or your upstream notification system performs\n  # its own grouping. Example: group_by: [...]\n  group_by: ['alertname']\n\n  # When a new group of alerts is created by an incoming alert, wait at\n  # least 'group_wait' to send the initial notification.\n  # This way ensures that you get multiple alerts for the same group that start\n  # firing shortly after another are batched together on the first\n  # notification.\n  group_wait: 30s\n\n  # When the first notification was sent, wait 'group_interval' to send a batch\n  # of new alerts that started firing for that group.\n  group_interval: 5m\n\n  # If an alert has successfully been sent, wait 'repeat_interval' to\n  # resend them.\n  repeat_interval: 3h\n\n  # All the above attributes are inherited by all child routes and can\n  # overwritten on each.\n  # The child route trees.\n  routes:\n  # This routes performs a regular expression match on alert labels to\n  # catch alerts that are related to a list of services.\n  - match_re:\n      service: .*\n    receiver: default-receiver\n    # The service has a sub-route for critical alerts, any alerts\n    # that do not match, i.e. severity != critical, fall-back to the\n    # parent node and are sent to 'team-X-mails'\n    routes:\n    - match:\n        severity: critical\n      receiver: 'default-receiver'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n    severity: 'critical'\n  target_match:\n    severity: 'warning'\n  # Apply inhibition if the alertname is the same.\n  # CAUTION:\n  #   If all label names listed in `equal` are missing\n  #   from both the source and target alerts,\n  #   the inhibition rule will apply!\n  equal: ['alertname', 'cluster', 'service']\n\nreceivers:\n- name: 'default-receiver'\n  slack_configs:\n  - channel: '#fdio-infra-monitoring'\n    send_resolved: true\n    icon_url: https://avatars3.githubusercontent.com/u/3380462\n    title: |-\n     [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n     {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n       {{\" \"}}(\n       {{- with .CommonLabels.Remove .GroupLabels.Names }}\n         {{- range $index, $label := .SortedPairs -}}\n           {{ if $index }}, {{ end }}\n           {{- $label.Name }}=\"{{ $label.Value -}}\"\n         {{- end }}\n       {{- end -}}\n       )\n     {{- end }}\n    text: \u003e-\n     {{ range .Alerts -}}\n     *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n     *Description:* {{ .Annotations.description }}\n\n     *Details:*\n       {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n       {{ end }}\n     {{ end }}\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"alertmanager\"\n        port            = \"alertmanager\"\n        tags            = [ \"alertmanager${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Alertmanager Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = 1000\n        memory          = 1024\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"alertmanager\" {\n            static      = 9093\n          }\n        }\n      }\n    }\n  }\n}",
-            "template": "job \"${job_name}\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"${datacenters}\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n%{ if use_canary }\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n%{ endif }\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-${service_name}\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count             = ${group_count}\n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute       = \"$${attr.cpu.arch}\"\n      operator        = \"!=\"\n      value           = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-${service_name}\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver          = \"exec\"\n\n      %{ if use_vault_provider }\n      vault {\n        policies        = \"${vault_kv_policy_name}\"\n      }\n      %{ endif }\n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command       = \"local/alertmanager-${version}.linux-amd64/alertmanager\"\n        args          = [\n          \"--config.file=secrets/alertmanager.yml\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"${url}\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alertmanager.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\nglobal:\n  # The API URL to use for Slack notifications.\n  slack_api_url: 'https://hooks.slack.com/services/${slack_api_key}'\n\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n#  # CA certificate to validate the server certificate with.\n#  ca_file: \u003cfilepath\u003e ]\n#\n#  # Certificate and key files for client cert authentication to the server.\n#  cert_file: \u003cfilepath\u003e\n#  key_file: \u003cfilepath\u003e\n#\n#  # ServerName extension to indicate the name of the server.\n#  # http://tools.ietf.org/html/rfc4366#section-3.1\n#  server_name: \u003cstring\u003e\n#\n#  # Disable validation of the server certificate.\n#  insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n  receiver: '${default_receiver}'\n\n  # The labels by which incoming alerts are grouped together. For example,\n  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n  # be batched into a single group.\n  #\n  # To aggregate by all possible labels use '...' as the sole label name.\n  # This effectively disables aggregation entirely, passing through all\n  # alerts as-is. This is unlikely to be what you want, unless you have\n  # a very low alert volume or your upstream notification system performs\n  # its own grouping. Example: group_by: [...]\n  group_by: ['alertname']\n\n  # When a new group of alerts is created by an incoming alert, wait at\n  # least 'group_wait' to send the initial notification.\n  # This way ensures that you get multiple alerts for the same group that start\n  # firing shortly after another are batched together on the first\n  # notification.\n  group_wait: 30s\n\n  # When the first notification was sent, wait 'group_interval' to send a batch\n  # of new alerts that started firing for that group.\n  group_interval: 5m\n\n  # If an alert has successfully been sent, wait 'repeat_interval' to\n  # resend them.\n  repeat_interval: 3h\n\n  # All the above attributes are inherited by all child routes and can\n  # overwritten on each.\n  # The child route trees.\n  routes:\n  # This routes performs a regular expression match on alert labels to\n  # catch alerts that are related to a list of services.\n  - match_re:\n      service: .*\n    receiver: ${default_receiver}\n    # The service has a sub-route for critical alerts, any alerts\n    # that do not match, i.e. severity != critical, fall-back to the\n    # parent node and are sent to 'team-X-mails'\n    routes:\n    - match:\n        severity: critical\n      receiver: '${default_receiver}'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n    severity: 'critical'\n  target_match:\n    severity: 'warning'\n  # Apply inhibition if the alertname is the same.\n  # CAUTION:\n  #   If all label names listed in `equal` are missing\n  #   from both the source and target alerts,\n  #   the inhibition rule will apply!\n  equal: ['alertname', 'cluster', 'service']\n\nreceivers:\n- name: '${default_receiver}'\n  slack_configs:\n  - channel: '#${slack_channel}'\n    send_resolved: true\n    icon_url: https://avatars3.githubusercontent.com/u/3380462\n    title: |-\n     [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n     {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n       {{\" \"}}(\n       {{- with .CommonLabels.Remove .GroupLabels.Names }}\n         {{- range $index, $label := .SortedPairs -}}\n           {{ if $index }}, {{ end }}\n           {{- $label.Name }}=\"{{ $label.Value -}}\"\n         {{- end }}\n       {{- end -}}\n       )\n     {{- end }}\n    text: \u003e-\n     {{ range .Alerts -}}\n     *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n     *Description:* {{ .Annotations.description }}\n\n     *Details:*\n       {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n       {{ end }}\n     {{ end }}\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"${service_name}\"\n        port            = \"${service_name}\"\n        tags            = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Alertmanager Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = ${cpu}\n        memory          = ${mem}\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"${service_name}\" {\n            static      = ${port}\n          }\n        }\n      }\n    }\n  }\n}",
+            "id": "9078b0e92799cb1fa8e6a6a0f7c9950fd4c0b563b3e98fd4168ebd9c770ba033",
+            "rendered": "job \"prod-alertmanager\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-alertmanager\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count             = 1\n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute       = \"${attr.cpu.arch}\"\n      operator        = \"!=\"\n      value           = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-alertmanager\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver          = \"exec\"\n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command       = \"local/alertmanager-0.21.0.linux-amd64/alertmanager\"\n        args          = [\n          \"--config.file=secrets/alertmanager.yml\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alertmanager.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n#  # CA certificate to validate the server certificate with.\n#  ca_file: \u003cfilepath\u003e ]\n#\n#  # Certificate and key files for client cert authentication to the server.\n#  cert_file: \u003cfilepath\u003e\n#  key_file: \u003cfilepath\u003e\n#\n#  # ServerName extension to indicate the name of the server.\n#  # http://tools.ietf.org/html/rfc4366#section-3.1\n#  server_name: \u003cstring\u003e\n#\n#  # Disable validation of the server certificate.\n#  insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n  receiver: 'default-slack-receiver'\n\n  # The labels by which incoming alerts are grouped together. For example,\n  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n  # be batched into a single group.\n  #\n  # To aggregate by all possible labels use '...' as the sole label name.\n  # This effectively disables aggregation entirely, passing through all\n  # alerts as-is. This is unlikely to be what you want, unless you have\n  # a very low alert volume or your upstream notification system performs\n  # its own grouping. Example: group_by: [...]\n  group_by: ['alertname']\n\n  # When a new group of alerts is created by an incoming alert, wait at\n  # least 'group_wait' to send the initial notification.\n  # This way ensures that you get multiple alerts for the same group that start\n  # firing shortly after another are batched together on the first\n  # notification.\n  group_wait: 30s\n\n  # When the first notification was sent, wait 'group_interval' to send a batch\n  # of new alerts that started firing for that group.\n  group_interval: 5m\n\n  # If an alert has successfully been sent, wait 'repeat_interval' to\n  # resend them.\n  repeat_interval: 3h\n\n  # All the above attributes are inherited by all child routes and can\n  # overwritten on each.\n  # The child route trees.\n  routes:\n  - match_re:\n      alertname: JenkinsJob.*\n    receiver: jenkins-slack-receiver\n    routes:\n    - match:\n        severity: critical\n      receiver: 'jenkins-slack-receiver'\n\n  - match_re:\n      service: .*\n    receiver: default-slack-receiver\n    routes:\n    - match:\n        severity: critical\n      receiver: 'default-slack-receiver'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n    severity: 'critical'\n  target_match:\n    severity: 'warning'\n  equal: ['alertname', 'instance']\n\nreceivers:\n- name: 'jenkins-slack-receiver'\n  slack_configs:\n  - api_url: 'https://hooks.slack.com/services/'\n    channel: '#fdio-jobs-monitoring'\n    send_resolved: true\n    icon_url: https://avatars3.githubusercontent.com/u/3380462\n    title: |-\n     [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n     {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n       {{\" \"}}(\n       {{- with .CommonLabels.Remove .GroupLabels.Names }}\n         {{- range $index, $label := .SortedPairs -}}\n           {{ if $index }}, {{ end }}\n           {{- $label.Name }}=\"{{ $label.Value -}}\"\n         {{- end }}\n       {{- end -}}\n       )\n     {{- end }}\n    text: \u003e-\n     {{ range .Alerts -}}\n     *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n     *Description:* {{ .Annotations.description }}\n\n     *Details:*\n       {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n       {{ end }}\n     {{ end }}\n\n- name: 'default-slack-receiver'\n  slack_configs:\n  - api_url: 'https://hooks.slack.com/services/'\n    channel: '#fdio-infra-monitoring'\n    send_resolved: true\n    icon_url: https://avatars3.githubusercontent.com/u/3380462\n    title: |-\n     [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n     {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n       {{\" \"}}(\n       {{- with .CommonLabels.Remove .GroupLabels.Names }}\n         {{- range $index, $label := .SortedPairs -}}\n           {{ if $index }}, {{ end }}\n           {{- $label.Name }}=\"{{ $label.Value -}}\"\n         {{- end }}\n       {{- end -}}\n       )\n     {{- end }}\n    text: \u003e-\n     {{ range .Alerts -}}\n     *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n     *Description:* {{ .Annotations.description }}\n\n     *Details:*\n       {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n       {{ end }}\n     {{ end }}\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"alertmanager\"\n        port            = \"alertmanager\"\n        tags            = [ \"alertmanager${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Alertmanager Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = 1000\n        memory          = 1024\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"alertmanager\" {\n            static      = 9093\n          }\n        }\n      }\n    }\n  }\n}",
+            "template": "job \"${job_name}\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"${datacenters}\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n%{ if use_canary }\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n%{ endif }\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-${service_name}\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count             = ${group_count}\n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute       = \"$${attr.cpu.arch}\"\n      operator        = \"!=\"\n      value           = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-${service_name}\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver          = \"exec\"\n\n      %{ if use_vault_provider }\n      vault {\n        policies        = \"${vault_kv_policy_name}\"\n      }\n      %{ endif }\n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command       = \"local/alertmanager-${version}.linux-amd64/alertmanager\"\n        args          = [\n          \"--config.file=secrets/alertmanager.yml\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"${url}\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alertmanager.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n#  # CA certificate to validate the server certificate with.\n#  ca_file: \u003cfilepath\u003e ]\n#\n#  # Certificate and key files for client cert authentication to the server.\n#  cert_file: \u003cfilepath\u003e\n#  key_file: \u003cfilepath\u003e\n#\n#  # ServerName extension to indicate the name of the server.\n#  # http://tools.ietf.org/html/rfc4366#section-3.1\n#  server_name: \u003cstring\u003e\n#\n#  # Disable validation of the server certificate.\n#  insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n  receiver: '${slack_default_receiver}'\n\n  # The labels by which incoming alerts are grouped together. For example,\n  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n  # be batched into a single group.\n  #\n  # To aggregate by all possible labels use '...' as the sole label name.\n  # This effectively disables aggregation entirely, passing through all\n  # alerts as-is. This is unlikely to be what you want, unless you have\n  # a very low alert volume or your upstream notification system performs\n  # its own grouping. Example: group_by: [...]\n  group_by: ['alertname']\n\n  # When a new group of alerts is created by an incoming alert, wait at\n  # least 'group_wait' to send the initial notification.\n  # This way ensures that you get multiple alerts for the same group that start\n  # firing shortly after another are batched together on the first\n  # notification.\n  group_wait: 30s\n\n  # When the first notification was sent, wait 'group_interval' to send a batch\n  # of new alerts that started firing for that group.\n  group_interval: 5m\n\n  # If an alert has successfully been sent, wait 'repeat_interval' to\n  # resend them.\n  repeat_interval: 3h\n\n  # All the above attributes are inherited by all child routes and can\n  # overwritten on each.\n  # The child route trees.\n  routes:\n  - match_re:\n      alertname: JenkinsJob.*\n    receiver: ${slack_jenkins_receiver}\n    routes:\n    - match:\n        severity: critical\n      receiver: '${slack_jenkins_receiver}'\n\n  - match_re:\n      service: .*\n    receiver: ${slack_default_receiver}\n    routes:\n    - match:\n        severity: critical\n      receiver: '${slack_default_receiver}'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n    severity: 'critical'\n  target_match:\n    severity: 'warning'\n  equal: ['alertname', 'instance']\n\nreceivers:\n- name: '${slack_jenkins_receiver}'\n  slack_configs:\n  - api_url: 'https://hooks.slack.com/services/${slack_jenkins_api_key}'\n    channel: '#${slack_jenkins_channel}'\n    send_resolved: true\n    icon_url: https://avatars3.githubusercontent.com/u/3380462\n    title: |-\n     [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n     {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n       {{\" \"}}(\n       {{- with .CommonLabels.Remove .GroupLabels.Names }}\n         {{- range $index, $label := .SortedPairs -}}\n           {{ if $index }}, {{ end }}\n           {{- $label.Name }}=\"{{ $label.Value -}}\"\n         {{- end }}\n       {{- end -}}\n       )\n     {{- end }}\n    text: \u003e-\n     {{ range .Alerts -}}\n     *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n     *Description:* {{ .Annotations.description }}\n\n     *Details:*\n       {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n       {{ end }}\n     {{ end }}\n\n- name: '${slack_default_receiver}'\n  slack_configs:\n  - api_url: 'https://hooks.slack.com/services/${slack_default_api_key}'\n    channel: '#${slack_default_channel}'\n    send_resolved: true\n    icon_url: https://avatars3.githubusercontent.com/u/3380462\n    title: |-\n     [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n     {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n       {{\" \"}}(\n       {{- with .CommonLabels.Remove .GroupLabels.Names }}\n         {{- range $index, $label := .SortedPairs -}}\n           {{ if $index }}, {{ end }}\n           {{- $label.Name }}=\"{{ $label.Value -}}\"\n         {{- end }}\n       {{- end -}}\n       )\n     {{- end }}\n    text: \u003e-\n     {{ range .Alerts -}}\n     *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n     *Description:* {{ .Annotations.description }}\n\n     *Details:*\n       {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n       {{ end }}\n     {{ end }}\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"${service_name}\"\n        port            = \"${service_name}\"\n        tags            = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Alertmanager Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = ${cpu}\n        memory          = ${mem}\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"${service_name}\" {\n            static      = ${port}\n          }\n        }\n      }\n    }\n  }\n}",
              "vars": {
                "cpu": "1000",
                "datacenters": "yul1",
-              "default_receiver": "default-receiver",
                "group_count": "1",
                "job_name": "prod-alertmanager",
                "mem": "1024",
                "port": "9093",
                "service_name": "alertmanager",
-              "slack_api_key": "XX",
-              "slack_channel": "fdio-infra-monitoring",
+              "slack_default_api_key": "TE07RD1V1/B01L7PQK9S8/pbADGhhhj60JSxHRi3K0NoW6",
+              "slack_default_channel": "fdio-infra-monitoring",
+              "slack_default_receiver": "default-slack-receiver",
+              "slack_jenkins_api_key": "TE07RD1V1/B01LPL8KM0F/JZQh0gF5U5SoNYnedBknzdHz",
+              "slack_jenkins_channel": "fdio-jobs-monitoring",
+              "slack_jenkins_receiver": "jenkins-slack-receiver",
                "url": "https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz",
                "use_canary": "true",
                "use_vault_provider": "false",
@@ -51,21 +54,20 @@
            "schema_version": 0,
            "attributes": {
              "allocation_ids": [
-              "e0aa0e03-a425-2ec8-06ad-ccf89f0f4ee3",
-              "bf013613-1ac7-6155-69e0-51ff57719549"
+              "c5e80ff9-5f53-8022-0179-5b628bba986f"
              ],
              "datacenters": [
                "yul1"
              ],
-            "deployment_id": "d132f4bd-ef31-5406-97e6-8831ad4a3db5",
+            "deployment_id": "84b19f76-906d-acdb-a7e9-041a07ad72e1",
              "deployment_status": "successful",
              "deregister_on_destroy": true,
              "deregister_on_id_change": true,
              "detach": false,
              "id": "prod-alertmanager",
-            "jobspec": "job \"prod-alertmanager\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-alertmanager\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count             = 1\n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute       = \"${attr.cpu.arch}\"\n      operator        = \"!=\"\n      value           = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-alertmanager\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver          = \"exec\"\n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command       = \"local/alertmanager-0.21.0.linux-amd64/alertmanager\"\n        args          = [\n          \"--config.file=secrets/alertmanager.yml\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alertmanager.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\nglobal:\n  # The API URL to use for Slack notifications.\n  slack_api_url: 'https://hooks.slack.com/services/XX'\n\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n#  # CA certificate to validate the server certificate with.\n#  ca_file: \u003cfilepath\u003e ]\n#\n#  # Certificate and key files for client cert authentication to the server.\n#  cert_file: \u003cfilepath\u003e\n#  key_file: \u003cfilepath\u003e\n#\n#  # ServerName extension to indicate the name of the server.\n#  # http://tools.ietf.org/html/rfc4366#section-3.1\n#  server_name: \u003cstring\u003e\n#\n#  # Disable validation of the server certificate.\n#  insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n  receiver: 'default-receiver'\n\n  # The labels by which incoming alerts are grouped together. For example,\n  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n  # be batched into a single group.\n  #\n  # To aggregate by all possible labels use '...' as the sole label name.\n  # This effectively disables aggregation entirely, passing through all\n  # alerts as-is. This is unlikely to be what you want, unless you have\n  # a very low alert volume or your upstream notification system performs\n  # its own grouping. Example: group_by: [...]\n  group_by: ['alertname']\n\n  # When a new group of alerts is created by an incoming alert, wait at\n  # least 'group_wait' to send the initial notification.\n  # This way ensures that you get multiple alerts for the same group that start\n  # firing shortly after another are batched together on the first\n  # notification.\n  group_wait: 30s\n\n  # When the first notification was sent, wait 'group_interval' to send a batch\n  # of new alerts that started firing for that group.\n  group_interval: 5m\n\n  # If an alert has successfully been sent, wait 'repeat_interval' to\n  # resend them.\n  repeat_interval: 3h\n\n  # All the above attributes are inherited by all child routes and can\n  # overwritten on each.\n  # The child route trees.\n  routes:\n  # This routes performs a regular expression match on alert labels to\n  # catch alerts that are related to a list of services.\n  - match_re:\n      service: .*\n    receiver: default-receiver\n    # The service has a sub-route for critical alerts, any alerts\n    # that do not match, i.e. severity != critical, fall-back to the\n    # parent node and are sent to 'team-X-mails'\n    routes:\n    - match:\n        severity: critical\n      receiver: 'default-receiver'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n    severity: 'critical'\n  target_match:\n    severity: 'warning'\n  # Apply inhibition if the alertname is the same.\n  # CAUTION:\n  #   If all label names listed in `equal` are missing\n  #   from both the source and target alerts,\n  #   the inhibition rule will apply!\n  equal: ['alertname', 'cluster', 'service']\n\nreceivers:\n- name: 'default-receiver'\n  slack_configs:\n  - channel: '#fdio-infra-monitoring'\n    send_resolved: true\n    icon_url: https://avatars3.githubusercontent.com/u/3380462\n    title: |-\n     [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n     {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n       {{\" \"}}(\n       {{- with .CommonLabels.Remove .GroupLabels.Names }}\n         {{- range $index, $label := .SortedPairs -}}\n           {{ if $index }}, {{ end }}\n           {{- $label.Name }}=\"{{ $label.Value -}}\"\n         {{- end }}\n       {{- end -}}\n       )\n     {{- end }}\n    text: \u003e-\n     {{ range .Alerts -}}\n     *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n     *Description:* {{ .Annotations.description }}\n\n     *Details:*\n       {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n       {{ end }}\n     {{ end }}\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"alertmanager\"\n        port            = \"alertmanager\"\n        tags            = [ \"alertmanager${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Alertmanager Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = 1000\n        memory          = 1024\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"alertmanager\" {\n            static      = 9093\n          }\n        }\n      }\n    }\n  }\n}",
+            "jobspec": "job \"prod-alertmanager\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-alertmanager\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count             = 1\n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute       = \"${attr.cpu.arch}\"\n      operator        = \"!=\"\n      value           = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-alertmanager\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver          = \"exec\"\n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command       = \"local/alertmanager-0.21.0.linux-amd64/alertmanager\"\n        args          = [\n          \"--config.file=secrets/alertmanager.yml\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alertmanager.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n#  # CA certificate to validate the server certificate with.\n#  ca_file: \u003cfilepath\u003e ]\n#\n#  # Certificate and key files for client cert authentication to the server.\n#  cert_file: \u003cfilepath\u003e\n#  key_file: \u003cfilepath\u003e\n#\n#  # ServerName extension to indicate the name of the server.\n#  # http://tools.ietf.org/html/rfc4366#section-3.1\n#  server_name: \u003cstring\u003e\n#\n#  # Disable validation of the server certificate.\n#  insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n  receiver: 'default-slack-receiver'\n\n  # The labels by which incoming alerts are grouped together. For example,\n  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n  # be batched into a single group.\n  #\n  # To aggregate by all possible labels use '...' as the sole label name.\n  # This effectively disables aggregation entirely, passing through all\n  # alerts as-is. This is unlikely to be what you want, unless you have\n  # a very low alert volume or your upstream notification system performs\n  # its own grouping. Example: group_by: [...]\n  group_by: ['alertname']\n\n  # When a new group of alerts is created by an incoming alert, wait at\n  # least 'group_wait' to send the initial notification.\n  # This way ensures that you get multiple alerts for the same group that start\n  # firing shortly after another are batched together on the first\n  # notification.\n  group_wait: 30s\n\n  # When the first notification was sent, wait 'group_interval' to send a batch\n  # of new alerts that started firing for that group.\n  group_interval: 5m\n\n  # If an alert has successfully been sent, wait 'repeat_interval' to\n  # resend them.\n  repeat_interval: 3h\n\n  # All the above attributes are inherited by all child routes and can\n  # overwritten on each.\n  # The child route trees.\n  routes:\n  - match_re:\n      alertname: JenkinsJob.*\n    receiver: jenkins-slack-receiver\n    routes:\n    - match:\n        severity: critical\n      receiver: 'jenkins-slack-receiver'\n\n  - match_re:\n      service: .*\n    receiver: default-slack-receiver\n    routes:\n    - match:\n        severity: critical\n      receiver: 'default-slack-receiver'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n    severity: 'critical'\n  target_match:\n    severity: 'warning'\n  equal: ['alertname', 'instance']\n\nreceivers:\n- name: 'jenkins-slack-receiver'\n  slack_configs:\n  - api_url: 'https://hooks.slack.com/services/'\n    channel: '#fdio-jobs-monitoring'\n    send_resolved: true\n    icon_url: https://avatars3.githubusercontent.com/u/3380462\n    title: |-\n     [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n     {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n       {{\" \"}}(\n       {{- with .CommonLabels.Remove .GroupLabels.Names }}\n         {{- range $index, $label := .SortedPairs -}}\n           {{ if $index }}, {{ end }}\n           {{- $label.Name }}=\"{{ $label.Value -}}\"\n         {{- end }}\n       {{- end -}}\n       )\n     {{- end }}\n    text: \u003e-\n     {{ range .Alerts -}}\n     *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n     *Description:* {{ .Annotations.description }}\n\n     *Details:*\n       {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n       {{ end }}\n     {{ end }}\n\n- name: 'default-slack-receiver'\n  slack_configs:\n  - api_url: 'https://hooks.slack.com/services/'\n    channel: '#fdio-infra-monitoring'\n    send_resolved: true\n    icon_url: https://avatars3.githubusercontent.com/u/3380462\n    title: |-\n     [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n     {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n       {{\" \"}}(\n       {{- with .CommonLabels.Remove .GroupLabels.Names }}\n         {{- range $index, $label := .SortedPairs -}}\n           {{ if $index }}, {{ end }}\n           {{- $label.Name }}=\"{{ $label.Value -}}\"\n         {{- end }}\n       {{- end -}}\n       )\n     {{- end }}\n    text: \u003e-\n     {{ range .Alerts -}}\n     *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n     *Description:* {{ .Annotations.description }}\n\n     *Details:*\n       {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n       {{ end }}\n     {{ end }}\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"alertmanager\"\n        port            = \"alertmanager\"\n        tags            = [ \"alertmanager${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Alertmanager Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = 1000\n        memory          = 1024\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"alertmanager\" {\n            static      = 9093\n          }\n        }\n      }\n    }\n  }\n}",
              "json": null,
-            "modify_index": "7202773",
+            "modify_index": "7224418",
              "name": "prod-alertmanager",
              "namespace": "default",
              "policy_override": null,
@@ -268,7 +270,7 @@
            "schema_version": 0,
            "attributes": {
              "allocation_ids": [
-              "7e9cea8d-a5e5-47db-7f9c-653c59c93928"
+              "a1bc1a10-95a5-3c33-a46d-849522abe4fe"
              ],
              "datacenters": [
                "yul1"
@@ -281,7 +283,7 @@
              "id": "prod-mc",
              "jobspec": "job \"prod-mc\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers.html\n  #\n  type        = \"batch\"\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group.html\n  #\n  group \"prod-group1-mc\" {\n    task \"prod-task1-create-buckets\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver        = \"docker\"\n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        image       = \"minio/mc:RELEASE.2020-12-10T01-26-17Z\"\n        entrypoint  = [\n          \"/bin/sh\",\n          \"-c\",\n          \"mc config host add LOCALMINIO http://storage.service.consul:9000 $MINIO_ACCESS_KEY $MINIO_SECRET_KEY \u0026\u0026 mc mb -p LOCALMINIO/logs.fd.io LOCALMINIO/docs.fd.io ; mc policy set public LOCALMINIO/logs.fd.io mc policy set public LOCALMINIO/docs.fd.io mc ilm add --expiry-days '180' LOCALMINIO/logs.fd.io mc admin user add LOCALMINIO storage Storage1234 mc admin policy set LOCALMINIO writeonly user=storage\"\n        ]\n        dns_servers  = [ \"${attr.unique.network.ip-address}\" ]\n        privileged   = false\n      }\n\n      # The env stanza configures a list of environment variables to populate\n      # the task's environment before starting.\n      env {\n        \n        MINIO_ACCESS_KEY = \"minio\"\n        MINIO_SECRET_KEY = \"minio123\"\n        \n        \n      }\n    }\n  }\n}\n",
              "json": null,
-            "modify_index": "7202809",
+            "modify_index": "7254549",
              "name": "prod-mc",
              "namespace": "default",
              "policy_override": null,
@@ -477,9 +479,9 @@
            "schema_version": 0,
            "attributes": {
              "filename": null,
-            "id": "f5c5b3316512492fa4e1caea185bafeeeb51b619e76b08ab473ca9523c2b8268",
-            "rendered": "job \"prod-prometheus\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-prometheus\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count               = 4\n\n    # The volume stanza allows the group to specify that it requires a given\n    # volume from the cluster.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/volume\n    #\n    \n    volume \"prod-volume1-prometheus\" {\n      type              = \"host\"\n      read_only         = false\n      source            = \"prod-volume-data1-1\"\n    }\n    \n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute         = \"${attr.cpu.arch}\"\n      operator          = \"!=\"\n      value             = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-prometheus\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver            = \"exec\"\n\n      \n      volume_mount {\n        volume          = \"prod-volume1-prometheus\"\n        destination     = \"/data/\"\n        read_only       = false\n      }\n      \n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command         = \"local/prometheus-2.24.0.linux-amd64/prometheus\"\n        args            = [\n          \"--config.file=secrets/prometheus.yml\",\n          \"--storage.tsdb.path=/data/prometheus/\",\n          \"--storage.tsdb.retention.time=15d\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alerts.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n---\ngroups:\n- name: \"Consul\"\n  rules:\n  - alert: ConsulServiceHealthcheckFailed\n    expr: consul_catalog_service_node_healthy == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n      description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n  - alert: ConsulMissingMasterNode\n    expr: consul_raft_peers \u003c 3\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n      description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n  - alert: ConsulAgentUnhealthy\n    expr: consul_health_node_status{status=\"critical\"} == 1\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n      description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n  rules:\n  - alert: NodeDown\n    expr: up == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n      description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n  - alert: HostHighCpuLoad\n    expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n      description: \"CPU load is \u003e 95%.\"\n  - alert: HostOutOfMemory\n    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n      description: \"Node memory is filling up (\u003c 10% left).\"\n  - alert: HostOomKillDetected\n    expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n      description: \"OOM kill detected.\"\n  - alert: HostMemoryUnderMemoryPressure\n    expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n      description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n  - alert: HostOutOfDiskSpace\n    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n      description: \"Disk is almost full (\u003c 10% left).\"\n  - alert: HostRaidDiskFailure\n    expr: node_md_disks{state=\"failed\"} \u003e 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n      description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n  - alert: HostConntrackLimit\n    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n      description: \"The number of conntrack is approching limit.\"\n  - alert: HostNetworkInterfaceSaturated\n    expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n    for: 1m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n      description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n  - alert: HostSystemdServiceCrashed\n    expr: node_systemd_unit_state{state=\"failed\"} == 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n      description: \"SystemD service crashed.\"\n  - alert: HostEdacCorrectableErrorsDetected\n    expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: info\n    annotations:\n      summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n  - alert: HostEdacUncorrectableErrorsDetected\n    expr: node_edac_uncorrectable_errors_total \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n  rules:\n  - alert: MinioDiskOffline\n    expr: minio_offline_disks \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n      description: \"Minio disk is offline.\"\n  - alert: MinioStorageSpaceExhausted\n    expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n      description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n  rules:\n  - alert: PrometheusConfigurationReloadFailure\n    expr: prometheus_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"Prometheus configuration reload error.\"\n  - alert: PrometheusTooManyRestarts\n    expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n      description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n  - alert: PrometheusAlertmanagerConfigurationReloadFailure\n    expr: alertmanager_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"AlertManager configuration reload error.\"\n  - alert: PrometheusRuleEvaluationFailures\n    expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n  - alert: PrometheusTargetScrapingSlow\n    expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n      description: \"Prometheus is scraping exporters slowly.\"\n  - alert: PrometheusTsdbCompactionsFailed\n    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n  - alert: PrometheusTsdbHeadTruncationsFailed\n    expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n  - alert: PrometheusTsdbWalCorruptions\n    expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n  - alert: PrometheusTsdbWalTruncationsFailed\n    expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n      }\n\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/prometheus.yml\"\n        data            = \u003c\u003cEOH\n---\nglobal:\n  scrape_interval:     5s\n  scrape_timeout:      5s\n  evaluation_interval: 5s\n\nalerting:\n  alertmanagers:\n  - consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\nrule_files:\n  - 'alerts.yml'\n\nscrape_configs:\n\n  - job_name: 'Nomad Cluster'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'nomad-client', 'nomad' ]\n    relabel_configs:\n    - source_labels: [__meta_consul_tags]\n      regex: '(.*)http(.*)'\n      action: keep\n    metrics_path: /v1/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Consul Cluster'\n    static_configs:\n      - targets: [ '10.30.51.28:8500' ]\n      - targets: [ '10.30.51.29:8500' ]\n      - targets: [ '10.30.51.30:8500' ]\n      - targets: [ '10.30.51.32:8500' ]\n      - targets: [ '10.30.51.33:8500' ]\n      - targets: [ '10.30.51.34:8500' ]\n      - targets: [ '10.30.51.35:8500' ]\n      - targets: [ '10.30.51.39:8500' ]\n      - targets: [ '10.30.51.40:8500' ]\n      - targets: [ '10.30.51.50:8500' ]\n      - targets: [ '10.30.51.51:8500' ]\n      - targets: [ '10.30.51.65:8500' ]\n      - targets: [ '10.30.51.66:8500' ]\n      - targets: [ '10.30.51.67:8500' ]\n      - targets: [ '10.30.51.68:8500' ]\n      - targets: [ '10.30.51.70:8500' ]\n      - targets: [ '10.30.51.71:8500' ]\n      - targets: [ '10.32.8.14:8500' ]\n      - targets: [ '10.32.8.15:8500' ]\n      - targets: [ '10.32.8.16:8500' ]\n      - targets: [ '10.32.8.17:8500' ]\n    metrics_path: /v1/agent/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Blackbox Exporter (icmp)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n      - targets: [ '10.30.51.32' ]\n    params:\n      module: [ 'icmp_v4' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'Blackbox Exporter (http)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n    params:\n      module: [ 'http_2xx' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'cAdvisor Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:8080' ]\n      - targets: [ '10.30.51.29:8080' ]\n      - targets: [ '10.30.51.30:8080' ]\n      #- targets: [ '10.30.51.32:8080' ]\n      - targets: [ '10.30.51.33:8080' ]\n      - targets: [ '10.30.51.34:8080' ]\n      - targets: [ '10.30.51.35:8080' ]\n      - targets: [ '10.30.51.39:8080' ]\n      - targets: [ '10.30.51.40:8080' ]\n      - targets: [ '10.30.51.50:8080' ]\n      - targets: [ '10.30.51.51:8080' ]\n      - targets: [ '10.30.51.65:8080' ]\n      - targets: [ '10.30.51.66:8080' ]\n      - targets: [ '10.30.51.67:8080' ]\n      - targets: [ '10.30.51.68:8080' ]\n      - targets: [ '10.30.51.70:8080' ]\n      - targets: [ '10.30.51.71:8080' ]\n      - targets: [ '10.32.8.14:8080' ]\n      - targets: [ '10.32.8.15:8080' ]\n      - targets: [ '10.32.8.16:8080' ]\n      - targets: [ '10.32.8.17:8080' ]\n\n  - job_name: 'Node Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:9100' ]\n      - targets: [ '10.30.51.29:9100' ]\n      - targets: [ '10.30.51.30:9100' ]\n      - targets: [ '10.30.51.32:9100' ]\n      - targets: [ '10.30.51.33:9100' ]\n      - targets: [ '10.30.51.34:9100' ]\n      - targets: [ '10.30.51.35:9100' ]\n      - targets: [ '10.30.51.39:9100' ]\n      - targets: [ '10.30.51.40:9100' ]\n      - targets: [ '10.30.51.50:9100' ]\n      - targets: [ '10.30.51.51:9100' ]\n      - targets: [ '10.30.51.65:9100' ]\n      - targets: [ '10.30.51.66:9100' ]\n      - targets: [ '10.30.51.67:9100' ]\n      - targets: [ '10.30.51.68:9100' ]\n      - targets: [ '10.30.51.70:9100' ]\n      - targets: [ '10.30.51.71:9100' ]\n      - targets: [ '10.32.8.14:9100' ]\n      - targets: [ '10.32.8.15:9100' ]\n      - targets: [ '10.32.8.16:9100' ]\n      - targets: [ '10.32.8.17:9100' ]\n\n  - job_name: 'Alertmanager'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\n  - job_name: 'Grafana'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'grafana' ]\n\n  - job_name: 'Prometheus'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'prometheus' ]\n\n  - job_name: 'Minio'\n    bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'storage' ]\n    metrics_path: /minio/prometheus/metrics\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"prometheus\"\n        port            = \"prometheus\"\n        tags            = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Prometheus Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = 2000\n        memory          = 8192\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"prometheus\" {\n            static      = 9090\n          }\n        }\n      }\n    }\n  }\n}",
-            "template": "job \"${job_name}\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"${datacenters}\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n%{ if use_canary }\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n%{ endif }\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-${service_name}\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count               = ${group_count}\n\n    # The volume stanza allows the group to specify that it requires a given\n    # volume from the cluster.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/volume\n    #\n    %{ if use_host_volume }\n    volume \"prod-volume1-${service_name}\" {\n      type              = \"host\"\n      read_only         = false\n      source            = \"${host_volume}\"\n    }\n    %{ endif }\n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute         = \"$${attr.cpu.arch}\"\n      operator          = \"!=\"\n      value             = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-${service_name}\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver            = \"exec\"\n\n      %{ if use_host_volume }\n      volume_mount {\n        volume          = \"prod-volume1-${service_name}\"\n        destination     = \"${data_dir}\"\n        read_only       = false\n      }\n      %{ endif }\n\n      %{ if use_vault_provider }\n      vault {\n        policies        = \"${vault_kv_policy_name}\"\n      }\n      %{ endif }\n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command         = \"local/prometheus-${version}.linux-amd64/prometheus\"\n        args            = [\n          \"--config.file=secrets/prometheus.yml\",\n          \"--storage.tsdb.path=${data_dir}prometheus/\",\n          \"--storage.tsdb.retention.time=15d\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"${url}\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alerts.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n---\ngroups:\n- name: \"Consul\"\n  rules:\n  - alert: ConsulServiceHealthcheckFailed\n    expr: consul_catalog_service_node_healthy == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n      description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n  - alert: ConsulMissingMasterNode\n    expr: consul_raft_peers \u003c 3\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n      description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n  - alert: ConsulAgentUnhealthy\n    expr: consul_health_node_status{status=\"critical\"} == 1\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n      description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n  rules:\n  - alert: NodeDown\n    expr: up == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n      description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n  - alert: HostHighCpuLoad\n    expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n      description: \"CPU load is \u003e 95%.\"\n  - alert: HostOutOfMemory\n    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n      description: \"Node memory is filling up (\u003c 10% left).\"\n  - alert: HostOomKillDetected\n    expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n      description: \"OOM kill detected.\"\n  - alert: HostMemoryUnderMemoryPressure\n    expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n      description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n  - alert: HostOutOfDiskSpace\n    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n      description: \"Disk is almost full (\u003c 10% left).\"\n  - alert: HostRaidDiskFailure\n    expr: node_md_disks{state=\"failed\"} \u003e 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n      description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n  - alert: HostConntrackLimit\n    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n      description: \"The number of conntrack is approching limit.\"\n  - alert: HostNetworkInterfaceSaturated\n    expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n    for: 1m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n      description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n  - alert: HostSystemdServiceCrashed\n    expr: node_systemd_unit_state{state=\"failed\"} == 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n      description: \"SystemD service crashed.\"\n  - alert: HostEdacCorrectableErrorsDetected\n    expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: info\n    annotations:\n      summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n  - alert: HostEdacUncorrectableErrorsDetected\n    expr: node_edac_uncorrectable_errors_total \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n  rules:\n  - alert: MinioDiskOffline\n    expr: minio_offline_disks \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n      description: \"Minio disk is offline.\"\n  - alert: MinioStorageSpaceExhausted\n    expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n      description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n  rules:\n  - alert: PrometheusConfigurationReloadFailure\n    expr: prometheus_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"Prometheus configuration reload error.\"\n  - alert: PrometheusTooManyRestarts\n    expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n      description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n  - alert: PrometheusAlertmanagerConfigurationReloadFailure\n    expr: alertmanager_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"AlertManager configuration reload error.\"\n  - alert: PrometheusRuleEvaluationFailures\n    expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n  - alert: PrometheusTargetScrapingSlow\n    expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n      description: \"Prometheus is scraping exporters slowly.\"\n  - alert: PrometheusTsdbCompactionsFailed\n    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n  - alert: PrometheusTsdbHeadTruncationsFailed\n    expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n  - alert: PrometheusTsdbWalCorruptions\n    expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n  - alert: PrometheusTsdbWalTruncationsFailed\n    expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n      }\n\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/prometheus.yml\"\n        data            = \u003c\u003cEOH\n---\nglobal:\n  scrape_interval:     5s\n  scrape_timeout:      5s\n  evaluation_interval: 5s\n\nalerting:\n  alertmanagers:\n  - consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\nrule_files:\n  - 'alerts.yml'\n\nscrape_configs:\n\n  - job_name: 'Nomad Cluster'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'nomad-client', 'nomad' ]\n    relabel_configs:\n    - source_labels: [__meta_consul_tags]\n      regex: '(.*)http(.*)'\n      action: keep\n    metrics_path: /v1/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Consul Cluster'\n    static_configs:\n      - targets: [ '10.30.51.28:8500' ]\n      - targets: [ '10.30.51.29:8500' ]\n      - targets: [ '10.30.51.30:8500' ]\n      - targets: [ '10.30.51.32:8500' ]\n      - targets: [ '10.30.51.33:8500' ]\n      - targets: [ '10.30.51.34:8500' ]\n      - targets: [ '10.30.51.35:8500' ]\n      - targets: [ '10.30.51.39:8500' ]\n      - targets: [ '10.30.51.40:8500' ]\n      - targets: [ '10.30.51.50:8500' ]\n      - targets: [ '10.30.51.51:8500' ]\n      - targets: [ '10.30.51.65:8500' ]\n      - targets: [ '10.30.51.66:8500' ]\n      - targets: [ '10.30.51.67:8500' ]\n      - targets: [ '10.30.51.68:8500' ]\n      - targets: [ '10.30.51.70:8500' ]\n      - targets: [ '10.30.51.71:8500' ]\n      - targets: [ '10.32.8.14:8500' ]\n      - targets: [ '10.32.8.15:8500' ]\n      - targets: [ '10.32.8.16:8500' ]\n      - targets: [ '10.32.8.17:8500' ]\n    metrics_path: /v1/agent/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Blackbox Exporter (icmp)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n      - targets: [ '10.30.51.32' ]\n    params:\n      module: [ 'icmp_v4' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'Blackbox Exporter (http)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n    params:\n      module: [ 'http_2xx' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'cAdvisor Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:8080' ]\n      - targets: [ '10.30.51.29:8080' ]\n      - targets: [ '10.30.51.30:8080' ]\n      #- targets: [ '10.30.51.32:8080' ]\n      - targets: [ '10.30.51.33:8080' ]\n      - targets: [ '10.30.51.34:8080' ]\n      - targets: [ '10.30.51.35:8080' ]\n      - targets: [ '10.30.51.39:8080' ]\n      - targets: [ '10.30.51.40:8080' ]\n      - targets: [ '10.30.51.50:8080' ]\n      - targets: [ '10.30.51.51:8080' ]\n      - targets: [ '10.30.51.65:8080' ]\n      - targets: [ '10.30.51.66:8080' ]\n      - targets: [ '10.30.51.67:8080' ]\n      - targets: [ '10.30.51.68:8080' ]\n      - targets: [ '10.30.51.70:8080' ]\n      - targets: [ '10.30.51.71:8080' ]\n      - targets: [ '10.32.8.14:8080' ]\n      - targets: [ '10.32.8.15:8080' ]\n      - targets: [ '10.32.8.16:8080' ]\n      - targets: [ '10.32.8.17:8080' ]\n\n  - job_name: 'Node Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:9100' ]\n      - targets: [ '10.30.51.29:9100' ]\n      - targets: [ '10.30.51.30:9100' ]\n      - targets: [ '10.30.51.32:9100' ]\n      - targets: [ '10.30.51.33:9100' ]\n      - targets: [ '10.30.51.34:9100' ]\n      - targets: [ '10.30.51.35:9100' ]\n      - targets: [ '10.30.51.39:9100' ]\n      - targets: [ '10.30.51.40:9100' ]\n      - targets: [ '10.30.51.50:9100' ]\n      - targets: [ '10.30.51.51:9100' ]\n      - targets: [ '10.30.51.65:9100' ]\n      - targets: [ '10.30.51.66:9100' ]\n      - targets: [ '10.30.51.67:9100' ]\n      - targets: [ '10.30.51.68:9100' ]\n      - targets: [ '10.30.51.70:9100' ]\n      - targets: [ '10.30.51.71:9100' ]\n      - targets: [ '10.32.8.14:9100' ]\n      - targets: [ '10.32.8.15:9100' ]\n      - targets: [ '10.32.8.16:9100' ]\n      - targets: [ '10.32.8.17:9100' ]\n\n  - job_name: 'Alertmanager'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\n  - job_name: 'Grafana'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'grafana' ]\n\n  - job_name: 'Prometheus'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'prometheus' ]\n\n  - job_name: 'Minio'\n    bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'storage' ]\n    metrics_path: /minio/prometheus/metrics\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"${service_name}\"\n        port            = \"${service_name}\"\n        tags            = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Prometheus Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = ${cpu}\n        memory          = ${mem}\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"${service_name}\" {\n            static      = ${port}\n          }\n        }\n      }\n    }\n  }\n}",
+            "id": "f4fe3d282a60afb0ef6713540a32d855d65713707e68cf8636a1afd38521b604",
+            "rendered": "job \"prod-prometheus\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-prometheus\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count               = 4\n\n    # The volume stanza allows the group to specify that it requires a given\n    # volume from the cluster.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/volume\n    #\n    \n    volume \"prod-volume1-prometheus\" {\n      type              = \"host\"\n      read_only         = false\n      source            = \"prod-volume-data1-1\"\n    }\n    \n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute         = \"${attr.cpu.arch}\"\n      operator          = \"!=\"\n      value             = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-prometheus\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver            = \"exec\"\n\n      \n      volume_mount {\n        volume          = \"prod-volume1-prometheus\"\n        destination     = \"/data/\"\n        read_only       = false\n      }\n      \n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command         = \"local/prometheus-2.24.0.linux-amd64/prometheus\"\n        args            = [\n          \"--config.file=secrets/prometheus.yml\",\n          \"--storage.tsdb.path=/data/prometheus/\",\n          \"--storage.tsdb.retention.time=15d\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alerts.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n---\ngroups:\n- name: \"Jenkins Job Health Exporter\"\n  rules:\n  - alert: JenkinsJobHealthExporterFailures\n    expr: jenkins_job_failure{id=~\".*\"} \u003e= 10\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Jenkins Job Health detected high failure rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n  - alert: JenkinsJobHealthExporterUnstable\n    expr: jenkins_job_unstable{id=~\".*\"} \u003e= 10\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Jenkins Job Health detected high unstable rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n- name: \"Consul\"\n  rules:\n  - alert: ConsulServiceHealthcheckFailed\n    expr: consul_catalog_service_node_healthy == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n      description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n  - alert: ConsulMissingMasterNode\n    expr: consul_raft_peers \u003c 3\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n      description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n  - alert: ConsulAgentUnhealthy\n    expr: consul_health_node_status{status=\"critical\"} == 1\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n      description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n  rules:\n  - alert: NodeDown\n    expr: up == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n      description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n  - alert: HostHighCpuLoad\n    expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n      description: \"CPU load is \u003e 95%.\"\n  - alert: HostOutOfMemory\n    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n      description: \"Node memory is filling up (\u003c 10% left).\"\n  - alert: HostOomKillDetected\n    expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n      description: \"OOM kill detected.\"\n  - alert: HostMemoryUnderMemoryPressure\n    expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n      description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n  - alert: HostOutOfDiskSpace\n    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n      description: \"Disk is almost full (\u003c 10% left).\"\n  - alert: HostRaidDiskFailure\n    expr: node_md_disks{state=\"failed\"} \u003e 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n      description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n  - alert: HostConntrackLimit\n    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n      description: \"The number of conntrack is approching limit.\"\n  - alert: HostNetworkInterfaceSaturated\n    expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n    for: 1m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n      description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n  - alert: HostSystemdServiceCrashed\n    expr: node_systemd_unit_state{state=\"failed\"} == 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n      description: \"SystemD service crashed.\"\n  - alert: HostEdacCorrectableErrorsDetected\n    expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: info\n    annotations:\n      summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n  - alert: HostEdacUncorrectableErrorsDetected\n    expr: node_edac_uncorrectable_errors_total \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n  rules:\n  - alert: MinioDiskOffline\n    expr: minio_offline_disks \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n      description: \"Minio disk is offline.\"\n  - alert: MinioStorageSpaceExhausted\n    expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n      description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n  rules:\n  - alert: PrometheusConfigurationReloadFailure\n    expr: prometheus_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"Prometheus configuration reload error.\"\n  - alert: PrometheusTooManyRestarts\n    expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n      description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n  - alert: PrometheusAlertmanagerConfigurationReloadFailure\n    expr: alertmanager_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"AlertManager configuration reload error.\"\n  - alert: PrometheusRuleEvaluationFailures\n    expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n  - alert: PrometheusTargetScrapingSlow\n    expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n      description: \"Prometheus is scraping exporters slowly.\"\n  - alert: PrometheusTsdbCompactionsFailed\n    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n  - alert: PrometheusTsdbHeadTruncationsFailed\n    expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n  - alert: PrometheusTsdbWalCorruptions\n    expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n  - alert: PrometheusTsdbWalTruncationsFailed\n    expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n      }\n\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/prometheus.yml\"\n        data            = \u003c\u003cEOH\n---\nglobal:\n  scrape_interval:     5s\n  scrape_timeout:      5s\n  evaluation_interval: 5s\n\nalerting:\n  alertmanagers:\n  - consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\nrule_files:\n  - 'alerts.yml'\n\nscrape_configs:\n\n  - job_name: 'Nomad Cluster'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'nomad-client', 'nomad' ]\n    relabel_configs:\n    - source_labels: [__meta_consul_tags]\n      regex: '(.*)http(.*)'\n      action: keep\n    metrics_path: /v1/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Consul Cluster'\n    static_configs:\n      - targets: [ '10.30.51.28:8500' ]\n      - targets: [ '10.30.51.29:8500' ]\n      - targets: [ '10.30.51.30:8500' ]\n      - targets: [ '10.30.51.32:8500' ]\n      - targets: [ '10.30.51.33:8500' ]\n      - targets: [ '10.30.51.34:8500' ]\n      - targets: [ '10.30.51.35:8500' ]\n      - targets: [ '10.30.51.39:8500' ]\n      - targets: [ '10.30.51.40:8500' ]\n      - targets: [ '10.30.51.50:8500' ]\n      - targets: [ '10.30.51.51:8500' ]\n      - targets: [ '10.30.51.65:8500' ]\n      - targets: [ '10.30.51.66:8500' ]\n      - targets: [ '10.30.51.67:8500' ]\n      - targets: [ '10.30.51.68:8500' ]\n      - targets: [ '10.30.51.70:8500' ]\n      - targets: [ '10.30.51.71:8500' ]\n      - targets: [ '10.32.8.14:8500' ]\n      - targets: [ '10.32.8.15:8500' ]\n      - targets: [ '10.32.8.16:8500' ]\n      - targets: [ '10.32.8.17:8500' ]\n    metrics_path: /v1/agent/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Blackbox Exporter (icmp)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n      - targets: [ '10.30.51.32' ]\n    params:\n      module: [ 'icmp_v4' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'Blackbox Exporter (http)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n    params:\n      module: [ 'http_2xx' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'cAdvisor Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:8080' ]\n      - targets: [ '10.30.51.29:8080' ]\n      - targets: [ '10.30.51.30:8080' ]\n      #- targets: [ '10.30.51.32:8080' ]\n      - targets: [ '10.30.51.33:8080' ]\n      - targets: [ '10.30.51.34:8080' ]\n      - targets: [ '10.30.51.35:8080' ]\n      - targets: [ '10.30.51.39:8080' ]\n      - targets: [ '10.30.51.40:8080' ]\n      - targets: [ '10.30.51.50:8080' ]\n      - targets: [ '10.30.51.51:8080' ]\n      - targets: [ '10.30.51.65:8080' ]\n      - targets: [ '10.30.51.66:8080' ]\n      - targets: [ '10.30.51.67:8080' ]\n      - targets: [ '10.30.51.68:8080' ]\n      - targets: [ '10.30.51.70:8080' ]\n      - targets: [ '10.30.51.71:8080' ]\n      - targets: [ '10.32.8.14:8080' ]\n      - targets: [ '10.32.8.15:8080' ]\n      - targets: [ '10.32.8.16:8080' ]\n      - targets: [ '10.32.8.17:8080' ]\n\n  - job_name: 'Jenkins Job Health Exporter'\n    static_configs:\n      - targets: [ '10.30.51.32:9186' ]\n    metric_relabel_configs:\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        action: replace\n        replacement: '$1'\n        target_label: id\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        replacement: 'jenkins_job_$2'\n        target_label: __name__\n\n  - job_name: 'Node Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:9100' ]\n      - targets: [ '10.30.51.29:9100' ]\n      - targets: [ '10.30.51.30:9100' ]\n      - targets: [ '10.30.51.32:9100' ]\n      - targets: [ '10.30.51.33:9100' ]\n      - targets: [ '10.30.51.34:9100' ]\n      - targets: [ '10.30.51.35:9100' ]\n      - targets: [ '10.30.51.39:9100' ]\n      - targets: [ '10.30.51.40:9100' ]\n      - targets: [ '10.30.51.50:9100' ]\n      - targets: [ '10.30.51.51:9100' ]\n      - targets: [ '10.30.51.65:9100' ]\n      - targets: [ '10.30.51.66:9100' ]\n      - targets: [ '10.30.51.67:9100' ]\n      - targets: [ '10.30.51.68:9100' ]\n      - targets: [ '10.30.51.70:9100' ]\n      - targets: [ '10.30.51.71:9100' ]\n      - targets: [ '10.32.8.14:9100' ]\n      - targets: [ '10.32.8.15:9100' ]\n      - targets: [ '10.32.8.16:9100' ]\n      - targets: [ '10.32.8.17:9100' ]\n\n  - job_name: 'Alertmanager'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\n  - job_name: 'Grafana'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'grafana' ]\n\n  - job_name: 'Prometheus'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'prometheus' ]\n\n  - job_name: 'Minio'\n    bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'storage' ]\n    metrics_path: /minio/prometheus/metrics\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"prometheus\"\n        port            = \"prometheus\"\n        tags            = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Prometheus Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = 2000\n        memory          = 8192\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"prometheus\" {\n            static      = 9090\n          }\n        }\n      }\n    }\n  }\n}",
+            "template": "job \"${job_name}\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"${datacenters}\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n%{ if use_canary }\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n%{ endif }\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-${service_name}\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count               = ${group_count}\n\n    # The volume stanza allows the group to specify that it requires a given\n    # volume from the cluster.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/volume\n    #\n    %{ if use_host_volume }\n    volume \"prod-volume1-${service_name}\" {\n      type              = \"host\"\n      read_only         = false\n      source            = \"${host_volume}\"\n    }\n    %{ endif }\n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute         = \"$${attr.cpu.arch}\"\n      operator          = \"!=\"\n      value             = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-${service_name}\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver            = \"exec\"\n\n      %{ if use_host_volume }\n      volume_mount {\n        volume          = \"prod-volume1-${service_name}\"\n        destination     = \"${data_dir}\"\n        read_only       = false\n      }\n      %{ endif }\n\n      %{ if use_vault_provider }\n      vault {\n        policies        = \"${vault_kv_policy_name}\"\n      }\n      %{ endif }\n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command         = \"local/prometheus-${version}.linux-amd64/prometheus\"\n        args            = [\n          \"--config.file=secrets/prometheus.yml\",\n          \"--storage.tsdb.path=${data_dir}prometheus/\",\n          \"--storage.tsdb.retention.time=15d\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"${url}\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alerts.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n---\ngroups:\n- name: \"Jenkins Job Health Exporter\"\n  rules:\n  - alert: JenkinsJobHealthExporterFailures\n    expr: jenkins_job_failure{id=~\".*\"} \u003e= 10\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Jenkins Job Health detected high failure rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n  - alert: JenkinsJobHealthExporterUnstable\n    expr: jenkins_job_unstable{id=~\".*\"} \u003e= 10\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Jenkins Job Health detected high unstable rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n- name: \"Consul\"\n  rules:\n  - alert: ConsulServiceHealthcheckFailed\n    expr: consul_catalog_service_node_healthy == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n      description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n  - alert: ConsulMissingMasterNode\n    expr: consul_raft_peers \u003c 3\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n      description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n  - alert: ConsulAgentUnhealthy\n    expr: consul_health_node_status{status=\"critical\"} == 1\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n      description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n  rules:\n  - alert: NodeDown\n    expr: up == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n      description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n  - alert: HostHighCpuLoad\n    expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n      description: \"CPU load is \u003e 95%.\"\n  - alert: HostOutOfMemory\n    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n      description: \"Node memory is filling up (\u003c 10% left).\"\n  - alert: HostOomKillDetected\n    expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n      description: \"OOM kill detected.\"\n  - alert: HostMemoryUnderMemoryPressure\n    expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n      description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n  - alert: HostOutOfDiskSpace\n    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n      description: \"Disk is almost full (\u003c 10% left).\"\n  - alert: HostRaidDiskFailure\n    expr: node_md_disks{state=\"failed\"} \u003e 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n      description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n  - alert: HostConntrackLimit\n    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n      description: \"The number of conntrack is approching limit.\"\n  - alert: HostNetworkInterfaceSaturated\n    expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n    for: 1m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n      description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n  - alert: HostSystemdServiceCrashed\n    expr: node_systemd_unit_state{state=\"failed\"} == 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n      description: \"SystemD service crashed.\"\n  - alert: HostEdacCorrectableErrorsDetected\n    expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: info\n    annotations:\n      summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n  - alert: HostEdacUncorrectableErrorsDetected\n    expr: node_edac_uncorrectable_errors_total \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n  rules:\n  - alert: MinioDiskOffline\n    expr: minio_offline_disks \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n      description: \"Minio disk is offline.\"\n  - alert: MinioStorageSpaceExhausted\n    expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n      description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n  rules:\n  - alert: PrometheusConfigurationReloadFailure\n    expr: prometheus_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"Prometheus configuration reload error.\"\n  - alert: PrometheusTooManyRestarts\n    expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n      description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n  - alert: PrometheusAlertmanagerConfigurationReloadFailure\n    expr: alertmanager_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"AlertManager configuration reload error.\"\n  - alert: PrometheusRuleEvaluationFailures\n    expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n  - alert: PrometheusTargetScrapingSlow\n    expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n      description: \"Prometheus is scraping exporters slowly.\"\n  - alert: PrometheusTsdbCompactionsFailed\n    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n  - alert: PrometheusTsdbHeadTruncationsFailed\n    expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n  - alert: PrometheusTsdbWalCorruptions\n    expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n  - alert: PrometheusTsdbWalTruncationsFailed\n    expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n      }\n\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/prometheus.yml\"\n        data            = \u003c\u003cEOH\n---\nglobal:\n  scrape_interval:     5s\n  scrape_timeout:      5s\n  evaluation_interval: 5s\n\nalerting:\n  alertmanagers:\n  - consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\nrule_files:\n  - 'alerts.yml'\n\nscrape_configs:\n\n  - job_name: 'Nomad Cluster'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'nomad-client', 'nomad' ]\n    relabel_configs:\n    - source_labels: [__meta_consul_tags]\n      regex: '(.*)http(.*)'\n      action: keep\n    metrics_path: /v1/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Consul Cluster'\n    static_configs:\n      - targets: [ '10.30.51.28:8500' ]\n      - targets: [ '10.30.51.29:8500' ]\n      - targets: [ '10.30.51.30:8500' ]\n      - targets: [ '10.30.51.32:8500' ]\n      - targets: [ '10.30.51.33:8500' ]\n      - targets: [ '10.30.51.34:8500' ]\n      - targets: [ '10.30.51.35:8500' ]\n      - targets: [ '10.30.51.39:8500' ]\n      - targets: [ '10.30.51.40:8500' ]\n      - targets: [ '10.30.51.50:8500' ]\n      - targets: [ '10.30.51.51:8500' ]\n      - targets: [ '10.30.51.65:8500' ]\n      - targets: [ '10.30.51.66:8500' ]\n      - targets: [ '10.30.51.67:8500' ]\n      - targets: [ '10.30.51.68:8500' ]\n      - targets: [ '10.30.51.70:8500' ]\n      - targets: [ '10.30.51.71:8500' ]\n      - targets: [ '10.32.8.14:8500' ]\n      - targets: [ '10.32.8.15:8500' ]\n      - targets: [ '10.32.8.16:8500' ]\n      - targets: [ '10.32.8.17:8500' ]\n    metrics_path: /v1/agent/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Blackbox Exporter (icmp)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n      - targets: [ '10.30.51.32' ]\n    params:\n      module: [ 'icmp_v4' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'Blackbox Exporter (http)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n    params:\n      module: [ 'http_2xx' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'cAdvisor Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:8080' ]\n      - targets: [ '10.30.51.29:8080' ]\n      - targets: [ '10.30.51.30:8080' ]\n      #- targets: [ '10.30.51.32:8080' ]\n      - targets: [ '10.30.51.33:8080' ]\n      - targets: [ '10.30.51.34:8080' ]\n      - targets: [ '10.30.51.35:8080' ]\n      - targets: [ '10.30.51.39:8080' ]\n      - targets: [ '10.30.51.40:8080' ]\n      - targets: [ '10.30.51.50:8080' ]\n      - targets: [ '10.30.51.51:8080' ]\n      - targets: [ '10.30.51.65:8080' ]\n      - targets: [ '10.30.51.66:8080' ]\n      - targets: [ '10.30.51.67:8080' ]\n      - targets: [ '10.30.51.68:8080' ]\n      - targets: [ '10.30.51.70:8080' ]\n      - targets: [ '10.30.51.71:8080' ]\n      - targets: [ '10.32.8.14:8080' ]\n      - targets: [ '10.32.8.15:8080' ]\n      - targets: [ '10.32.8.16:8080' ]\n      - targets: [ '10.32.8.17:8080' ]\n\n  - job_name: 'Jenkins Job Health Exporter'\n    static_configs:\n      - targets: [ '10.30.51.32:9186' ]\n    metric_relabel_configs:\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        action: replace\n        replacement: '$1'\n        target_label: id\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        replacement: 'jenkins_job_$2'\n        target_label: __name__\n\n  - job_name: 'Node Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:9100' ]\n      - targets: [ '10.30.51.29:9100' ]\n      - targets: [ '10.30.51.30:9100' ]\n      - targets: [ '10.30.51.32:9100' ]\n      - targets: [ '10.30.51.33:9100' ]\n      - targets: [ '10.30.51.34:9100' ]\n      - targets: [ '10.30.51.35:9100' ]\n      - targets: [ '10.30.51.39:9100' ]\n      - targets: [ '10.30.51.40:9100' ]\n      - targets: [ '10.30.51.50:9100' ]\n      - targets: [ '10.30.51.51:9100' ]\n      - targets: [ '10.30.51.65:9100' ]\n      - targets: [ '10.30.51.66:9100' ]\n      - targets: [ '10.30.51.67:9100' ]\n      - targets: [ '10.30.51.68:9100' ]\n      - targets: [ '10.30.51.70:9100' ]\n      - targets: [ '10.30.51.71:9100' ]\n      - targets: [ '10.32.8.14:9100' ]\n      - targets: [ '10.32.8.15:9100' ]\n      - targets: [ '10.32.8.16:9100' ]\n      - targets: [ '10.32.8.17:9100' ]\n\n  - job_name: 'Alertmanager'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\n  - job_name: 'Grafana'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'grafana' ]\n\n  - job_name: 'Prometheus'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'prometheus' ]\n\n  - job_name: 'Minio'\n    bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'storage' ]\n    metrics_path: /minio/prometheus/metrics\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"${service_name}\"\n        port            = \"${service_name}\"\n        tags            = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Prometheus Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = ${cpu}\n        memory          = ${mem}\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"${service_name}\" {\n            static      = ${port}\n          }\n        }\n      }\n    }\n  }\n}",
              "vars": {
                "cpu": "2000",
                "data_dir": "/data/",
@@ -512,23 +514,26 @@
            "schema_version": 0,
            "attributes": {
              "allocation_ids": [
-              "81af62a3-09d6-5e29-a63a-e13952f9caec",
-              "1516d874-0001-78b1-f6ec-727067af2f29",
-              "fcf4df05-3083-e96a-9e8b-0856c0fed8f9",
-              "6eaada1d-5bea-7b95-6564-7c0ff31b3053"
+              "fc9ab3e3-7854-65d5-0c82-28283383976a",
+              "5e526c5d-f7b5-f275-357c-48a739a96071",
+              "d816ca29-4cf4-f9be-4d48-edbddac472ae",
+              "794f7531-14f5-c1c3-9bf3-88936ee335e3",
+              "b1edacab-e57f-e014-4d4c-50e563c81ab7",
+              "2ee67e9b-5bc1-ca2b-c3c4-0509e02ee954",
+              "922f75f9-39c1-c232-9ef6-81d68a03c719"
              ],
              "datacenters": [
                "yul1"
              ],
-            "deployment_id": "4cbf8370-f2d7-16c2-d500-f6495d43537d",
+            "deployment_id": "0a36a612-38de-5e1c-448e-72e29a2fb5f3",
              "deployment_status": "successful",
              "deregister_on_destroy": true,
              "deregister_on_id_change": true,
              "detach": false,
              "id": "prod-prometheus",
-            "jobspec": "job \"prod-prometheus\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-prometheus\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count               = 4\n\n    # The volume stanza allows the group to specify that it requires a given\n    # volume from the cluster.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/volume\n    #\n    \n    volume \"prod-volume1-prometheus\" {\n      type              = \"host\"\n      read_only         = false\n      source            = \"prod-volume-data1-1\"\n    }\n    \n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute         = \"${attr.cpu.arch}\"\n      operator          = \"!=\"\n      value             = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-prometheus\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver            = \"exec\"\n\n      \n      volume_mount {\n        volume          = \"prod-volume1-prometheus\"\n        destination     = \"/data/\"\n        read_only       = false\n      }\n      \n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command         = \"local/prometheus-2.24.0.linux-amd64/prometheus\"\n        args            = [\n          \"--config.file=secrets/prometheus.yml\",\n          \"--storage.tsdb.path=/data/prometheus/\",\n          \"--storage.tsdb.retention.time=15d\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alerts.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n---\ngroups:\n- name: \"Consul\"\n  rules:\n  - alert: ConsulServiceHealthcheckFailed\n    expr: consul_catalog_service_node_healthy == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n      description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n  - alert: ConsulMissingMasterNode\n    expr: consul_raft_peers \u003c 3\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n      description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n  - alert: ConsulAgentUnhealthy\n    expr: consul_health_node_status{status=\"critical\"} == 1\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n      description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n  rules:\n  - alert: NodeDown\n    expr: up == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n      description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n  - alert: HostHighCpuLoad\n    expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n      description: \"CPU load is \u003e 95%.\"\n  - alert: HostOutOfMemory\n    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n      description: \"Node memory is filling up (\u003c 10% left).\"\n  - alert: HostOomKillDetected\n    expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n      description: \"OOM kill detected.\"\n  - alert: HostMemoryUnderMemoryPressure\n    expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n      description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n  - alert: HostOutOfDiskSpace\n    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n      description: \"Disk is almost full (\u003c 10% left).\"\n  - alert: HostRaidDiskFailure\n    expr: node_md_disks{state=\"failed\"} \u003e 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n      description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n  - alert: HostConntrackLimit\n    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n      description: \"The number of conntrack is approching limit.\"\n  - alert: HostNetworkInterfaceSaturated\n    expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n    for: 1m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n      description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n  - alert: HostSystemdServiceCrashed\n    expr: node_systemd_unit_state{state=\"failed\"} == 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n      description: \"SystemD service crashed.\"\n  - alert: HostEdacCorrectableErrorsDetected\n    expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: info\n    annotations:\n      summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n  - alert: HostEdacUncorrectableErrorsDetected\n    expr: node_edac_uncorrectable_errors_total \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n  rules:\n  - alert: MinioDiskOffline\n    expr: minio_offline_disks \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n      description: \"Minio disk is offline.\"\n  - alert: MinioStorageSpaceExhausted\n    expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n      description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n  rules:\n  - alert: PrometheusConfigurationReloadFailure\n    expr: prometheus_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"Prometheus configuration reload error.\"\n  - alert: PrometheusTooManyRestarts\n    expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n      description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n  - alert: PrometheusAlertmanagerConfigurationReloadFailure\n    expr: alertmanager_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"AlertManager configuration reload error.\"\n  - alert: PrometheusRuleEvaluationFailures\n    expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n  - alert: PrometheusTargetScrapingSlow\n    expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n      description: \"Prometheus is scraping exporters slowly.\"\n  - alert: PrometheusTsdbCompactionsFailed\n    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n  - alert: PrometheusTsdbHeadTruncationsFailed\n    expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n  - alert: PrometheusTsdbWalCorruptions\n    expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n  - alert: PrometheusTsdbWalTruncationsFailed\n    expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n      }\n\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/prometheus.yml\"\n        data            = \u003c\u003cEOH\n---\nglobal:\n  scrape_interval:     5s\n  scrape_timeout:      5s\n  evaluation_interval: 5s\n\nalerting:\n  alertmanagers:\n  - consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\nrule_files:\n  - 'alerts.yml'\n\nscrape_configs:\n\n  - job_name: 'Nomad Cluster'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'nomad-client', 'nomad' ]\n    relabel_configs:\n    - source_labels: [__meta_consul_tags]\n      regex: '(.*)http(.*)'\n      action: keep\n    metrics_path: /v1/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Consul Cluster'\n    static_configs:\n      - targets: [ '10.30.51.28:8500' ]\n      - targets: [ '10.30.51.29:8500' ]\n      - targets: [ '10.30.51.30:8500' ]\n      - targets: [ '10.30.51.32:8500' ]\n      - targets: [ '10.30.51.33:8500' ]\n      - targets: [ '10.30.51.34:8500' ]\n      - targets: [ '10.30.51.35:8500' ]\n      - targets: [ '10.30.51.39:8500' ]\n      - targets: [ '10.30.51.40:8500' ]\n      - targets: [ '10.30.51.50:8500' ]\n      - targets: [ '10.30.51.51:8500' ]\n      - targets: [ '10.30.51.65:8500' ]\n      - targets: [ '10.30.51.66:8500' ]\n      - targets: [ '10.30.51.67:8500' ]\n      - targets: [ '10.30.51.68:8500' ]\n      - targets: [ '10.30.51.70:8500' ]\n      - targets: [ '10.30.51.71:8500' ]\n      - targets: [ '10.32.8.14:8500' ]\n      - targets: [ '10.32.8.15:8500' ]\n      - targets: [ '10.32.8.16:8500' ]\n      - targets: [ '10.32.8.17:8500' ]\n    metrics_path: /v1/agent/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Blackbox Exporter (icmp)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n      - targets: [ '10.30.51.32' ]\n    params:\n      module: [ 'icmp_v4' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'Blackbox Exporter (http)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n    params:\n      module: [ 'http_2xx' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'cAdvisor Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:8080' ]\n      - targets: [ '10.30.51.29:8080' ]\n      - targets: [ '10.30.51.30:8080' ]\n      #- targets: [ '10.30.51.32:8080' ]\n      - targets: [ '10.30.51.33:8080' ]\n      - targets: [ '10.30.51.34:8080' ]\n      - targets: [ '10.30.51.35:8080' ]\n      - targets: [ '10.30.51.39:8080' ]\n      - targets: [ '10.30.51.40:8080' ]\n      - targets: [ '10.30.51.50:8080' ]\n      - targets: [ '10.30.51.51:8080' ]\n      - targets: [ '10.30.51.65:8080' ]\n      - targets: [ '10.30.51.66:8080' ]\n      - targets: [ '10.30.51.67:8080' ]\n      - targets: [ '10.30.51.68:8080' ]\n      - targets: [ '10.30.51.70:8080' ]\n      - targets: [ '10.30.51.71:8080' ]\n      - targets: [ '10.32.8.14:8080' ]\n      - targets: [ '10.32.8.15:8080' ]\n      - targets: [ '10.32.8.16:8080' ]\n      - targets: [ '10.32.8.17:8080' ]\n\n  - job_name: 'Node Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:9100' ]\n      - targets: [ '10.30.51.29:9100' ]\n      - targets: [ '10.30.51.30:9100' ]\n      - targets: [ '10.30.51.32:9100' ]\n      - targets: [ '10.30.51.33:9100' ]\n      - targets: [ '10.30.51.34:9100' ]\n      - targets: [ '10.30.51.35:9100' ]\n      - targets: [ '10.30.51.39:9100' ]\n      - targets: [ '10.30.51.40:9100' ]\n      - targets: [ '10.30.51.50:9100' ]\n      - targets: [ '10.30.51.51:9100' ]\n      - targets: [ '10.30.51.65:9100' ]\n      - targets: [ '10.30.51.66:9100' ]\n      - targets: [ '10.30.51.67:9100' ]\n      - targets: [ '10.30.51.68:9100' ]\n      - targets: [ '10.30.51.70:9100' ]\n      - targets: [ '10.30.51.71:9100' ]\n      - targets: [ '10.32.8.14:9100' ]\n      - targets: [ '10.32.8.15:9100' ]\n      - targets: [ '10.32.8.16:9100' ]\n      - targets: [ '10.32.8.17:9100' ]\n\n  - job_name: 'Alertmanager'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\n  - job_name: 'Grafana'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'grafana' ]\n\n  - job_name: 'Prometheus'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'prometheus' ]\n\n  - job_name: 'Minio'\n    bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'storage' ]\n    metrics_path: /minio/prometheus/metrics\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"prometheus\"\n        port            = \"prometheus\"\n        tags            = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Prometheus Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = 2000\n        memory          = 8192\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"prometheus\" {\n            static      = 9090\n          }\n        }\n      }\n    }\n  }\n}",
+            "jobspec": "job \"prod-prometheus\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-prometheus\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count               = 4\n\n    # The volume stanza allows the group to specify that it requires a given\n    # volume from the cluster.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/volume\n    #\n    \n    volume \"prod-volume1-prometheus\" {\n      type              = \"host\"\n      read_only         = false\n      source            = \"prod-volume-data1-1\"\n    }\n    \n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute         = \"${attr.cpu.arch}\"\n      operator          = \"!=\"\n      value             = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-prometheus\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver            = \"exec\"\n\n      \n      volume_mount {\n        volume          = \"prod-volume1-prometheus\"\n        destination     = \"/data/\"\n        read_only       = false\n      }\n      \n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command         = \"local/prometheus-2.24.0.linux-amd64/prometheus\"\n        args            = [\n          \"--config.file=secrets/prometheus.yml\",\n          \"--storage.tsdb.path=/data/prometheus/\",\n          \"--storage.tsdb.retention.time=15d\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alerts.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n---\ngroups:\n- name: \"Jenkins Job Health Exporter\"\n  rules:\n  - alert: JenkinsJobHealthExporterFailures\n    expr: jenkins_job_failure{id=~\".*\"} \u003e= 10\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Jenkins Job Health detected high failure rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n  - alert: JenkinsJobHealthExporterUnstable\n    expr: jenkins_job_unstable{id=~\".*\"} \u003e= 10\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Jenkins Job Health detected high unstable rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n- name: \"Consul\"\n  rules:\n  - alert: ConsulServiceHealthcheckFailed\n    expr: consul_catalog_service_node_healthy == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n      description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n  - alert: ConsulMissingMasterNode\n    expr: consul_raft_peers \u003c 3\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n      description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n  - alert: ConsulAgentUnhealthy\n    expr: consul_health_node_status{status=\"critical\"} == 1\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n      description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n  rules:\n  - alert: NodeDown\n    expr: up == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n      description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n  - alert: HostHighCpuLoad\n    expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n      description: \"CPU load is \u003e 95%.\"\n  - alert: HostOutOfMemory\n    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n      description: \"Node memory is filling up (\u003c 10% left).\"\n  - alert: HostOomKillDetected\n    expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n      description: \"OOM kill detected.\"\n  - alert: HostMemoryUnderMemoryPressure\n    expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n      description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n  - alert: HostOutOfDiskSpace\n    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n      description: \"Disk is almost full (\u003c 10% left).\"\n  - alert: HostRaidDiskFailure\n    expr: node_md_disks{state=\"failed\"} \u003e 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n      description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n  - alert: HostConntrackLimit\n    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n      description: \"The number of conntrack is approching limit.\"\n  - alert: HostNetworkInterfaceSaturated\n    expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n    for: 1m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n      description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n  - alert: HostSystemdServiceCrashed\n    expr: node_systemd_unit_state{state=\"failed\"} == 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n      description: \"SystemD service crashed.\"\n  - alert: HostEdacCorrectableErrorsDetected\n    expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: info\n    annotations:\n      summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n  - alert: HostEdacUncorrectableErrorsDetected\n    expr: node_edac_uncorrectable_errors_total \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n  rules:\n  - alert: MinioDiskOffline\n    expr: minio_offline_disks \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n      description: \"Minio disk is offline.\"\n  - alert: MinioStorageSpaceExhausted\n    expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n      description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n  rules:\n  - alert: PrometheusConfigurationReloadFailure\n    expr: prometheus_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"Prometheus configuration reload error.\"\n  - alert: PrometheusTooManyRestarts\n    expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n      description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n  - alert: PrometheusAlertmanagerConfigurationReloadFailure\n    expr: alertmanager_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"AlertManager configuration reload error.\"\n  - alert: PrometheusRuleEvaluationFailures\n    expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n  - alert: PrometheusTargetScrapingSlow\n    expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n      description: \"Prometheus is scraping exporters slowly.\"\n  - alert: PrometheusTsdbCompactionsFailed\n    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n  - alert: PrometheusTsdbHeadTruncationsFailed\n    expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n  - alert: PrometheusTsdbWalCorruptions\n    expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n  - alert: PrometheusTsdbWalTruncationsFailed\n    expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n      }\n\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/prometheus.yml\"\n        data            = \u003c\u003cEOH\n---\nglobal:\n  scrape_interval:     5s\n  scrape_timeout:      5s\n  evaluation_interval: 5s\n\nalerting:\n  alertmanagers:\n  - consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\nrule_files:\n  - 'alerts.yml'\n\nscrape_configs:\n\n  - job_name: 'Nomad Cluster'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'nomad-client', 'nomad' ]\n    relabel_configs:\n    - source_labels: [__meta_consul_tags]\n      regex: '(.*)http(.*)'\n      action: keep\n    metrics_path: /v1/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Consul Cluster'\n    static_configs:\n      - targets: [ '10.30.51.28:8500' ]\n      - targets: [ '10.30.51.29:8500' ]\n      - targets: [ '10.30.51.30:8500' ]\n      - targets: [ '10.30.51.32:8500' ]\n      - targets: [ '10.30.51.33:8500' ]\n      - targets: [ '10.30.51.34:8500' ]\n      - targets: [ '10.30.51.35:8500' ]\n      - targets: [ '10.30.51.39:8500' ]\n      - targets: [ '10.30.51.40:8500' ]\n      - targets: [ '10.30.51.50:8500' ]\n      - targets: [ '10.30.51.51:8500' ]\n      - targets: [ '10.30.51.65:8500' ]\n      - targets: [ '10.30.51.66:8500' ]\n      - targets: [ '10.30.51.67:8500' ]\n      - targets: [ '10.30.51.68:8500' ]\n      - targets: [ '10.30.51.70:8500' ]\n      - targets: [ '10.30.51.71:8500' ]\n      - targets: [ '10.32.8.14:8500' ]\n      - targets: [ '10.32.8.15:8500' ]\n      - targets: [ '10.32.8.16:8500' ]\n      - targets: [ '10.32.8.17:8500' ]\n    metrics_path: /v1/agent/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Blackbox Exporter (icmp)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n      - targets: [ '10.30.51.32' ]\n    params:\n      module: [ 'icmp_v4' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'Blackbox Exporter (http)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n    params:\n      module: [ 'http_2xx' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'cAdvisor Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:8080' ]\n      - targets: [ '10.30.51.29:8080' ]\n      - targets: [ '10.30.51.30:8080' ]\n      #- targets: [ '10.30.51.32:8080' ]\n      - targets: [ '10.30.51.33:8080' ]\n      - targets: [ '10.30.51.34:8080' ]\n      - targets: [ '10.30.51.35:8080' ]\n      - targets: [ '10.30.51.39:8080' ]\n      - targets: [ '10.30.51.40:8080' ]\n      - targets: [ '10.30.51.50:8080' ]\n      - targets: [ '10.30.51.51:8080' ]\n      - targets: [ '10.30.51.65:8080' ]\n      - targets: [ '10.30.51.66:8080' ]\n      - targets: [ '10.30.51.67:8080' ]\n      - targets: [ '10.30.51.68:8080' ]\n      - targets: [ '10.30.51.70:8080' ]\n      - targets: [ '10.30.51.71:8080' ]\n      - targets: [ '10.32.8.14:8080' ]\n      - targets: [ '10.32.8.15:8080' ]\n      - targets: [ '10.32.8.16:8080' ]\n      - targets: [ '10.32.8.17:8080' ]\n\n  - job_name: 'Jenkins Job Health Exporter'\n    static_configs:\n      - targets: [ '10.30.51.32:9186' ]\n    metric_relabel_configs:\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        action: replace\n        replacement: '$1'\n        target_label: id\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        replacement: 'jenkins_job_$2'\n        target_label: __name__\n\n  - job_name: 'Node Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:9100' ]\n      - targets: [ '10.30.51.29:9100' ]\n      - targets: [ '10.30.51.30:9100' ]\n      - targets: [ '10.30.51.32:9100' ]\n      - targets: [ '10.30.51.33:9100' ]\n      - targets: [ '10.30.51.34:9100' ]\n      - targets: [ '10.30.51.35:9100' ]\n      - targets: [ '10.30.51.39:9100' ]\n      - targets: [ '10.30.51.40:9100' ]\n      - targets: [ '10.30.51.50:9100' ]\n      - targets: [ '10.30.51.51:9100' ]\n      - targets: [ '10.30.51.65:9100' ]\n      - targets: [ '10.30.51.66:9100' ]\n      - targets: [ '10.30.51.67:9100' ]\n      - targets: [ '10.30.51.68:9100' ]\n      - targets: [ '10.30.51.70:9100' ]\n      - targets: [ '10.30.51.71:9100' ]\n      - targets: [ '10.32.8.14:9100' ]\n      - targets: [ '10.32.8.15:9100' ]\n      - targets: [ '10.32.8.16:9100' ]\n      - targets: [ '10.32.8.17:9100' ]\n\n  - job_name: 'Alertmanager'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\n  - job_name: 'Grafana'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'grafana' ]\n\n  - job_name: 'Prometheus'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'prometheus' ]\n\n  - job_name: 'Minio'\n    bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'storage' ]\n    metrics_path: /minio/prometheus/metrics\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"prometheus\"\n        port            = \"prometheus\"\n        tags            = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Prometheus Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = 2000\n        memory          = 8192\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"prometheus\" {\n            static      = 9090\n          }\n        }\n      }\n    }\n  }\n}",
              "json": null,
-            "modify_index": "7201194",
+            "modify_index": "7254539",
              "name": "prod-prometheus",
              "namespace": "default",
              "policy_override": null,
@@ -610,10 +615,9 @@
            "schema_version": 0,
            "attributes": {
              "allocation_ids": [
+              "a51e95c4-8ff0-47d9-6f88-f475942141bc",
                "0239788e-a5eb-b54d-0cf2-2404b3ec7c53",
-              "ecfcd9ad-b959-0fa4-c1ec-8b82195830f1",
-              "190f5699-0d10-7f85-ac68-96927702070b",
-              "2fbbef45-f00a-1367-08c4-c2239de9e78d"
+              "190f5699-0d10-7f85-ac68-96927702070b"
              ],
              "datacenters": [
                "yul1"
diff --git a/terraform-ci-infra/1n_nmd/terraform.tfstate.backup b/terraform-ci-infra/1n_nmd/terraform.tfstate.backup

index f40e2c2..ba758f7 100644 (file)
--- a/terraform-ci-infra/1n_nmd/terraform.tfstate.backup
+++ b/terraform-ci-infra/1n_nmd/terraform.tfstate.backup
@@ -1,7 +1,7 @@
  {
    "version": 4,
    "terraform_version": "0.14.5",
-  "serial": 1037,
+  "serial": 1108,
    "lineage": "e4e7f30a-652d-7a31-e31c-5e3a3388c9b9",
    "outputs": {},
    "resources": [
@@ -16,20 +16,23 @@
            "schema_version": 0,
            "attributes": {
              "filename": null,
-            "id": "f723f0544ca4fcac1162853257d62928794f50523b9186cdf2c24684721bd058",
-            "rendered": "job \"prod-alertmanager\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-alertmanager\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count             = 1\n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute       = \"${attr.cpu.arch}\"\n      operator        = \"!=\"\n      value           = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-alertmanager\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver          = \"exec\"\n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command       = \"local/alertmanager-0.21.0.linux-amd64/alertmanager\"\n        args          = [\n          \"--config.file=secrets/alertmanager.yml\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alertmanager.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\nglobal:\n  # The API URL to use for Slack notifications.\n  slack_api_url: 'https://hooks.slack.com/services/XX'\n\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n#  # CA certificate to validate the server certificate with.\n#  ca_file: \u003cfilepath\u003e ]\n#\n#  # Certificate and key files for client cert authentication to the server.\n#  cert_file: \u003cfilepath\u003e\n#  key_file: \u003cfilepath\u003e\n#\n#  # ServerName extension to indicate the name of the server.\n#  # http://tools.ietf.org/html/rfc4366#section-3.1\n#  server_name: \u003cstring\u003e\n#\n#  # Disable validation of the server certificate.\n#  insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n  receiver: 'default-receiver'\n\n  # The labels by which incoming alerts are grouped together. For example,\n  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n  # be batched into a single group.\n  #\n  # To aggregate by all possible labels use '...' as the sole label name.\n  # This effectively disables aggregation entirely, passing through all\n  # alerts as-is. This is unlikely to be what you want, unless you have\n  # a very low alert volume or your upstream notification system performs\n  # its own grouping. Example: group_by: [...]\n  group_by: ['alertname', 'cluster', 'service']\n\n  # When a new group of alerts is created by an incoming alert, wait at\n  # least 'group_wait' to send the initial notification.\n  # This way ensures that you get multiple alerts for the same group that start\n  # firing shortly after another are batched together on the first\n  # notification.\n  group_wait: 30s\n\n  # When the first notification was sent, wait 'group_interval' to send a batch\n  # of new alerts that started firing for that group.\n  group_interval: 5m\n\n  # If an alert has successfully been sent, wait 'repeat_interval' to\n  # resend them.\n  repeat_interval: 3h\n\n  # All the above attributes are inherited by all child routes and can\n  # overwritten on each.\n  # The child route trees.\n  routes:\n  # This routes performs a regular expression match on alert labels to\n  # catch alerts that are related to a list of services.\n  - match_re:\n      service: .*\n    receiver: default-receiver\n    # The service has a sub-route for critical alerts, any alerts\n    # that do not match, i.e. severity != critical, fall-back to the\n    # parent node and are sent to 'team-X-mails'\n    routes:\n    - match:\n        severity: critical\n      receiver: 'default-receiver'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n    severity: 'critical'\n  target_match:\n    severity: 'warning'\n  # Apply inhibition if the alertname is the same.\n  # CAUTION:\n  #   If all label names listed in `equal` are missing\n  #   from both the source and target alerts,\n  #   the inhibition rule will apply!\n  equal: ['alertname', 'cluster', 'service']\n\nreceivers:\n- name: 'default-receiver'\n  slack_configs:\n  - channel: '#fdio-infra-monitoring'\n    send_resolved: true\n    icon_url: https://avatars3.githubusercontent.com/u/3380462\n    title: |-\n     [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n     {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n       {{\" \"}}(\n       {{- with .CommonLabels.Remove .GroupLabels.Names }}\n         {{- range $index, $label := .SortedPairs -}}\n           {{ if $index }}, {{ end }}\n           {{- $label.Name }}=\"{{ $label.Value -}}\"\n         {{- end }}\n       {{- end -}}\n       )\n     {{- end }}\n    text: \u003e-\n     {{ range .Alerts -}}\n     *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n     *Description:* {{ .Annotations.description }}\n\n     *Details:*\n       {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n       {{ end }}\n     {{ end }}\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"alertmanager\"\n        port            = \"alertmanager\"\n        tags            = [ \"alertmanager${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Alertmanager Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = 1000\n        memory          = 1024\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"alertmanager\" {\n            static      = 9093\n          }\n        }\n      }\n    }\n  }\n}",
-            "template": "job \"${job_name}\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"${datacenters}\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n%{ if use_canary }\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n%{ endif }\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-${service_name}\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count             = ${group_count}\n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute       = \"$${attr.cpu.arch}\"\n      operator        = \"!=\"\n      value           = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-${service_name}\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver          = \"exec\"\n\n      %{ if use_vault_provider }\n      vault {\n        policies        = \"${vault_kv_policy_name}\"\n      }\n      %{ endif }\n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command       = \"local/alertmanager-${version}.linux-amd64/alertmanager\"\n        args          = [\n          \"--config.file=secrets/alertmanager.yml\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"${url}\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alertmanager.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\nglobal:\n  # The API URL to use for Slack notifications.\n  slack_api_url: 'https://hooks.slack.com/services/${slack_api_key}'\n\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n#  # CA certificate to validate the server certificate with.\n#  ca_file: \u003cfilepath\u003e ]\n#\n#  # Certificate and key files for client cert authentication to the server.\n#  cert_file: \u003cfilepath\u003e\n#  key_file: \u003cfilepath\u003e\n#\n#  # ServerName extension to indicate the name of the server.\n#  # http://tools.ietf.org/html/rfc4366#section-3.1\n#  server_name: \u003cstring\u003e\n#\n#  # Disable validation of the server certificate.\n#  insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n  receiver: '${default_receiver}'\n\n  # The labels by which incoming alerts are grouped together. For example,\n  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n  # be batched into a single group.\n  #\n  # To aggregate by all possible labels use '...' as the sole label name.\n  # This effectively disables aggregation entirely, passing through all\n  # alerts as-is. This is unlikely to be what you want, unless you have\n  # a very low alert volume or your upstream notification system performs\n  # its own grouping. Example: group_by: [...]\n  group_by: ['alertname', 'cluster', 'service']\n\n  # When a new group of alerts is created by an incoming alert, wait at\n  # least 'group_wait' to send the initial notification.\n  # This way ensures that you get multiple alerts for the same group that start\n  # firing shortly after another are batched together on the first\n  # notification.\n  group_wait: 30s\n\n  # When the first notification was sent, wait 'group_interval' to send a batch\n  # of new alerts that started firing for that group.\n  group_interval: 5m\n\n  # If an alert has successfully been sent, wait 'repeat_interval' to\n  # resend them.\n  repeat_interval: 3h\n\n  # All the above attributes are inherited by all child routes and can\n  # overwritten on each.\n  # The child route trees.\n  routes:\n  # This routes performs a regular expression match on alert labels to\n  # catch alerts that are related to a list of services.\n  - match_re:\n      service: .*\n    receiver: ${default_receiver}\n    # The service has a sub-route for critical alerts, any alerts\n    # that do not match, i.e. severity != critical, fall-back to the\n    # parent node and are sent to 'team-X-mails'\n    routes:\n    - match:\n        severity: critical\n      receiver: '${default_receiver}'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n    severity: 'critical'\n  target_match:\n    severity: 'warning'\n  # Apply inhibition if the alertname is the same.\n  # CAUTION:\n  #   If all label names listed in `equal` are missing\n  #   from both the source and target alerts,\n  #   the inhibition rule will apply!\n  equal: ['alertname', 'cluster', 'service']\n\nreceivers:\n- name: '${default_receiver}'\n  slack_configs:\n  - channel: '#${slack_channel}'\n    send_resolved: true\n    icon_url: https://avatars3.githubusercontent.com/u/3380462\n    title: |-\n     [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n     {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n       {{\" \"}}(\n       {{- with .CommonLabels.Remove .GroupLabels.Names }}\n         {{- range $index, $label := .SortedPairs -}}\n           {{ if $index }}, {{ end }}\n           {{- $label.Name }}=\"{{ $label.Value -}}\"\n         {{- end }}\n       {{- end -}}\n       )\n     {{- end }}\n    text: \u003e-\n     {{ range .Alerts -}}\n     *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n     *Description:* {{ .Annotations.description }}\n\n     *Details:*\n       {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n       {{ end }}\n     {{ end }}\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"${service_name}\"\n        port            = \"${service_name}\"\n        tags            = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Alertmanager Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = ${cpu}\n        memory          = ${mem}\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"${service_name}\" {\n            static      = ${port}\n          }\n        }\n      }\n    }\n  }\n}",
+            "id": "9078b0e92799cb1fa8e6a6a0f7c9950fd4c0b563b3e98fd4168ebd9c770ba033",
+            "rendered": "job \"prod-alertmanager\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-alertmanager\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count             = 1\n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute       = \"${attr.cpu.arch}\"\n      operator        = \"!=\"\n      value           = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-alertmanager\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver          = \"exec\"\n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command       = \"local/alertmanager-0.21.0.linux-amd64/alertmanager\"\n        args          = [\n          \"--config.file=secrets/alertmanager.yml\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alertmanager.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n#  # CA certificate to validate the server certificate with.\n#  ca_file: \u003cfilepath\u003e ]\n#\n#  # Certificate and key files for client cert authentication to the server.\n#  cert_file: \u003cfilepath\u003e\n#  key_file: \u003cfilepath\u003e\n#\n#  # ServerName extension to indicate the name of the server.\n#  # http://tools.ietf.org/html/rfc4366#section-3.1\n#  server_name: \u003cstring\u003e\n#\n#  # Disable validation of the server certificate.\n#  insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n  receiver: 'default-slack-receiver'\n\n  # The labels by which incoming alerts are grouped together. For example,\n  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n  # be batched into a single group.\n  #\n  # To aggregate by all possible labels use '...' as the sole label name.\n  # This effectively disables aggregation entirely, passing through all\n  # alerts as-is. This is unlikely to be what you want, unless you have\n  # a very low alert volume or your upstream notification system performs\n  # its own grouping. Example: group_by: [...]\n  group_by: ['alertname']\n\n  # When a new group of alerts is created by an incoming alert, wait at\n  # least 'group_wait' to send the initial notification.\n  # This way ensures that you get multiple alerts for the same group that start\n  # firing shortly after another are batched together on the first\n  # notification.\n  group_wait: 30s\n\n  # When the first notification was sent, wait 'group_interval' to send a batch\n  # of new alerts that started firing for that group.\n  group_interval: 5m\n\n  # If an alert has successfully been sent, wait 'repeat_interval' to\n  # resend them.\n  repeat_interval: 3h\n\n  # All the above attributes are inherited by all child routes and can\n  # overwritten on each.\n  # The child route trees.\n  routes:\n  - match_re:\n      alertname: JenkinsJob.*\n    receiver: jenkins-slack-receiver\n    routes:\n    - match:\n        severity: critical\n      receiver: 'jenkins-slack-receiver'\n\n  - match_re:\n      service: .*\n    receiver: default-slack-receiver\n    routes:\n    - match:\n        severity: critical\n      receiver: 'default-slack-receiver'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n    severity: 'critical'\n  target_match:\n    severity: 'warning'\n  equal: ['alertname', 'instance']\n\nreceivers:\n- name: 'jenkins-slack-receiver'\n  slack_configs:\n  - api_url: 'https://hooks.slack.com/services/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'\n    channel: '#fdio-jobs-monitoring'\n    send_resolved: true\n    icon_url: https://avatars3.githubusercontent.com/u/3380462\n    title: |-\n     [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n     {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n       {{\" \"}}(\n       {{- with .CommonLabels.Remove .GroupLabels.Names }}\n         {{- range $index, $label := .SortedPairs -}}\n           {{ if $index }}, {{ end }}\n           {{- $label.Name }}=\"{{ $label.Value -}}\"\n         {{- end }}\n       {{- end -}}\n       )\n     {{- end }}\n    text: \u003e-\n     {{ range .Alerts -}}\n     *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n     *Description:* {{ .Annotations.description }}\n\n     *Details:*\n       {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n       {{ end }}\n     {{ end }}\n\n- name: 'default-slack-receiver'\n  slack_configs:\n  - api_url: 'https://hooks.slack.com/services/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'\n    channel: '#fdio-infra-monitoring'\n    send_resolved: true\n    icon_url: https://avatars3.githubusercontent.com/u/3380462\n    title: |-\n     [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n     {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n       {{\" \"}}(\n       {{- with .CommonLabels.Remove .GroupLabels.Names }}\n         {{- range $index, $label := .SortedPairs -}}\n           {{ if $index }}, {{ end }}\n           {{- $label.Name }}=\"{{ $label.Value -}}\"\n         {{- end }}\n       {{- end -}}\n       )\n     {{- end }}\n    text: \u003e-\n     {{ range .Alerts -}}\n     *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n     *Description:* {{ .Annotations.description }}\n\n     *Details:*\n       {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n       {{ end }}\n     {{ end }}\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"alertmanager\"\n        port            = \"alertmanager\"\n        tags            = [ \"alertmanager${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Alertmanager Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = 1000\n        memory          = 1024\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"alertmanager\" {\n            static      = 9093\n          }\n        }\n      }\n    }\n  }\n}",
+            "template": "job \"${job_name}\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"${datacenters}\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n%{ if use_canary }\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n%{ endif }\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-${service_name}\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count             = ${group_count}\n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute       = \"$${attr.cpu.arch}\"\n      operator        = \"!=\"\n      value           = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-${service_name}\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver          = \"exec\"\n\n      %{ if use_vault_provider }\n      vault {\n        policies        = \"${vault_kv_policy_name}\"\n      }\n      %{ endif }\n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command       = \"local/alertmanager-${version}.linux-amd64/alertmanager\"\n        args          = [\n          \"--config.file=secrets/alertmanager.yml\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"${url}\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alertmanager.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n#  # CA certificate to validate the server certificate with.\n#  ca_file: \u003cfilepath\u003e ]\n#\n#  # Certificate and key files for client cert authentication to the server.\n#  cert_file: \u003cfilepath\u003e\n#  key_file: \u003cfilepath\u003e\n#\n#  # ServerName extension to indicate the name of the server.\n#  # http://tools.ietf.org/html/rfc4366#section-3.1\n#  server_name: \u003cstring\u003e\n#\n#  # Disable validation of the server certificate.\n#  insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n  receiver: '${slack_default_receiver}'\n\n  # The labels by which incoming alerts are grouped together. For example,\n  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n  # be batched into a single group.\n  #\n  # To aggregate by all possible labels use '...' as the sole label name.\n  # This effectively disables aggregation entirely, passing through all\n  # alerts as-is. This is unlikely to be what you want, unless you have\n  # a very low alert volume or your upstream notification system performs\n  # its own grouping. Example: group_by: [...]\n  group_by: ['alertname']\n\n  # When a new group of alerts is created by an incoming alert, wait at\n  # least 'group_wait' to send the initial notification.\n  # This way ensures that you get multiple alerts for the same group that start\n  # firing shortly after another are batched together on the first\n  # notification.\n  group_wait: 30s\n\n  # When the first notification was sent, wait 'group_interval' to send a batch\n  # of new alerts that started firing for that group.\n  group_interval: 5m\n\n  # If an alert has successfully been sent, wait 'repeat_interval' to\n  # resend them.\n  repeat_interval: 3h\n\n  # All the above attributes are inherited by all child routes and can\n  # overwritten on each.\n  # The child route trees.\n  routes:\n  - match_re:\n      alertname: JenkinsJob.*\n    receiver: ${slack_jenkins_receiver}\n    routes:\n    - match:\n        severity: critical\n      receiver: '${slack_jenkins_receiver}'\n\n  - match_re:\n      service: .*\n    receiver: ${slack_default_receiver}\n    routes:\n    - match:\n        severity: critical\n      receiver: '${slack_default_receiver}'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n    severity: 'critical'\n  target_match:\n    severity: 'warning'\n  equal: ['alertname', 'instance']\n\nreceivers:\n- name: '${slack_jenkins_receiver}'\n  slack_configs:\n  - api_url: 'https://hooks.slack.com/services/${slack_jenkins_api_key}'\n    channel: '#${slack_jenkins_channel}'\n    send_resolved: true\n    icon_url: https://avatars3.githubusercontent.com/u/3380462\n    title: |-\n     [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n     {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n       {{\" \"}}(\n       {{- with .CommonLabels.Remove .GroupLabels.Names }}\n         {{- range $index, $label := .SortedPairs -}}\n           {{ if $index }}, {{ end }}\n           {{- $label.Name }}=\"{{ $label.Value -}}\"\n         {{- end }}\n       {{- end -}}\n       )\n     {{- end }}\n    text: \u003e-\n     {{ range .Alerts -}}\n     *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n     *Description:* {{ .Annotations.description }}\n\n     *Details:*\n       {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n       {{ end }}\n     {{ end }}\n\n- name: '${slack_default_receiver}'\n  slack_configs:\n  - api_url: 'https://hooks.slack.com/services/${slack_default_api_key}'\n    channel: '#${slack_default_channel}'\n    send_resolved: true\n    icon_url: https://avatars3.githubusercontent.com/u/3380462\n    title: |-\n     [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n     {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n       {{\" \"}}(\n       {{- with .CommonLabels.Remove .GroupLabels.Names }}\n         {{- range $index, $label := .SortedPairs -}}\n           {{ if $index }}, {{ end }}\n           {{- $label.Name }}=\"{{ $label.Value -}}\"\n         {{- end }}\n       {{- end -}}\n       )\n     {{- end }}\n    text: \u003e-\n     {{ range .Alerts -}}\n     *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n     *Description:* {{ .Annotations.description }}\n\n     *Details:*\n       {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n       {{ end }}\n     {{ end }}\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"${service_name}\"\n        port            = \"${service_name}\"\n        tags            = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Alertmanager Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = ${cpu}\n        memory          = ${mem}\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"${service_name}\" {\n            static      = ${port}\n          }\n        }\n      }\n    }\n  }\n}",
              "vars": {
                "cpu": "1000",
                "datacenters": "yul1",
-              "default_receiver": "default-receiver",
                "group_count": "1",
                "job_name": "prod-alertmanager",
                "mem": "1024",
                "port": "9093",
                "service_name": "alertmanager",
-              "slack_api_key": "XX",
-              "slack_channel": "fdio-infra-monitoring",
+              "slack_default_api_key": "TE07RD1V1/B01L7PQK9S8/pbADGhhhj60JSxHRi3K0NoW6",
+              "slack_default_channel": "fdio-infra-monitoring",
+              "slack_default_receiver": "default-slack-receiver",
+              "slack_jenkins_api_key": "TE07RD1V1/B01LPL8KM0F/JZQh0gF5U5SoNYnedBknzdHz",
+              "slack_jenkins_channel": "fdio-jobs-monitoring",
+              "slack_jenkins_receiver": "jenkins-slack-receiver",
                "url": "https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz",
                "use_canary": "true",
                "use_vault_provider": "false",
@@ -51,20 +54,20 @@
            "schema_version": 0,
            "attributes": {
              "allocation_ids": [
-              "bf013613-1ac7-6155-69e0-51ff57719549"
+              "c5e80ff9-5f53-8022-0179-5b628bba986f"
              ],
              "datacenters": [
                "yul1"
              ],
-            "deployment_id": "36a8a7ba-bd0c-bce9-bcc1-284e352dfd32",
+            "deployment_id": "84b19f76-906d-acdb-a7e9-041a07ad72e1",
              "deployment_status": "successful",
              "deregister_on_destroy": true,
              "deregister_on_id_change": true,
              "detach": false,
              "id": "prod-alertmanager",
-            "jobspec": "job \"prod-alertmanager\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-alertmanager\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count             = 1\n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute       = \"${attr.cpu.arch}\"\n      operator        = \"!=\"\n      value           = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-alertmanager\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver          = \"exec\"\n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command       = \"local/alertmanager-0.21.0.linux-amd64/alertmanager\"\n        args          = [\n          \"--config.file=secrets/alertmanager.yml\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alertmanager.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\nglobal:\n  # The API URL to use for Slack notifications.\n  slack_api_url: 'https://hooks.slack.com/services/XX'\n\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n#  # CA certificate to validate the server certificate with.\n#  ca_file: \u003cfilepath\u003e ]\n#\n#  # Certificate and key files for client cert authentication to the server.\n#  cert_file: \u003cfilepath\u003e\n#  key_file: \u003cfilepath\u003e\n#\n#  # ServerName extension to indicate the name of the server.\n#  # http://tools.ietf.org/html/rfc4366#section-3.1\n#  server_name: \u003cstring\u003e\n#\n#  # Disable validation of the server certificate.\n#  insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n  receiver: 'default-receiver'\n\n  # The labels by which incoming alerts are grouped together. For example,\n  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n  # be batched into a single group.\n  #\n  # To aggregate by all possible labels use '...' as the sole label name.\n  # This effectively disables aggregation entirely, passing through all\n  # alerts as-is. This is unlikely to be what you want, unless you have\n  # a very low alert volume or your upstream notification system performs\n  # its own grouping. Example: group_by: [...]\n  group_by: ['alertname', 'cluster', 'service']\n\n  # When a new group of alerts is created by an incoming alert, wait at\n  # least 'group_wait' to send the initial notification.\n  # This way ensures that you get multiple alerts for the same group that start\n  # firing shortly after another are batched together on the first\n  # notification.\n  group_wait: 30s\n\n  # When the first notification was sent, wait 'group_interval' to send a batch\n  # of new alerts that started firing for that group.\n  group_interval: 5m\n\n  # If an alert has successfully been sent, wait 'repeat_interval' to\n  # resend them.\n  repeat_interval: 3h\n\n  # All the above attributes are inherited by all child routes and can\n  # overwritten on each.\n  # The child route trees.\n  routes:\n  # This routes performs a regular expression match on alert labels to\n  # catch alerts that are related to a list of services.\n  - match_re:\n      service: .*\n    receiver: default-receiver\n    # The service has a sub-route for critical alerts, any alerts\n    # that do not match, i.e. severity != critical, fall-back to the\n    # parent node and are sent to 'team-X-mails'\n    routes:\n    - match:\n        severity: critical\n      receiver: 'default-receiver'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n    severity: 'critical'\n  target_match:\n    severity: 'warning'\n  # Apply inhibition if the alertname is the same.\n  # CAUTION:\n  #   If all label names listed in `equal` are missing\n  #   from both the source and target alerts,\n  #   the inhibition rule will apply!\n  equal: ['alertname', 'cluster', 'service']\n\nreceivers:\n- name: 'default-receiver'\n  slack_configs:\n  - channel: '#fdio-infra-monitoring'\n    send_resolved: true\n    icon_url: https://avatars3.githubusercontent.com/u/3380462\n    title: |-\n     [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n     {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n       {{\" \"}}(\n       {{- with .CommonLabels.Remove .GroupLabels.Names }}\n         {{- range $index, $label := .SortedPairs -}}\n           {{ if $index }}, {{ end }}\n           {{- $label.Name }}=\"{{ $label.Value -}}\"\n         {{- end }}\n       {{- end -}}\n       )\n     {{- end }}\n    text: \u003e-\n     {{ range .Alerts -}}\n     *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n     *Description:* {{ .Annotations.description }}\n\n     *Details:*\n       {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n       {{ end }}\n     {{ end }}\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"alertmanager\"\n        port            = \"alertmanager\"\n        tags            = [ \"alertmanager${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Alertmanager Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = 1000\n        memory          = 1024\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"alertmanager\" {\n            static      = 9093\n          }\n        }\n      }\n    }\n  }\n}",
+            "jobspec": "job \"prod-alertmanager\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-alertmanager\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count             = 1\n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute       = \"${attr.cpu.arch}\"\n      operator        = \"!=\"\n      value           = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-alertmanager\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver          = \"exec\"\n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command       = \"local/alertmanager-0.21.0.linux-amd64/alertmanager\"\n        args          = [\n          \"--config.file=secrets/alertmanager.yml\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alertmanager.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n#  # CA certificate to validate the server certificate with.\n#  ca_file: \u003cfilepath\u003e ]\n#\n#  # Certificate and key files for client cert authentication to the server.\n#  cert_file: \u003cfilepath\u003e\n#  key_file: \u003cfilepath\u003e\n#\n#  # ServerName extension to indicate the name of the server.\n#  # http://tools.ietf.org/html/rfc4366#section-3.1\n#  server_name: \u003cstring\u003e\n#\n#  # Disable validation of the server certificate.\n#  insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n  receiver: 'default-slack-receiver'\n\n  # The labels by which incoming alerts are grouped together. For example,\n  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n  # be batched into a single group.\n  #\n  # To aggregate by all possible labels use '...' as the sole label name.\n  # This effectively disables aggregation entirely, passing through all\n  # alerts as-is. This is unlikely to be what you want, unless you have\n  # a very low alert volume or your upstream notification system performs\n  # its own grouping. Example: group_by: [...]\n  group_by: ['alertname']\n\n  # When a new group of alerts is created by an incoming alert, wait at\n  # least 'group_wait' to send the initial notification.\n  # This way ensures that you get multiple alerts for the same group that start\n  # firing shortly after another are batched together on the first\n  # notification.\n  group_wait: 30s\n\n  # When the first notification was sent, wait 'group_interval' to send a batch\n  # of new alerts that started firing for that group.\n  group_interval: 5m\n\n  # If an alert has successfully been sent, wait 'repeat_interval' to\n  # resend them.\n  repeat_interval: 3h\n\n  # All the above attributes are inherited by all child routes and can\n  # overwritten on each.\n  # The child route trees.\n  routes:\n  - match_re:\n      alertname: JenkinsJob.*\n    receiver: jenkins-slack-receiver\n    routes:\n    - match:\n        severity: critical\n      receiver: 'jenkins-slack-receiver'\n\n  - match_re:\n      service: .*\n    receiver: default-slack-receiver\n    routes:\n    - match:\n        severity: critical\n      receiver: 'default-slack-receiver'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n    severity: 'critical'\n  target_match:\n    severity: 'warning'\n  equal: ['alertname', 'instance']\n\nreceivers:\n- name: 'jenkins-slack-receiver'\n  slack_configs:\n  - api_url: 'https://hooks.slack.com/services/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'\n    channel: '#fdio-jobs-monitoring'\n    send_resolved: true\n    icon_url: https://avatars3.githubusercontent.com/u/3380462\n    title: |-\n     [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n     {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n       {{\" \"}}(\n       {{- with .CommonLabels.Remove .GroupLabels.Names }}\n         {{- range $index, $label := .SortedPairs -}}\n           {{ if $index }}, {{ end }}\n           {{- $label.Name }}=\"{{ $label.Value -}}\"\n         {{- end }}\n       {{- end -}}\n       )\n     {{- end }}\n    text: \u003e-\n     {{ range .Alerts -}}\n     *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n     *Description:* {{ .Annotations.description }}\n\n     *Details:*\n       {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n       {{ end }}\n     {{ end }}\n\n- name: 'default-slack-receiver'\n  slack_configs:\n  - api_url: 'https://hooks.slack.com/services/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'\n    channel: '#fdio-infra-monitoring'\n    send_resolved: true\n    icon_url: https://avatars3.githubusercontent.com/u/3380462\n    title: |-\n     [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n     {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n       {{\" \"}}(\n       {{- with .CommonLabels.Remove .GroupLabels.Names }}\n         {{- range $index, $label := .SortedPairs -}}\n           {{ if $index }}, {{ end }}\n           {{- $label.Name }}=\"{{ $label.Value -}}\"\n         {{- end }}\n       {{- end -}}\n       )\n     {{- end }}\n    text: \u003e-\n     {{ range .Alerts -}}\n     *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n     *Description:* {{ .Annotations.description }}\n\n     *Details:*\n       {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n       {{ end }}\n     {{ end }}\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"alertmanager\"\n        port            = \"alertmanager\"\n        tags            = [ \"alertmanager${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Alertmanager Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = 1000\n        memory          = 1024\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"alertmanager\" {\n            static      = 9093\n          }\n        }\n      }\n    }\n  }\n}",
              "json": null,
-            "modify_index": "7150816",
+            "modify_index": "7224418",
              "name": "prod-alertmanager",
              "namespace": "default",
              "policy_override": null,
@@ -96,111 +99,6 @@
          }
        ]
      },
-    {
-      "module": "module.exporter",
-      "mode": "data",
-      "type": "template_file",
-      "name": "nomad_job_exporter",
-      "provider": "provider[\"registry.terraform.io/hashicorp/template\"]",
-      "instances": [
-        {
-          "schema_version": 0,
-          "attributes": {
-            "filename": null,
-            "id": "04a453cfc12a27f7b6d323a51d4cde3c76479feb5ccbfdfc2b43f6d4556e4f63",
-            "rendered": "job \"prod-exporter\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"system\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-exporter-amd64\" {\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute       = \"${attr.cpu.arch}\"\n      operator        = \"!=\"\n      value           = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task3-cadvisorexporter-amd64\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver          = \"docker\"\n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        image         = \"gcr.io/cadvisor/cadvisor:latest\"\n        volumes       = [\n          \"/:/rootfs:ro\",\n          \"/var/run:/var/run:rw\",\n          \"/sys:/sys:ro\",\n          \"/var/lib/docker/:/var/lib/docker:ro\",\n          \"/cgroup:/cgroup:ro\"\n        ]\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name          = \"cadvisorexporter\"\n        port          = \"cadvisorexporter\"\n        check {\n          name        = \"cAdvisor Check Live\"\n          type        = \"http\"\n          path        = \"/metrics\"\n          interval    = \"10s\"\n          timeout     = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu           = 500\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"cadvisorexporter\" {\n            static    = 8080\n          }\n        }\n      }\n    }\n  }\n\n  group \"prod-group1-exporter-arm64\" {\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute       = \"${attr.cpu.arch}\"\n      operator        = \"==\"\n      value           = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task3-cadvisorexporter-arm64\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver          = \"docker\"\n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        # There is currently no official release for arm yet...using community.\n        image         = \"zcube/cadvisor:latest\"\n        volumes       = [\n          \"/:/rootfs:ro\",\n          \"/var/run:/var/run:rw\",\n          \"/sys:/sys:ro\",\n          \"/var/lib/docker/:/var/lib/docker:ro\",\n          \"/cgroup:/cgroup:ro\"\n        ]\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name          = \"cadvisorexporter\"\n        port          = \"cadvisorexporter\"\n        check {\n          name        = \"cAdvisor Check Live\"\n          type        = \"http\"\n          path        = \"/metrics\"\n          interval    = \"10s\"\n          timeout     = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu           = 500\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"cadvisorexporter\" {\n            static    = 8080\n          }\n        }\n      }\n    }\n  }\n}",
-            "template": "job \"${job_name}\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"${datacenters}\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"system\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n%{ if use_canary }\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n%{ endif }\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-exporter-amd64\" {\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute       = \"$${attr.cpu.arch}\"\n      operator        = \"!=\"\n      value           = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task3-${cadvisor_service_name}-amd64\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver          = \"docker\"\n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        image         = \"${cadvisor_image}\"\n        volumes       = [\n          \"/:/rootfs:ro\",\n          \"/var/run:/var/run:rw\",\n          \"/sys:/sys:ro\",\n          \"/var/lib/docker/:/var/lib/docker:ro\",\n          \"/cgroup:/cgroup:ro\"\n        ]\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name          = \"${cadvisor_service_name}\"\n        port          = \"${cadvisor_service_name}\"\n        check {\n          name        = \"cAdvisor Check Live\"\n          type        = \"http\"\n          path        = \"/metrics\"\n          interval    = \"10s\"\n          timeout     = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu           = 500\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"${cadvisor_service_name}\" {\n            static    = ${cadvisor_port}\n          }\n        }\n      }\n    }\n  }\n\n  group \"prod-group1-exporter-arm64\" {\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute       = \"$${attr.cpu.arch}\"\n      operator        = \"==\"\n      value           = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task3-${cadvisor_service_name}-arm64\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver          = \"docker\"\n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        # There is currently no official release for arm yet...using community.\n        image         = \"zcube/cadvisor:latest\"\n        volumes       = [\n          \"/:/rootfs:ro\",\n          \"/var/run:/var/run:rw\",\n          \"/sys:/sys:ro\",\n          \"/var/lib/docker/:/var/lib/docker:ro\",\n          \"/cgroup:/cgroup:ro\"\n        ]\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name          = \"${cadvisor_service_name}\"\n        port          = \"${cadvisor_service_name}\"\n        check {\n          name        = \"cAdvisor Check Live\"\n          type        = \"http\"\n          path        = \"/metrics\"\n          interval    = \"10s\"\n          timeout     = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu           = 500\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"${cadvisor_service_name}\" {\n            static    = ${cadvisor_port}\n          }\n        }\n      }\n    }\n  }\n}",
-            "vars": {
-              "blackbox_port": "9115",
-              "blackbox_service_name": "blackboxexporter",
-              "blackbox_url_amd64": "https://github.com/prometheus/blackbox_exporter/releases/download/v0.18.0/blackbox_exporter-0.18.0.linux-amd64.tar.gz",
-              "blackbox_url_arm64": "https://github.com/prometheus/blackbox_exporter/releases/download/v0.18.0/blackbox_exporter-0.18.0.linux-arm64.tar.gz",
-              "blackbox_version": "0.18.0",
-              "cadvisor_image": "gcr.io/cadvisor/cadvisor:latest",
-              "cadvisor_port": "8080",
-              "cadvisor_service_name": "cadvisorexporter",
-              "datacenters": "yul1",
-              "job_name": "prod-exporter",
-              "node_port": "9100",
-              "node_service_name": "nodeexporter",
-              "node_url_amd64": "https://github.com/prometheus/node_exporter/releases/download/v1.0.1/node_exporter-1.0.1.linux-amd64.tar.gz",
-              "node_url_arm64": "https://github.com/prometheus/node_exporter/releases/download/v1.0.1/node_exporter-1.0.1.linux-arm64.tar.gz",
-              "node_version": "1.0.1",
-              "use_canary": "false"
-            }
-          },
-          "sensitive_attributes": []
-        }
-      ]
-    },
-    {
-      "module": "module.exporter",
-      "mode": "managed",
-      "type": "nomad_job",
-      "name": "nomad_job_exporter",
-      "provider": "provider[\"registry.terraform.io/hashicorp/nomad\"].yul1",
-      "instances": [
-        {
-          "schema_version": 0,
-          "attributes": {
-            "allocation_ids": null,
-            "datacenters": [
-              "yul1"
-            ],
-            "deployment_id": "",
-            "deployment_status": "",
-            "deregister_on_destroy": true,
-            "deregister_on_id_change": true,
-            "detach": false,
-            "id": "prod-exporter",
-            "jobspec": "job \"prod-exporter\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"system\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-exporter-amd64\" {\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute       = \"${attr.cpu.arch}\"\n      operator        = \"!=\"\n      value           = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task3-cadvisorexporter-amd64\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver          = \"docker\"\n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        image         = \"gcr.io/cadvisor/cadvisor:latest\"\n        volumes       = [\n          \"/:/rootfs:ro\",\n          \"/var/run:/var/run:rw\",\n          \"/sys:/sys:ro\",\n          \"/var/lib/docker/:/var/lib/docker:ro\",\n          \"/cgroup:/cgroup:ro\"\n        ]\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name          = \"cadvisorexporter\"\n        port          = \"cadvisorexporter\"\n        check {\n          name        = \"cAdvisor Check Live\"\n          type        = \"http\"\n          path        = \"/metrics\"\n          interval    = \"10s\"\n          timeout     = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu           = 500\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"cadvisorexporter\" {\n            static    = 8080\n          }\n        }\n      }\n    }\n  }\n\n  group \"prod-group1-exporter-arm64\" {\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute       = \"${attr.cpu.arch}\"\n      operator        = \"==\"\n      value           = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task3-cadvisorexporter-arm64\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver          = \"docker\"\n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        # There is currently no official release for arm yet...using community.\n        image         = \"zcube/cadvisor:latest\"\n        volumes       = [\n          \"/:/rootfs:ro\",\n          \"/var/run:/var/run:rw\",\n          \"/sys:/sys:ro\",\n          \"/var/lib/docker/:/var/lib/docker:ro\",\n          \"/cgroup:/cgroup:ro\"\n        ]\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name          = \"cadvisorexporter\"\n        port          = \"cadvisorexporter\"\n        check {\n          name        = \"cAdvisor Check Live\"\n          type        = \"http\"\n          path        = \"/metrics\"\n          interval    = \"10s\"\n          timeout     = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu           = 500\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"cadvisorexporter\" {\n            static    = 8080\n          }\n        }\n      }\n    }\n  }\n}",
-            "json": null,
-            "modify_index": "7201184",
-            "name": "prod-exporter",
-            "namespace": "default",
-            "policy_override": null,
-            "purge_on_destroy": null,
-            "region": "global",
-            "task_groups": [
-              {
-                "count": 1,
-                "meta": {},
-                "name": "prod-group1-exporter-amd64",
-                "task": [
-                  {
-                    "driver": "docker",
-                    "meta": {},
-                    "name": "prod-task3-cadvisorexporter-amd64",
-                    "volume_mounts": []
-                  }
-                ],
-                "volumes": []
-              },
-              {
-                "count": 1,
-                "meta": {},
-                "name": "prod-group1-exporter-arm64",
-                "task": [
-                  {
-                    "driver": "docker",
-                    "meta": {},
-                    "name": "prod-task3-cadvisorexporter-arm64",
-                    "volume_mounts": []
-                  }
-                ],
-                "volumes": []
-              }
-            ],
-            "type": "system"
-          },
-          "sensitive_attributes": [],
-          "private": "bnVsbA==",
-          "dependencies": [
-            "module.exporter.data.template_file.nomad_job_exporter"
-          ]
-        }
-      ]
-    },
      {
        "module": "module.grafana",
        "mode": "data",
@@ -369,10 +267,11 @@
        "provider": "provider[\"registry.terraform.io/hashicorp/nomad\"].yul1",
        "instances": [
          {
-          "status": "tainted",
            "schema_version": 0,
            "attributes": {
-            "allocation_ids": null,
+            "allocation_ids": [
+              "631050b2-0973-35f9-30ab-1f80b2b56b28"
+            ],
              "datacenters": [
                "yul1"
              ],
@@ -384,12 +283,12 @@
              "id": "prod-mc",
              "jobspec": "job \"prod-mc\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers.html\n  #\n  type        = \"batch\"\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group.html\n  #\n  group \"prod-group1-mc\" {\n    task \"prod-task1-create-buckets\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver        = \"docker\"\n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        image       = \"minio/mc:RELEASE.2020-12-10T01-26-17Z\"\n        entrypoint  = [\n          \"/bin/sh\",\n          \"-c\",\n          \"mc config host add LOCALMINIO http://storage.service.consul:9000 $MINIO_ACCESS_KEY $MINIO_SECRET_KEY \u0026\u0026 mc mb -p LOCALMINIO/logs.fd.io LOCALMINIO/docs.fd.io ; mc policy set public LOCALMINIO/logs.fd.io mc policy set public LOCALMINIO/docs.fd.io mc ilm add --expiry-days '180' LOCALMINIO/logs.fd.io mc admin user add LOCALMINIO storage Storage1234 mc admin policy set LOCALMINIO writeonly user=storage\"\n        ]\n        dns_servers  = [ \"${attr.unique.network.ip-address}\" ]\n        privileged   = false\n      }\n\n      # The env stanza configures a list of environment variables to populate\n      # the task's environment before starting.\n      env {\n        \n        MINIO_ACCESS_KEY = \"minio\"\n        MINIO_SECRET_KEY = \"minio123\"\n        \n        \n      }\n    }\n  }\n}\n",
              "json": null,
-            "modify_index": "7201202",
+            "modify_index": "7254273",
              "name": "prod-mc",
              "namespace": "default",
              "policy_override": null,
              "purge_on_destroy": null,
-            "region": null,
+            "region": "global",
              "task_groups": [
                {
                  "count": 1,
@@ -580,9 +479,9 @@
            "schema_version": 0,
            "attributes": {
              "filename": null,
-            "id": "f5c5b3316512492fa4e1caea185bafeeeb51b619e76b08ab473ca9523c2b8268",
-            "rendered": "job \"prod-prometheus\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-prometheus\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count               = 4\n\n    # The volume stanza allows the group to specify that it requires a given\n    # volume from the cluster.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/volume\n    #\n    \n    volume \"prod-volume1-prometheus\" {\n      type              = \"host\"\n      read_only         = false\n      source            = \"prod-volume-data1-1\"\n    }\n    \n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute         = \"${attr.cpu.arch}\"\n      operator          = \"!=\"\n      value             = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-prometheus\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver            = \"exec\"\n\n      \n      volume_mount {\n        volume          = \"prod-volume1-prometheus\"\n        destination     = \"/data/\"\n        read_only       = false\n      }\n      \n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command         = \"local/prometheus-2.24.0.linux-amd64/prometheus\"\n        args            = [\n          \"--config.file=secrets/prometheus.yml\",\n          \"--storage.tsdb.path=/data/prometheus/\",\n          \"--storage.tsdb.retention.time=15d\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alerts.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n---\ngroups:\n- name: \"Consul\"\n  rules:\n  - alert: ConsulServiceHealthcheckFailed\n    expr: consul_catalog_service_node_healthy == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n      description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n  - alert: ConsulMissingMasterNode\n    expr: consul_raft_peers \u003c 3\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n      description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n  - alert: ConsulAgentUnhealthy\n    expr: consul_health_node_status{status=\"critical\"} == 1\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n      description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n  rules:\n  - alert: NodeDown\n    expr: up == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n      description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n  - alert: HostHighCpuLoad\n    expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n      description: \"CPU load is \u003e 95%.\"\n  - alert: HostOutOfMemory\n    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n      description: \"Node memory is filling up (\u003c 10% left).\"\n  - alert: HostOomKillDetected\n    expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n      description: \"OOM kill detected.\"\n  - alert: HostMemoryUnderMemoryPressure\n    expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n      description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n  - alert: HostOutOfDiskSpace\n    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n      description: \"Disk is almost full (\u003c 10% left).\"\n  - alert: HostRaidDiskFailure\n    expr: node_md_disks{state=\"failed\"} \u003e 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n      description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n  - alert: HostConntrackLimit\n    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n      description: \"The number of conntrack is approching limit.\"\n  - alert: HostNetworkInterfaceSaturated\n    expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n    for: 1m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n      description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n  - alert: HostSystemdServiceCrashed\n    expr: node_systemd_unit_state{state=\"failed\"} == 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n      description: \"SystemD service crashed.\"\n  - alert: HostEdacCorrectableErrorsDetected\n    expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: info\n    annotations:\n      summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n  - alert: HostEdacUncorrectableErrorsDetected\n    expr: node_edac_uncorrectable_errors_total \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n  rules:\n  - alert: MinioDiskOffline\n    expr: minio_offline_disks \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n      description: \"Minio disk is offline.\"\n  - alert: MinioStorageSpaceExhausted\n    expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n      description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n  rules:\n  - alert: PrometheusConfigurationReloadFailure\n    expr: prometheus_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"Prometheus configuration reload error.\"\n  - alert: PrometheusTooManyRestarts\n    expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n      description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n  - alert: PrometheusAlertmanagerConfigurationReloadFailure\n    expr: alertmanager_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"AlertManager configuration reload error.\"\n  - alert: PrometheusRuleEvaluationFailures\n    expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n  - alert: PrometheusTargetScrapingSlow\n    expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n      description: \"Prometheus is scraping exporters slowly.\"\n  - alert: PrometheusTsdbCompactionsFailed\n    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n  - alert: PrometheusTsdbHeadTruncationsFailed\n    expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n  - alert: PrometheusTsdbWalCorruptions\n    expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n  - alert: PrometheusTsdbWalTruncationsFailed\n    expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n      }\n\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/prometheus.yml\"\n        data            = \u003c\u003cEOH\n---\nglobal:\n  scrape_interval:     5s\n  scrape_timeout:      5s\n  evaluation_interval: 5s\n\nalerting:\n  alertmanagers:\n  - consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\nrule_files:\n  - 'alerts.yml'\n\nscrape_configs:\n\n  - job_name: 'Nomad Cluster'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'nomad-client', 'nomad' ]\n    relabel_configs:\n    - source_labels: [__meta_consul_tags]\n      regex: '(.*)http(.*)'\n      action: keep\n    metrics_path: /v1/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Consul Cluster'\n    static_configs:\n      - targets: [ '10.30.51.28:8500' ]\n      - targets: [ '10.30.51.29:8500' ]\n      - targets: [ '10.30.51.30:8500' ]\n      - targets: [ '10.30.51.32:8500' ]\n      - targets: [ '10.30.51.33:8500' ]\n      - targets: [ '10.30.51.34:8500' ]\n      - targets: [ '10.30.51.35:8500' ]\n      - targets: [ '10.30.51.39:8500' ]\n      - targets: [ '10.30.51.40:8500' ]\n      - targets: [ '10.30.51.50:8500' ]\n      - targets: [ '10.30.51.51:8500' ]\n      - targets: [ '10.30.51.65:8500' ]\n      - targets: [ '10.30.51.66:8500' ]\n      - targets: [ '10.30.51.67:8500' ]\n      - targets: [ '10.30.51.68:8500' ]\n      - targets: [ '10.30.51.70:8500' ]\n      - targets: [ '10.30.51.71:8500' ]\n      - targets: [ '10.32.8.14:8500' ]\n      - targets: [ '10.32.8.15:8500' ]\n      - targets: [ '10.32.8.16:8500' ]\n      - targets: [ '10.32.8.17:8500' ]\n    metrics_path: /v1/agent/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Blackbox Exporter (icmp)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n      - targets: [ '10.30.51.32' ]\n    params:\n      module: [ 'icmp_v4' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'Blackbox Exporter (http)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n    params:\n      module: [ 'http_2xx' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'cAdvisor Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:8080' ]\n      - targets: [ '10.30.51.29:8080' ]\n      - targets: [ '10.30.51.30:8080' ]\n      #- targets: [ '10.30.51.32:8080' ]\n      - targets: [ '10.30.51.33:8080' ]\n      - targets: [ '10.30.51.34:8080' ]\n      - targets: [ '10.30.51.35:8080' ]\n      - targets: [ '10.30.51.39:8080' ]\n      - targets: [ '10.30.51.40:8080' ]\n      - targets: [ '10.30.51.50:8080' ]\n      - targets: [ '10.30.51.51:8080' ]\n      - targets: [ '10.30.51.65:8080' ]\n      - targets: [ '10.30.51.66:8080' ]\n      - targets: [ '10.30.51.67:8080' ]\n      - targets: [ '10.30.51.68:8080' ]\n      - targets: [ '10.30.51.70:8080' ]\n      - targets: [ '10.30.51.71:8080' ]\n      - targets: [ '10.32.8.14:8080' ]\n      - targets: [ '10.32.8.15:8080' ]\n      - targets: [ '10.32.8.16:8080' ]\n      - targets: [ '10.32.8.17:8080' ]\n\n  - job_name: 'Node Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:9100' ]\n      - targets: [ '10.30.51.29:9100' ]\n      - targets: [ '10.30.51.30:9100' ]\n      - targets: [ '10.30.51.32:9100' ]\n      - targets: [ '10.30.51.33:9100' ]\n      - targets: [ '10.30.51.34:9100' ]\n      - targets: [ '10.30.51.35:9100' ]\n      - targets: [ '10.30.51.39:9100' ]\n      - targets: [ '10.30.51.40:9100' ]\n      - targets: [ '10.30.51.50:9100' ]\n      - targets: [ '10.30.51.51:9100' ]\n      - targets: [ '10.30.51.65:9100' ]\n      - targets: [ '10.30.51.66:9100' ]\n      - targets: [ '10.30.51.67:9100' ]\n      - targets: [ '10.30.51.68:9100' ]\n      - targets: [ '10.30.51.70:9100' ]\n      - targets: [ '10.30.51.71:9100' ]\n      - targets: [ '10.32.8.14:9100' ]\n      - targets: [ '10.32.8.15:9100' ]\n      - targets: [ '10.32.8.16:9100' ]\n      - targets: [ '10.32.8.17:9100' ]\n\n  - job_name: 'Alertmanager'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\n  - job_name: 'Grafana'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'grafana' ]\n\n  - job_name: 'Prometheus'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'prometheus' ]\n\n  - job_name: 'Minio'\n    bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'storage' ]\n    metrics_path: /minio/prometheus/metrics\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"prometheus\"\n        port            = \"prometheus\"\n        tags            = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Prometheus Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = 2000\n        memory          = 8192\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"prometheus\" {\n            static      = 9090\n          }\n        }\n      }\n    }\n  }\n}",
-            "template": "job \"${job_name}\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"${datacenters}\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n%{ if use_canary }\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n%{ endif }\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-${service_name}\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count               = ${group_count}\n\n    # The volume stanza allows the group to specify that it requires a given\n    # volume from the cluster.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/volume\n    #\n    %{ if use_host_volume }\n    volume \"prod-volume1-${service_name}\" {\n      type              = \"host\"\n      read_only         = false\n      source            = \"${host_volume}\"\n    }\n    %{ endif }\n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute         = \"$${attr.cpu.arch}\"\n      operator          = \"!=\"\n      value             = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-${service_name}\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver            = \"exec\"\n\n      %{ if use_host_volume }\n      volume_mount {\n        volume          = \"prod-volume1-${service_name}\"\n        destination     = \"${data_dir}\"\n        read_only       = false\n      }\n      %{ endif }\n\n      %{ if use_vault_provider }\n      vault {\n        policies        = \"${vault_kv_policy_name}\"\n      }\n      %{ endif }\n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command         = \"local/prometheus-${version}.linux-amd64/prometheus\"\n        args            = [\n          \"--config.file=secrets/prometheus.yml\",\n          \"--storage.tsdb.path=${data_dir}prometheus/\",\n          \"--storage.tsdb.retention.time=15d\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"${url}\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alerts.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n---\ngroups:\n- name: \"Consul\"\n  rules:\n  - alert: ConsulServiceHealthcheckFailed\n    expr: consul_catalog_service_node_healthy == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n      description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n  - alert: ConsulMissingMasterNode\n    expr: consul_raft_peers \u003c 3\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n      description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n  - alert: ConsulAgentUnhealthy\n    expr: consul_health_node_status{status=\"critical\"} == 1\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n      description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n  rules:\n  - alert: NodeDown\n    expr: up == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n      description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n  - alert: HostHighCpuLoad\n    expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n      description: \"CPU load is \u003e 95%.\"\n  - alert: HostOutOfMemory\n    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n      description: \"Node memory is filling up (\u003c 10% left).\"\n  - alert: HostOomKillDetected\n    expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n      description: \"OOM kill detected.\"\n  - alert: HostMemoryUnderMemoryPressure\n    expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n      description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n  - alert: HostOutOfDiskSpace\n    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n      description: \"Disk is almost full (\u003c 10% left).\"\n  - alert: HostRaidDiskFailure\n    expr: node_md_disks{state=\"failed\"} \u003e 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n      description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n  - alert: HostConntrackLimit\n    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n      description: \"The number of conntrack is approching limit.\"\n  - alert: HostNetworkInterfaceSaturated\n    expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n    for: 1m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n      description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n  - alert: HostSystemdServiceCrashed\n    expr: node_systemd_unit_state{state=\"failed\"} == 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n      description: \"SystemD service crashed.\"\n  - alert: HostEdacCorrectableErrorsDetected\n    expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: info\n    annotations:\n      summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n  - alert: HostEdacUncorrectableErrorsDetected\n    expr: node_edac_uncorrectable_errors_total \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n  rules:\n  - alert: MinioDiskOffline\n    expr: minio_offline_disks \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n      description: \"Minio disk is offline.\"\n  - alert: MinioStorageSpaceExhausted\n    expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n      description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n  rules:\n  - alert: PrometheusConfigurationReloadFailure\n    expr: prometheus_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"Prometheus configuration reload error.\"\n  - alert: PrometheusTooManyRestarts\n    expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n      description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n  - alert: PrometheusAlertmanagerConfigurationReloadFailure\n    expr: alertmanager_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"AlertManager configuration reload error.\"\n  - alert: PrometheusRuleEvaluationFailures\n    expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n  - alert: PrometheusTargetScrapingSlow\n    expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n      description: \"Prometheus is scraping exporters slowly.\"\n  - alert: PrometheusTsdbCompactionsFailed\n    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n  - alert: PrometheusTsdbHeadTruncationsFailed\n    expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n  - alert: PrometheusTsdbWalCorruptions\n    expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n  - alert: PrometheusTsdbWalTruncationsFailed\n    expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n      }\n\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/prometheus.yml\"\n        data            = \u003c\u003cEOH\n---\nglobal:\n  scrape_interval:     5s\n  scrape_timeout:      5s\n  evaluation_interval: 5s\n\nalerting:\n  alertmanagers:\n  - consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\nrule_files:\n  - 'alerts.yml'\n\nscrape_configs:\n\n  - job_name: 'Nomad Cluster'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'nomad-client', 'nomad' ]\n    relabel_configs:\n    - source_labels: [__meta_consul_tags]\n      regex: '(.*)http(.*)'\n      action: keep\n    metrics_path: /v1/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Consul Cluster'\n    static_configs:\n      - targets: [ '10.30.51.28:8500' ]\n      - targets: [ '10.30.51.29:8500' ]\n      - targets: [ '10.30.51.30:8500' ]\n      - targets: [ '10.30.51.32:8500' ]\n      - targets: [ '10.30.51.33:8500' ]\n      - targets: [ '10.30.51.34:8500' ]\n      - targets: [ '10.30.51.35:8500' ]\n      - targets: [ '10.30.51.39:8500' ]\n      - targets: [ '10.30.51.40:8500' ]\n      - targets: [ '10.30.51.50:8500' ]\n      - targets: [ '10.30.51.51:8500' ]\n      - targets: [ '10.30.51.65:8500' ]\n      - targets: [ '10.30.51.66:8500' ]\n      - targets: [ '10.30.51.67:8500' ]\n      - targets: [ '10.30.51.68:8500' ]\n      - targets: [ '10.30.51.70:8500' ]\n      - targets: [ '10.30.51.71:8500' ]\n      - targets: [ '10.32.8.14:8500' ]\n      - targets: [ '10.32.8.15:8500' ]\n      - targets: [ '10.32.8.16:8500' ]\n      - targets: [ '10.32.8.17:8500' ]\n    metrics_path: /v1/agent/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Blackbox Exporter (icmp)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n      - targets: [ '10.30.51.32' ]\n    params:\n      module: [ 'icmp_v4' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'Blackbox Exporter (http)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n    params:\n      module: [ 'http_2xx' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'cAdvisor Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:8080' ]\n      - targets: [ '10.30.51.29:8080' ]\n      - targets: [ '10.30.51.30:8080' ]\n      #- targets: [ '10.30.51.32:8080' ]\n      - targets: [ '10.30.51.33:8080' ]\n      - targets: [ '10.30.51.34:8080' ]\n      - targets: [ '10.30.51.35:8080' ]\n      - targets: [ '10.30.51.39:8080' ]\n      - targets: [ '10.30.51.40:8080' ]\n      - targets: [ '10.30.51.50:8080' ]\n      - targets: [ '10.30.51.51:8080' ]\n      - targets: [ '10.30.51.65:8080' ]\n      - targets: [ '10.30.51.66:8080' ]\n      - targets: [ '10.30.51.67:8080' ]\n      - targets: [ '10.30.51.68:8080' ]\n      - targets: [ '10.30.51.70:8080' ]\n      - targets: [ '10.30.51.71:8080' ]\n      - targets: [ '10.32.8.14:8080' ]\n      - targets: [ '10.32.8.15:8080' ]\n      - targets: [ '10.32.8.16:8080' ]\n      - targets: [ '10.32.8.17:8080' ]\n\n  - job_name: 'Node Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:9100' ]\n      - targets: [ '10.30.51.29:9100' ]\n      - targets: [ '10.30.51.30:9100' ]\n      - targets: [ '10.30.51.32:9100' ]\n      - targets: [ '10.30.51.33:9100' ]\n      - targets: [ '10.30.51.34:9100' ]\n      - targets: [ '10.30.51.35:9100' ]\n      - targets: [ '10.30.51.39:9100' ]\n      - targets: [ '10.30.51.40:9100' ]\n      - targets: [ '10.30.51.50:9100' ]\n      - targets: [ '10.30.51.51:9100' ]\n      - targets: [ '10.30.51.65:9100' ]\n      - targets: [ '10.30.51.66:9100' ]\n      - targets: [ '10.30.51.67:9100' ]\n      - targets: [ '10.30.51.68:9100' ]\n      - targets: [ '10.30.51.70:9100' ]\n      - targets: [ '10.30.51.71:9100' ]\n      - targets: [ '10.32.8.14:9100' ]\n      - targets: [ '10.32.8.15:9100' ]\n      - targets: [ '10.32.8.16:9100' ]\n      - targets: [ '10.32.8.17:9100' ]\n\n  - job_name: 'Alertmanager'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\n  - job_name: 'Grafana'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'grafana' ]\n\n  - job_name: 'Prometheus'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'prometheus' ]\n\n  - job_name: 'Minio'\n    bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'storage' ]\n    metrics_path: /minio/prometheus/metrics\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"${service_name}\"\n        port            = \"${service_name}\"\n        tags            = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Prometheus Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = ${cpu}\n        memory          = ${mem}\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"${service_name}\" {\n            static      = ${port}\n          }\n        }\n      }\n    }\n  }\n}",
+            "id": "4a4840cd7c13ce07d6bf0743f56a523eabde46a27c583d17865967a03dbf717a",
+            "rendered": "job \"prod-prometheus\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-prometheus\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count               = 4\n\n    # The volume stanza allows the group to specify that it requires a given\n    # volume from the cluster.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/volume\n    #\n    \n    volume \"prod-volume1-prometheus\" {\n      type              = \"host\"\n      read_only         = false\n      source            = \"prod-volume-data1-1\"\n    }\n    \n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute         = \"${attr.cpu.arch}\"\n      operator          = \"!=\"\n      value             = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-prometheus\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver            = \"exec\"\n\n      \n      volume_mount {\n        volume          = \"prod-volume1-prometheus\"\n        destination     = \"/data/\"\n        read_only       = false\n      }\n      \n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command         = \"local/prometheus-2.24.0.linux-amd64/prometheus\"\n        args            = [\n          \"--config.file=secrets/prometheus.yml\",\n          \"--storage.tsdb.path=/data/prometheus/\",\n          \"--storage.tsdb.retention.time=15d\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alerts.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n---\ngroups:\n- name: \"Jenkins Job Health Exporter\"\n  rules:\n  - alert: JenkinsJobHealthExporterFailures\n    expr: jenkins_job_failure{id=~\".*\"} \u003e= 10\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Jenkins Job Health detected high failure rate on VPP jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n  - alert: JenkinsJobHealthExporterUnstable\n    expr: jenkins_job_unstable{id=~\".*\"} \u003e= 10\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Jenkins Job Health detected high unstable rate on VPP jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n- name: \"Consul\"\n  rules:\n  - alert: ConsulServiceHealthcheckFailed\n    expr: consul_catalog_service_node_healthy == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n      description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n  - alert: ConsulMissingMasterNode\n    expr: consul_raft_peers \u003c 3\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n      description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n  - alert: ConsulAgentUnhealthy\n    expr: consul_health_node_status{status=\"critical\"} == 1\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n      description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n  rules:\n  - alert: NodeDown\n    expr: up == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n      description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n  - alert: HostHighCpuLoad\n    expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n      description: \"CPU load is \u003e 95%.\"\n  - alert: HostOutOfMemory\n    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n      description: \"Node memory is filling up (\u003c 10% left).\"\n  - alert: HostOomKillDetected\n    expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n      description: \"OOM kill detected.\"\n  - alert: HostMemoryUnderMemoryPressure\n    expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n      description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n  - alert: HostOutOfDiskSpace\n    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n      description: \"Disk is almost full (\u003c 10% left).\"\n  - alert: HostRaidDiskFailure\n    expr: node_md_disks{state=\"failed\"} \u003e 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n      description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n  - alert: HostConntrackLimit\n    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n      description: \"The number of conntrack is approching limit.\"\n  - alert: HostNetworkInterfaceSaturated\n    expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n    for: 1m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n      description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n  - alert: HostSystemdServiceCrashed\n    expr: node_systemd_unit_state{state=\"failed\"} == 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n      description: \"SystemD service crashed.\"\n  - alert: HostEdacCorrectableErrorsDetected\n    expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: info\n    annotations:\n      summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n  - alert: HostEdacUncorrectableErrorsDetected\n    expr: node_edac_uncorrectable_errors_total \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n  rules:\n  - alert: MinioDiskOffline\n    expr: minio_offline_disks \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n      description: \"Minio disk is offline.\"\n  - alert: MinioStorageSpaceExhausted\n    expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n      description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n  rules:\n  - alert: PrometheusConfigurationReloadFailure\n    expr: prometheus_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"Prometheus configuration reload error.\"\n  - alert: PrometheusTooManyRestarts\n    expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n      description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n  - alert: PrometheusAlertmanagerConfigurationReloadFailure\n    expr: alertmanager_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"AlertManager configuration reload error.\"\n  - alert: PrometheusRuleEvaluationFailures\n    expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n  - alert: PrometheusTargetScrapingSlow\n    expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n      description: \"Prometheus is scraping exporters slowly.\"\n  - alert: PrometheusTsdbCompactionsFailed\n    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n  - alert: PrometheusTsdbHeadTruncationsFailed\n    expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n  - alert: PrometheusTsdbWalCorruptions\n    expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n  - alert: PrometheusTsdbWalTruncationsFailed\n    expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n      }\n\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/prometheus.yml\"\n        data            = \u003c\u003cEOH\n---\nglobal:\n  scrape_interval:     5s\n  scrape_timeout:      5s\n  evaluation_interval: 5s\n\nalerting:\n  alertmanagers:\n  - consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\nrule_files:\n  - 'alerts.yml'\n\nscrape_configs:\n\n  - job_name: 'Nomad Cluster'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'nomad-client', 'nomad' ]\n    relabel_configs:\n    - source_labels: [__meta_consul_tags]\n      regex: '(.*)http(.*)'\n      action: keep\n    metrics_path: /v1/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Consul Cluster'\n    static_configs:\n      - targets: [ '10.30.51.28:8500' ]\n      - targets: [ '10.30.51.29:8500' ]\n      - targets: [ '10.30.51.30:8500' ]\n      - targets: [ '10.30.51.32:8500' ]\n      - targets: [ '10.30.51.33:8500' ]\n      - targets: [ '10.30.51.34:8500' ]\n      - targets: [ '10.30.51.35:8500' ]\n      - targets: [ '10.30.51.39:8500' ]\n      - targets: [ '10.30.51.40:8500' ]\n      - targets: [ '10.30.51.50:8500' ]\n      - targets: [ '10.30.51.51:8500' ]\n      - targets: [ '10.30.51.65:8500' ]\n      - targets: [ '10.30.51.66:8500' ]\n      - targets: [ '10.30.51.67:8500' ]\n      - targets: [ '10.30.51.68:8500' ]\n      - targets: [ '10.30.51.70:8500' ]\n      - targets: [ '10.30.51.71:8500' ]\n      - targets: [ '10.32.8.14:8500' ]\n      - targets: [ '10.32.8.15:8500' ]\n      - targets: [ '10.32.8.16:8500' ]\n      - targets: [ '10.32.8.17:8500' ]\n    metrics_path: /v1/agent/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Blackbox Exporter (icmp)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n      - targets: [ '10.30.51.32' ]\n    params:\n      module: [ 'icmp_v4' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'Blackbox Exporter (http)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n    params:\n      module: [ 'http_2xx' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'cAdvisor Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:8080' ]\n      - targets: [ '10.30.51.29:8080' ]\n      - targets: [ '10.30.51.30:8080' ]\n      #- targets: [ '10.30.51.32:8080' ]\n      - targets: [ '10.30.51.33:8080' ]\n      - targets: [ '10.30.51.34:8080' ]\n      - targets: [ '10.30.51.35:8080' ]\n      - targets: [ '10.30.51.39:8080' ]\n      - targets: [ '10.30.51.40:8080' ]\n      - targets: [ '10.30.51.50:8080' ]\n      - targets: [ '10.30.51.51:8080' ]\n      - targets: [ '10.30.51.65:8080' ]\n      - targets: [ '10.30.51.66:8080' ]\n      - targets: [ '10.30.51.67:8080' ]\n      - targets: [ '10.30.51.68:8080' ]\n      - targets: [ '10.30.51.70:8080' ]\n      - targets: [ '10.30.51.71:8080' ]\n      - targets: [ '10.32.8.14:8080' ]\n      - targets: [ '10.32.8.15:8080' ]\n      - targets: [ '10.32.8.16:8080' ]\n      - targets: [ '10.32.8.17:8080' ]\n\n  - job_name: 'Jenkins Job Health Exporter'\n    static_configs:\n      - targets: [ '10.30.51.32:9186' ]\n    metric_relabel_configs:\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*)_(success|failure|total|unstable|reqtime_ms)$'\n        action: replace\n        replacement: '$1'\n        target_label: id\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*)_(success|failure|total|unstable|reqtime_ms)$'\n        replacement: 'jenkins_job_$2'\n        target_label: __name__\n\n  - job_name: 'Node Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:9100' ]\n      - targets: [ '10.30.51.29:9100' ]\n      - targets: [ '10.30.51.30:9100' ]\n      - targets: [ '10.30.51.32:9100' ]\n      - targets: [ '10.30.51.33:9100' ]\n      - targets: [ '10.30.51.34:9100' ]\n      - targets: [ '10.30.51.35:9100' ]\n      - targets: [ '10.30.51.39:9100' ]\n      - targets: [ '10.30.51.40:9100' ]\n      - targets: [ '10.30.51.50:9100' ]\n      - targets: [ '10.30.51.51:9100' ]\n      - targets: [ '10.30.51.65:9100' ]\n      - targets: [ '10.30.51.66:9100' ]\n      - targets: [ '10.30.51.67:9100' ]\n      - targets: [ '10.30.51.68:9100' ]\n      - targets: [ '10.30.51.70:9100' ]\n      - targets: [ '10.30.51.71:9100' ]\n      - targets: [ '10.32.8.14:9100' ]\n      - targets: [ '10.32.8.15:9100' ]\n      - targets: [ '10.32.8.16:9100' ]\n      - targets: [ '10.32.8.17:9100' ]\n\n  - job_name: 'Alertmanager'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\n  - job_name: 'Grafana'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'grafana' ]\n\n  - job_name: 'Prometheus'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'prometheus' ]\n\n  - job_name: 'Minio'\n    bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'storage' ]\n    metrics_path: /minio/prometheus/metrics\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"prometheus\"\n        port            = \"prometheus\"\n        tags            = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Prometheus Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = 2000\n        memory          = 8192\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"prometheus\" {\n            static      = 9090\n          }\n        }\n      }\n    }\n  }\n}",
+            "template": "job \"${job_name}\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"${datacenters}\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n%{ if use_canary }\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n%{ endif }\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-${service_name}\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count               = ${group_count}\n\n    # The volume stanza allows the group to specify that it requires a given\n    # volume from the cluster.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/volume\n    #\n    %{ if use_host_volume }\n    volume \"prod-volume1-${service_name}\" {\n      type              = \"host\"\n      read_only         = false\n      source            = \"${host_volume}\"\n    }\n    %{ endif }\n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute         = \"$${attr.cpu.arch}\"\n      operator          = \"!=\"\n      value             = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-${service_name}\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver            = \"exec\"\n\n      %{ if use_host_volume }\n      volume_mount {\n        volume          = \"prod-volume1-${service_name}\"\n        destination     = \"${data_dir}\"\n        read_only       = false\n      }\n      %{ endif }\n\n      %{ if use_vault_provider }\n      vault {\n        policies        = \"${vault_kv_policy_name}\"\n      }\n      %{ endif }\n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command         = \"local/prometheus-${version}.linux-amd64/prometheus\"\n        args            = [\n          \"--config.file=secrets/prometheus.yml\",\n          \"--storage.tsdb.path=${data_dir}prometheus/\",\n          \"--storage.tsdb.retention.time=15d\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"${url}\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alerts.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n---\ngroups:\n- name: \"Jenkins Job Health Exporter\"\n  rules:\n  - alert: JenkinsJobHealthExporterFailures\n    expr: jenkins_job_failure{id=~\".*\"} \u003e= 10\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Jenkins Job Health detected high failure rate on VPP jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n  - alert: JenkinsJobHealthExporterUnstable\n    expr: jenkins_job_unstable{id=~\".*\"} \u003e= 10\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Jenkins Job Health detected high unstable rate on VPP jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n- name: \"Consul\"\n  rules:\n  - alert: ConsulServiceHealthcheckFailed\n    expr: consul_catalog_service_node_healthy == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n      description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n  - alert: ConsulMissingMasterNode\n    expr: consul_raft_peers \u003c 3\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n      description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n  - alert: ConsulAgentUnhealthy\n    expr: consul_health_node_status{status=\"critical\"} == 1\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n      description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n  rules:\n  - alert: NodeDown\n    expr: up == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n      description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n  - alert: HostHighCpuLoad\n    expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n      description: \"CPU load is \u003e 95%.\"\n  - alert: HostOutOfMemory\n    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n      description: \"Node memory is filling up (\u003c 10% left).\"\n  - alert: HostOomKillDetected\n    expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n      description: \"OOM kill detected.\"\n  - alert: HostMemoryUnderMemoryPressure\n    expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n      description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n  - alert: HostOutOfDiskSpace\n    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n      description: \"Disk is almost full (\u003c 10% left).\"\n  - alert: HostRaidDiskFailure\n    expr: node_md_disks{state=\"failed\"} \u003e 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n      description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n  - alert: HostConntrackLimit\n    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n      description: \"The number of conntrack is approching limit.\"\n  - alert: HostNetworkInterfaceSaturated\n    expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n    for: 1m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n      description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n  - alert: HostSystemdServiceCrashed\n    expr: node_systemd_unit_state{state=\"failed\"} == 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n      description: \"SystemD service crashed.\"\n  - alert: HostEdacCorrectableErrorsDetected\n    expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: info\n    annotations:\n      summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n  - alert: HostEdacUncorrectableErrorsDetected\n    expr: node_edac_uncorrectable_errors_total \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n  rules:\n  - alert: MinioDiskOffline\n    expr: minio_offline_disks \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n      description: \"Minio disk is offline.\"\n  - alert: MinioStorageSpaceExhausted\n    expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n      description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n  rules:\n  - alert: PrometheusConfigurationReloadFailure\n    expr: prometheus_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"Prometheus configuration reload error.\"\n  - alert: PrometheusTooManyRestarts\n    expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n      description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n  - alert: PrometheusAlertmanagerConfigurationReloadFailure\n    expr: alertmanager_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"AlertManager configuration reload error.\"\n  - alert: PrometheusRuleEvaluationFailures\n    expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n  - alert: PrometheusTargetScrapingSlow\n    expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n      description: \"Prometheus is scraping exporters slowly.\"\n  - alert: PrometheusTsdbCompactionsFailed\n    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n  - alert: PrometheusTsdbHeadTruncationsFailed\n    expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n  - alert: PrometheusTsdbWalCorruptions\n    expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n  - alert: PrometheusTsdbWalTruncationsFailed\n    expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n      }\n\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/prometheus.yml\"\n        data            = \u003c\u003cEOH\n---\nglobal:\n  scrape_interval:     5s\n  scrape_timeout:      5s\n  evaluation_interval: 5s\n\nalerting:\n  alertmanagers:\n  - consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\nrule_files:\n  - 'alerts.yml'\n\nscrape_configs:\n\n  - job_name: 'Nomad Cluster'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'nomad-client', 'nomad' ]\n    relabel_configs:\n    - source_labels: [__meta_consul_tags]\n      regex: '(.*)http(.*)'\n      action: keep\n    metrics_path: /v1/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Consul Cluster'\n    static_configs:\n      - targets: [ '10.30.51.28:8500' ]\n      - targets: [ '10.30.51.29:8500' ]\n      - targets: [ '10.30.51.30:8500' ]\n      - targets: [ '10.30.51.32:8500' ]\n      - targets: [ '10.30.51.33:8500' ]\n      - targets: [ '10.30.51.34:8500' ]\n      - targets: [ '10.30.51.35:8500' ]\n      - targets: [ '10.30.51.39:8500' ]\n      - targets: [ '10.30.51.40:8500' ]\n      - targets: [ '10.30.51.50:8500' ]\n      - targets: [ '10.30.51.51:8500' ]\n      - targets: [ '10.30.51.65:8500' ]\n      - targets: [ '10.30.51.66:8500' ]\n      - targets: [ '10.30.51.67:8500' ]\n      - targets: [ '10.30.51.68:8500' ]\n      - targets: [ '10.30.51.70:8500' ]\n      - targets: [ '10.30.51.71:8500' ]\n      - targets: [ '10.32.8.14:8500' ]\n      - targets: [ '10.32.8.15:8500' ]\n      - targets: [ '10.32.8.16:8500' ]\n      - targets: [ '10.32.8.17:8500' ]\n    metrics_path: /v1/agent/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Blackbox Exporter (icmp)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n      - targets: [ '10.30.51.32' ]\n    params:\n      module: [ 'icmp_v4' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'Blackbox Exporter (http)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n    params:\n      module: [ 'http_2xx' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'cAdvisor Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:8080' ]\n      - targets: [ '10.30.51.29:8080' ]\n      - targets: [ '10.30.51.30:8080' ]\n      #- targets: [ '10.30.51.32:8080' ]\n      - targets: [ '10.30.51.33:8080' ]\n      - targets: [ '10.30.51.34:8080' ]\n      - targets: [ '10.30.51.35:8080' ]\n      - targets: [ '10.30.51.39:8080' ]\n      - targets: [ '10.30.51.40:8080' ]\n      - targets: [ '10.30.51.50:8080' ]\n      - targets: [ '10.30.51.51:8080' ]\n      - targets: [ '10.30.51.65:8080' ]\n      - targets: [ '10.30.51.66:8080' ]\n      - targets: [ '10.30.51.67:8080' ]\n      - targets: [ '10.30.51.68:8080' ]\n      - targets: [ '10.30.51.70:8080' ]\n      - targets: [ '10.30.51.71:8080' ]\n      - targets: [ '10.32.8.14:8080' ]\n      - targets: [ '10.32.8.15:8080' ]\n      - targets: [ '10.32.8.16:8080' ]\n      - targets: [ '10.32.8.17:8080' ]\n\n  - job_name: 'Jenkins Job Health Exporter'\n    static_configs:\n      - targets: [ '10.30.51.32:9186' ]\n    metric_relabel_configs:\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*)_(success|failure|total|unstable|reqtime_ms)$'\n        action: replace\n        replacement: '$1'\n        target_label: id\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*)_(success|failure|total|unstable|reqtime_ms)$'\n        replacement: 'jenkins_job_$2'\n        target_label: __name__\n\n  - job_name: 'Node Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:9100' ]\n      - targets: [ '10.30.51.29:9100' ]\n      - targets: [ '10.30.51.30:9100' ]\n      - targets: [ '10.30.51.32:9100' ]\n      - targets: [ '10.30.51.33:9100' ]\n      - targets: [ '10.30.51.34:9100' ]\n      - targets: [ '10.30.51.35:9100' ]\n      - targets: [ '10.30.51.39:9100' ]\n      - targets: [ '10.30.51.40:9100' ]\n      - targets: [ '10.30.51.50:9100' ]\n      - targets: [ '10.30.51.51:9100' ]\n      - targets: [ '10.30.51.65:9100' ]\n      - targets: [ '10.30.51.66:9100' ]\n      - targets: [ '10.30.51.67:9100' ]\n      - targets: [ '10.30.51.68:9100' ]\n      - targets: [ '10.30.51.70:9100' ]\n      - targets: [ '10.30.51.71:9100' ]\n      - targets: [ '10.32.8.14:9100' ]\n      - targets: [ '10.32.8.15:9100' ]\n      - targets: [ '10.32.8.16:9100' ]\n      - targets: [ '10.32.8.17:9100' ]\n\n  - job_name: 'Alertmanager'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\n  - job_name: 'Grafana'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'grafana' ]\n\n  - job_name: 'Prometheus'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'prometheus' ]\n\n  - job_name: 'Minio'\n    bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'storage' ]\n    metrics_path: /minio/prometheus/metrics\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"${service_name}\"\n        port            = \"${service_name}\"\n        tags            = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Prometheus Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = ${cpu}\n        memory          = ${mem}\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"${service_name}\" {\n            static      = ${port}\n          }\n        }\n      }\n    }\n  }\n}",
              "vars": {
                "cpu": "2000",
                "data_dir": "/data/",
@@ -614,19 +513,29 @@
          {
            "schema_version": 0,
            "attributes": {
-            "allocation_ids": null,
+            "allocation_ids": [
+              "7722e831-d956-55a4-c375-f0e33485d234",
+              "b1edacab-e57f-e014-4d4c-50e563c81ab7",
+              "2ee67e9b-5bc1-ca2b-c3c4-0509e02ee954",
+              "c9c55369-2e27-8087-b156-f1923a57e1a0",
+              "922f75f9-39c1-c232-9ef6-81d68a03c719",
+              "6e23349f-9644-a20b-5c0e-db9d79733b3d",
+              "0337957a-0d04-f4b6-4dfc-3b828a731498",
+              "2705a9d9-913a-f2db-2535-abd85e9a5e6b",
+              "57b9c1f1-0dbe-321e-5cce-861f766c5ee3"
+            ],
              "datacenters": [
                "yul1"
              ],
-            "deployment_id": "4cbf8370-f2d7-16c2-d500-f6495d43537d",
+            "deployment_id": "02342e30-fde4-3402-b201-ef649d404a95",
              "deployment_status": "successful",
              "deregister_on_destroy": true,
              "deregister_on_id_change": true,
              "detach": false,
              "id": "prod-prometheus",
-            "jobspec": "job \"prod-prometheus\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-prometheus\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count               = 4\n\n    # The volume stanza allows the group to specify that it requires a given\n    # volume from the cluster.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/volume\n    #\n    \n    volume \"prod-volume1-prometheus\" {\n      type              = \"host\"\n      read_only         = false\n      source            = \"prod-volume-data1-1\"\n    }\n    \n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute         = \"${attr.cpu.arch}\"\n      operator          = \"!=\"\n      value             = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-prometheus\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver            = \"exec\"\n\n      \n      volume_mount {\n        volume          = \"prod-volume1-prometheus\"\n        destination     = \"/data/\"\n        read_only       = false\n      }\n      \n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command         = \"local/prometheus-2.24.0.linux-amd64/prometheus\"\n        args            = [\n          \"--config.file=secrets/prometheus.yml\",\n          \"--storage.tsdb.path=/data/prometheus/\",\n          \"--storage.tsdb.retention.time=15d\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alerts.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n---\ngroups:\n- name: \"Consul\"\n  rules:\n  - alert: ConsulServiceHealthcheckFailed\n    expr: consul_catalog_service_node_healthy == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n      description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n  - alert: ConsulMissingMasterNode\n    expr: consul_raft_peers \u003c 3\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n      description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n  - alert: ConsulAgentUnhealthy\n    expr: consul_health_node_status{status=\"critical\"} == 1\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n      description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n  rules:\n  - alert: NodeDown\n    expr: up == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n      description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n  - alert: HostHighCpuLoad\n    expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n      description: \"CPU load is \u003e 95%.\"\n  - alert: HostOutOfMemory\n    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n      description: \"Node memory is filling up (\u003c 10% left).\"\n  - alert: HostOomKillDetected\n    expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n      description: \"OOM kill detected.\"\n  - alert: HostMemoryUnderMemoryPressure\n    expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n      description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n  - alert: HostOutOfDiskSpace\n    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n      description: \"Disk is almost full (\u003c 10% left).\"\n  - alert: HostRaidDiskFailure\n    expr: node_md_disks{state=\"failed\"} \u003e 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n      description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n  - alert: HostConntrackLimit\n    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n      description: \"The number of conntrack is approching limit.\"\n  - alert: HostNetworkInterfaceSaturated\n    expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n    for: 1m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n      description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n  - alert: HostSystemdServiceCrashed\n    expr: node_systemd_unit_state{state=\"failed\"} == 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n      description: \"SystemD service crashed.\"\n  - alert: HostEdacCorrectableErrorsDetected\n    expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: info\n    annotations:\n      summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n  - alert: HostEdacUncorrectableErrorsDetected\n    expr: node_edac_uncorrectable_errors_total \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n  rules:\n  - alert: MinioDiskOffline\n    expr: minio_offline_disks \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n      description: \"Minio disk is offline.\"\n  - alert: MinioStorageSpaceExhausted\n    expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n      description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n  rules:\n  - alert: PrometheusConfigurationReloadFailure\n    expr: prometheus_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"Prometheus configuration reload error.\"\n  - alert: PrometheusTooManyRestarts\n    expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n      description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n  - alert: PrometheusAlertmanagerConfigurationReloadFailure\n    expr: alertmanager_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"AlertManager configuration reload error.\"\n  - alert: PrometheusRuleEvaluationFailures\n    expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n  - alert: PrometheusTargetScrapingSlow\n    expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n      description: \"Prometheus is scraping exporters slowly.\"\n  - alert: PrometheusTsdbCompactionsFailed\n    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n  - alert: PrometheusTsdbHeadTruncationsFailed\n    expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n  - alert: PrometheusTsdbWalCorruptions\n    expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n  - alert: PrometheusTsdbWalTruncationsFailed\n    expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n      }\n\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/prometheus.yml\"\n        data            = \u003c\u003cEOH\n---\nglobal:\n  scrape_interval:     5s\n  scrape_timeout:      5s\n  evaluation_interval: 5s\n\nalerting:\n  alertmanagers:\n  - consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\nrule_files:\n  - 'alerts.yml'\n\nscrape_configs:\n\n  - job_name: 'Nomad Cluster'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'nomad-client', 'nomad' ]\n    relabel_configs:\n    - source_labels: [__meta_consul_tags]\n      regex: '(.*)http(.*)'\n      action: keep\n    metrics_path: /v1/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Consul Cluster'\n    static_configs:\n      - targets: [ '10.30.51.28:8500' ]\n      - targets: [ '10.30.51.29:8500' ]\n      - targets: [ '10.30.51.30:8500' ]\n      - targets: [ '10.30.51.32:8500' ]\n      - targets: [ '10.30.51.33:8500' ]\n      - targets: [ '10.30.51.34:8500' ]\n      - targets: [ '10.30.51.35:8500' ]\n      - targets: [ '10.30.51.39:8500' ]\n      - targets: [ '10.30.51.40:8500' ]\n      - targets: [ '10.30.51.50:8500' ]\n      - targets: [ '10.30.51.51:8500' ]\n      - targets: [ '10.30.51.65:8500' ]\n      - targets: [ '10.30.51.66:8500' ]\n      - targets: [ '10.30.51.67:8500' ]\n      - targets: [ '10.30.51.68:8500' ]\n      - targets: [ '10.30.51.70:8500' ]\n      - targets: [ '10.30.51.71:8500' ]\n      - targets: [ '10.32.8.14:8500' ]\n      - targets: [ '10.32.8.15:8500' ]\n      - targets: [ '10.32.8.16:8500' ]\n      - targets: [ '10.32.8.17:8500' ]\n    metrics_path: /v1/agent/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Blackbox Exporter (icmp)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n      - targets: [ '10.30.51.32' ]\n    params:\n      module: [ 'icmp_v4' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'Blackbox Exporter (http)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n    params:\n      module: [ 'http_2xx' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'cAdvisor Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:8080' ]\n      - targets: [ '10.30.51.29:8080' ]\n      - targets: [ '10.30.51.30:8080' ]\n      #- targets: [ '10.30.51.32:8080' ]\n      - targets: [ '10.30.51.33:8080' ]\n      - targets: [ '10.30.51.34:8080' ]\n      - targets: [ '10.30.51.35:8080' ]\n      - targets: [ '10.30.51.39:8080' ]\n      - targets: [ '10.30.51.40:8080' ]\n      - targets: [ '10.30.51.50:8080' ]\n      - targets: [ '10.30.51.51:8080' ]\n      - targets: [ '10.30.51.65:8080' ]\n      - targets: [ '10.30.51.66:8080' ]\n      - targets: [ '10.30.51.67:8080' ]\n      - targets: [ '10.30.51.68:8080' ]\n      - targets: [ '10.30.51.70:8080' ]\n      - targets: [ '10.30.51.71:8080' ]\n      - targets: [ '10.32.8.14:8080' ]\n      - targets: [ '10.32.8.15:8080' ]\n      - targets: [ '10.32.8.16:8080' ]\n      - targets: [ '10.32.8.17:8080' ]\n\n  - job_name: 'Node Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:9100' ]\n      - targets: [ '10.30.51.29:9100' ]\n      - targets: [ '10.30.51.30:9100' ]\n      - targets: [ '10.30.51.32:9100' ]\n      - targets: [ '10.30.51.33:9100' ]\n      - targets: [ '10.30.51.34:9100' ]\n      - targets: [ '10.30.51.35:9100' ]\n      - targets: [ '10.30.51.39:9100' ]\n      - targets: [ '10.30.51.40:9100' ]\n      - targets: [ '10.30.51.50:9100' ]\n      - targets: [ '10.30.51.51:9100' ]\n      - targets: [ '10.30.51.65:9100' ]\n      - targets: [ '10.30.51.66:9100' ]\n      - targets: [ '10.30.51.67:9100' ]\n      - targets: [ '10.30.51.68:9100' ]\n      - targets: [ '10.30.51.70:9100' ]\n      - targets: [ '10.30.51.71:9100' ]\n      - targets: [ '10.32.8.14:9100' ]\n      - targets: [ '10.32.8.15:9100' ]\n      - targets: [ '10.32.8.16:9100' ]\n      - targets: [ '10.32.8.17:9100' ]\n\n  - job_name: 'Alertmanager'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\n  - job_name: 'Grafana'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'grafana' ]\n\n  - job_name: 'Prometheus'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'prometheus' ]\n\n  - job_name: 'Minio'\n    bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'storage' ]\n    metrics_path: /minio/prometheus/metrics\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"prometheus\"\n        port            = \"prometheus\"\n        tags            = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Prometheus Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = 2000\n        memory          = 8192\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"prometheus\" {\n            static      = 9090\n          }\n        }\n      }\n    }\n  }\n}",
+            "jobspec": "job \"prod-prometheus\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-prometheus\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count               = 4\n\n    # The volume stanza allows the group to specify that it requires a given\n    # volume from the cluster.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/volume\n    #\n    \n    volume \"prod-volume1-prometheus\" {\n      type              = \"host\"\n      read_only         = false\n      source            = \"prod-volume-data1-1\"\n    }\n    \n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute         = \"${attr.cpu.arch}\"\n      operator          = \"!=\"\n      value             = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-prometheus\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver            = \"exec\"\n\n      \n      volume_mount {\n        volume          = \"prod-volume1-prometheus\"\n        destination     = \"/data/\"\n        read_only       = false\n      }\n      \n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command         = \"local/prometheus-2.24.0.linux-amd64/prometheus\"\n        args            = [\n          \"--config.file=secrets/prometheus.yml\",\n          \"--storage.tsdb.path=/data/prometheus/\",\n          \"--storage.tsdb.retention.time=15d\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alerts.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n---\ngroups:\n- name: \"Jenkins Job Health Exporter\"\n  rules:\n  - alert: JenkinsJobHealthExporterFailures\n    expr: jenkins_job_failure{id=~\".*\"} \u003e= 10\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Jenkins Job Health detected high failure rate on VPP jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n  - alert: JenkinsJobHealthExporterUnstable\n    expr: jenkins_job_unstable{id=~\".*\"} \u003e= 10\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Jenkins Job Health detected high unstable rate on VPP jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n- name: \"Consul\"\n  rules:\n  - alert: ConsulServiceHealthcheckFailed\n    expr: consul_catalog_service_node_healthy == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n      description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n  - alert: ConsulMissingMasterNode\n    expr: consul_raft_peers \u003c 3\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n      description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n  - alert: ConsulAgentUnhealthy\n    expr: consul_health_node_status{status=\"critical\"} == 1\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n      description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n  rules:\n  - alert: NodeDown\n    expr: up == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n      description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n  - alert: HostHighCpuLoad\n    expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n      description: \"CPU load is \u003e 95%.\"\n  - alert: HostOutOfMemory\n    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n      description: \"Node memory is filling up (\u003c 10% left).\"\n  - alert: HostOomKillDetected\n    expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n      description: \"OOM kill detected.\"\n  - alert: HostMemoryUnderMemoryPressure\n    expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n      description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n  - alert: HostOutOfDiskSpace\n    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n      description: \"Disk is almost full (\u003c 10% left).\"\n  - alert: HostRaidDiskFailure\n    expr: node_md_disks{state=\"failed\"} \u003e 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n      description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n  - alert: HostConntrackLimit\n    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n      description: \"The number of conntrack is approching limit.\"\n  - alert: HostNetworkInterfaceSaturated\n    expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n    for: 1m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n      description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n  - alert: HostSystemdServiceCrashed\n    expr: node_systemd_unit_state{state=\"failed\"} == 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n      description: \"SystemD service crashed.\"\n  - alert: HostEdacCorrectableErrorsDetected\n    expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: info\n    annotations:\n      summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n  - alert: HostEdacUncorrectableErrorsDetected\n    expr: node_edac_uncorrectable_errors_total \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n  rules:\n  - alert: MinioDiskOffline\n    expr: minio_offline_disks \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n      description: \"Minio disk is offline.\"\n  - alert: MinioStorageSpaceExhausted\n    expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n      description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n  rules:\n  - alert: PrometheusConfigurationReloadFailure\n    expr: prometheus_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"Prometheus configuration reload error.\"\n  - alert: PrometheusTooManyRestarts\n    expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n      description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n  - alert: PrometheusAlertmanagerConfigurationReloadFailure\n    expr: alertmanager_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"AlertManager configuration reload error.\"\n  - alert: PrometheusRuleEvaluationFailures\n    expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n  - alert: PrometheusTargetScrapingSlow\n    expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n      description: \"Prometheus is scraping exporters slowly.\"\n  - alert: PrometheusTsdbCompactionsFailed\n    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n  - alert: PrometheusTsdbHeadTruncationsFailed\n    expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n  - alert: PrometheusTsdbWalCorruptions\n    expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n  - alert: PrometheusTsdbWalTruncationsFailed\n    expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n      }\n\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/prometheus.yml\"\n        data            = \u003c\u003cEOH\n---\nglobal:\n  scrape_interval:     5s\n  scrape_timeout:      5s\n  evaluation_interval: 5s\n\nalerting:\n  alertmanagers:\n  - consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\nrule_files:\n  - 'alerts.yml'\n\nscrape_configs:\n\n  - job_name: 'Nomad Cluster'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'nomad-client', 'nomad' ]\n    relabel_configs:\n    - source_labels: [__meta_consul_tags]\n      regex: '(.*)http(.*)'\n      action: keep\n    metrics_path: /v1/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Consul Cluster'\n    static_configs:\n      - targets: [ '10.30.51.28:8500' ]\n      - targets: [ '10.30.51.29:8500' ]\n      - targets: [ '10.30.51.30:8500' ]\n      - targets: [ '10.30.51.32:8500' ]\n      - targets: [ '10.30.51.33:8500' ]\n      - targets: [ '10.30.51.34:8500' ]\n      - targets: [ '10.30.51.35:8500' ]\n      - targets: [ '10.30.51.39:8500' ]\n      - targets: [ '10.30.51.40:8500' ]\n      - targets: [ '10.30.51.50:8500' ]\n      - targets: [ '10.30.51.51:8500' ]\n      - targets: [ '10.30.51.65:8500' ]\n      - targets: [ '10.30.51.66:8500' ]\n      - targets: [ '10.30.51.67:8500' ]\n      - targets: [ '10.30.51.68:8500' ]\n      - targets: [ '10.30.51.70:8500' ]\n      - targets: [ '10.30.51.71:8500' ]\n      - targets: [ '10.32.8.14:8500' ]\n      - targets: [ '10.32.8.15:8500' ]\n      - targets: [ '10.32.8.16:8500' ]\n      - targets: [ '10.32.8.17:8500' ]\n    metrics_path: /v1/agent/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Blackbox Exporter (icmp)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n      - targets: [ '10.30.51.32' ]\n    params:\n      module: [ 'icmp_v4' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'Blackbox Exporter (http)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n    params:\n      module: [ 'http_2xx' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'cAdvisor Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:8080' ]\n      - targets: [ '10.30.51.29:8080' ]\n      - targets: [ '10.30.51.30:8080' ]\n      #- targets: [ '10.30.51.32:8080' ]\n      - targets: [ '10.30.51.33:8080' ]\n      - targets: [ '10.30.51.34:8080' ]\n      - targets: [ '10.30.51.35:8080' ]\n      - targets: [ '10.30.51.39:8080' ]\n      - targets: [ '10.30.51.40:8080' ]\n      - targets: [ '10.30.51.50:8080' ]\n      - targets: [ '10.30.51.51:8080' ]\n      - targets: [ '10.30.51.65:8080' ]\n      - targets: [ '10.30.51.66:8080' ]\n      - targets: [ '10.30.51.67:8080' ]\n      - targets: [ '10.30.51.68:8080' ]\n      - targets: [ '10.30.51.70:8080' ]\n      - targets: [ '10.30.51.71:8080' ]\n      - targets: [ '10.32.8.14:8080' ]\n      - targets: [ '10.32.8.15:8080' ]\n      - targets: [ '10.32.8.16:8080' ]\n      - targets: [ '10.32.8.17:8080' ]\n\n  - job_name: 'Jenkins Job Health Exporter'\n    static_configs:\n      - targets: [ '10.30.51.32:9186' ]\n    metric_relabel_configs:\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*)_(success|failure|total|unstable|reqtime_ms)$'\n        action: replace\n        replacement: '$1'\n        target_label: id\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*)_(success|failure|total|unstable|reqtime_ms)$'\n        replacement: 'jenkins_job_$2'\n        target_label: __name__\n\n  - job_name: 'Node Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:9100' ]\n      - targets: [ '10.30.51.29:9100' ]\n      - targets: [ '10.30.51.30:9100' ]\n      - targets: [ '10.30.51.32:9100' ]\n      - targets: [ '10.30.51.33:9100' ]\n      - targets: [ '10.30.51.34:9100' ]\n      - targets: [ '10.30.51.35:9100' ]\n      - targets: [ '10.30.51.39:9100' ]\n      - targets: [ '10.30.51.40:9100' ]\n      - targets: [ '10.30.51.50:9100' ]\n      - targets: [ '10.30.51.51:9100' ]\n      - targets: [ '10.30.51.65:9100' ]\n      - targets: [ '10.30.51.66:9100' ]\n      - targets: [ '10.30.51.67:9100' ]\n      - targets: [ '10.30.51.68:9100' ]\n      - targets: [ '10.30.51.70:9100' ]\n      - targets: [ '10.30.51.71:9100' ]\n      - targets: [ '10.32.8.14:9100' ]\n      - targets: [ '10.32.8.15:9100' ]\n      - targets: [ '10.32.8.16:9100' ]\n      - targets: [ '10.32.8.17:9100' ]\n\n  - job_name: 'Alertmanager'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\n  - job_name: 'Grafana'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'grafana' ]\n\n  - job_name: 'Prometheus'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'prometheus' ]\n\n  - job_name: 'Minio'\n    bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'storage' ]\n    metrics_path: /minio/prometheus/metrics\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"prometheus\"\n        port            = \"prometheus\"\n        tags            = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Prometheus Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = 2000\n        memory          = 8192\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"prometheus\" {\n            static      = 9090\n          }\n        }\n      }\n    }\n  }\n}",
              "json": null,
-            "modify_index": "7201194",
+            "modify_index": "7254268",
              "name": "prod-prometheus",
              "namespace": "default",
              "policy_override": null,
@@ -708,10 +617,9 @@
            "schema_version": 0,
            "attributes": {
              "allocation_ids": [
+              "a51e95c4-8ff0-47d9-6f88-f475942141bc",
                "0239788e-a5eb-b54d-0cf2-2404b3ec7c53",
-              "ecfcd9ad-b959-0fa4-c1ec-8b82195830f1",
-              "190f5699-0d10-7f85-ac68-96927702070b",
-              "2fbbef45-f00a-1367-08c4-c2239de9e78d"
+              "190f5699-0d10-7f85-ac68-96927702070b"
              ],
              "datacenters": [
                "yul1"
author	pmikus <pmikus@cisco.com>
	Fri, 5 Feb 2021 14:51:43 +0000 (14:51 +0000)
committer	Peter Mikus <pmikus@cisco.com>
	Wed, 10 Feb 2021 09:04:46 +0000 (09:04 +0000)
resources/tools/testbed-setup/ansible/nomad.yaml		patch \| blob \| history
resources/tools/testbed-setup/ansible/roles/jenkins_job_health_exporter/defaults/main.yaml	[new file with mode: 0644]	patch \| blob
resources/tools/testbed-setup/ansible/roles/jenkins_job_health_exporter/handlers/main.yaml	[new file with mode: 0644]	patch \| blob
resources/tools/testbed-setup/ansible/roles/jenkins_job_health_exporter/tasks/main.yaml	[new file with mode: 0644]	patch \| blob
resources/tools/testbed-setup/ansible/roles/jenkins_job_health_exporter/templates/jenkins-job-health-exporter.j2	[new file with mode: 0644]	patch \| blob
resources/tools/testbed-setup/ansible/roles/jenkins_job_health_exporter/templates/jenkins-job-health-exporter.service.j2	[new file with mode: 0644]	patch \| blob
resources/tools/testbed-setup/ansible/vpp_device.yaml		patch \| blob \| history
terraform-ci-infra/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl		patch \| blob \| history
terraform-ci-infra/1n_nmd/alertmanager/main.tf		patch \| blob \| history
terraform-ci-infra/1n_nmd/alertmanager/variables.tf		patch \| blob \| history
terraform-ci-infra/1n_nmd/main.tf		patch \| blob \| history
terraform-ci-infra/1n_nmd/prometheus/conf/nomad/prometheus.hcl		patch \| blob \| history
terraform-ci-infra/1n_nmd/terraform.tfstate		patch \| blob \| history
terraform-ci-infra/1n_nmd/terraform.tfstate.backup		patch \| blob \| history