FIX: JenkinsJob alert condition

author pmikus <pmikus@cisco.com>

Fri, 5 Mar 2021 11:36:29 +0000 (11:36 +0000)

committer Peter Mikus <pmikus@cisco.com>

Fri, 5 Mar 2021 11:51:51 +0000 (11:51 +0000)
author pmikus <pmikus@cisco.com>
Fri, 5 Mar 2021 11:36:29 +0000 (11:36 +0000)
committer Peter Mikus <pmikus@cisco.com>
Fri, 5 Mar 2021 11:51:51 +0000 (11:51 +0000)
diff --git a/terraform-ci-infra/1n_nmd/prometheus/conf/nomad/prometheus.hcl b/terraform-ci-infra/1n_nmd/prometheus/conf/nomad/prometheus.hcl

index d851628..235a042 100644 (file)
--- a/terraform-ci-infra/1n_nmd/prometheus/conf/nomad/prometheus.hcl
+++ b/terraform-ci-infra/1n_nmd/prometheus/conf/nomad/prometheus.hcl
@@ -191,7 +191,7 @@ groups:
  - name: "Jenkins Job Health Exporter"
    rules:
    - alert: JenkinsJobHealthExporterFailures
-    expr: jenkins_job_failure{id=~".*"} >= 10
+    expr: jenkins_job_failure{id=~".*"} > jenkins_job_success{id=~".*"}
      for: 0m
      labels:
        severity: critical
@@ -199,7 +199,7 @@ groups:
        summary: "Jenkins Job Health detected high failure rate on jenkins jobs."
        description: "Job: {{ $labels.id }}"
    - alert: JenkinsJobHealthExporterUnstable
-    expr: jenkins_job_unstable{id=~".*"} >= 10
+    expr: jenkins_job_unstable{id=~".*"} > jenkins_job_success{id=~".*"}
      for: 0m
      labels:
        severity: warning
diff --git a/terraform-ci-infra/1n_nmd/terraform.tfstate b/terraform-ci-infra/1n_nmd/terraform.tfstate

index 92757f0..bfce8cf 100644 (file)
--- a/terraform-ci-infra/1n_nmd/terraform.tfstate
+++ b/terraform-ci-infra/1n_nmd/terraform.tfstate
@@ -1,7 +1,7 @@
  {
    "version": 4,
    "terraform_version": "0.14.7",
-  "serial": 1138,
+  "serial": 1142,
    "lineage": "e4e7f30a-652d-7a31-e31c-5e3a3388c9b9",
    "outputs": {},
    "resources": [
@@ -17,7 +17,7 @@
            "attributes": {
              "filename": null,
              "id": "4a624043a1633ecf6736293b32ca9e3621fb6583d4b1eb1ff6d9ae1f21ce6b19",
-            "rendered": "job \"prod-alertmanager\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-alertmanager\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count             = 1\n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute       = \"${attr.cpu.arch}\"\n      operator        = \"!=\"\n      value           = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-alertmanager\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver          = \"exec\"\n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command       = \"local/alertmanager-0.21.0.linux-amd64/alertmanager\"\n        args          = [\n          \"--config.file=secrets/alertmanager.yml\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alertmanager.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n#  # CA certificate to validate the server certificate with.\n#  ca_file: \u003cfilepath\u003e ]\n#\n#  # Certificate and key files for client cert authentication to the server.\n#  cert_file: \u003cfilepath\u003e\n#  key_file: \u003cfilepath\u003e\n#\n#  # ServerName extension to indicate the name of the server.\n#  # http://tools.ietf.org/html/rfc4366#section-3.1\n#  server_name: \u003cstring\u003e\n#\n#  # Disable validation of the server certificate.\n#  insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n  receiver: 'default-slack-receiver'\n\n  # The labels by which incoming alerts are grouped together. For example,\n  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n  # be batched into a single group.\n  #\n  # To aggregate by all possible labels use '...' as the sole label name.\n  # This effectively disables aggregation entirely, passing through all\n  # alerts as-is. This is unlikely to be what you want, unless you have\n  # a very low alert volume or your upstream notification system performs\n  # its own grouping. Example: group_by: [...]\n  group_by: ['alertname']\n\n  # When a new group of alerts is created by an incoming alert, wait at\n  # least 'group_wait' to send the initial notification.\n  # This way ensures that you get multiple alerts for the same group that start\n  # firing shortly after another are batched together on the first\n  # notification.\n  group_wait: 30s\n\n  # When the first notification was sent, wait 'group_interval' to send a batch\n  # of new alerts that started firing for that group.\n  group_interval: 5m\n\n  # If an alert has successfully been sent, wait 'repeat_interval' to\n  # resend them.\n  repeat_interval: 3h\n\n  # All the above attributes are inherited by all child routes and can\n  # overwritten on each.\n  # The child route trees.\n  routes:\n  - match_re:\n      alertname: JenkinsJob.*\n    receiver: jenkins-slack-receiver\n    routes:\n    - match:\n        severity: critical\n      receiver: 'jenkins-slack-receiver'\n\n  - match_re:\n      service: .*\n    receiver: default-slack-receiver\n    routes:\n    - match:\n        severity: critical\n      receiver: 'default-slack-receiver'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n    severity: 'critical'\n  target_match:\n    severity: 'warning'\n  equal: ['alertname', 'instance']\n\nreceivers:\n- name: 'jenkins-slack-receiver'\n  slack_configs:\n  - api_url: ''\n    channel: '#fdio-jobs-monitoring'\n    send_resolved: true\n    icon_url: https://avatars3.githubusercontent.com/u/3380462\n    title: |-\n     [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n     {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n       {{\" \"}}(\n       {{- with .CommonLabels.Remove .GroupLabels.Names }}\n         {{- range $index, $label := .SortedPairs -}}\n           {{ if $index }}, {{ end }}\n           {{- $label.Name }}=\"{{ $label.Value -}}\"\n         {{- end }}\n       {{- end -}}\n       )\n     {{- end }}\n    text: \u003e-\n     {{ range .Alerts -}}\n     *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n     *Description:* {{ .Annotations.description }}\n\n     *Details:*\n       {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n       {{ end }}\n     {{ end }}\n\n- name: 'default-slack-receiver'\n  slack_configs:\n  - api_url: ''\n    channel: '#fdio-infra-monitoring'\n    send_resolved: true\n    icon_url: https://avatars3.githubusercontent.com/u/3380462\n    title: |-\n     [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n     {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n       {{\" \"}}(\n       {{- with .CommonLabels.Remove .GroupLabels.Names }}\n         {{- range $index, $label := .SortedPairs -}}\n           {{ if $index }}, {{ end }}\n           {{- $label.Name }}=\"{{ $label.Value -}}\"\n         {{- end }}\n       {{- end -}}\n       )\n     {{- end }}\n    text: \u003e-\n     {{ range .Alerts -}}\n     *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n     *Description:* {{ .Annotations.description }}\n\n     *Details:*\n       {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n       {{ end }}\n     {{ end }}\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"alertmanager\"\n        port            = \"alertmanager\"\n        tags            = [ \"alertmanager${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Alertmanager Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = 1000\n        memory          = 1024\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"alertmanager\" {\n            static      = 9093\n          }\n        }\n      }\n    }\n  }\n}",
+            "rendered": "job \"prod-alertmanager\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-alertmanager\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count             = 1\n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute       = \"${attr.cpu.arch}\"\n      operator        = \"!=\"\n      value           = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-alertmanager\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver          = \"exec\"\n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command       = \"local/alertmanager-0.21.0.linux-amd64/alertmanager\"\n        args          = [\n          \"--config.file=secrets/alertmanager.yml\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alertmanager.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n#  # CA certificate to validate the server certificate with.\n#  ca_file: \u003cfilepath\u003e ]\n#\n#  # Certificate and key files for client cert authentication to the server.\n#  cert_file: \u003cfilepath\u003e\n#  key_file: \u003cfilepath\u003e\n#\n#  # ServerName extension to indicate the name of the server.\n#  # http://tools.ietf.org/html/rfc4366#section-3.1\n#  server_name: \u003cstring\u003e\n#\n#  # Disable validation of the server certificate.\n#  insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n  receiver: 'default-slack-receiver'\n\n  # The labels by which incoming alerts are grouped together. For example,\n  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n  # be batched into a single group.\n  #\n  # To aggregate by all possible labels use '...' as the sole label name.\n  # This effectively disables aggregation entirely, passing through all\n  # alerts as-is. This is unlikely to be what you want, unless you have\n  # a very low alert volume or your upstream notification system performs\n  # its own grouping. Example: group_by: [...]\n  group_by: ['alertname']\n\n  # When a new group of alerts is created by an incoming alert, wait at\n  # least 'group_wait' to send the initial notification.\n  # This way ensures that you get multiple alerts for the same group that start\n  # firing shortly after another are batched together on the first\n  # notification.\n  group_wait: 30s\n\n  # When the first notification was sent, wait 'group_interval' to send a batch\n  # of new alerts that started firing for that group.\n  group_interval: 5m\n\n  # If an alert has successfully been sent, wait 'repeat_interval' to\n  # resend them.\n  repeat_interval: 3h\n\n  # All the above attributes are inherited by all child routes and can\n  # overwritten on each.\n  # The child route trees.\n  routes:\n  - match_re:\n      alertname: JenkinsJob.*\n    receiver: jenkins-slack-receiver\n    routes:\n    - match:\n        severity: critical\n      receiver: 'jenkins-slack-receiver'\n\n  - match_re:\n      service: .*\n    receiver: default-slack-receiver\n    routes:\n    - match:\n        severity: critical\n      receiver: 'default-slack-receiver'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n    severity: 'critical'\n  target_match:\n    severity: 'warning'\n  equal: ['alertname', 'instance']\n\nreceivers:\n- name: 'jenkins-slack-receiver'\n  slack_configs:\n  - api_url: 'https://hooks.slack.com/services/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'\n    channel: '#fdio-jobs-monitoring'\n    send_resolved: true\n    icon_url: https://avatars3.githubusercontent.com/u/3380462\n    title: |-\n     [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n     {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n       {{\" \"}}(\n       {{- with .CommonLabels.Remove .GroupLabels.Names }}\n         {{- range $index, $label := .SortedPairs -}}\n           {{ if $index }}, {{ end }}\n           {{- $label.Name }}=\"{{ $label.Value -}}\"\n         {{- end }}\n       {{- end -}}\n       )\n     {{- end }}\n    text: \u003e-\n     {{ range .Alerts -}}\n     *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n     *Description:* {{ .Annotations.description }}\n\n     *Details:*\n       {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n       {{ end }}\n     {{ end }}\n\n- name: 'default-slack-receiver'\n  slack_configs:\n  - api_url: 'https://hooks.slack.com/services/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'\n    channel: '#fdio-infra-monitoring'\n    send_resolved: true\n    icon_url: https://avatars3.githubusercontent.com/u/3380462\n    title: |-\n     [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n     {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n       {{\" \"}}(\n       {{- with .CommonLabels.Remove .GroupLabels.Names }}\n         {{- range $index, $label := .SortedPairs -}}\n           {{ if $index }}, {{ end }}\n           {{- $label.Name }}=\"{{ $label.Value -}}\"\n         {{- end }}\n       {{- end -}}\n       )\n     {{- end }}\n    text: \u003e-\n     {{ range .Alerts -}}\n     *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n     *Description:* {{ .Annotations.description }}\n\n     *Details:*\n       {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n       {{ end }}\n     {{ end }}\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"alertmanager\"\n        port            = \"alertmanager\"\n        tags            = [ \"alertmanager${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Alertmanager Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = 1000\n        memory          = 1024\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"alertmanager\" {\n            static      = 9093\n          }\n        }\n      }\n    }\n  }\n}",
              "template": "job \"${job_name}\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"${datacenters}\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n%{ if use_canary }\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n%{ endif }\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-${service_name}\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count             = ${group_count}\n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute       = \"$${attr.cpu.arch}\"\n      operator        = \"!=\"\n      value           = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-${service_name}\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver          = \"exec\"\n\n      %{ if use_vault_provider }\n      vault {\n        policies        = \"${vault_kv_policy_name}\"\n      }\n      %{ endif }\n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command       = \"local/alertmanager-${version}.linux-amd64/alertmanager\"\n        args          = [\n          \"--config.file=secrets/alertmanager.yml\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"${url}\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alertmanager.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n#  # CA certificate to validate the server certificate with.\n#  ca_file: \u003cfilepath\u003e ]\n#\n#  # Certificate and key files for client cert authentication to the server.\n#  cert_file: \u003cfilepath\u003e\n#  key_file: \u003cfilepath\u003e\n#\n#  # ServerName extension to indicate the name of the server.\n#  # http://tools.ietf.org/html/rfc4366#section-3.1\n#  server_name: \u003cstring\u003e\n#\n#  # Disable validation of the server certificate.\n#  insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n  receiver: '${slack_default_receiver}'\n\n  # The labels by which incoming alerts are grouped together. For example,\n  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n  # be batched into a single group.\n  #\n  # To aggregate by all possible labels use '...' as the sole label name.\n  # This effectively disables aggregation entirely, passing through all\n  # alerts as-is. This is unlikely to be what you want, unless you have\n  # a very low alert volume or your upstream notification system performs\n  # its own grouping. Example: group_by: [...]\n  group_by: ['alertname']\n\n  # When a new group of alerts is created by an incoming alert, wait at\n  # least 'group_wait' to send the initial notification.\n  # This way ensures that you get multiple alerts for the same group that start\n  # firing shortly after another are batched together on the first\n  # notification.\n  group_wait: 30s\n\n  # When the first notification was sent, wait 'group_interval' to send a batch\n  # of new alerts that started firing for that group.\n  group_interval: 5m\n\n  # If an alert has successfully been sent, wait 'repeat_interval' to\n  # resend them.\n  repeat_interval: 3h\n\n  # All the above attributes are inherited by all child routes and can\n  # overwritten on each.\n  # The child route trees.\n  routes:\n  - match_re:\n      alertname: JenkinsJob.*\n    receiver: ${slack_jenkins_receiver}\n    routes:\n    - match:\n        severity: critical\n      receiver: '${slack_jenkins_receiver}'\n\n  - match_re:\n      service: .*\n    receiver: ${slack_default_receiver}\n    routes:\n    - match:\n        severity: critical\n      receiver: '${slack_default_receiver}'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n    severity: 'critical'\n  target_match:\n    severity: 'warning'\n  equal: ['alertname', 'instance']\n\nreceivers:\n- name: '${slack_jenkins_receiver}'\n  slack_configs:\n  - api_url: 'https://hooks.slack.com/services/${slack_jenkins_api_key}'\n    channel: '#${slack_jenkins_channel}'\n    send_resolved: true\n    icon_url: https://avatars3.githubusercontent.com/u/3380462\n    title: |-\n     [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n     {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n       {{\" \"}}(\n       {{- with .CommonLabels.Remove .GroupLabels.Names }}\n         {{- range $index, $label := .SortedPairs -}}\n           {{ if $index }}, {{ end }}\n           {{- $label.Name }}=\"{{ $label.Value -}}\"\n         {{- end }}\n       {{- end -}}\n       )\n     {{- end }}\n    text: \u003e-\n     {{ range .Alerts -}}\n     *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n     *Description:* {{ .Annotations.description }}\n\n     *Details:*\n       {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n       {{ end }}\n     {{ end }}\n\n- name: '${slack_default_receiver}'\n  slack_configs:\n  - api_url: 'https://hooks.slack.com/services/${slack_default_api_key}'\n    channel: '#${slack_default_channel}'\n    send_resolved: true\n    icon_url: https://avatars3.githubusercontent.com/u/3380462\n    title: |-\n     [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n     {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n       {{\" \"}}(\n       {{- with .CommonLabels.Remove .GroupLabels.Names }}\n         {{- range $index, $label := .SortedPairs -}}\n           {{ if $index }}, {{ end }}\n           {{- $label.Name }}=\"{{ $label.Value -}}\"\n         {{- end }}\n       {{- end -}}\n       )\n     {{- end }}\n    text: \u003e-\n     {{ range .Alerts -}}\n     *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n     *Description:* {{ .Annotations.description }}\n\n     *Details:*\n       {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n       {{ end }}\n     {{ end }}\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"${service_name}\"\n        port            = \"${service_name}\"\n        tags            = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Alertmanager Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = ${cpu}\n        memory          = ${mem}\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"${service_name}\" {\n            static      = ${port}\n          }\n        }\n      }\n    }\n  }\n}",
              "vars": {
                "cpu": "1000",
@@ -65,7 +65,7 @@
              "deregister_on_id_change": true,
              "detach": false,
              "id": "prod-alertmanager",
-            "jobspec": "job \"prod-alertmanager\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-alertmanager\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count             = 1\n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute       = \"${attr.cpu.arch}\"\n      operator        = \"!=\"\n      value           = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-alertmanager\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver          = \"exec\"\n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command       = \"local/alertmanager-0.21.0.linux-amd64/alertmanager\"\n        args          = [\n          \"--config.file=secrets/alertmanager.yml\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alertmanager.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n#  # CA certificate to validate the server certificate with.\n#  ca_file: \u003cfilepath\u003e ]\n#\n#  # Certificate and key files for client cert authentication to the server.\n#  cert_file: \u003cfilepath\u003e\n#  key_file: \u003cfilepath\u003e\n#\n#  # ServerName extension to indicate the name of the server.\n#  # http://tools.ietf.org/html/rfc4366#section-3.1\n#  server_name: \u003cstring\u003e\n#\n#  # Disable validation of the server certificate.\n#  insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n  receiver: 'default-slack-receiver'\n\n  # The labels by which incoming alerts are grouped together. For example,\n  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n  # be batched into a single group.\n  #\n  # To aggregate by all possible labels use '...' as the sole label name.\n  # This effectively disables aggregation entirely, passing through all\n  # alerts as-is. This is unlikely to be what you want, unless you have\n  # a very low alert volume or your upstream notification system performs\n  # its own grouping. Example: group_by: [...]\n  group_by: ['alertname']\n\n  # When a new group of alerts is created by an incoming alert, wait at\n  # least 'group_wait' to send the initial notification.\n  # This way ensures that you get multiple alerts for the same group that start\n  # firing shortly after another are batched together on the first\n  # notification.\n  group_wait: 30s\n\n  # When the first notification was sent, wait 'group_interval' to send a batch\n  # of new alerts that started firing for that group.\n  group_interval: 5m\n\n  # If an alert has successfully been sent, wait 'repeat_interval' to\n  # resend them.\n  repeat_interval: 3h\n\n  # All the above attributes are inherited by all child routes and can\n  # overwritten on each.\n  # The child route trees.\n  routes:\n  - match_re:\n      alertname: JenkinsJob.*\n    receiver: jenkins-slack-receiver\n    routes:\n    - match:\n        severity: critical\n      receiver: 'jenkins-slack-receiver'\n\n  - match_re:\n      service: .*\n    receiver: default-slack-receiver\n    routes:\n    - match:\n        severity: critical\n      receiver: 'default-slack-receiver'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n    severity: 'critical'\n  target_match:\n    severity: 'warning'\n  equal: ['alertname', 'instance']\n\nreceivers:\n- name: 'jenkins-slack-receiver'\n  slack_configs:\n  - api_url: ''\n    channel: '#fdio-jobs-monitoring'\n    send_resolved: true\n    icon_url: https://avatars3.githubusercontent.com/u/3380462\n    title: |-\n     [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n     {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n       {{\" \"}}(\n       {{- with .CommonLabels.Remove .GroupLabels.Names }}\n         {{- range $index, $label := .SortedPairs -}}\n           {{ if $index }}, {{ end }}\n           {{- $label.Name }}=\"{{ $label.Value -}}\"\n         {{- end }}\n       {{- end -}}\n       )\n     {{- end }}\n    text: \u003e-\n     {{ range .Alerts -}}\n     *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n     *Description:* {{ .Annotations.description }}\n\n     *Details:*\n       {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n       {{ end }}\n     {{ end }}\n\n- name: 'default-slack-receiver'\n  slack_configs:\n  - api_url: ''\n    channel: '#fdio-infra-monitoring'\n    send_resolved: true\n    icon_url: https://avatars3.githubusercontent.com/u/3380462\n    title: |-\n     [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n     {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n       {{\" \"}}(\n       {{- with .CommonLabels.Remove .GroupLabels.Names }}\n         {{- range $index, $label := .SortedPairs -}}\n           {{ if $index }}, {{ end }}\n           {{- $label.Name }}=\"{{ $label.Value -}}\"\n         {{- end }}\n       {{- end -}}\n       )\n     {{- end }}\n    text: \u003e-\n     {{ range .Alerts -}}\n     *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n     *Description:* {{ .Annotations.description }}\n\n     *Details:*\n       {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n       {{ end }}\n     {{ end }}\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"alertmanager\"\n        port            = \"alertmanager\"\n        tags            = [ \"alertmanager${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Alertmanager Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = 1000\n        memory          = 1024\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"alertmanager\" {\n            static      = 9093\n          }\n        }\n      }\n    }\n  }\n}",
+            "jobspec": "job \"prod-alertmanager\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-alertmanager\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count             = 1\n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute       = \"${attr.cpu.arch}\"\n      operator        = \"!=\"\n      value           = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-alertmanager\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver          = \"exec\"\n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command       = \"local/alertmanager-0.21.0.linux-amd64/alertmanager\"\n        args          = [\n          \"--config.file=secrets/alertmanager.yml\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alertmanager.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n#  # CA certificate to validate the server certificate with.\n#  ca_file: \u003cfilepath\u003e ]\n#\n#  # Certificate and key files for client cert authentication to the server.\n#  cert_file: \u003cfilepath\u003e\n#  key_file: \u003cfilepath\u003e\n#\n#  # ServerName extension to indicate the name of the server.\n#  # http://tools.ietf.org/html/rfc4366#section-3.1\n#  server_name: \u003cstring\u003e\n#\n#  # Disable validation of the server certificate.\n#  insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n  receiver: 'default-slack-receiver'\n\n  # The labels by which incoming alerts are grouped together. For example,\n  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n  # be batched into a single group.\n  #\n  # To aggregate by all possible labels use '...' as the sole label name.\n  # This effectively disables aggregation entirely, passing through all\n  # alerts as-is. This is unlikely to be what you want, unless you have\n  # a very low alert volume or your upstream notification system performs\n  # its own grouping. Example: group_by: [...]\n  group_by: ['alertname']\n\n  # When a new group of alerts is created by an incoming alert, wait at\n  # least 'group_wait' to send the initial notification.\n  # This way ensures that you get multiple alerts for the same group that start\n  # firing shortly after another are batched together on the first\n  # notification.\n  group_wait: 30s\n\n  # When the first notification was sent, wait 'group_interval' to send a batch\n  # of new alerts that started firing for that group.\n  group_interval: 5m\n\n  # If an alert has successfully been sent, wait 'repeat_interval' to\n  # resend them.\n  repeat_interval: 3h\n\n  # All the above attributes are inherited by all child routes and can\n  # overwritten on each.\n  # The child route trees.\n  routes:\n  - match_re:\n      alertname: JenkinsJob.*\n    receiver: jenkins-slack-receiver\n    routes:\n    - match:\n        severity: critical\n      receiver: 'jenkins-slack-receiver'\n\n  - match_re:\n      service: .*\n    receiver: default-slack-receiver\n    routes:\n    - match:\n        severity: critical\n      receiver: 'default-slack-receiver'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n    severity: 'critical'\n  target_match:\n    severity: 'warning'\n  equal: ['alertname', 'instance']\n\nreceivers:\n- name: 'jenkins-slack-receiver'\n  slack_configs:\n  - api_url: 'https://hooks.slack.com/services/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'\n    channel: '#fdio-jobs-monitoring'\n    send_resolved: true\n    icon_url: https://avatars3.githubusercontent.com/u/3380462\n    title: |-\n     [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n     {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n       {{\" \"}}(\n       {{- with .CommonLabels.Remove .GroupLabels.Names }}\n         {{- range $index, $label := .SortedPairs -}}\n           {{ if $index }}, {{ end }}\n           {{- $label.Name }}=\"{{ $label.Value -}}\"\n         {{- end }}\n       {{- end -}}\n       )\n     {{- end }}\n    text: \u003e-\n     {{ range .Alerts -}}\n     *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n     *Description:* {{ .Annotations.description }}\n\n     *Details:*\n       {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n       {{ end }}\n     {{ end }}\n\n- name: 'default-slack-receiver'\n  slack_configs:\n  - api_url: 'https://hooks.slack.com/services/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'\n    channel: '#fdio-infra-monitoring'\n    send_resolved: true\n    icon_url: https://avatars3.githubusercontent.com/u/3380462\n    title: |-\n     [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n     {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n       {{\" \"}}(\n       {{- with .CommonLabels.Remove .GroupLabels.Names }}\n         {{- range $index, $label := .SortedPairs -}}\n           {{ if $index }}, {{ end }}\n           {{- $label.Name }}=\"{{ $label.Value -}}\"\n         {{- end }}\n       {{- end -}}\n       )\n     {{- end }}\n    text: \u003e-\n     {{ range .Alerts -}}\n     *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n     *Description:* {{ .Annotations.description }}\n\n     *Details:*\n       {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n       {{ end }}\n     {{ end }}\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"alertmanager\"\n        port            = \"alertmanager\"\n        tags            = [ \"alertmanager${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Alertmanager Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = 1000\n        memory          = 1024\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"alertmanager\" {\n            static      = 9093\n          }\n        }\n      }\n    }\n  }\n}",
              "json": null,
              "modify_index": "7267712",
              "name": "prod-alertmanager",
@@ -263,14 +263,17 @@
        "module": "module.minio",
        "mode": "managed",
        "type": "nomad_job",
-      "name": "nomad_job_mc",
+      "name": "nomad_job_minio",
        "provider": "provider[\"registry.terraform.io/hashicorp/nomad\"].yul1",
        "instances": [
          {
            "schema_version": 0,
            "attributes": {
              "allocation_ids": [
-              "499b34c4-7f90-5d06-5ea5-e0826824a4f7"
+              "71a7d57c-77f2-d9d6-aa4d-94e232480be1",
+              "1f3ca262-a00f-af34-0187-3b9b77f08b3a",
+              "d0634db3-1ee5-ac6f-3111-39ba302b409c",
+              "a3a2e0cd-617c-dfb1-054f-f69ca27ff6c6"
              ],
              "datacenters": [
                "yul1"
@@ -280,71 +283,10 @@
              "deregister_on_destroy": true,
              "deregister_on_id_change": true,
              "detach": false,
-            "id": "prod-mc",
-            "jobspec": "job \"prod-mc\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers.html\n  #\n  type        = \"batch\"\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group.html\n  #\n  group \"prod-group1-mc\" {\n    task \"prod-task1-create-buckets\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver        = \"docker\"\n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        image       = \"minio/mc:RELEASE.2020-12-10T01-26-17Z\"\n        entrypoint  = [\n          \"/bin/sh\",\n          \"-c\",\n          \"mc config host add LOCALMINIO http://storage.service.consul:9000 $MINIO_ACCESS_KEY $MINIO_SECRET_KEY \u0026\u0026 mc mb -p LOCALMINIO/logs.fd.io LOCALMINIO/docs.fd.io ; mc policy set public LOCALMINIO/logs.fd.io mc policy set public LOCALMINIO/docs.fd.io mc ilm add --expiry-days '180' LOCALMINIO/logs.fd.io mc admin user add LOCALMINIO storage Storage1234 mc admin policy set LOCALMINIO writeonly user=storage\"\n        ]\n        dns_servers  = [ \"${attr.unique.network.ip-address}\" ]\n        privileged   = false\n      }\n\n      # The env stanza configures a list of environment variables to populate\n      # the task's environment before starting.\n      env {\n        \n        MINIO_ACCESS_KEY = \"minio\"\n        MINIO_SECRET_KEY = \"minio123\"\n        \n        \n      }\n    }\n  }\n}\n",
-            "json": null,
-            "modify_index": "7454315",
-            "name": "prod-mc",
-            "namespace": "default",
-            "policy_override": null,
-            "purge_on_destroy": null,
-            "region": "global",
-            "task_groups": [
-              {
-                "count": 1,
-                "meta": {},
-                "name": "prod-group1-mc",
-                "task": [
-                  {
-                    "driver": "docker",
-                    "meta": {},
-                    "name": "prod-task1-create-buckets",
-                    "volume_mounts": null
-                  }
-                ],
-                "volumes": null
-              }
-            ],
-            "type": "batch"
-          },
-          "sensitive_attributes": [],
-          "private": "bnVsbA==",
-          "dependencies": [
-            "module.minio.data.template_file.nomad_job_mc",
-            "module.minio.data.template_file.nomad_job_minio",
-            "module.minio.nomad_job.nomad_job_minio"
-          ]
-        }
-      ]
-    },
-    {
-      "module": "module.minio",
-      "mode": "managed",
-      "type": "nomad_job",
-      "name": "nomad_job_minio",
-      "provider": "provider[\"registry.terraform.io/hashicorp/nomad\"].yul1",
-      "instances": [
-        {
-          "schema_version": 0,
-          "attributes": {
-            "allocation_ids": [
-              "609f2c74-a79c-5801-bab1-5b050dfdfdcf",
-              "8ce07e52-32ad-445c-42a1-64b884308f75",
-              "98a5b038-805b-59a7-95b2-91daac526e8a",
-              "e55ab00b-7b21-94b2-9a7a-e25623e3da89"
-            ],
-            "datacenters": [
-              "yul1"
-            ],
-            "deployment_id": "55b089ea-cf27-3a16-26c6-072ec4bb3167",
-            "deployment_status": "successful",
-            "deregister_on_destroy": true,
-            "deregister_on_id_change": true,
-            "detach": false,
              "id": "prod-minio",
              "jobspec": "job \"prod-minio\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers.html\n  #\n  type        = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # All groups in this job should be scheduled on different hosts.\n  constraint {\n    operator          = \"distinct_hosts\"\n    value             = \"true\"\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group.html\n  #\n  group \"prod-group1-minio\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count       = 4\n\n    # https://www.nomadproject.io/docs/job-specification/volume\n    \n    volume \"prod-volume1-minio\" {\n      type      = \"host\"\n      read_only = false\n      source    = \"prod-volume-data1-1\"\n    }\n    \n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task.html\n    #\n    task \"prod-task1-minio\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver             = \"docker\"\n\n    \n      volume_mount {\n        volume           = \"prod-volume1-minio\"\n        destination      = \"/data/\"\n        read_only        = false\n      }\n    \n\n    \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        image            = \"minio/minio:RELEASE.2020-12-03T05-49-24Z\"\n        dns_servers      = [ \"${attr.unique.network.ip-address}\" ]\n        network_mode     = \"host\"\n        command          = \"server\"\n        args             = [ \"http://10.32.8.1{4...7}:9000/data/\" ]\n        port_map {\n          http           = 9000\n        }\n        privileged       = false\n      }\n\n      # The env stanza configures a list of environment variables to populate\n      # the task's environment before starting.\n      env {\n\n        MINIO_ACCESS_KEY = \"minio\"\n        MINIO_SECRET_KEY = \"minio123\"\n\n        MINIO_BROWSER=\"off\"\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service.html\n      #\n      service {\n        name       = \"storage\"\n        port       = \"http\"\n        tags       = [ \"storage${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name     = \"Min.io Server HTTP Check Live\"\n          type     = \"http\"\n          port     = \"http\"\n          protocol = \"http\"\n          method   = \"GET\"\n          path     = \"/minio/health/live\"\n          interval = \"10s\"\n          timeout  = \"2s\"\n        }\n        check {\n          name     = \"Min.io Server HTTP Check Ready\"\n          type     = \"http\"\n          port     = \"http\"\n          protocol = \"http\"\n          method   = \"GET\"\n          path     = \"/minio/health/ready\"\n          interval = \"10s\"\n          timeout  = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources.html\n      #\n      resources {\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network.html\n        #\n        cpu      = 40000\n        memory   = 40000\n        network {\n          port \"http\" {\n            static = 9000\n          }\n        }\n      }\n    }\n  }\n}\n",
              "json": null,
-            "modify_index": "5933603",
+            "modify_index": "7479267",
              "name": "prod-minio",
              "namespace": "default",
              "policy_override": null,
@@ -479,9 +421,9 @@
            "schema_version": 0,
            "attributes": {
              "filename": null,
-            "id": "f4fe3d282a60afb0ef6713540a32d855d65713707e68cf8636a1afd38521b604",
-            "rendered": "job \"prod-prometheus\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-prometheus\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count               = 4\n\n    # The volume stanza allows the group to specify that it requires a given\n    # volume from the cluster.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/volume\n    #\n    \n    volume \"prod-volume1-prometheus\" {\n      type              = \"host\"\n      read_only         = false\n      source            = \"prod-volume-data1-1\"\n    }\n    \n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute         = \"${attr.cpu.arch}\"\n      operator          = \"!=\"\n      value             = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-prometheus\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver            = \"exec\"\n\n      \n      volume_mount {\n        volume          = \"prod-volume1-prometheus\"\n        destination     = \"/data/\"\n        read_only       = false\n      }\n      \n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command         = \"local/prometheus-2.24.0.linux-amd64/prometheus\"\n        args            = [\n          \"--config.file=secrets/prometheus.yml\",\n          \"--storage.tsdb.path=/data/prometheus/\",\n          \"--storage.tsdb.retention.time=15d\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alerts.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n---\ngroups:\n- name: \"Jenkins Job Health Exporter\"\n  rules:\n  - alert: JenkinsJobHealthExporterFailures\n    expr: jenkins_job_failure{id=~\".*\"} \u003e= 10\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Jenkins Job Health detected high failure rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n  - alert: JenkinsJobHealthExporterUnstable\n    expr: jenkins_job_unstable{id=~\".*\"} \u003e= 10\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Jenkins Job Health detected high unstable rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n- name: \"Consul\"\n  rules:\n  - alert: ConsulServiceHealthcheckFailed\n    expr: consul_catalog_service_node_healthy == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n      description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n  - alert: ConsulMissingMasterNode\n    expr: consul_raft_peers \u003c 3\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n      description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n  - alert: ConsulAgentUnhealthy\n    expr: consul_health_node_status{status=\"critical\"} == 1\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n      description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n  rules:\n  - alert: NodeDown\n    expr: up == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n      description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n  - alert: HostHighCpuLoad\n    expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n      description: \"CPU load is \u003e 95%.\"\n  - alert: HostOutOfMemory\n    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n      description: \"Node memory is filling up (\u003c 10% left).\"\n  - alert: HostOomKillDetected\n    expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n      description: \"OOM kill detected.\"\n  - alert: HostMemoryUnderMemoryPressure\n    expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n      description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n  - alert: HostOutOfDiskSpace\n    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n      description: \"Disk is almost full (\u003c 10% left).\"\n  - alert: HostRaidDiskFailure\n    expr: node_md_disks{state=\"failed\"} \u003e 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n      description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n  - alert: HostConntrackLimit\n    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n      description: \"The number of conntrack is approching limit.\"\n  - alert: HostNetworkInterfaceSaturated\n    expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n    for: 1m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n      description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n  - alert: HostSystemdServiceCrashed\n    expr: node_systemd_unit_state{state=\"failed\"} == 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n      description: \"SystemD service crashed.\"\n  - alert: HostEdacCorrectableErrorsDetected\n    expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: info\n    annotations:\n      summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n  - alert: HostEdacUncorrectableErrorsDetected\n    expr: node_edac_uncorrectable_errors_total \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n  rules:\n  - alert: MinioDiskOffline\n    expr: minio_offline_disks \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n      description: \"Minio disk is offline.\"\n  - alert: MinioStorageSpaceExhausted\n    expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n      description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n  rules:\n  - alert: PrometheusConfigurationReloadFailure\n    expr: prometheus_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"Prometheus configuration reload error.\"\n  - alert: PrometheusTooManyRestarts\n    expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n      description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n  - alert: PrometheusAlertmanagerConfigurationReloadFailure\n    expr: alertmanager_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"AlertManager configuration reload error.\"\n  - alert: PrometheusRuleEvaluationFailures\n    expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n  - alert: PrometheusTargetScrapingSlow\n    expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n      description: \"Prometheus is scraping exporters slowly.\"\n  - alert: PrometheusTsdbCompactionsFailed\n    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n  - alert: PrometheusTsdbHeadTruncationsFailed\n    expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n  - alert: PrometheusTsdbWalCorruptions\n    expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n  - alert: PrometheusTsdbWalTruncationsFailed\n    expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n      }\n\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/prometheus.yml\"\n        data            = \u003c\u003cEOH\n---\nglobal:\n  scrape_interval:     5s\n  scrape_timeout:      5s\n  evaluation_interval: 5s\n\nalerting:\n  alertmanagers:\n  - consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\nrule_files:\n  - 'alerts.yml'\n\nscrape_configs:\n\n  - job_name: 'Nomad Cluster'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'nomad-client', 'nomad' ]\n    relabel_configs:\n    - source_labels: [__meta_consul_tags]\n      regex: '(.*)http(.*)'\n      action: keep\n    metrics_path: /v1/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Consul Cluster'\n    static_configs:\n      - targets: [ '10.30.51.28:8500' ]\n      - targets: [ '10.30.51.29:8500' ]\n      - targets: [ '10.30.51.30:8500' ]\n      - targets: [ '10.30.51.32:8500' ]\n      - targets: [ '10.30.51.33:8500' ]\n      - targets: [ '10.30.51.34:8500' ]\n      - targets: [ '10.30.51.35:8500' ]\n      - targets: [ '10.30.51.39:8500' ]\n      - targets: [ '10.30.51.40:8500' ]\n      - targets: [ '10.30.51.50:8500' ]\n      - targets: [ '10.30.51.51:8500' ]\n      - targets: [ '10.30.51.65:8500' ]\n      - targets: [ '10.30.51.66:8500' ]\n      - targets: [ '10.30.51.67:8500' ]\n      - targets: [ '10.30.51.68:8500' ]\n      - targets: [ '10.30.51.70:8500' ]\n      - targets: [ '10.30.51.71:8500' ]\n      - targets: [ '10.32.8.14:8500' ]\n      - targets: [ '10.32.8.15:8500' ]\n      - targets: [ '10.32.8.16:8500' ]\n      - targets: [ '10.32.8.17:8500' ]\n    metrics_path: /v1/agent/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Blackbox Exporter (icmp)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n      - targets: [ '10.30.51.32' ]\n    params:\n      module: [ 'icmp_v4' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'Blackbox Exporter (http)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n    params:\n      module: [ 'http_2xx' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'cAdvisor Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:8080' ]\n      - targets: [ '10.30.51.29:8080' ]\n      - targets: [ '10.30.51.30:8080' ]\n      #- targets: [ '10.30.51.32:8080' ]\n      - targets: [ '10.30.51.33:8080' ]\n      - targets: [ '10.30.51.34:8080' ]\n      - targets: [ '10.30.51.35:8080' ]\n      - targets: [ '10.30.51.39:8080' ]\n      - targets: [ '10.30.51.40:8080' ]\n      - targets: [ '10.30.51.50:8080' ]\n      - targets: [ '10.30.51.51:8080' ]\n      - targets: [ '10.30.51.65:8080' ]\n      - targets: [ '10.30.51.66:8080' ]\n      - targets: [ '10.30.51.67:8080' ]\n      - targets: [ '10.30.51.68:8080' ]\n      - targets: [ '10.30.51.70:8080' ]\n      - targets: [ '10.30.51.71:8080' ]\n      - targets: [ '10.32.8.14:8080' ]\n      - targets: [ '10.32.8.15:8080' ]\n      - targets: [ '10.32.8.16:8080' ]\n      - targets: [ '10.32.8.17:8080' ]\n\n  - job_name: 'Jenkins Job Health Exporter'\n    static_configs:\n      - targets: [ '10.30.51.32:9186' ]\n    metric_relabel_configs:\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        action: replace\n        replacement: '$1'\n        target_label: id\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        replacement: 'jenkins_job_$2'\n        target_label: __name__\n\n  - job_name: 'Node Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:9100' ]\n      - targets: [ '10.30.51.29:9100' ]\n      - targets: [ '10.30.51.30:9100' ]\n      - targets: [ '10.30.51.32:9100' ]\n      - targets: [ '10.30.51.33:9100' ]\n      - targets: [ '10.30.51.34:9100' ]\n      - targets: [ '10.30.51.35:9100' ]\n      - targets: [ '10.30.51.39:9100' ]\n      - targets: [ '10.30.51.40:9100' ]\n      - targets: [ '10.30.51.50:9100' ]\n      - targets: [ '10.30.51.51:9100' ]\n      - targets: [ '10.30.51.65:9100' ]\n      - targets: [ '10.30.51.66:9100' ]\n      - targets: [ '10.30.51.67:9100' ]\n      - targets: [ '10.30.51.68:9100' ]\n      - targets: [ '10.30.51.70:9100' ]\n      - targets: [ '10.30.51.71:9100' ]\n      - targets: [ '10.32.8.14:9100' ]\n      - targets: [ '10.32.8.15:9100' ]\n      - targets: [ '10.32.8.16:9100' ]\n      - targets: [ '10.32.8.17:9100' ]\n\n  - job_name: 'Alertmanager'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\n  - job_name: 'Grafana'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'grafana' ]\n\n  - job_name: 'Prometheus'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'prometheus' ]\n\n  - job_name: 'Minio'\n    bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'storage' ]\n    metrics_path: /minio/prometheus/metrics\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"prometheus\"\n        port            = \"prometheus\"\n        tags            = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Prometheus Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = 2000\n        memory          = 8192\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"prometheus\" {\n            static      = 9090\n          }\n        }\n      }\n    }\n  }\n}",
-            "template": "job \"${job_name}\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"${datacenters}\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n%{ if use_canary }\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n%{ endif }\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-${service_name}\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count               = ${group_count}\n\n    # The volume stanza allows the group to specify that it requires a given\n    # volume from the cluster.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/volume\n    #\n    %{ if use_host_volume }\n    volume \"prod-volume1-${service_name}\" {\n      type              = \"host\"\n      read_only         = false\n      source            = \"${host_volume}\"\n    }\n    %{ endif }\n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute         = \"$${attr.cpu.arch}\"\n      operator          = \"!=\"\n      value             = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-${service_name}\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver            = \"exec\"\n\n      %{ if use_host_volume }\n      volume_mount {\n        volume          = \"prod-volume1-${service_name}\"\n        destination     = \"${data_dir}\"\n        read_only       = false\n      }\n      %{ endif }\n\n      %{ if use_vault_provider }\n      vault {\n        policies        = \"${vault_kv_policy_name}\"\n      }\n      %{ endif }\n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command         = \"local/prometheus-${version}.linux-amd64/prometheus\"\n        args            = [\n          \"--config.file=secrets/prometheus.yml\",\n          \"--storage.tsdb.path=${data_dir}prometheus/\",\n          \"--storage.tsdb.retention.time=15d\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"${url}\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alerts.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n---\ngroups:\n- name: \"Jenkins Job Health Exporter\"\n  rules:\n  - alert: JenkinsJobHealthExporterFailures\n    expr: jenkins_job_failure{id=~\".*\"} \u003e= 10\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Jenkins Job Health detected high failure rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n  - alert: JenkinsJobHealthExporterUnstable\n    expr: jenkins_job_unstable{id=~\".*\"} \u003e= 10\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Jenkins Job Health detected high unstable rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n- name: \"Consul\"\n  rules:\n  - alert: ConsulServiceHealthcheckFailed\n    expr: consul_catalog_service_node_healthy == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n      description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n  - alert: ConsulMissingMasterNode\n    expr: consul_raft_peers \u003c 3\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n      description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n  - alert: ConsulAgentUnhealthy\n    expr: consul_health_node_status{status=\"critical\"} == 1\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n      description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n  rules:\n  - alert: NodeDown\n    expr: up == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n      description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n  - alert: HostHighCpuLoad\n    expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n      description: \"CPU load is \u003e 95%.\"\n  - alert: HostOutOfMemory\n    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n      description: \"Node memory is filling up (\u003c 10% left).\"\n  - alert: HostOomKillDetected\n    expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n      description: \"OOM kill detected.\"\n  - alert: HostMemoryUnderMemoryPressure\n    expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n      description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n  - alert: HostOutOfDiskSpace\n    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n      description: \"Disk is almost full (\u003c 10% left).\"\n  - alert: HostRaidDiskFailure\n    expr: node_md_disks{state=\"failed\"} \u003e 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n      description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n  - alert: HostConntrackLimit\n    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n      description: \"The number of conntrack is approching limit.\"\n  - alert: HostNetworkInterfaceSaturated\n    expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n    for: 1m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n      description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n  - alert: HostSystemdServiceCrashed\n    expr: node_systemd_unit_state{state=\"failed\"} == 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n      description: \"SystemD service crashed.\"\n  - alert: HostEdacCorrectableErrorsDetected\n    expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: info\n    annotations:\n      summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n  - alert: HostEdacUncorrectableErrorsDetected\n    expr: node_edac_uncorrectable_errors_total \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n  rules:\n  - alert: MinioDiskOffline\n    expr: minio_offline_disks \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n      description: \"Minio disk is offline.\"\n  - alert: MinioStorageSpaceExhausted\n    expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n      description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n  rules:\n  - alert: PrometheusConfigurationReloadFailure\n    expr: prometheus_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"Prometheus configuration reload error.\"\n  - alert: PrometheusTooManyRestarts\n    expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n      description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n  - alert: PrometheusAlertmanagerConfigurationReloadFailure\n    expr: alertmanager_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"AlertManager configuration reload error.\"\n  - alert: PrometheusRuleEvaluationFailures\n    expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n  - alert: PrometheusTargetScrapingSlow\n    expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n      description: \"Prometheus is scraping exporters slowly.\"\n  - alert: PrometheusTsdbCompactionsFailed\n    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n  - alert: PrometheusTsdbHeadTruncationsFailed\n    expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n  - alert: PrometheusTsdbWalCorruptions\n    expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n  - alert: PrometheusTsdbWalTruncationsFailed\n    expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n      }\n\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/prometheus.yml\"\n        data            = \u003c\u003cEOH\n---\nglobal:\n  scrape_interval:     5s\n  scrape_timeout:      5s\n  evaluation_interval: 5s\n\nalerting:\n  alertmanagers:\n  - consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\nrule_files:\n  - 'alerts.yml'\n\nscrape_configs:\n\n  - job_name: 'Nomad Cluster'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'nomad-client', 'nomad' ]\n    relabel_configs:\n    - source_labels: [__meta_consul_tags]\n      regex: '(.*)http(.*)'\n      action: keep\n    metrics_path: /v1/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Consul Cluster'\n    static_configs:\n      - targets: [ '10.30.51.28:8500' ]\n      - targets: [ '10.30.51.29:8500' ]\n      - targets: [ '10.30.51.30:8500' ]\n      - targets: [ '10.30.51.32:8500' ]\n      - targets: [ '10.30.51.33:8500' ]\n      - targets: [ '10.30.51.34:8500' ]\n      - targets: [ '10.30.51.35:8500' ]\n      - targets: [ '10.30.51.39:8500' ]\n      - targets: [ '10.30.51.40:8500' ]\n      - targets: [ '10.30.51.50:8500' ]\n      - targets: [ '10.30.51.51:8500' ]\n      - targets: [ '10.30.51.65:8500' ]\n      - targets: [ '10.30.51.66:8500' ]\n      - targets: [ '10.30.51.67:8500' ]\n      - targets: [ '10.30.51.68:8500' ]\n      - targets: [ '10.30.51.70:8500' ]\n      - targets: [ '10.30.51.71:8500' ]\n      - targets: [ '10.32.8.14:8500' ]\n      - targets: [ '10.32.8.15:8500' ]\n      - targets: [ '10.32.8.16:8500' ]\n      - targets: [ '10.32.8.17:8500' ]\n    metrics_path: /v1/agent/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Blackbox Exporter (icmp)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n      - targets: [ '10.30.51.32' ]\n    params:\n      module: [ 'icmp_v4' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'Blackbox Exporter (http)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n    params:\n      module: [ 'http_2xx' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'cAdvisor Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:8080' ]\n      - targets: [ '10.30.51.29:8080' ]\n      - targets: [ '10.30.51.30:8080' ]\n      #- targets: [ '10.30.51.32:8080' ]\n      - targets: [ '10.30.51.33:8080' ]\n      - targets: [ '10.30.51.34:8080' ]\n      - targets: [ '10.30.51.35:8080' ]\n      - targets: [ '10.30.51.39:8080' ]\n      - targets: [ '10.30.51.40:8080' ]\n      - targets: [ '10.30.51.50:8080' ]\n      - targets: [ '10.30.51.51:8080' ]\n      - targets: [ '10.30.51.65:8080' ]\n      - targets: [ '10.30.51.66:8080' ]\n      - targets: [ '10.30.51.67:8080' ]\n      - targets: [ '10.30.51.68:8080' ]\n      - targets: [ '10.30.51.70:8080' ]\n      - targets: [ '10.30.51.71:8080' ]\n      - targets: [ '10.32.8.14:8080' ]\n      - targets: [ '10.32.8.15:8080' ]\n      - targets: [ '10.32.8.16:8080' ]\n      - targets: [ '10.32.8.17:8080' ]\n\n  - job_name: 'Jenkins Job Health Exporter'\n    static_configs:\n      - targets: [ '10.30.51.32:9186' ]\n    metric_relabel_configs:\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        action: replace\n        replacement: '$1'\n        target_label: id\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        replacement: 'jenkins_job_$2'\n        target_label: __name__\n\n  - job_name: 'Node Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:9100' ]\n      - targets: [ '10.30.51.29:9100' ]\n      - targets: [ '10.30.51.30:9100' ]\n      - targets: [ '10.30.51.32:9100' ]\n      - targets: [ '10.30.51.33:9100' ]\n      - targets: [ '10.30.51.34:9100' ]\n      - targets: [ '10.30.51.35:9100' ]\n      - targets: [ '10.30.51.39:9100' ]\n      - targets: [ '10.30.51.40:9100' ]\n      - targets: [ '10.30.51.50:9100' ]\n      - targets: [ '10.30.51.51:9100' ]\n      - targets: [ '10.30.51.65:9100' ]\n      - targets: [ '10.30.51.66:9100' ]\n      - targets: [ '10.30.51.67:9100' ]\n      - targets: [ '10.30.51.68:9100' ]\n      - targets: [ '10.30.51.70:9100' ]\n      - targets: [ '10.30.51.71:9100' ]\n      - targets: [ '10.32.8.14:9100' ]\n      - targets: [ '10.32.8.15:9100' ]\n      - targets: [ '10.32.8.16:9100' ]\n      - targets: [ '10.32.8.17:9100' ]\n\n  - job_name: 'Alertmanager'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\n  - job_name: 'Grafana'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'grafana' ]\n\n  - job_name: 'Prometheus'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'prometheus' ]\n\n  - job_name: 'Minio'\n    bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'storage' ]\n    metrics_path: /minio/prometheus/metrics\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"${service_name}\"\n        port            = \"${service_name}\"\n        tags            = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Prometheus Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = ${cpu}\n        memory          = ${mem}\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"${service_name}\" {\n            static      = ${port}\n          }\n        }\n      }\n    }\n  }\n}",
+            "id": "eddfe06cf2af83302365d353cc365dab387ccb1696b8a9be02dcab381ef527df",
+            "rendered": "job \"prod-prometheus\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-prometheus\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count               = 4\n\n    # The volume stanza allows the group to specify that it requires a given\n    # volume from the cluster.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/volume\n    #\n    \n    volume \"prod-volume1-prometheus\" {\n      type              = \"host\"\n      read_only         = false\n      source            = \"prod-volume-data1-1\"\n    }\n    \n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute         = \"${attr.cpu.arch}\"\n      operator          = \"!=\"\n      value             = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-prometheus\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver            = \"exec\"\n\n      \n      volume_mount {\n        volume          = \"prod-volume1-prometheus\"\n        destination     = \"/data/\"\n        read_only       = false\n      }\n      \n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command         = \"local/prometheus-2.24.0.linux-amd64/prometheus\"\n        args            = [\n          \"--config.file=secrets/prometheus.yml\",\n          \"--storage.tsdb.path=/data/prometheus/\",\n          \"--storage.tsdb.retention.time=15d\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alerts.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n---\ngroups:\n- name: \"Jenkins Job Health Exporter\"\n  rules:\n  - alert: JenkinsJobHealthExporterFailures\n    expr: jenkins_job_failure{id=~\".*\"} \u003e jenkins_job_success{id=~\".*\"}\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Jenkins Job Health detected high failure rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n  - alert: JenkinsJobHealthExporterUnstable\n    expr: jenkins_job_unstable{id=~\".*\"} \u003e jenkins_job_success{id=~\".*\"}\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Jenkins Job Health detected high unstable rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n- name: \"Consul\"\n  rules:\n  - alert: ConsulServiceHealthcheckFailed\n    expr: consul_catalog_service_node_healthy == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n      description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n  - alert: ConsulMissingMasterNode\n    expr: consul_raft_peers \u003c 3\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n      description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n  - alert: ConsulAgentUnhealthy\n    expr: consul_health_node_status{status=\"critical\"} == 1\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n      description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n  rules:\n  - alert: NodeDown\n    expr: up == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n      description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n  - alert: HostHighCpuLoad\n    expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n      description: \"CPU load is \u003e 95%.\"\n  - alert: HostOutOfMemory\n    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n      description: \"Node memory is filling up (\u003c 10% left).\"\n  - alert: HostOomKillDetected\n    expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n      description: \"OOM kill detected.\"\n  - alert: HostMemoryUnderMemoryPressure\n    expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n      description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n  - alert: HostOutOfDiskSpace\n    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n      description: \"Disk is almost full (\u003c 10% left).\"\n  - alert: HostRaidDiskFailure\n    expr: node_md_disks{state=\"failed\"} \u003e 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n      description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n  - alert: HostConntrackLimit\n    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n      description: \"The number of conntrack is approching limit.\"\n  - alert: HostNetworkInterfaceSaturated\n    expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n    for: 1m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n      description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n  - alert: HostSystemdServiceCrashed\n    expr: node_systemd_unit_state{state=\"failed\"} == 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n      description: \"SystemD service crashed.\"\n  - alert: HostEdacCorrectableErrorsDetected\n    expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: info\n    annotations:\n      summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n  - alert: HostEdacUncorrectableErrorsDetected\n    expr: node_edac_uncorrectable_errors_total \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n  rules:\n  - alert: MinioDiskOffline\n    expr: minio_offline_disks \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n      description: \"Minio disk is offline.\"\n  - alert: MinioStorageSpaceExhausted\n    expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n      description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n  rules:\n  - alert: PrometheusConfigurationReloadFailure\n    expr: prometheus_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"Prometheus configuration reload error.\"\n  - alert: PrometheusTooManyRestarts\n    expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n      description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n  - alert: PrometheusAlertmanagerConfigurationReloadFailure\n    expr: alertmanager_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"AlertManager configuration reload error.\"\n  - alert: PrometheusRuleEvaluationFailures\n    expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n  - alert: PrometheusTargetScrapingSlow\n    expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n      description: \"Prometheus is scraping exporters slowly.\"\n  - alert: PrometheusTsdbCompactionsFailed\n    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n  - alert: PrometheusTsdbHeadTruncationsFailed\n    expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n  - alert: PrometheusTsdbWalCorruptions\n    expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n  - alert: PrometheusTsdbWalTruncationsFailed\n    expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n      }\n\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/prometheus.yml\"\n        data            = \u003c\u003cEOH\n---\nglobal:\n  scrape_interval:     5s\n  scrape_timeout:      5s\n  evaluation_interval: 5s\n\nalerting:\n  alertmanagers:\n  - consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\nrule_files:\n  - 'alerts.yml'\n\nscrape_configs:\n\n  - job_name: 'Nomad Cluster'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'nomad-client', 'nomad' ]\n    relabel_configs:\n    - source_labels: [__meta_consul_tags]\n      regex: '(.*)http(.*)'\n      action: keep\n    metrics_path: /v1/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Consul Cluster'\n    static_configs:\n      - targets: [ '10.30.51.28:8500' ]\n      - targets: [ '10.30.51.29:8500' ]\n      - targets: [ '10.30.51.30:8500' ]\n      - targets: [ '10.30.51.32:8500' ]\n      - targets: [ '10.30.51.33:8500' ]\n      - targets: [ '10.30.51.34:8500' ]\n      - targets: [ '10.30.51.35:8500' ]\n      - targets: [ '10.30.51.39:8500' ]\n      - targets: [ '10.30.51.40:8500' ]\n      - targets: [ '10.30.51.50:8500' ]\n      - targets: [ '10.30.51.51:8500' ]\n      - targets: [ '10.30.51.65:8500' ]\n      - targets: [ '10.30.51.66:8500' ]\n      - targets: [ '10.30.51.67:8500' ]\n      - targets: [ '10.30.51.68:8500' ]\n      - targets: [ '10.30.51.70:8500' ]\n      - targets: [ '10.30.51.71:8500' ]\n      - targets: [ '10.32.8.14:8500' ]\n      - targets: [ '10.32.8.15:8500' ]\n      - targets: [ '10.32.8.16:8500' ]\n      - targets: [ '10.32.8.17:8500' ]\n    metrics_path: /v1/agent/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Blackbox Exporter (icmp)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n      - targets: [ '10.30.51.32' ]\n    params:\n      module: [ 'icmp_v4' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'Blackbox Exporter (http)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n    params:\n      module: [ 'http_2xx' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'cAdvisor Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:8080' ]\n      - targets: [ '10.30.51.29:8080' ]\n      - targets: [ '10.30.51.30:8080' ]\n      #- targets: [ '10.30.51.32:8080' ]\n      - targets: [ '10.30.51.33:8080' ]\n      - targets: [ '10.30.51.34:8080' ]\n      - targets: [ '10.30.51.35:8080' ]\n      - targets: [ '10.30.51.39:8080' ]\n      - targets: [ '10.30.51.40:8080' ]\n      - targets: [ '10.30.51.50:8080' ]\n      - targets: [ '10.30.51.51:8080' ]\n      - targets: [ '10.30.51.65:8080' ]\n      - targets: [ '10.30.51.66:8080' ]\n      - targets: [ '10.30.51.67:8080' ]\n      - targets: [ '10.30.51.68:8080' ]\n      - targets: [ '10.30.51.70:8080' ]\n      - targets: [ '10.30.51.71:8080' ]\n      - targets: [ '10.32.8.14:8080' ]\n      - targets: [ '10.32.8.15:8080' ]\n      - targets: [ '10.32.8.16:8080' ]\n      - targets: [ '10.32.8.17:8080' ]\n\n  - job_name: 'Jenkins Job Health Exporter'\n    static_configs:\n      - targets: [ '10.30.51.32:9186' ]\n    metric_relabel_configs:\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        action: replace\n        replacement: '$1'\n        target_label: id\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        replacement: 'jenkins_job_$2'\n        target_label: __name__\n\n  - job_name: 'Node Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:9100' ]\n      - targets: [ '10.30.51.29:9100' ]\n      - targets: [ '10.30.51.30:9100' ]\n      - targets: [ '10.30.51.32:9100' ]\n      - targets: [ '10.30.51.33:9100' ]\n      - targets: [ '10.30.51.34:9100' ]\n      - targets: [ '10.30.51.35:9100' ]\n      - targets: [ '10.30.51.39:9100' ]\n      - targets: [ '10.30.51.40:9100' ]\n      - targets: [ '10.30.51.50:9100' ]\n      - targets: [ '10.30.51.51:9100' ]\n      - targets: [ '10.30.51.65:9100' ]\n      - targets: [ '10.30.51.66:9100' ]\n      - targets: [ '10.30.51.67:9100' ]\n      - targets: [ '10.30.51.68:9100' ]\n      - targets: [ '10.30.51.70:9100' ]\n      - targets: [ '10.30.51.71:9100' ]\n      - targets: [ '10.32.8.14:9100' ]\n      - targets: [ '10.32.8.15:9100' ]\n      - targets: [ '10.32.8.16:9100' ]\n      - targets: [ '10.32.8.17:9100' ]\n\n  - job_name: 'Alertmanager'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\n  - job_name: 'Grafana'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'grafana' ]\n\n  - job_name: 'Prometheus'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'prometheus' ]\n\n  - job_name: 'Minio'\n    bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'storage' ]\n    metrics_path: /minio/prometheus/metrics\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"prometheus\"\n        port            = \"prometheus\"\n        tags            = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Prometheus Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = 2000\n        memory          = 8192\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"prometheus\" {\n            static      = 9090\n          }\n        }\n      }\n    }\n  }\n}",
+            "template": "job \"${job_name}\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"${datacenters}\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n%{ if use_canary }\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n%{ endif }\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-${service_name}\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count               = ${group_count}\n\n    # The volume stanza allows the group to specify that it requires a given\n    # volume from the cluster.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/volume\n    #\n    %{ if use_host_volume }\n    volume \"prod-volume1-${service_name}\" {\n      type              = \"host\"\n      read_only         = false\n      source            = \"${host_volume}\"\n    }\n    %{ endif }\n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute         = \"$${attr.cpu.arch}\"\n      operator          = \"!=\"\n      value             = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-${service_name}\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver            = \"exec\"\n\n      %{ if use_host_volume }\n      volume_mount {\n        volume          = \"prod-volume1-${service_name}\"\n        destination     = \"${data_dir}\"\n        read_only       = false\n      }\n      %{ endif }\n\n      %{ if use_vault_provider }\n      vault {\n        policies        = \"${vault_kv_policy_name}\"\n      }\n      %{ endif }\n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command         = \"local/prometheus-${version}.linux-amd64/prometheus\"\n        args            = [\n          \"--config.file=secrets/prometheus.yml\",\n          \"--storage.tsdb.path=${data_dir}prometheus/\",\n          \"--storage.tsdb.retention.time=15d\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"${url}\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alerts.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n---\ngroups:\n- name: \"Jenkins Job Health Exporter\"\n  rules:\n  - alert: JenkinsJobHealthExporterFailures\n    expr: jenkins_job_failure{id=~\".*\"} \u003e jenkins_job_success{id=~\".*\"}\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Jenkins Job Health detected high failure rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n  - alert: JenkinsJobHealthExporterUnstable\n    expr: jenkins_job_unstable{id=~\".*\"} \u003e jenkins_job_success{id=~\".*\"}\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Jenkins Job Health detected high unstable rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n- name: \"Consul\"\n  rules:\n  - alert: ConsulServiceHealthcheckFailed\n    expr: consul_catalog_service_node_healthy == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n      description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n  - alert: ConsulMissingMasterNode\n    expr: consul_raft_peers \u003c 3\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n      description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n  - alert: ConsulAgentUnhealthy\n    expr: consul_health_node_status{status=\"critical\"} == 1\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n      description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n  rules:\n  - alert: NodeDown\n    expr: up == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n      description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n  - alert: HostHighCpuLoad\n    expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n      description: \"CPU load is \u003e 95%.\"\n  - alert: HostOutOfMemory\n    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n      description: \"Node memory is filling up (\u003c 10% left).\"\n  - alert: HostOomKillDetected\n    expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n      description: \"OOM kill detected.\"\n  - alert: HostMemoryUnderMemoryPressure\n    expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n      description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n  - alert: HostOutOfDiskSpace\n    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n      description: \"Disk is almost full (\u003c 10% left).\"\n  - alert: HostRaidDiskFailure\n    expr: node_md_disks{state=\"failed\"} \u003e 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n      description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n  - alert: HostConntrackLimit\n    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n      description: \"The number of conntrack is approching limit.\"\n  - alert: HostNetworkInterfaceSaturated\n    expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n    for: 1m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n      description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n  - alert: HostSystemdServiceCrashed\n    expr: node_systemd_unit_state{state=\"failed\"} == 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n      description: \"SystemD service crashed.\"\n  - alert: HostEdacCorrectableErrorsDetected\n    expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: info\n    annotations:\n      summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n  - alert: HostEdacUncorrectableErrorsDetected\n    expr: node_edac_uncorrectable_errors_total \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n  rules:\n  - alert: MinioDiskOffline\n    expr: minio_offline_disks \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n      description: \"Minio disk is offline.\"\n  - alert: MinioStorageSpaceExhausted\n    expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n      description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n  rules:\n  - alert: PrometheusConfigurationReloadFailure\n    expr: prometheus_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"Prometheus configuration reload error.\"\n  - alert: PrometheusTooManyRestarts\n    expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n      description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n  - alert: PrometheusAlertmanagerConfigurationReloadFailure\n    expr: alertmanager_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"AlertManager configuration reload error.\"\n  - alert: PrometheusRuleEvaluationFailures\n    expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n  - alert: PrometheusTargetScrapingSlow\n    expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n      description: \"Prometheus is scraping exporters slowly.\"\n  - alert: PrometheusTsdbCompactionsFailed\n    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n  - alert: PrometheusTsdbHeadTruncationsFailed\n    expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n  - alert: PrometheusTsdbWalCorruptions\n    expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n  - alert: PrometheusTsdbWalTruncationsFailed\n    expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n      }\n\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/prometheus.yml\"\n        data            = \u003c\u003cEOH\n---\nglobal:\n  scrape_interval:     5s\n  scrape_timeout:      5s\n  evaluation_interval: 5s\n\nalerting:\n  alertmanagers:\n  - consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\nrule_files:\n  - 'alerts.yml'\n\nscrape_configs:\n\n  - job_name: 'Nomad Cluster'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'nomad-client', 'nomad' ]\n    relabel_configs:\n    - source_labels: [__meta_consul_tags]\n      regex: '(.*)http(.*)'\n      action: keep\n    metrics_path: /v1/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Consul Cluster'\n    static_configs:\n      - targets: [ '10.30.51.28:8500' ]\n      - targets: [ '10.30.51.29:8500' ]\n      - targets: [ '10.30.51.30:8500' ]\n      - targets: [ '10.30.51.32:8500' ]\n      - targets: [ '10.30.51.33:8500' ]\n      - targets: [ '10.30.51.34:8500' ]\n      - targets: [ '10.30.51.35:8500' ]\n      - targets: [ '10.30.51.39:8500' ]\n      - targets: [ '10.30.51.40:8500' ]\n      - targets: [ '10.30.51.50:8500' ]\n      - targets: [ '10.30.51.51:8500' ]\n      - targets: [ '10.30.51.65:8500' ]\n      - targets: [ '10.30.51.66:8500' ]\n      - targets: [ '10.30.51.67:8500' ]\n      - targets: [ '10.30.51.68:8500' ]\n      - targets: [ '10.30.51.70:8500' ]\n      - targets: [ '10.30.51.71:8500' ]\n      - targets: [ '10.32.8.14:8500' ]\n      - targets: [ '10.32.8.15:8500' ]\n      - targets: [ '10.32.8.16:8500' ]\n      - targets: [ '10.32.8.17:8500' ]\n    metrics_path: /v1/agent/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Blackbox Exporter (icmp)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n      - targets: [ '10.30.51.32' ]\n    params:\n      module: [ 'icmp_v4' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'Blackbox Exporter (http)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n    params:\n      module: [ 'http_2xx' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'cAdvisor Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:8080' ]\n      - targets: [ '10.30.51.29:8080' ]\n      - targets: [ '10.30.51.30:8080' ]\n      #- targets: [ '10.30.51.32:8080' ]\n      - targets: [ '10.30.51.33:8080' ]\n      - targets: [ '10.30.51.34:8080' ]\n      - targets: [ '10.30.51.35:8080' ]\n      - targets: [ '10.30.51.39:8080' ]\n      - targets: [ '10.30.51.40:8080' ]\n      - targets: [ '10.30.51.50:8080' ]\n      - targets: [ '10.30.51.51:8080' ]\n      - targets: [ '10.30.51.65:8080' ]\n      - targets: [ '10.30.51.66:8080' ]\n      - targets: [ '10.30.51.67:8080' ]\n      - targets: [ '10.30.51.68:8080' ]\n      - targets: [ '10.30.51.70:8080' ]\n      - targets: [ '10.30.51.71:8080' ]\n      - targets: [ '10.32.8.14:8080' ]\n      - targets: [ '10.32.8.15:8080' ]\n      - targets: [ '10.32.8.16:8080' ]\n      - targets: [ '10.32.8.17:8080' ]\n\n  - job_name: 'Jenkins Job Health Exporter'\n    static_configs:\n      - targets: [ '10.30.51.32:9186' ]\n    metric_relabel_configs:\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        action: replace\n        replacement: '$1'\n        target_label: id\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        replacement: 'jenkins_job_$2'\n        target_label: __name__\n\n  - job_name: 'Node Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:9100' ]\n      - targets: [ '10.30.51.29:9100' ]\n      - targets: [ '10.30.51.30:9100' ]\n      - targets: [ '10.30.51.32:9100' ]\n      - targets: [ '10.30.51.33:9100' ]\n      - targets: [ '10.30.51.34:9100' ]\n      - targets: [ '10.30.51.35:9100' ]\n      - targets: [ '10.30.51.39:9100' ]\n      - targets: [ '10.30.51.40:9100' ]\n      - targets: [ '10.30.51.50:9100' ]\n      - targets: [ '10.30.51.51:9100' ]\n      - targets: [ '10.30.51.65:9100' ]\n      - targets: [ '10.30.51.66:9100' ]\n      - targets: [ '10.30.51.67:9100' ]\n      - targets: [ '10.30.51.68:9100' ]\n      - targets: [ '10.30.51.70:9100' ]\n      - targets: [ '10.30.51.71:9100' ]\n      - targets: [ '10.32.8.14:9100' ]\n      - targets: [ '10.32.8.15:9100' ]\n      - targets: [ '10.32.8.16:9100' ]\n      - targets: [ '10.32.8.17:9100' ]\n\n  - job_name: 'Alertmanager'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\n  - job_name: 'Grafana'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'grafana' ]\n\n  - job_name: 'Prometheus'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'prometheus' ]\n\n  - job_name: 'Minio'\n    bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'storage' ]\n    metrics_path: /minio/prometheus/metrics\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"${service_name}\"\n        port            = \"${service_name}\"\n        tags            = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Prometheus Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = ${cpu}\n        memory          = ${mem}\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"${service_name}\" {\n            static      = ${port}\n          }\n        }\n      }\n    }\n  }\n}",
              "vars": {
                "cpu": "2000",
                "data_dir": "/data/",
@@ -514,23 +456,27 @@
            "schema_version": 0,
            "attributes": {
              "allocation_ids": [
-              "f33c33fb-e8e7-ef4d-9e9c-0709a9f39194",
-              "659ce43a-f845-ae65-83ac-09fa2266ba93",
-              "d194c106-2627-f45d-6e85-7bdd8b7b74e3",
-              "488b2fc8-0ad4-9e23-c606-401d138930df"
+              "7f07affc-3df6-e8df-e757-e6bc36126d69",
+              "1531f6df-3d54-332e-49f0-a6fe02f3d39d",
+              "d15a55f2-4299-06c2-f42f-ed3f22e6c643",
+              "91f6ce4c-d197-75ec-2242-bb21e045e111",
+              "145bd905-e279-f3ff-e65b-da46f55331f5",
+              "4a781bdb-21e7-3a59-880e-372f6d209274",
+              "066c548e-e96c-fcb6-4a35-9b3d7336693d",
+              "ad5e1036-25ba-038f-63e6-166962b7c1bb"
              ],
              "datacenters": [
                "yul1"
              ],
-            "deployment_id": "0a36a612-38de-5e1c-448e-72e29a2fb5f3",
+            "deployment_id": "f87810ec-32a6-04a3-d507-e9d2abe2e94d",
              "deployment_status": "successful",
              "deregister_on_destroy": true,
              "deregister_on_id_change": true,
              "detach": false,
              "id": "prod-prometheus",
-            "jobspec": "job \"prod-prometheus\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-prometheus\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count               = 4\n\n    # The volume stanza allows the group to specify that it requires a given\n    # volume from the cluster.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/volume\n    #\n    \n    volume \"prod-volume1-prometheus\" {\n      type              = \"host\"\n      read_only         = false\n      source            = \"prod-volume-data1-1\"\n    }\n    \n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute         = \"${attr.cpu.arch}\"\n      operator          = \"!=\"\n      value             = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-prometheus\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver            = \"exec\"\n\n      \n      volume_mount {\n        volume          = \"prod-volume1-prometheus\"\n        destination     = \"/data/\"\n        read_only       = false\n      }\n      \n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command         = \"local/prometheus-2.24.0.linux-amd64/prometheus\"\n        args            = [\n          \"--config.file=secrets/prometheus.yml\",\n          \"--storage.tsdb.path=/data/prometheus/\",\n          \"--storage.tsdb.retention.time=15d\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alerts.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n---\ngroups:\n- name: \"Jenkins Job Health Exporter\"\n  rules:\n  - alert: JenkinsJobHealthExporterFailures\n    expr: jenkins_job_failure{id=~\".*\"} \u003e= 10\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Jenkins Job Health detected high failure rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n  - alert: JenkinsJobHealthExporterUnstable\n    expr: jenkins_job_unstable{id=~\".*\"} \u003e= 10\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Jenkins Job Health detected high unstable rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n- name: \"Consul\"\n  rules:\n  - alert: ConsulServiceHealthcheckFailed\n    expr: consul_catalog_service_node_healthy == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n      description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n  - alert: ConsulMissingMasterNode\n    expr: consul_raft_peers \u003c 3\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n      description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n  - alert: ConsulAgentUnhealthy\n    expr: consul_health_node_status{status=\"critical\"} == 1\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n      description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n  rules:\n  - alert: NodeDown\n    expr: up == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n      description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n  - alert: HostHighCpuLoad\n    expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n      description: \"CPU load is \u003e 95%.\"\n  - alert: HostOutOfMemory\n    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n      description: \"Node memory is filling up (\u003c 10% left).\"\n  - alert: HostOomKillDetected\n    expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n      description: \"OOM kill detected.\"\n  - alert: HostMemoryUnderMemoryPressure\n    expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n      description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n  - alert: HostOutOfDiskSpace\n    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n      description: \"Disk is almost full (\u003c 10% left).\"\n  - alert: HostRaidDiskFailure\n    expr: node_md_disks{state=\"failed\"} \u003e 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n      description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n  - alert: HostConntrackLimit\n    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n      description: \"The number of conntrack is approching limit.\"\n  - alert: HostNetworkInterfaceSaturated\n    expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n    for: 1m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n      description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n  - alert: HostSystemdServiceCrashed\n    expr: node_systemd_unit_state{state=\"failed\"} == 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n      description: \"SystemD service crashed.\"\n  - alert: HostEdacCorrectableErrorsDetected\n    expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: info\n    annotations:\n      summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n  - alert: HostEdacUncorrectableErrorsDetected\n    expr: node_edac_uncorrectable_errors_total \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n  rules:\n  - alert: MinioDiskOffline\n    expr: minio_offline_disks \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n      description: \"Minio disk is offline.\"\n  - alert: MinioStorageSpaceExhausted\n    expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n      description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n  rules:\n  - alert: PrometheusConfigurationReloadFailure\n    expr: prometheus_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"Prometheus configuration reload error.\"\n  - alert: PrometheusTooManyRestarts\n    expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n      description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n  - alert: PrometheusAlertmanagerConfigurationReloadFailure\n    expr: alertmanager_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"AlertManager configuration reload error.\"\n  - alert: PrometheusRuleEvaluationFailures\n    expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n  - alert: PrometheusTargetScrapingSlow\n    expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n      description: \"Prometheus is scraping exporters slowly.\"\n  - alert: PrometheusTsdbCompactionsFailed\n    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n  - alert: PrometheusTsdbHeadTruncationsFailed\n    expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n  - alert: PrometheusTsdbWalCorruptions\n    expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n  - alert: PrometheusTsdbWalTruncationsFailed\n    expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n      }\n\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/prometheus.yml\"\n        data            = \u003c\u003cEOH\n---\nglobal:\n  scrape_interval:     5s\n  scrape_timeout:      5s\n  evaluation_interval: 5s\n\nalerting:\n  alertmanagers:\n  - consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\nrule_files:\n  - 'alerts.yml'\n\nscrape_configs:\n\n  - job_name: 'Nomad Cluster'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'nomad-client', 'nomad' ]\n    relabel_configs:\n    - source_labels: [__meta_consul_tags]\n      regex: '(.*)http(.*)'\n      action: keep\n    metrics_path: /v1/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Consul Cluster'\n    static_configs:\n      - targets: [ '10.30.51.28:8500' ]\n      - targets: [ '10.30.51.29:8500' ]\n      - targets: [ '10.30.51.30:8500' ]\n      - targets: [ '10.30.51.32:8500' ]\n      - targets: [ '10.30.51.33:8500' ]\n      - targets: [ '10.30.51.34:8500' ]\n      - targets: [ '10.30.51.35:8500' ]\n      - targets: [ '10.30.51.39:8500' ]\n      - targets: [ '10.30.51.40:8500' ]\n      - targets: [ '10.30.51.50:8500' ]\n      - targets: [ '10.30.51.51:8500' ]\n      - targets: [ '10.30.51.65:8500' ]\n      - targets: [ '10.30.51.66:8500' ]\n      - targets: [ '10.30.51.67:8500' ]\n      - targets: [ '10.30.51.68:8500' ]\n      - targets: [ '10.30.51.70:8500' ]\n      - targets: [ '10.30.51.71:8500' ]\n      - targets: [ '10.32.8.14:8500' ]\n      - targets: [ '10.32.8.15:8500' ]\n      - targets: [ '10.32.8.16:8500' ]\n      - targets: [ '10.32.8.17:8500' ]\n    metrics_path: /v1/agent/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Blackbox Exporter (icmp)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n      - targets: [ '10.30.51.32' ]\n    params:\n      module: [ 'icmp_v4' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'Blackbox Exporter (http)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n    params:\n      module: [ 'http_2xx' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'cAdvisor Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:8080' ]\n      - targets: [ '10.30.51.29:8080' ]\n      - targets: [ '10.30.51.30:8080' ]\n      #- targets: [ '10.30.51.32:8080' ]\n      - targets: [ '10.30.51.33:8080' ]\n      - targets: [ '10.30.51.34:8080' ]\n      - targets: [ '10.30.51.35:8080' ]\n      - targets: [ '10.30.51.39:8080' ]\n      - targets: [ '10.30.51.40:8080' ]\n      - targets: [ '10.30.51.50:8080' ]\n      - targets: [ '10.30.51.51:8080' ]\n      - targets: [ '10.30.51.65:8080' ]\n      - targets: [ '10.30.51.66:8080' ]\n      - targets: [ '10.30.51.67:8080' ]\n      - targets: [ '10.30.51.68:8080' ]\n      - targets: [ '10.30.51.70:8080' ]\n      - targets: [ '10.30.51.71:8080' ]\n      - targets: [ '10.32.8.14:8080' ]\n      - targets: [ '10.32.8.15:8080' ]\n      - targets: [ '10.32.8.16:8080' ]\n      - targets: [ '10.32.8.17:8080' ]\n\n  - job_name: 'Jenkins Job Health Exporter'\n    static_configs:\n      - targets: [ '10.30.51.32:9186' ]\n    metric_relabel_configs:\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        action: replace\n        replacement: '$1'\n        target_label: id\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        replacement: 'jenkins_job_$2'\n        target_label: __name__\n\n  - job_name: 'Node Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:9100' ]\n      - targets: [ '10.30.51.29:9100' ]\n      - targets: [ '10.30.51.30:9100' ]\n      - targets: [ '10.30.51.32:9100' ]\n      - targets: [ '10.30.51.33:9100' ]\n      - targets: [ '10.30.51.34:9100' ]\n      - targets: [ '10.30.51.35:9100' ]\n      - targets: [ '10.30.51.39:9100' ]\n      - targets: [ '10.30.51.40:9100' ]\n      - targets: [ '10.30.51.50:9100' ]\n      - targets: [ '10.30.51.51:9100' ]\n      - targets: [ '10.30.51.65:9100' ]\n      - targets: [ '10.30.51.66:9100' ]\n      - targets: [ '10.30.51.67:9100' ]\n      - targets: [ '10.30.51.68:9100' ]\n      - targets: [ '10.30.51.70:9100' ]\n      - targets: [ '10.30.51.71:9100' ]\n      - targets: [ '10.32.8.14:9100' ]\n      - targets: [ '10.32.8.15:9100' ]\n      - targets: [ '10.32.8.16:9100' ]\n      - targets: [ '10.32.8.17:9100' ]\n\n  - job_name: 'Alertmanager'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\n  - job_name: 'Grafana'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'grafana' ]\n\n  - job_name: 'Prometheus'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'prometheus' ]\n\n  - job_name: 'Minio'\n    bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'storage' ]\n    metrics_path: /minio/prometheus/metrics\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"prometheus\"\n        port            = \"prometheus\"\n        tags            = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Prometheus Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = 2000\n        memory          = 8192\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"prometheus\" {\n            static      = 9090\n          }\n        }\n      }\n    }\n  }\n}",
+            "jobspec": "job \"prod-prometheus\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-prometheus\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count               = 4\n\n    # The volume stanza allows the group to specify that it requires a given\n    # volume from the cluster.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/volume\n    #\n    \n    volume \"prod-volume1-prometheus\" {\n      type              = \"host\"\n      read_only         = false\n      source            = \"prod-volume-data1-1\"\n    }\n    \n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute         = \"${attr.cpu.arch}\"\n      operator          = \"!=\"\n      value             = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-prometheus\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver            = \"exec\"\n\n      \n      volume_mount {\n        volume          = \"prod-volume1-prometheus\"\n        destination     = \"/data/\"\n        read_only       = false\n      }\n      \n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command         = \"local/prometheus-2.24.0.linux-amd64/prometheus\"\n        args            = [\n          \"--config.file=secrets/prometheus.yml\",\n          \"--storage.tsdb.path=/data/prometheus/\",\n          \"--storage.tsdb.retention.time=15d\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alerts.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n---\ngroups:\n- name: \"Jenkins Job Health Exporter\"\n  rules:\n  - alert: JenkinsJobHealthExporterFailures\n    expr: jenkins_job_failure{id=~\".*\"} \u003e jenkins_job_success{id=~\".*\"}\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Jenkins Job Health detected high failure rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n  - alert: JenkinsJobHealthExporterUnstable\n    expr: jenkins_job_unstable{id=~\".*\"} \u003e jenkins_job_success{id=~\".*\"}\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Jenkins Job Health detected high unstable rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n- name: \"Consul\"\n  rules:\n  - alert: ConsulServiceHealthcheckFailed\n    expr: consul_catalog_service_node_healthy == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n      description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n  - alert: ConsulMissingMasterNode\n    expr: consul_raft_peers \u003c 3\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n      description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n  - alert: ConsulAgentUnhealthy\n    expr: consul_health_node_status{status=\"critical\"} == 1\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n      description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n  rules:\n  - alert: NodeDown\n    expr: up == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n      description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n  - alert: HostHighCpuLoad\n    expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n      description: \"CPU load is \u003e 95%.\"\n  - alert: HostOutOfMemory\n    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n      description: \"Node memory is filling up (\u003c 10% left).\"\n  - alert: HostOomKillDetected\n    expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n      description: \"OOM kill detected.\"\n  - alert: HostMemoryUnderMemoryPressure\n    expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n      description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n  - alert: HostOutOfDiskSpace\n    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n      description: \"Disk is almost full (\u003c 10% left).\"\n  - alert: HostRaidDiskFailure\n    expr: node_md_disks{state=\"failed\"} \u003e 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n      description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n  - alert: HostConntrackLimit\n    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n      description: \"The number of conntrack is approching limit.\"\n  - alert: HostNetworkInterfaceSaturated\n    expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n    for: 1m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n      description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n  - alert: HostSystemdServiceCrashed\n    expr: node_systemd_unit_state{state=\"failed\"} == 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n      description: \"SystemD service crashed.\"\n  - alert: HostEdacCorrectableErrorsDetected\n    expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: info\n    annotations:\n      summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n  - alert: HostEdacUncorrectableErrorsDetected\n    expr: node_edac_uncorrectable_errors_total \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n  rules:\n  - alert: MinioDiskOffline\n    expr: minio_offline_disks \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n      description: \"Minio disk is offline.\"\n  - alert: MinioStorageSpaceExhausted\n    expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n      description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n  rules:\n  - alert: PrometheusConfigurationReloadFailure\n    expr: prometheus_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"Prometheus configuration reload error.\"\n  - alert: PrometheusTooManyRestarts\n    expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n      description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n  - alert: PrometheusAlertmanagerConfigurationReloadFailure\n    expr: alertmanager_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"AlertManager configuration reload error.\"\n  - alert: PrometheusRuleEvaluationFailures\n    expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n  - alert: PrometheusTargetScrapingSlow\n    expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n      description: \"Prometheus is scraping exporters slowly.\"\n  - alert: PrometheusTsdbCompactionsFailed\n    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n  - alert: PrometheusTsdbHeadTruncationsFailed\n    expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n  - alert: PrometheusTsdbWalCorruptions\n    expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n  - alert: PrometheusTsdbWalTruncationsFailed\n    expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n      }\n\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/prometheus.yml\"\n        data            = \u003c\u003cEOH\n---\nglobal:\n  scrape_interval:     5s\n  scrape_timeout:      5s\n  evaluation_interval: 5s\n\nalerting:\n  alertmanagers:\n  - consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\nrule_files:\n  - 'alerts.yml'\n\nscrape_configs:\n\n  - job_name: 'Nomad Cluster'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'nomad-client', 'nomad' ]\n    relabel_configs:\n    - source_labels: [__meta_consul_tags]\n      regex: '(.*)http(.*)'\n      action: keep\n    metrics_path: /v1/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Consul Cluster'\n    static_configs:\n      - targets: [ '10.30.51.28:8500' ]\n      - targets: [ '10.30.51.29:8500' ]\n      - targets: [ '10.30.51.30:8500' ]\n      - targets: [ '10.30.51.32:8500' ]\n      - targets: [ '10.30.51.33:8500' ]\n      - targets: [ '10.30.51.34:8500' ]\n      - targets: [ '10.30.51.35:8500' ]\n      - targets: [ '10.30.51.39:8500' ]\n      - targets: [ '10.30.51.40:8500' ]\n      - targets: [ '10.30.51.50:8500' ]\n      - targets: [ '10.30.51.51:8500' ]\n      - targets: [ '10.30.51.65:8500' ]\n      - targets: [ '10.30.51.66:8500' ]\n      - targets: [ '10.30.51.67:8500' ]\n      - targets: [ '10.30.51.68:8500' ]\n      - targets: [ '10.30.51.70:8500' ]\n      - targets: [ '10.30.51.71:8500' ]\n      - targets: [ '10.32.8.14:8500' ]\n      - targets: [ '10.32.8.15:8500' ]\n      - targets: [ '10.32.8.16:8500' ]\n      - targets: [ '10.32.8.17:8500' ]\n    metrics_path: /v1/agent/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Blackbox Exporter (icmp)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n      - targets: [ '10.30.51.32' ]\n    params:\n      module: [ 'icmp_v4' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'Blackbox Exporter (http)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n    params:\n      module: [ 'http_2xx' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'cAdvisor Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:8080' ]\n      - targets: [ '10.30.51.29:8080' ]\n      - targets: [ '10.30.51.30:8080' ]\n      #- targets: [ '10.30.51.32:8080' ]\n      - targets: [ '10.30.51.33:8080' ]\n      - targets: [ '10.30.51.34:8080' ]\n      - targets: [ '10.30.51.35:8080' ]\n      - targets: [ '10.30.51.39:8080' ]\n      - targets: [ '10.30.51.40:8080' ]\n      - targets: [ '10.30.51.50:8080' ]\n      - targets: [ '10.30.51.51:8080' ]\n      - targets: [ '10.30.51.65:8080' ]\n      - targets: [ '10.30.51.66:8080' ]\n      - targets: [ '10.30.51.67:8080' ]\n      - targets: [ '10.30.51.68:8080' ]\n      - targets: [ '10.30.51.70:8080' ]\n      - targets: [ '10.30.51.71:8080' ]\n      - targets: [ '10.32.8.14:8080' ]\n      - targets: [ '10.32.8.15:8080' ]\n      - targets: [ '10.32.8.16:8080' ]\n      - targets: [ '10.32.8.17:8080' ]\n\n  - job_name: 'Jenkins Job Health Exporter'\n    static_configs:\n      - targets: [ '10.30.51.32:9186' ]\n    metric_relabel_configs:\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        action: replace\n        replacement: '$1'\n        target_label: id\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        replacement: 'jenkins_job_$2'\n        target_label: __name__\n\n  - job_name: 'Node Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:9100' ]\n      - targets: [ '10.30.51.29:9100' ]\n      - targets: [ '10.30.51.30:9100' ]\n      - targets: [ '10.30.51.32:9100' ]\n      - targets: [ '10.30.51.33:9100' ]\n      - targets: [ '10.30.51.34:9100' ]\n      - targets: [ '10.30.51.35:9100' ]\n      - targets: [ '10.30.51.39:9100' ]\n      - targets: [ '10.30.51.40:9100' ]\n      - targets: [ '10.30.51.50:9100' ]\n      - targets: [ '10.30.51.51:9100' ]\n      - targets: [ '10.30.51.65:9100' ]\n      - targets: [ '10.30.51.66:9100' ]\n      - targets: [ '10.30.51.67:9100' ]\n      - targets: [ '10.30.51.68:9100' ]\n      - targets: [ '10.30.51.70:9100' ]\n      - targets: [ '10.30.51.71:9100' ]\n      - targets: [ '10.32.8.14:9100' ]\n      - targets: [ '10.32.8.15:9100' ]\n      - targets: [ '10.32.8.16:9100' ]\n      - targets: [ '10.32.8.17:9100' ]\n\n  - job_name: 'Alertmanager'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\n  - job_name: 'Grafana'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'grafana' ]\n\n  - job_name: 'Prometheus'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'prometheus' ]\n\n  - job_name: 'Minio'\n    bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'storage' ]\n    metrics_path: /minio/prometheus/metrics\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"prometheus\"\n        port            = \"prometheus\"\n        tags            = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Prometheus Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = 2000\n        memory          = 8192\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"prometheus\" {\n            static      = 9090\n          }\n        }\n      }\n    }\n  }\n}",
              "json": null,
-            "modify_index": "7254539",
+            "modify_index": "7502384",
              "name": "prod-prometheus",
              "namespace": "default",
              "policy_override": null,
@@ -616,8 +562,6 @@
              "allocation_ids": [
                "1ab59a51-2bbe-98f5-2c97-26b962dac41c",
                "54db9b31-670f-d1d5-a6be-a578e0487371",
-              "871d4cab-976e-b972-88fd-37082262cc14",
-              "90277018-b8d3-6b12-d8af-066f5faccd93",
                "2db5470a-e47c-5f3f-2086-fe658d92a9f1",
                "76174535-f2da-288f-ea14-8af495e631cc"
              ],
diff --git a/terraform-ci-infra/1n_nmd/terraform.tfstate.backup b/terraform-ci-infra/1n_nmd/terraform.tfstate.backup

index 4476f2b..92757f0 100644 (file)
--- a/terraform-ci-infra/1n_nmd/terraform.tfstate.backup
+++ b/terraform-ci-infra/1n_nmd/terraform.tfstate.backup
@@ -1,7 +1,7 @@
  {
    "version": 4,
    "terraform_version": "0.14.7",
-  "serial": 1134,
+  "serial": 1138,
    "lineage": "e4e7f30a-652d-7a31-e31c-5e3a3388c9b9",
    "outputs": {},
    "resources": [
@@ -270,7 +270,7 @@
            "schema_version": 0,
            "attributes": {
              "allocation_ids": [
-              "fb50ac37-6494-1c7c-3bee-48a64bf05dd4"
+              "499b34c4-7f90-5d06-5ea5-e0826824a4f7"
              ],
              "datacenters": [
                "yul1"
@@ -283,7 +283,7 @@
              "id": "prod-mc",
              "jobspec": "job \"prod-mc\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers.html\n  #\n  type        = \"batch\"\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group.html\n  #\n  group \"prod-group1-mc\" {\n    task \"prod-task1-create-buckets\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver        = \"docker\"\n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        image       = \"minio/mc:RELEASE.2020-12-10T01-26-17Z\"\n        entrypoint  = [\n          \"/bin/sh\",\n          \"-c\",\n          \"mc config host add LOCALMINIO http://storage.service.consul:9000 $MINIO_ACCESS_KEY $MINIO_SECRET_KEY \u0026\u0026 mc mb -p LOCALMINIO/logs.fd.io LOCALMINIO/docs.fd.io ; mc policy set public LOCALMINIO/logs.fd.io mc policy set public LOCALMINIO/docs.fd.io mc ilm add --expiry-days '180' LOCALMINIO/logs.fd.io mc admin user add LOCALMINIO storage Storage1234 mc admin policy set LOCALMINIO writeonly user=storage\"\n        ]\n        dns_servers  = [ \"${attr.unique.network.ip-address}\" ]\n        privileged   = false\n      }\n\n      # The env stanza configures a list of environment variables to populate\n      # the task's environment before starting.\n      env {\n        \n        MINIO_ACCESS_KEY = \"minio\"\n        MINIO_SECRET_KEY = \"minio123\"\n        \n        \n      }\n    }\n  }\n}\n",
              "json": null,
-            "modify_index": "7454200",
+            "modify_index": "7454315",
              "name": "prod-mc",
              "namespace": "default",
              "policy_override": null,
@@ -586,15 +586,15 @@
            "schema_version": 0,
            "attributes": {
              "filename": null,
-            "id": "e5fede9b392dff183ecf017d29714d6cfa7528236064fb3aa42b24d26f4f87fe",
-            "rendered": "job \"prod-device-csit-shim\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers.html\n  #\n  type        = \"system\"\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group.html\n  #\n  group \"prod-group1-csit-shim-amd\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count            = 1\n\n    constraint {\n      attribute      = \"${node.class}\"\n      value          = \"csit\"\n    }\n\n    restart {\n      interval       = \"1m\"\n      attempts       = 3\n      delay          = \"15s\"\n      mode           = \"delay\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task.html\n    #\n    task \"prod-task1-csit-shim-amd\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver         = \"docker\"\n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        image        = \"fdiotools/csit_shim-ubuntu2004:2021_02_26_081228-x86_64\"\n        network_mode = \"host\"\n        pid_mode     = \"host\"\n        volumes      = [\n          \"/var/run/docker.sock:/var/run/docker.sock\"\n        ]\n        privileged   = true\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources.html\n      #\n      resources {\n        cpu          = 1500\n        memory       = 4096\n        network {\n          port \"ssh\" {\n              static = 6022\n          }\n          port \"ssh2\" {\n              static = 6023\n          }\n        }\n      }\n    }\n  }\n\n  group \"prod-group1-csit-shim-arm\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count            = 1\n\n    constraint {\n      attribute      = \"${node.class}\"\n      value          = \"csitarm\"\n    }\n\n    restart {\n      interval       = \"1m\"\n      attempts       = 3\n      delay          = \"15s\"\n      mode           = \"delay\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task.html\n    #\n    task \"prod-task1-csit-shim-arm\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver         = \"docker\"\n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        image        = \"csit_shim-ubuntu1804:local\"\n        network_mode = \"host\"\n        pid_mode     = \"host\"\n        volumes      = [\n          \"/var/run/docker.sock:/var/run/docker.sock\"\n        ]\n        privileged   = true\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources.html\n      #\n      resources {\n        cpu          = 1500\n        memory       = 4096\n        network {\n          port \"ssh\" {\n              static = 6022\n          }\n          port \"ssh2\" {\n              static = 6023\n          }\n        }\n      }\n    }\n  }\n}",
+            "id": "95bbc9810be63e5d4e9fb668030eacc88f86d0d00b92cc2d08aee2dc1669fa0b",
+            "rendered": "job \"prod-device-csit-shim\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers.html\n  #\n  type        = \"system\"\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group.html\n  #\n  group \"prod-group1-csit-shim-amd\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count            = 1\n\n    constraint {\n      attribute      = \"${node.class}\"\n      value          = \"csit\"\n    }\n\n    restart {\n      interval       = \"1m\"\n      attempts       = 3\n      delay          = \"15s\"\n      mode           = \"delay\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task.html\n    #\n    task \"prod-task1-csit-shim-amd\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver         = \"docker\"\n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        image        = \"csit_shim-ubuntu1804:local\"\n        network_mode = \"host\"\n        pid_mode     = \"host\"\n        volumes      = [\n          \"/var/run/docker.sock:/var/run/docker.sock\"\n        ]\n        privileged   = true\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources.html\n      #\n      resources {\n        cpu          = 1500\n        memory       = 4096\n        network {\n          port \"ssh\" {\n              static = 6022\n          }\n          port \"ssh2\" {\n              static = 6023\n          }\n        }\n      }\n    }\n  }\n\n  group \"prod-group1-csit-shim-arm\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count            = 1\n\n    constraint {\n      attribute      = \"${node.class}\"\n      value          = \"csitarm\"\n    }\n\n    restart {\n      interval       = \"1m\"\n      attempts       = 3\n      delay          = \"15s\"\n      mode           = \"delay\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task.html\n    #\n    task \"prod-task1-csit-shim-arm\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver         = \"docker\"\n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        image        = \"csit_shim-ubuntu1804:local\"\n        network_mode = \"host\"\n        pid_mode     = \"host\"\n        volumes      = [\n          \"/var/run/docker.sock:/var/run/docker.sock\"\n        ]\n        privileged   = true\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources.html\n      #\n      resources {\n        cpu          = 1500\n        memory       = 4096\n        network {\n          port \"ssh\" {\n              static = 6022\n          }\n          port \"ssh2\" {\n              static = 6023\n          }\n        }\n      }\n    }\n  }\n}",
              "template": "job \"${job_name}\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters = \"${datacenters}\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers.html\n  #\n  type        = \"system\"\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group.html\n  #\n  group \"prod-group1-csit-shim-amd\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count            = ${group_count}\n\n    constraint {\n      attribute      = \"$${node.class}\"\n      value          = \"csit\"\n    }\n\n    restart {\n      interval       = \"1m\"\n      attempts       = 3\n      delay          = \"15s\"\n      mode           = \"delay\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task.html\n    #\n    task \"prod-task1-csit-shim-amd\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver         = \"docker\"\n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        image        = \"${image_x86_64}\"\n        network_mode = \"host\"\n        pid_mode     = \"host\"\n        volumes      = [\n          \"/var/run/docker.sock:/var/run/docker.sock\"\n        ]\n        privileged   = true\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources.html\n      #\n      resources {\n        cpu          = ${cpu}\n        memory       = ${mem}\n        network {\n          port \"ssh\" {\n              static = 6022\n          }\n          port \"ssh2\" {\n              static = 6023\n          }\n        }\n      }\n    }\n  }\n\n  group \"prod-group1-csit-shim-arm\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count            = ${group_count}\n\n    constraint {\n      attribute      = \"$${node.class}\"\n      value          = \"csitarm\"\n    }\n\n    restart {\n      interval       = \"1m\"\n      attempts       = 3\n      delay          = \"15s\"\n      mode           = \"delay\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task.html\n    #\n    task \"prod-task1-csit-shim-arm\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver         = \"docker\"\n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        image        = \"${image_aarch64}\"\n        network_mode = \"host\"\n        pid_mode     = \"host\"\n        volumes      = [\n          \"/var/run/docker.sock:/var/run/docker.sock\"\n        ]\n        privileged   = true\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources.html\n      #\n      resources {\n        cpu          = ${cpu}\n        memory       = ${mem}\n        network {\n          port \"ssh\" {\n              static = 6022\n          }\n          port \"ssh2\" {\n              static = 6023\n          }\n        }\n      }\n    }\n  }\n}",
              "vars": {
                "cpu": "1500",
                "datacenters": "yul1",
                "group_count": "1",
                "image_aarch64": "csit_shim-ubuntu1804:local",
-              "image_x86_64": "fdiotools/csit_shim-ubuntu2004:2021_02_26_081228-x86_64",
+              "image_x86_64": "csit_shim-ubuntu1804:local",
                "job_name": "prod-device-csit-shim",
                "mem": "4096"
              }
@@ -614,10 +614,10 @@
            "schema_version": 0,
            "attributes": {
              "allocation_ids": [
+              "1ab59a51-2bbe-98f5-2c97-26b962dac41c",
+              "54db9b31-670f-d1d5-a6be-a578e0487371",
                "871d4cab-976e-b972-88fd-37082262cc14",
                "90277018-b8d3-6b12-d8af-066f5faccd93",
-              "019b5b46-41aa-eeca-ae6f-111fe1f0d5e8",
-              "29a15532-775c-68ba-299f-f801e4c304df",
                "2db5470a-e47c-5f3f-2086-fe658d92a9f1",
                "76174535-f2da-288f-ea14-8af495e631cc"
              ],
@@ -630,9 +630,9 @@
              "deregister_on_id_change": true,
              "detach": false,
              "id": "prod-device-csit-shim",
-            "jobspec": "job \"prod-device-csit-shim\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers.html\n  #\n  type        = \"system\"\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group.html\n  #\n  group \"prod-group1-csit-shim-amd\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count            = 1\n\n    constraint {\n      attribute      = \"${node.class}\"\n      value          = \"csit\"\n    }\n\n    restart {\n      interval       = \"1m\"\n      attempts       = 3\n      delay          = \"15s\"\n      mode           = \"delay\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task.html\n    #\n    task \"prod-task1-csit-shim-amd\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver         = \"docker\"\n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        image        = \"fdiotools/csit_shim-ubuntu2004:2021_02_26_081228-x86_64\"\n        network_mode = \"host\"\n        pid_mode     = \"host\"\n        volumes      = [\n          \"/var/run/docker.sock:/var/run/docker.sock\"\n        ]\n        privileged   = true\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources.html\n      #\n      resources {\n        cpu          = 1500\n        memory       = 4096\n        network {\n          port \"ssh\" {\n              static = 6022\n          }\n          port \"ssh2\" {\n              static = 6023\n          }\n        }\n      }\n    }\n  }\n\n  group \"prod-group1-csit-shim-arm\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count            = 1\n\n    constraint {\n      attribute      = \"${node.class}\"\n      value          = \"csitarm\"\n    }\n\n    restart {\n      interval       = \"1m\"\n      attempts       = 3\n      delay          = \"15s\"\n      mode           = \"delay\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task.html\n    #\n    task \"prod-task1-csit-shim-arm\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver         = \"docker\"\n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        image        = \"csit_shim-ubuntu1804:local\"\n        network_mode = \"host\"\n        pid_mode     = \"host\"\n        volumes      = [\n          \"/var/run/docker.sock:/var/run/docker.sock\"\n        ]\n        privileged   = true\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources.html\n      #\n      resources {\n        cpu          = 1500\n        memory       = 4096\n        network {\n          port \"ssh\" {\n              static = 6022\n          }\n          port \"ssh2\" {\n              static = 6023\n          }\n        }\n      }\n    }\n  }\n}",
+            "jobspec": "job \"prod-device-csit-shim\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers.html\n  #\n  type        = \"system\"\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group.html\n  #\n  group \"prod-group1-csit-shim-amd\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count            = 1\n\n    constraint {\n      attribute      = \"${node.class}\"\n      value          = \"csit\"\n    }\n\n    restart {\n      interval       = \"1m\"\n      attempts       = 3\n      delay          = \"15s\"\n      mode           = \"delay\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task.html\n    #\n    task \"prod-task1-csit-shim-amd\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver         = \"docker\"\n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        image        = \"csit_shim-ubuntu1804:local\"\n        network_mode = \"host\"\n        pid_mode     = \"host\"\n        volumes      = [\n          \"/var/run/docker.sock:/var/run/docker.sock\"\n        ]\n        privileged   = true\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources.html\n      #\n      resources {\n        cpu          = 1500\n        memory       = 4096\n        network {\n          port \"ssh\" {\n              static = 6022\n          }\n          port \"ssh2\" {\n              static = 6023\n          }\n        }\n      }\n    }\n  }\n\n  group \"prod-group1-csit-shim-arm\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count            = 1\n\n    constraint {\n      attribute      = \"${node.class}\"\n      value          = \"csitarm\"\n    }\n\n    restart {\n      interval       = \"1m\"\n      attempts       = 3\n      delay          = \"15s\"\n      mode           = \"delay\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task.html\n    #\n    task \"prod-task1-csit-shim-arm\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver         = \"docker\"\n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        image        = \"csit_shim-ubuntu1804:local\"\n        network_mode = \"host\"\n        pid_mode     = \"host\"\n        volumes      = [\n          \"/var/run/docker.sock:/var/run/docker.sock\"\n        ]\n        privileged   = true\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources.html\n      #\n      resources {\n        cpu          = 1500\n        memory       = 4096\n        network {\n          port \"ssh\" {\n              static = 6022\n          }\n          port \"ssh2\" {\n              static = 6023\n          }\n        }\n      }\n    }\n  }\n}",
              "json": null,
-            "modify_index": "7454188",
+            "modify_index": "7454305",
              "name": "prod-device-csit-shim",
              "namespace": "default",
              "policy_override": null,
author	pmikus <pmikus@cisco.com>
	Fri, 5 Mar 2021 11:36:29 +0000 (11:36 +0000)
committer	Peter Mikus <pmikus@cisco.com>
	Fri, 5 Mar 2021 11:51:51 +0000 (11:51 +0000)
terraform-ci-infra/1n_nmd/prometheus/conf/nomad/prometheus.hcl		patch \| blob \| history
terraform-ci-infra/1n_nmd/terraform.tfstate		patch \| blob \| history
terraform-ci-infra/1n_nmd/terraform.tfstate.backup		patch \| blob \| history