Infra: Add reschedule policy
[csit.git] / terraform-ci-infra / 1n_nmd / prometheus / conf / nomad / prometheus.hcl
index 2d74662..adc3031 100644 (file)
@@ -71,6 +71,17 @@ job "${job_name}" {
 %{ endif }
   }
 
+  # The reschedule stanza specifies the group's rescheduling strategy. If
+  # specified at the job level, the configuration will apply to all groups
+  # within the job. If the reschedule stanza is present on both the job and the
+  # group, they are merged with the group stanza taking the highest precedence
+  # and then the job.
+  reschedule {
+    delay             = "30s"
+    delay_function    = "constant"
+    unlimited         = true
+  }
+
   # The "group" stanza defines a series of tasks that should be co-located on
   # the same Nomad client. Any task within a group will be placed on the same
   # client.
@@ -86,6 +97,18 @@ job "${job_name}" {
     # to 1.
     count               = ${group_count}
 
+    # The restart stanza configures a tasks's behavior on task failure. Restarts
+    # happen on the client that is running the task.
+    #
+    # https://www.nomadproject.io/docs/job-specification/restart
+    #
+    restart {
+      interval  = "30m"
+      attempts  = 40
+      delay     = "15s"
+      mode      = "delay"
+    }
+
     # The volume stanza allows the group to specify that it requires a given
     # volume from the cluster.
     #
@@ -188,6 +211,24 @@ job "${job_name}" {
         data            = <<EOH
 ---
 groups:
+- name: "Jenkins Job Health Exporter"
+  rules:
+  - alert: JenkinsJobHealthExporterFailures
+    expr: jenkins_job_failure{id=~".*"} > jenkins_job_success{id=~".*"}
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: "Jenkins Job Health detected high failure rate on jenkins jobs."
+      description: "Job: {{ $labels.id }}"
+  - alert: JenkinsJobHealthExporterUnstable
+    expr: jenkins_job_unstable{id=~".*"} > jenkins_job_success{id=~".*"}
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Jenkins Job Health detected high unstable rate on jenkins jobs."
+      description: "Job: {{ $labels.id }}"
 - name: "Consul"
   rules:
   - alert: ConsulServiceHealthcheckFailed
@@ -225,13 +266,13 @@ groups:
       summary: "Prometheus target missing (instance {{ $labels.instance }})."
       description: "A Prometheus target has disappeared. An exporter might be crashed."
   - alert: HostHighCpuLoad
-    expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
+    expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95
     for: 0m
     labels:
       severity: warning
     annotations:
       summary: "Host high CPU load (instance {{ $labels.instance }})."
-      description: "CPU load is > 80%."
+      description: "CPU load is > 95%."
   - alert: HostOutOfMemory
     expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
     for: 2m
@@ -443,16 +484,31 @@ scrape_configs:
 
   - job_name: 'Consul Cluster'
     static_configs:
-      - targets: [ '10.30.51.30:8500', '10.30.51.32:8500', '10.30.51.33:8500' ]
+      - targets: [ '10.30.51.28:8500' ]
+      - targets: [ '10.30.51.29:8500' ]
+      - targets: [ '10.30.51.30:8500' ]
+      - targets: [ '10.30.51.32:8500' ]
+      - targets: [ '10.30.51.33:8500' ]
+      - targets: [ '10.30.51.34:8500' ]
+      - targets: [ '10.30.51.35:8500' ]
+      - targets: [ '10.30.51.39:8500' ]
+      - targets: [ '10.30.51.40:8500' ]
+      - targets: [ '10.30.51.50:8500' ]
+      - targets: [ '10.30.51.51:8500' ]
+      - targets: [ '10.30.51.65:8500' ]
+      - targets: [ '10.30.51.66:8500' ]
+      - targets: [ '10.30.51.67:8500' ]
+      - targets: [ '10.30.51.68:8500' ]
+      - targets: [ '10.30.51.70:8500' ]
+      - targets: [ '10.30.51.71:8500' ]
+      - targets: [ '10.32.8.14:8500' ]
+      - targets: [ '10.32.8.15:8500' ]
+      - targets: [ '10.32.8.16:8500' ]
+      - targets: [ '10.32.8.17:8500' ]
     metrics_path: /v1/agent/metrics
     params:
       format: [ 'prometheus' ]
 
-  - job_name: 'Alertmanager'
-    consul_sd_configs:
-    - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
-      services: [ 'alertmanager' ]
-
   - job_name: 'Blackbox Exporter (icmp)'
     static_configs:
       - targets: [ 'gerrit.fd.io' ]
@@ -485,20 +541,77 @@ scrape_configs:
     metrics_path: /probe
 
   - job_name: 'cAdvisor Exporter'
+    static_configs:
+      - targets: [ '10.30.51.28:8080' ]
+      - targets: [ '10.30.51.29:8080' ]
+      - targets: [ '10.30.51.30:8080' ]
+      #- targets: [ '10.30.51.32:8080' ]
+      - targets: [ '10.30.51.33:8080' ]
+      - targets: [ '10.30.51.34:8080' ]
+      - targets: [ '10.30.51.35:8080' ]
+      - targets: [ '10.30.51.39:8080' ]
+      - targets: [ '10.30.51.40:8080' ]
+      - targets: [ '10.30.51.50:8080' ]
+      - targets: [ '10.30.51.51:8080' ]
+      - targets: [ '10.30.51.65:8080' ]
+      - targets: [ '10.30.51.66:8080' ]
+      - targets: [ '10.30.51.67:8080' ]
+      - targets: [ '10.30.51.68:8080' ]
+      - targets: [ '10.30.51.70:8080' ]
+      - targets: [ '10.30.51.71:8080' ]
+      - targets: [ '10.32.8.14:8080' ]
+      - targets: [ '10.32.8.15:8080' ]
+      - targets: [ '10.32.8.16:8080' ]
+      - targets: [ '10.32.8.17:8080' ]
+
+  - job_name: 'Jenkins Job Health Exporter'
+    static_configs:
+      - targets: [ '10.30.51.32:9186' ]
+    metric_relabel_configs:
+      - source_labels: [ __name__ ]
+        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'
+        action: replace
+        replacement: '$1'
+        target_label: id
+      - source_labels: [ __name__ ]
+        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'
+        replacement: 'jenkins_job_$2'
+        target_label: __name__
+
+  - job_name: 'Node Exporter'
+    static_configs:
+      - targets: [ '10.30.51.28:9100' ]
+      - targets: [ '10.30.51.29:9100' ]
+      - targets: [ '10.30.51.30:9100' ]
+      - targets: [ '10.30.51.32:9100' ]
+      - targets: [ '10.30.51.33:9100' ]
+      - targets: [ '10.30.51.34:9100' ]
+      - targets: [ '10.30.51.35:9100' ]
+      - targets: [ '10.30.51.39:9100' ]
+      - targets: [ '10.30.51.40:9100' ]
+      - targets: [ '10.30.51.50:9100' ]
+      - targets: [ '10.30.51.51:9100' ]
+      - targets: [ '10.30.51.65:9100' ]
+      - targets: [ '10.30.51.66:9100' ]
+      - targets: [ '10.30.51.67:9100' ]
+      - targets: [ '10.30.51.68:9100' ]
+      - targets: [ '10.30.51.70:9100' ]
+      - targets: [ '10.30.51.71:9100' ]
+      - targets: [ '10.32.8.14:9100' ]
+      - targets: [ '10.32.8.15:9100' ]
+      - targets: [ '10.32.8.16:9100' ]
+      - targets: [ '10.32.8.17:9100' ]
+
+  - job_name: 'Alertmanager'
     consul_sd_configs:
     - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
-      services: [ 'cadvisorexporter' ]
+      services: [ 'alertmanager' ]
 
   - job_name: 'Grafana'
     consul_sd_configs:
     - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
       services: [ 'grafana' ]
 
-  - job_name: 'Node Exporter'
-    consul_sd_configs:
-    - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
-      services: [ 'nodeexporter' ]
-
   - job_name: 'Prometheus'
     consul_sd_configs:
     - server: '{{ env "NOMAD_IP_prometheus" }}:8500'