X-Git-Url: https://gerrit.fd.io/r/gitweb?p=csit.git;a=blobdiff_plain;f=terraform-ci-infra%2F1n_nmd%2Fprometheus%2Fconf%2Fnomad%2Fprometheus.hcl;h=adc30318c4819c236b6f629b5b3667dd42ef28b4;hp=d851628fcdc31663bb866db3e59070fe62981cec;hb=9481aad815189d6251d36c11e3f901f9179dab40;hpb=0017c9d8372ef306ac73aae22bb0d17631c944d2 diff --git a/terraform-ci-infra/1n_nmd/prometheus/conf/nomad/prometheus.hcl b/terraform-ci-infra/1n_nmd/prometheus/conf/nomad/prometheus.hcl index d851628fcd..adc30318c4 100644 --- a/terraform-ci-infra/1n_nmd/prometheus/conf/nomad/prometheus.hcl +++ b/terraform-ci-infra/1n_nmd/prometheus/conf/nomad/prometheus.hcl @@ -71,6 +71,17 @@ job "${job_name}" { %{ endif } } + # The reschedule stanza specifies the group's rescheduling strategy. If + # specified at the job level, the configuration will apply to all groups + # within the job. If the reschedule stanza is present on both the job and the + # group, they are merged with the group stanza taking the highest precedence + # and then the job. + reschedule { + delay = "30s" + delay_function = "constant" + unlimited = true + } + # The "group" stanza defines a series of tasks that should be co-located on # the same Nomad client. Any task within a group will be placed on the same # client. @@ -86,6 +97,18 @@ job "${job_name}" { # to 1. count = ${group_count} + # The restart stanza configures a tasks's behavior on task failure. Restarts + # happen on the client that is running the task. + # + # https://www.nomadproject.io/docs/job-specification/restart + # + restart { + interval = "30m" + attempts = 40 + delay = "15s" + mode = "delay" + } + # The volume stanza allows the group to specify that it requires a given # volume from the cluster. # @@ -191,7 +214,7 @@ groups: - name: "Jenkins Job Health Exporter" rules: - alert: JenkinsJobHealthExporterFailures - expr: jenkins_job_failure{id=~".*"} >= 10 + expr: jenkins_job_failure{id=~".*"} > jenkins_job_success{id=~".*"} for: 0m labels: severity: critical @@ -199,7 +222,7 @@ groups: summary: "Jenkins Job Health detected high failure rate on jenkins jobs." description: "Job: {{ $labels.id }}" - alert: JenkinsJobHealthExporterUnstable - expr: jenkins_job_unstable{id=~".*"} >= 10 + expr: jenkins_job_unstable{id=~".*"} > jenkins_job_success{id=~".*"} for: 0m labels: severity: warning