2 # The "region" parameter specifies the region in which to execute the job.
3 # If omitted, this inherits the default region name of "global".
6 # The "datacenters" parameter specifies the list of datacenters which should
7 # be considered when placing this task. This must be provided.
8 datacenters = "${datacenters}"
10 # The "type" parameter controls the type of job, which impacts the scheduler's
11 # decision on placement. This configuration is optional and defaults to
12 # "service". For a full list of job types and their differences, please see
13 # the online documentation.
15 # https://www.nomadproject.io/docs/jobspec/schedulers
20 # The "max_parallel" parameter specifies the maximum number of updates to
21 # perform in parallel. In this case, this specifies to update a single task
23 max_parallel = ${max_parallel}
25 health_check = "checks"
27 # The "min_healthy_time" parameter specifies the minimum time the allocation
28 # must be in the healthy state before it is marked as healthy and unblocks
29 # further allocations from being updated.
30 min_healthy_time = "10s"
32 # The "healthy_deadline" parameter specifies the deadline in which the
33 # allocation must be marked as healthy after which the allocation is
34 # automatically transitioned to unhealthy. Transitioning to unhealthy will
35 # fail the deployment and potentially roll back the job if "auto_revert" is
37 healthy_deadline = "3m"
39 # The "progress_deadline" parameter specifies the deadline in which an
40 # allocation must be marked as healthy. The deadline begins when the first
41 # allocation for the deployment is created and is reset whenever an allocation
42 # as part of the deployment transitions to a healthy state. If no allocation
43 # transitions to the healthy state before the progress deadline, the
44 # deployment is marked as failed.
45 progress_deadline = "10m"
48 # The "canary" parameter specifies that changes to the job that would result
49 # in destructive updates should create the specified number of canaries
50 # without stopping any previous allocations. Once the operator determines the
51 # canaries are healthy, they can be promoted which unblocks a rolling update
52 # of the remaining allocations at a rate of "max_parallel".
54 # Further, setting "canary" equal to the count of the task group allows
55 # blue/green deployments. When the job is updated, a full set of the new
56 # version is deployed and upon promotion the old version is stopped.
59 # Specifies if the job should auto-promote to the canary version when all
60 # canaries become healthy during a deployment. Defaults to false which means
61 # canaries must be manually updated with the nomad deployment promote
63 auto_promote = ${auto_promote}
65 # The "auto_revert" parameter specifies if the job should auto-revert to the
66 # last stable job on deployment failure. A job is marked as stable if all the
67 # allocations as part of its deployment were marked healthy.
68 auto_revert = ${auto_revert}
72 # The "group" stanza defines a series of tasks that should be co-located on
73 # the same Nomad client. Any task within a group will be placed on the same
76 # https://www.nomadproject.io/docs/job-specification/group
78 group "${job_name}-group-1" {
79 # The "count" parameter specifies the number of the task groups that should
80 # be running under this group. This value must be non-negative and defaults
82 count = ${group_count}
84 # The volume stanza allows the group to specify that it requires a given
85 # volume from the cluster. The key of the stanza is the name of the volume
86 # as it will be exposed to task configuration.
88 # https://www.nomadproject.io/docs/job-specification/volume
89 %{ if use_host_volume }
90 volume "${job_name}-volume-1" {
93 source = "${volume_source}"
97 # The restart stanza configures a tasks's behavior on task failure. Restarts
98 # happen on the client that is running the task.
100 # https://www.nomadproject.io/docs/job-specification/restart
109 # The constraint allows restricting the set of eligible nodes. Constraints
110 # may filter on attributes or client metadata.
112 # https://www.nomadproject.io/docs/job-specification/constraint
115 attribute = "$${attr.cpu.arch}"
121 attribute = "$${node.class}"
125 # The network stanza specifies the networking requirements for the task
126 # group, including the network mode and port allocations. When scheduling
127 # jobs in Nomad they are provisioned across your fleet of machines along
128 # with other jobs and services. Because you don't know in advance what host
129 # your job will be provisioned on, Nomad will provide your tasks with
130 # network configuration when they start up.
132 # https://www.nomadproject.io/docs/job-specification/network
135 port "${service_name}" {
141 # The "task" stanza creates an individual unit of work, such as a Docker
142 # container, web application, or batch processing.
144 # https://www.nomadproject.io/docs/job-specification/task
146 task "${job_name}-task-1" {
147 # The "driver" parameter specifies the task driver that should be used to
151 %{ if use_host_volume }
153 volume = "${job_name}-volume-1"
154 destination = "${volume_destination}"
159 %{ if use_vault_provider }
161 policies = "${vault_kv_policy_name}"
165 # The "config" stanza specifies the driver configuration, which is passed
166 # directly to the driver to start the task. The details of configurations
167 # are specific to each driver, so please see specific driver
168 # documentation for more information.
170 command = "local/prometheus-${version}.linux-amd64/prometheus"
172 "--config.file=secrets/prometheus.yml",
173 "--storage.tsdb.path=${volume_destination}prometheus/",
174 "--storage.tsdb.retention.time=7d"
178 # The artifact stanza instructs Nomad to fetch and unpack a remote resource,
179 # such as a file, tarball, or binary. Nomad downloads artifacts using the
180 # popular go-getter library, which permits downloading artifacts from a
181 # variety of locations using a URL as the input source.
183 # https://www.nomadproject.io/docs/job-specification/artifact
189 # The "template" stanza instructs Nomad to manage a template, such as
190 # a configuration file or script. This template can optionally pull data
191 # from Consul or Vault to populate runtime configuration data.
193 # https://www.nomadproject.io/docs/job-specification/template
197 change_signal = "SIGINT"
198 destination = "secrets/alerts.yml"
199 left_delimiter = "{{{"
200 right_delimiter = "}}}"
204 - name: "Jenkins Job Health Exporter"
206 - alert: JenkinsJobHealthExporterFailures
207 expr: jenkins_job_failure{id=~".*"} > jenkins_job_success{id=~".*"}
212 summary: "Jenkins Job Health detected high failure rate on jenkins jobs."
213 description: "Job: {{ $labels.id }}"
214 - alert: JenkinsJobHealthExporterUnstable
215 expr: jenkins_job_unstable{id=~".*"} > jenkins_job_success{id=~".*"}
220 summary: "Jenkins Job Health detected high unstable rate on jenkins jobs."
221 description: "Job: {{ $labels.id }}"
224 - alert: ConsulServiceHealthcheckFailed
225 expr: consul_catalog_service_node_healthy == 0
230 summary: "Consul service healthcheck failed (instance {{ $labels.instance }})."
231 description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`."
232 - alert: ConsulMissingMasterNode
233 expr: consul_raft_peers < 3
238 summary: "Consul missing master node (instance {{ $labels.instance }})."
239 description: "Numbers of consul raft peers should be 3, in order to preserve quorum."
240 - alert: ConsulAgentUnhealthy
241 expr: consul_health_node_status{status="critical"} == 1
246 summary: "Consul agent unhealthy (instance {{ $labels.instance }})."
247 description: "A Consul agent is down."
256 summary: "Prometheus target missing (instance {{ $labels.instance }})."
257 description: "A Prometheus target has disappeared. An exporter might be crashed."
258 - alert: HostOutOfMemory
259 expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
264 summary: "Host out of memory (instance {{ $labels.instance }})."
265 description: "Node memory is filling up (< 10% left)."
266 - alert: HostOomKillDetected
267 expr: increase(node_vmstat_oom_kill[1m]) > 0
272 summary: "Host OOM kill detected (instance {{ $labels.instance }})."
273 description: "OOM kill detected."
274 - alert: HostMemoryUnderMemoryPressure
275 expr: rate(node_vmstat_pgmajfault[1m]) > 1000
280 summary: "Host memory under memory pressure (instance {{ $labels.instance }})."
281 description: "The node is under heavy memory pressure. High rate of major page faults."
282 - alert: HostOutOfDiskSpace
283 expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
288 summary: "Host out of disk space (instance {{ $labels.instance }})."
289 description: "Disk is almost full (< 10% left)."
290 - alert: HostRaidDiskFailure
291 expr: node_md_disks{state="failed"} > 0
296 summary: "Host RAID disk failure (instance {{ $labels.instance }})."
297 description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap."
298 - alert: HostConntrackLimit
299 expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
304 summary: "Host conntrack limit (instance {{ $labels.instance }})."
305 description: "The number of conntrack is approching limit."
306 - alert: HostNetworkInterfaceSaturated
307 expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8
312 summary: "Host Network Interface Saturated (instance {{ $labels.instance }})."
313 description: "The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded."
314 - alert: HostSystemdServiceCrashed
315 expr: node_systemd_unit_state{state="failed"} == 1
320 summary: "Host SystemD service crashed (instance {{ $labels.instance }})."
321 description: "SystemD service crashed."
322 - alert: HostEdacCorrectableErrorsDetected
323 expr: increase(node_edac_correctable_errors_total[1m]) > 0
328 summary: "Host EDAC Correctable Errors detected (instance {{ $labels.instance }})."
329 description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'
330 - alert: HostEdacUncorrectableErrorsDetected
331 expr: node_edac_uncorrectable_errors_total > 0
336 summary: "Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})."
337 description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'
340 - alert: MinioDiskOffline
341 expr: minio_offline_disks > 0
346 summary: "Minio disk offline (instance {{ $labels.instance }})"
347 description: "Minio disk is offline."
348 - alert: MinioStorageSpaceExhausted
349 expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 < 10
354 summary: "Minio storage space exhausted (instance {{ $labels.instance }})."
355 description: "Minio storage space is low (< 10 GB)."
358 - alert: PrometheusConfigurationReloadFailure
359 expr: prometheus_config_last_reload_successful != 1
364 summary: "Prometheus configuration reload failure (instance {{ $labels.instance }})."
365 description: "Prometheus configuration reload error."
366 - alert: PrometheusTooManyRestarts
367 expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
372 summary: "Prometheus too many restarts (instance {{ $labels.instance }})."
373 description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping."
374 - alert: PrometheusAlertmanagerConfigurationReloadFailure
375 expr: alertmanager_config_last_reload_successful != 1
380 summary: "Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})."
381 description: "AlertManager configuration reload error."
382 - alert: PrometheusRuleEvaluationFailures
383 expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
388 summary: "Prometheus rule evaluation failures (instance {{ $labels.instance }})."
389 description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts."
390 - alert: PrometheusTargetScrapingSlow
391 expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60
396 summary: "Prometheus target scraping slow (instance {{ $labels.instance }})."
397 description: "Prometheus is scraping exporters slowly."
398 - alert: PrometheusTsdbCompactionsFailed
399 expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
404 summary: "Prometheus TSDB compactions failed (instance {{ $labels.instance }})."
405 description: "Prometheus encountered {{ $value }} TSDB compactions failures."
406 - alert: PrometheusTsdbHeadTruncationsFailed
407 expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
412 summary: "Prometheus TSDB head truncations failed (instance {{ $labels.instance }})."
413 description: "Prometheus encountered {{ $value }} TSDB head truncation failures."
414 - alert: PrometheusTsdbWalCorruptions
415 expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
420 summary: "Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})."
421 description: "Prometheus encountered {{ $value }} TSDB WAL corruptions."
422 - alert: PrometheusTsdbWalTruncationsFailed
423 expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
428 summary: "Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})."
429 description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures."
435 change_signal = "SIGINT"
436 destination = "secrets/prometheus.yml"
442 evaluation_interval: 5s
447 - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
448 services: [ 'alertmanager' ]
455 - job_name: 'Nomad Cluster'
457 - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
458 services: [ 'nomad-client', 'nomad' ]
460 - source_labels: [__meta_consul_tags]
461 regex: '(.*)http(.*)'
463 metrics_path: /v1/metrics
465 format: [ 'prometheus' ]
467 - job_name: 'Consul Cluster'
469 - targets: [ '10.30.51.16:8500' ]
470 - targets: [ '10.30.51.17:8500' ]
471 - targets: [ '10.30.51.18:8500' ]
472 - targets: [ '10.30.51.19:8500' ]
473 - targets: [ '10.30.51.20:8500' ]
474 - targets: [ '10.30.51.21:8500' ]
475 - targets: [ '10.30.51.22:8500' ]
476 - targets: [ '10.30.51.23:8500' ]
477 - targets: [ '10.30.51.24:8500' ]
478 - targets: [ '10.30.51.25:8500' ]
479 - targets: [ '10.30.51.26:8500' ]
480 - targets: [ '10.30.51.50:8500' ]
481 - targets: [ '10.30.51.51:8500' ]
482 - targets: [ '10.30.51.70:8500' ]
483 - targets: [ '10.30.51.71:8500' ]
484 - targets: [ '10.30.51.91:8500' ]
485 - targets: [ '10.30.51.92:8500' ]
486 metrics_path: /v1/agent/metrics
488 format: [ 'prometheus' ]
490 - job_name: 'Jenkins Job Health Exporter'
492 - targets: [ '10.30.51.22:9186' ]
493 metric_relabel_configs:
494 - source_labels: [ __name__ ]
495 regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'
499 - source_labels: [ __name__ ]
500 regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'
501 replacement: 'jenkins_job_$2'
502 target_label: __name__
504 - job_name: 'Node Exporter'
506 - targets: [ '10.30.51.16:9100' ]
507 - targets: [ '10.30.51.17:9100' ]
508 - targets: [ '10.30.51.18:9100' ]
509 - targets: [ '10.30.51.19:9100' ]
510 - targets: [ '10.30.51.20:9100' ]
511 - targets: [ '10.30.51.21:9100' ]
512 - targets: [ '10.30.51.22:9100' ]
513 - targets: [ '10.30.51.23:9100' ]
514 - targets: [ '10.30.51.24:9100' ]
515 - targets: [ '10.30.51.25:9100' ]
516 - targets: [ '10.30.51.26:9100' ]
517 - targets: [ '10.30.51.50:9100' ]
518 - targets: [ '10.30.51.51:9100' ]
519 - targets: [ '10.30.51.70:9100' ]
520 - targets: [ '10.30.51.71:9100' ]
521 - targets: [ '10.30.51.91:9100' ]
522 - targets: [ '10.30.51.92:9100' ]
524 - job_name: 'Alertmanager'
526 - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
527 services: [ 'alertmanager' ]
529 - job_name: 'Grafana'
531 - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
532 services: [ 'grafana' ]
534 - job_name: 'Prometheus'
536 - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
537 services: [ 'prometheus' ]
540 bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg
542 - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
543 services: [ 'storage' ]
544 metrics_path: /minio/prometheus/metrics
548 # The service stanza instructs Nomad to register a service with Consul.
550 # https://www.nomadproject.io/docs/job-specification/service
553 name = "${service_name}"
554 port = "${service_name}"
555 tags = [ "${service_name}$${NOMAD_ALLOC_INDEX}" ]
557 name = "Prometheus Check Live"
565 # The "resources" stanza describes the requirements a task needs to
566 # execute. Resource requirements include memory, network, cpu, and more.
567 # This ensures the task will execute on a machine that contains enough
570 # https://www.nomadproject.io/docs/job-specification/resources