2 # The "region" parameter specifies the region in which to execute the job.
3 # If omitted, this inherits the default region name of "global".
6 # The "datacenters" parameter specifies the list of datacenters which should
7 # be considered when placing this task. This must be provided.
8 datacenters = "${datacenters}"
10 # The "type" parameter controls the type of job, which impacts the scheduler's
11 # decision on placement. This configuration is optional and defaults to
12 # "service". For a full list of job types and their differences, please see
13 # the online documentation.
15 # For more information, please see the online documentation at:
17 # https://www.nomadproject.io/docs/jobspec/schedulers
22 # The "max_parallel" parameter specifies the maximum number of updates to
23 # perform in parallel. In this case, this specifies to update a single task
27 health_check = "checks"
29 # The "min_healthy_time" parameter specifies the minimum time the allocation
30 # must be in the healthy state before it is marked as healthy and unblocks
31 # further allocations from being updated.
32 min_healthy_time = "10s"
34 # The "healthy_deadline" parameter specifies the deadline in which the
35 # allocation must be marked as healthy after which the allocation is
36 # automatically transitioned to unhealthy. Transitioning to unhealthy will
37 # fail the deployment and potentially roll back the job if "auto_revert" is
39 healthy_deadline = "3m"
41 # The "progress_deadline" parameter specifies the deadline in which an
42 # allocation must be marked as healthy. The deadline begins when the first
43 # allocation for the deployment is created and is reset whenever an allocation
44 # as part of the deployment transitions to a healthy state. If no allocation
45 # transitions to the healthy state before the progress deadline, the
46 # deployment is marked as failed.
47 progress_deadline = "10m"
50 # The "canary" parameter specifies that changes to the job that would result
51 # in destructive updates should create the specified number of canaries
52 # without stopping any previous allocations. Once the operator determines the
53 # canaries are healthy, they can be promoted which unblocks a rolling update
54 # of the remaining allocations at a rate of "max_parallel".
56 # Further, setting "canary" equal to the count of the task group allows
57 # blue/green deployments. When the job is updated, a full set of the new
58 # version is deployed and upon promotion the old version is stopped.
61 # Specifies if the job should auto-promote to the canary version when all
62 # canaries become healthy during a deployment. Defaults to false which means
63 # canaries must be manually updated with the nomad deployment promote
67 # The "auto_revert" parameter specifies if the job should auto-revert to the
68 # last stable job on deployment failure. A job is marked as stable if all the
69 # allocations as part of its deployment were marked healthy.
74 # The reschedule stanza specifies the group's rescheduling strategy. If
75 # specified at the job level, the configuration will apply to all groups
76 # within the job. If the reschedule stanza is present on both the job and the
77 # group, they are merged with the group stanza taking the highest precedence
81 delay_function = "constant"
85 # The "group" stanza defines a series of tasks that should be co-located on
86 # the same Nomad client. Any task within a group will be placed on the same
89 # For more information and examples on the "group" stanza, please see
90 # the online documentation at:
92 # https://www.nomadproject.io/docs/job-specification/group
94 group "prod-group1-${service_name}" {
95 # The "count" parameter specifies the number of the task groups that should
96 # be running under this group. This value must be non-negative and defaults
98 count = ${group_count}
100 # The restart stanza configures a tasks's behavior on task failure. Restarts
101 # happen on the client that is running the task.
103 # https://www.nomadproject.io/docs/job-specification/restart
112 # The volume stanza allows the group to specify that it requires a given
113 # volume from the cluster.
115 # For more information and examples on the "volume" stanza, please see
116 # the online documentation at:
118 # https://www.nomadproject.io/docs/job-specification/volume
120 %{ if use_host_volume }
121 volume "prod-volume1-${service_name}" {
124 source = "${host_volume}"
128 # The constraint allows restricting the set of eligible nodes. Constraints
129 # may filter on attributes or client metadata.
131 # For more information and examples on the "volume" stanza, please see
132 # the online documentation at:
134 # https://www.nomadproject.io/docs/job-specification/constraint
137 attribute = "$${attr.cpu.arch}"
143 attribute = "$${node.class}"
147 # The "task" stanza creates an individual unit of work, such as a Docker
148 # container, web application, or batch processing.
150 # For more information and examples on the "task" stanza, please see
151 # the online documentation at:
153 # https://www.nomadproject.io/docs/job-specification/task
155 task "prod-task1-${service_name}" {
156 # The "driver" parameter specifies the task driver that should be used to
160 %{ if use_host_volume }
162 volume = "prod-volume1-${service_name}"
163 destination = "${data_dir}"
168 %{ if use_vault_provider }
170 policies = "${vault_kv_policy_name}"
174 # The "config" stanza specifies the driver configuration, which is passed
175 # directly to the driver to start the task. The details of configurations
176 # are specific to each driver, so please see specific driver
177 # documentation for more information.
179 command = "local/prometheus-${version}.linux-amd64/prometheus"
181 "--config.file=secrets/prometheus.yml",
182 "--storage.tsdb.path=${data_dir}prometheus/",
183 "--storage.tsdb.retention.time=7d"
187 # The artifact stanza instructs Nomad to fetch and unpack a remote resource,
188 # such as a file, tarball, or binary. Nomad downloads artifacts using the
189 # popular go-getter library, which permits downloading artifacts from a
190 # variety of locations using a URL as the input source.
192 # For more information and examples on the "artifact" stanza, please see
193 # the online documentation at:
195 # https://www.nomadproject.io/docs/job-specification/artifact
201 # The "template" stanza instructs Nomad to manage a template, such as
202 # a configuration file or script. This template can optionally pull data
203 # from Consul or Vault to populate runtime configuration data.
205 # For more information and examples on the "template" stanza, please see
206 # the online documentation at:
208 # https://www.nomadproject.io/docs/job-specification/template
212 change_signal = "SIGINT"
213 destination = "secrets/alerts.yml"
214 left_delimiter = "{{{"
215 right_delimiter = "}}}"
219 - name: "Jenkins Job Health Exporter"
221 - alert: JenkinsJobHealthExporterFailures
222 expr: jenkins_job_failure{id=~".*"} > jenkins_job_success{id=~".*"}
227 summary: "Jenkins Job Health detected high failure rate on jenkins jobs."
228 description: "Job: {{ $labels.id }}"
229 - alert: JenkinsJobHealthExporterUnstable
230 expr: jenkins_job_unstable{id=~".*"} > jenkins_job_success{id=~".*"}
235 summary: "Jenkins Job Health detected high unstable rate on jenkins jobs."
236 description: "Job: {{ $labels.id }}"
239 - alert: ConsulServiceHealthcheckFailed
240 expr: consul_catalog_service_node_healthy == 0
245 summary: "Consul service healthcheck failed (instance {{ $labels.instance }})."
246 description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`."
247 - alert: ConsulMissingMasterNode
248 expr: consul_raft_peers < 3
253 summary: "Consul missing master node (instance {{ $labels.instance }})."
254 description: "Numbers of consul raft peers should be 3, in order to preserve quorum."
255 - alert: ConsulAgentUnhealthy
256 expr: consul_health_node_status{status="critical"} == 1
261 summary: "Consul agent unhealthy (instance {{ $labels.instance }})."
262 description: "A Consul agent is down."
271 summary: "Prometheus target missing (instance {{ $labels.instance }})."
272 description: "A Prometheus target has disappeared. An exporter might be crashed."
273 - alert: HostOutOfMemory
274 expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
279 summary: "Host out of memory (instance {{ $labels.instance }})."
280 description: "Node memory is filling up (< 10% left)."
281 - alert: HostOomKillDetected
282 expr: increase(node_vmstat_oom_kill[1m]) > 0
287 summary: "Host OOM kill detected (instance {{ $labels.instance }})."
288 description: "OOM kill detected."
289 - alert: HostMemoryUnderMemoryPressure
290 expr: rate(node_vmstat_pgmajfault[1m]) > 1000
295 summary: "Host memory under memory pressure (instance {{ $labels.instance }})."
296 description: "The node is under heavy memory pressure. High rate of major page faults."
297 - alert: HostOutOfDiskSpace
298 expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
303 summary: "Host out of disk space (instance {{ $labels.instance }})."
304 description: "Disk is almost full (< 10% left)."
305 - alert: HostRaidDiskFailure
306 expr: node_md_disks{state="failed"} > 0
311 summary: "Host RAID disk failure (instance {{ $labels.instance }})."
312 description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap."
313 - alert: HostConntrackLimit
314 expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
319 summary: "Host conntrack limit (instance {{ $labels.instance }})."
320 description: "The number of conntrack is approching limit."
321 - alert: HostNetworkInterfaceSaturated
322 expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8
327 summary: "Host Network Interface Saturated (instance {{ $labels.instance }})."
328 description: "The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded."
329 - alert: HostSystemdServiceCrashed
330 expr: node_systemd_unit_state{state="failed"} == 1
335 summary: "Host SystemD service crashed (instance {{ $labels.instance }})."
336 description: "SystemD service crashed."
337 - alert: HostEdacCorrectableErrorsDetected
338 expr: increase(node_edac_correctable_errors_total[1m]) > 0
343 summary: "Host EDAC Correctable Errors detected (instance {{ $labels.instance }})."
344 description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'
345 - alert: HostEdacUncorrectableErrorsDetected
346 expr: node_edac_uncorrectable_errors_total > 0
351 summary: "Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})."
352 description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'
355 - alert: MinioDiskOffline
356 expr: minio_offline_disks > 0
361 summary: "Minio disk offline (instance {{ $labels.instance }})"
362 description: "Minio disk is offline."
363 - alert: MinioStorageSpaceExhausted
364 expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 < 10
369 summary: "Minio storage space exhausted (instance {{ $labels.instance }})."
370 description: "Minio storage space is low (< 10 GB)."
373 - alert: PrometheusConfigurationReloadFailure
374 expr: prometheus_config_last_reload_successful != 1
379 summary: "Prometheus configuration reload failure (instance {{ $labels.instance }})."
380 description: "Prometheus configuration reload error."
381 - alert: PrometheusTooManyRestarts
382 expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
387 summary: "Prometheus too many restarts (instance {{ $labels.instance }})."
388 description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping."
389 - alert: PrometheusAlertmanagerConfigurationReloadFailure
390 expr: alertmanager_config_last_reload_successful != 1
395 summary: "Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})."
396 description: "AlertManager configuration reload error."
397 - alert: PrometheusRuleEvaluationFailures
398 expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
403 summary: "Prometheus rule evaluation failures (instance {{ $labels.instance }})."
404 description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts."
405 - alert: PrometheusTargetScrapingSlow
406 expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60
411 summary: "Prometheus target scraping slow (instance {{ $labels.instance }})."
412 description: "Prometheus is scraping exporters slowly."
413 - alert: PrometheusTsdbCompactionsFailed
414 expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
419 summary: "Prometheus TSDB compactions failed (instance {{ $labels.instance }})."
420 description: "Prometheus encountered {{ $value }} TSDB compactions failures."
421 - alert: PrometheusTsdbHeadTruncationsFailed
422 expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
427 summary: "Prometheus TSDB head truncations failed (instance {{ $labels.instance }})."
428 description: "Prometheus encountered {{ $value }} TSDB head truncation failures."
429 - alert: PrometheusTsdbWalCorruptions
430 expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
435 summary: "Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})."
436 description: "Prometheus encountered {{ $value }} TSDB WAL corruptions."
437 - alert: PrometheusTsdbWalTruncationsFailed
438 expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
443 summary: "Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})."
444 description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures."
450 change_signal = "SIGINT"
451 destination = "secrets/prometheus.yml"
457 evaluation_interval: 5s
462 - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
463 services: [ 'alertmanager' ]
470 - job_name: 'Nomad Cluster'
472 - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
473 services: [ 'nomad-client', 'nomad' ]
475 - source_labels: [__meta_consul_tags]
476 regex: '(.*)http(.*)'
478 metrics_path: /v1/metrics
480 format: [ 'prometheus' ]
482 - job_name: 'Consul Cluster'
484 - targets: [ '10.30.51.22:8500' ]
485 - targets: [ '10.30.51.24:8500' ]
486 - targets: [ '10.30.51.25:8500' ]
487 - targets: [ '10.30.51.26:8500' ]
488 - targets: [ '10.30.51.28:8500' ]
489 - targets: [ '10.30.51.29:8500' ]
490 - targets: [ '10.30.51.30:8500' ]
491 - targets: [ '10.30.51.39:8500' ]
492 - targets: [ '10.30.51.40:8500' ]
493 - targets: [ '10.30.51.50:8500' ]
494 - targets: [ '10.30.51.51:8500' ]
495 - targets: [ '10.30.51.65:8500' ]
496 - targets: [ '10.30.51.66:8500' ]
497 - targets: [ '10.30.51.67:8500' ]
498 - targets: [ '10.30.51.68:8500' ]
499 - targets: [ '10.30.51.70:8500' ]
500 - targets: [ '10.30.51.71:8500' ]
501 - targets: [ '10.32.8.14:8500' ]
502 - targets: [ '10.32.8.15:8500' ]
503 - targets: [ '10.32.8.16:8500' ]
504 - targets: [ '10.32.8.17:8500' ]
505 metrics_path: /v1/agent/metrics
507 format: [ 'prometheus' ]
509 - job_name: 'Blackbox Exporter (icmp)'
511 - targets: [ 'gerrit.fd.io' ]
512 - targets: [ 'jenkins.fd.io' ]
513 - targets: [ '10.32.8.17' ]
515 module: [ 'icmp_v4' ]
517 - source_labels: [__address__]
518 target_label: __param_target
519 - source_labels: [__param_target]
520 target_label: instance
521 - target_label: __address__
522 replacement: localhost:9115
525 - job_name: 'Blackbox Exporter (http)'
527 - targets: [ 'gerrit.fd.io' ]
528 - targets: [ 'jenkins.fd.io' ]
530 module: [ 'http_2xx' ]
532 - source_labels: [__address__]
533 target_label: __param_target
534 - source_labels: [__param_target]
535 target_label: instance
536 - target_label: __address__
537 replacement: localhost:9115
540 - job_name: 'Jenkins Job Health Exporter'
542 - targets: [ '10.30.51.22:9186' ]
543 metric_relabel_configs:
544 - source_labels: [ __name__ ]
545 regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'
549 - source_labels: [ __name__ ]
550 regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'
551 replacement: 'jenkins_job_$2'
552 target_label: __name__
554 - job_name: 'Node Exporter'
556 - targets: [ '10.30.51.22:9100' ]
557 - targets: [ '10.30.51.24:9100' ]
558 - targets: [ '10.30.51.25:9100' ]
559 - targets: [ '10.30.51.26:9100' ]
560 - targets: [ '10.30.51.28:9100' ]
561 - targets: [ '10.30.51.29:9100' ]
562 - targets: [ '10.30.51.30:9100' ]
563 - targets: [ '10.30.51.39:9100' ]
564 - targets: [ '10.30.51.40:9100' ]
565 - targets: [ '10.30.51.50:9100' ]
566 - targets: [ '10.30.51.51:9100' ]
567 - targets: [ '10.30.51.65:9100' ]
568 - targets: [ '10.30.51.66:9100' ]
569 - targets: [ '10.30.51.67:9100' ]
570 - targets: [ '10.30.51.68:9100' ]
571 - targets: [ '10.30.51.70:9100' ]
572 - targets: [ '10.30.51.71:9100' ]
573 - targets: [ '10.32.8.14:9100' ]
574 - targets: [ '10.32.8.15:9100' ]
575 - targets: [ '10.32.8.16:9100' ]
576 - targets: [ '10.32.8.17:9100' ]
578 - job_name: 'Alertmanager'
580 - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
581 services: [ 'alertmanager' ]
583 - job_name: 'Grafana'
585 - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
586 services: [ 'grafana' ]
588 - job_name: 'Prometheus'
590 - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
591 services: [ 'prometheus' ]
594 bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg
596 - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
597 services: [ 'storage' ]
598 metrics_path: /minio/prometheus/metrics
602 # The service stanza instructs Nomad to register a service with Consul.
604 # For more information and examples on the "task" stanza, please see
605 # the online documentation at:
607 # https://www.nomadproject.io/docs/job-specification/service
610 name = "${service_name}"
611 port = "${service_name}"
612 tags = [ "${service_name}$${NOMAD_ALLOC_INDEX}" ]
614 name = "Prometheus Check Live"
622 # The "resources" stanza describes the requirements a task needs to
623 # execute. Resource requirements include memory, network, cpu, and more.
624 # This ensures the task will execute on a machine that contains enough
627 # For more information and examples on the "resources" stanza, please see
628 # the online documentation at:
630 # https://www.nomadproject.io/docs/job-specification/resources
635 # The network stanza specifies the networking requirements for the task
636 # group, including the network mode and port allocations. When scheduling
637 # jobs in Nomad they are provisioned across your fleet of machines along
638 # with other jobs and services. Because you don't know in advance what host
639 # your job will be provisioned on, Nomad will provide your tasks with
640 # network configuration when they start up.
642 # For more information and examples on the "template" stanza, please see
643 # the online documentation at:
645 # https://www.nomadproject.io/docs/job-specification/network
648 port "${service_name}" {