2 # The "region" parameter specifies the region in which to execute the job.
3 # If omitted, this inherits the default region name of "global".
6 # The "datacenters" parameter specifies the list of datacenters which should
7 # be considered when placing this task. This must be provided.
8 datacenters = "${datacenters}"
10 # The "type" parameter controls the type of job, which impacts the scheduler's
11 # decision on placement. This configuration is optional and defaults to
12 # "service". For a full list of job types and their differences, please see
13 # the online documentation.
15 # For more information, please see the online documentation at:
17 # https://www.nomadproject.io/docs/jobspec/schedulers
22 # The "max_parallel" parameter specifies the maximum number of updates to
23 # perform in parallel. In this case, this specifies to update a single task
27 health_check = "checks"
29 # The "min_healthy_time" parameter specifies the minimum time the allocation
30 # must be in the healthy state before it is marked as healthy and unblocks
31 # further allocations from being updated.
32 min_healthy_time = "10s"
34 # The "healthy_deadline" parameter specifies the deadline in which the
35 # allocation must be marked as healthy after which the allocation is
36 # automatically transitioned to unhealthy. Transitioning to unhealthy will
37 # fail the deployment and potentially roll back the job if "auto_revert" is
39 healthy_deadline = "3m"
41 # The "progress_deadline" parameter specifies the deadline in which an
42 # allocation must be marked as healthy. The deadline begins when the first
43 # allocation for the deployment is created and is reset whenever an allocation
44 # as part of the deployment transitions to a healthy state. If no allocation
45 # transitions to the healthy state before the progress deadline, the
46 # deployment is marked as failed.
47 progress_deadline = "10m"
50 # The "canary" parameter specifies that changes to the job that would result
51 # in destructive updates should create the specified number of canaries
52 # without stopping any previous allocations. Once the operator determines the
53 # canaries are healthy, they can be promoted which unblocks a rolling update
54 # of the remaining allocations at a rate of "max_parallel".
56 # Further, setting "canary" equal to the count of the task group allows
57 # blue/green deployments. When the job is updated, a full set of the new
58 # version is deployed and upon promotion the old version is stopped.
61 # Specifies if the job should auto-promote to the canary version when all
62 # canaries become healthy during a deployment. Defaults to false which means
63 # canaries must be manually updated with the nomad deployment promote
67 # The "auto_revert" parameter specifies if the job should auto-revert to the
68 # last stable job on deployment failure. A job is marked as stable if all the
69 # allocations as part of its deployment were marked healthy.
74 # The reschedule stanza specifies the group's rescheduling strategy. If
75 # specified at the job level, the configuration will apply to all groups
76 # within the job. If the reschedule stanza is present on both the job and the
77 # group, they are merged with the group stanza taking the highest precedence
81 delay_function = "constant"
85 # The "group" stanza defines a series of tasks that should be co-located on
86 # the same Nomad client. Any task within a group will be placed on the same
89 # For more information and examples on the "group" stanza, please see
90 # the online documentation at:
92 # https://www.nomadproject.io/docs/job-specification/group
94 group "prod-group1-${service_name}" {
95 # The "count" parameter specifies the number of the task groups that should
96 # be running under this group. This value must be non-negative and defaults
98 count = ${group_count}
100 # The restart stanza configures a tasks's behavior on task failure. Restarts
101 # happen on the client that is running the task.
103 # https://www.nomadproject.io/docs/job-specification/restart
112 # The volume stanza allows the group to specify that it requires a given
113 # volume from the cluster.
115 # For more information and examples on the "volume" stanza, please see
116 # the online documentation at:
118 # https://www.nomadproject.io/docs/job-specification/volume
120 %{ if use_host_volume }
121 volume "prod-volume1-${service_name}" {
124 source = "${host_volume}"
128 # The constraint allows restricting the set of eligible nodes. Constraints
129 # may filter on attributes or client metadata.
131 # For more information and examples on the "volume" stanza, please see
132 # the online documentation at:
134 # https://www.nomadproject.io/docs/job-specification/constraint
137 attribute = "$${attr.cpu.arch}"
142 # The "task" stanza creates an individual unit of work, such as a Docker
143 # container, web application, or batch processing.
145 # For more information and examples on the "task" stanza, please see
146 # the online documentation at:
148 # https://www.nomadproject.io/docs/job-specification/task
150 task "prod-task1-${service_name}" {
151 # The "driver" parameter specifies the task driver that should be used to
155 %{ if use_host_volume }
157 volume = "prod-volume1-${service_name}"
158 destination = "${data_dir}"
163 %{ if use_vault_provider }
165 policies = "${vault_kv_policy_name}"
169 # The "config" stanza specifies the driver configuration, which is passed
170 # directly to the driver to start the task. The details of configurations
171 # are specific to each driver, so please see specific driver
172 # documentation for more information.
174 command = "local/prometheus-${version}.linux-amd64/prometheus"
176 "--config.file=secrets/prometheus.yml",
177 "--storage.tsdb.path=${data_dir}prometheus/",
178 "--storage.tsdb.retention.time=7d"
182 # The artifact stanza instructs Nomad to fetch and unpack a remote resource,
183 # such as a file, tarball, or binary. Nomad downloads artifacts using the
184 # popular go-getter library, which permits downloading artifacts from a
185 # variety of locations using a URL as the input source.
187 # For more information and examples on the "artifact" stanza, please see
188 # the online documentation at:
190 # https://www.nomadproject.io/docs/job-specification/artifact
196 # The "template" stanza instructs Nomad to manage a template, such as
197 # a configuration file or script. This template can optionally pull data
198 # from Consul or Vault to populate runtime configuration data.
200 # For more information and examples on the "template" stanza, please see
201 # the online documentation at:
203 # https://www.nomadproject.io/docs/job-specification/template
207 change_signal = "SIGINT"
208 destination = "secrets/alerts.yml"
209 left_delimiter = "{{{"
210 right_delimiter = "}}}"
214 - name: "Jenkins Job Health Exporter"
216 - alert: JenkinsJobHealthExporterFailures
217 expr: jenkins_job_failure{id=~".*"} > jenkins_job_success{id=~".*"}
222 summary: "Jenkins Job Health detected high failure rate on jenkins jobs."
223 description: "Job: {{ $labels.id }}"
224 - alert: JenkinsJobHealthExporterUnstable
225 expr: jenkins_job_unstable{id=~".*"} > jenkins_job_success{id=~".*"}
230 summary: "Jenkins Job Health detected high unstable rate on jenkins jobs."
231 description: "Job: {{ $labels.id }}"
234 - alert: ConsulServiceHealthcheckFailed
235 expr: consul_catalog_service_node_healthy == 0
240 summary: "Consul service healthcheck failed (instance {{ $labels.instance }})."
241 description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`."
242 - alert: ConsulMissingMasterNode
243 expr: consul_raft_peers < 3
248 summary: "Consul missing master node (instance {{ $labels.instance }})."
249 description: "Numbers of consul raft peers should be 3, in order to preserve quorum."
250 - alert: ConsulAgentUnhealthy
251 expr: consul_health_node_status{status="critical"} == 1
256 summary: "Consul agent unhealthy (instance {{ $labels.instance }})."
257 description: "A Consul agent is down."
266 summary: "Prometheus target missing (instance {{ $labels.instance }})."
267 description: "A Prometheus target has disappeared. An exporter might be crashed."
268 - alert: HostOutOfMemory
269 expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
274 summary: "Host out of memory (instance {{ $labels.instance }})."
275 description: "Node memory is filling up (< 10% left)."
276 - alert: HostOomKillDetected
277 expr: increase(node_vmstat_oom_kill[1m]) > 0
282 summary: "Host OOM kill detected (instance {{ $labels.instance }})."
283 description: "OOM kill detected."
284 - alert: HostMemoryUnderMemoryPressure
285 expr: rate(node_vmstat_pgmajfault[1m]) > 1000
290 summary: "Host memory under memory pressure (instance {{ $labels.instance }})."
291 description: "The node is under heavy memory pressure. High rate of major page faults."
292 - alert: HostOutOfDiskSpace
293 expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
298 summary: "Host out of disk space (instance {{ $labels.instance }})."
299 description: "Disk is almost full (< 10% left)."
300 - alert: HostRaidDiskFailure
301 expr: node_md_disks{state="failed"} > 0
306 summary: "Host RAID disk failure (instance {{ $labels.instance }})."
307 description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap."
308 - alert: HostConntrackLimit
309 expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
314 summary: "Host conntrack limit (instance {{ $labels.instance }})."
315 description: "The number of conntrack is approching limit."
316 - alert: HostNetworkInterfaceSaturated
317 expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8
322 summary: "Host Network Interface Saturated (instance {{ $labels.instance }})."
323 description: "The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded."
324 - alert: HostSystemdServiceCrashed
325 expr: node_systemd_unit_state{state="failed"} == 1
330 summary: "Host SystemD service crashed (instance {{ $labels.instance }})."
331 description: "SystemD service crashed."
332 - alert: HostEdacCorrectableErrorsDetected
333 expr: increase(node_edac_correctable_errors_total[1m]) > 0
338 summary: "Host EDAC Correctable Errors detected (instance {{ $labels.instance }})."
339 description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'
340 - alert: HostEdacUncorrectableErrorsDetected
341 expr: node_edac_uncorrectable_errors_total > 0
346 summary: "Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})."
347 description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'
350 - alert: MinioDiskOffline
351 expr: minio_offline_disks > 0
356 summary: "Minio disk offline (instance {{ $labels.instance }})"
357 description: "Minio disk is offline."
358 - alert: MinioStorageSpaceExhausted
359 expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 < 10
364 summary: "Minio storage space exhausted (instance {{ $labels.instance }})."
365 description: "Minio storage space is low (< 10 GB)."
368 - alert: PrometheusConfigurationReloadFailure
369 expr: prometheus_config_last_reload_successful != 1
374 summary: "Prometheus configuration reload failure (instance {{ $labels.instance }})."
375 description: "Prometheus configuration reload error."
376 - alert: PrometheusTooManyRestarts
377 expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
382 summary: "Prometheus too many restarts (instance {{ $labels.instance }})."
383 description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping."
384 - alert: PrometheusAlertmanagerConfigurationReloadFailure
385 expr: alertmanager_config_last_reload_successful != 1
390 summary: "Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})."
391 description: "AlertManager configuration reload error."
392 - alert: PrometheusRuleEvaluationFailures
393 expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
398 summary: "Prometheus rule evaluation failures (instance {{ $labels.instance }})."
399 description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts."
400 - alert: PrometheusTargetScrapingSlow
401 expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60
406 summary: "Prometheus target scraping slow (instance {{ $labels.instance }})."
407 description: "Prometheus is scraping exporters slowly."
408 - alert: PrometheusTsdbCompactionsFailed
409 expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
414 summary: "Prometheus TSDB compactions failed (instance {{ $labels.instance }})."
415 description: "Prometheus encountered {{ $value }} TSDB compactions failures."
416 - alert: PrometheusTsdbHeadTruncationsFailed
417 expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
422 summary: "Prometheus TSDB head truncations failed (instance {{ $labels.instance }})."
423 description: "Prometheus encountered {{ $value }} TSDB head truncation failures."
424 - alert: PrometheusTsdbWalCorruptions
425 expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
430 summary: "Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})."
431 description: "Prometheus encountered {{ $value }} TSDB WAL corruptions."
432 - alert: PrometheusTsdbWalTruncationsFailed
433 expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
438 summary: "Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})."
439 description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures."
445 change_signal = "SIGINT"
446 destination = "secrets/prometheus.yml"
452 evaluation_interval: 5s
457 - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
458 services: [ 'alertmanager' ]
465 - job_name: 'Nomad Cluster'
467 - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
468 services: [ 'nomad-client', 'nomad' ]
470 - source_labels: [__meta_consul_tags]
471 regex: '(.*)http(.*)'
473 metrics_path: /v1/metrics
475 format: [ 'prometheus' ]
477 - job_name: 'Consul Cluster'
479 - targets: [ '10.30.51.28:8500' ]
480 - targets: [ '10.30.51.29:8500' ]
481 - targets: [ '10.30.51.30:8500' ]
482 - targets: [ '10.30.51.32:8500' ]
483 - targets: [ '10.30.51.33:8500' ]
484 - targets: [ '10.30.51.34:8500' ]
485 - targets: [ '10.30.51.35:8500' ]
486 - targets: [ '10.30.51.39:8500' ]
487 - targets: [ '10.30.51.40:8500' ]
488 - targets: [ '10.30.51.50:8500' ]
489 - targets: [ '10.30.51.51:8500' ]
490 - targets: [ '10.30.51.65:8500' ]
491 - targets: [ '10.30.51.66:8500' ]
492 - targets: [ '10.30.51.67:8500' ]
493 - targets: [ '10.30.51.68:8500' ]
494 - targets: [ '10.30.51.70:8500' ]
495 - targets: [ '10.30.51.71:8500' ]
496 - targets: [ '10.32.8.14:8500' ]
497 - targets: [ '10.32.8.15:8500' ]
498 - targets: [ '10.32.8.16:8500' ]
499 - targets: [ '10.32.8.17:8500' ]
500 metrics_path: /v1/agent/metrics
502 format: [ 'prometheus' ]
504 - job_name: 'Blackbox Exporter (icmp)'
506 - targets: [ 'gerrit.fd.io' ]
507 - targets: [ 'jenkins.fd.io' ]
508 - targets: [ '10.30.51.32' ]
510 module: [ 'icmp_v4' ]
512 - source_labels: [__address__]
513 target_label: __param_target
514 - source_labels: [__param_target]
515 target_label: instance
516 - target_label: __address__
517 replacement: localhost:9115
520 - job_name: 'Blackbox Exporter (http)'
522 - targets: [ 'gerrit.fd.io' ]
523 - targets: [ 'jenkins.fd.io' ]
525 module: [ 'http_2xx' ]
527 - source_labels: [__address__]
528 target_label: __param_target
529 - source_labels: [__param_target]
530 target_label: instance
531 - target_label: __address__
532 replacement: localhost:9115
535 - job_name: 'Jenkins Job Health Exporter'
537 - targets: [ '10.30.51.32:9186' ]
538 metric_relabel_configs:
539 - source_labels: [ __name__ ]
540 regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'
544 - source_labels: [ __name__ ]
545 regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'
546 replacement: 'jenkins_job_$2'
547 target_label: __name__
549 - job_name: 'Node Exporter'
551 - targets: [ '10.30.51.28:9100' ]
552 - targets: [ '10.30.51.29:9100' ]
553 - targets: [ '10.30.51.30:9100' ]
554 - targets: [ '10.30.51.32:9100' ]
555 - targets: [ '10.30.51.33:9100' ]
556 - targets: [ '10.30.51.34:9100' ]
557 - targets: [ '10.30.51.35:9100' ]
558 - targets: [ '10.30.51.39:9100' ]
559 - targets: [ '10.30.51.40:9100' ]
560 - targets: [ '10.30.51.50:9100' ]
561 - targets: [ '10.30.51.51:9100' ]
562 - targets: [ '10.30.51.65:9100' ]
563 - targets: [ '10.30.51.66:9100' ]
564 - targets: [ '10.30.51.67:9100' ]
565 - targets: [ '10.30.51.68:9100' ]
566 - targets: [ '10.30.51.70:9100' ]
567 - targets: [ '10.30.51.71:9100' ]
568 - targets: [ '10.32.8.14:9100' ]
569 - targets: [ '10.32.8.15:9100' ]
570 - targets: [ '10.32.8.16:9100' ]
571 - targets: [ '10.32.8.17:9100' ]
573 - job_name: 'Alertmanager'
575 - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
576 services: [ 'alertmanager' ]
578 - job_name: 'Grafana'
580 - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
581 services: [ 'grafana' ]
583 - job_name: 'Prometheus'
585 - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
586 services: [ 'prometheus' ]
589 bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg
591 - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
592 services: [ 'storage' ]
593 metrics_path: /minio/prometheus/metrics
597 # The service stanza instructs Nomad to register a service with Consul.
599 # For more information and examples on the "task" stanza, please see
600 # the online documentation at:
602 # https://www.nomadproject.io/docs/job-specification/service
605 name = "${service_name}"
606 port = "${service_name}"
607 tags = [ "${service_name}$${NOMAD_ALLOC_INDEX}" ]
609 name = "Prometheus Check Live"
617 # The "resources" stanza describes the requirements a task needs to
618 # execute. Resource requirements include memory, network, cpu, and more.
619 # This ensures the task will execute on a machine that contains enough
622 # For more information and examples on the "resources" stanza, please see
623 # the online documentation at:
625 # https://www.nomadproject.io/docs/job-specification/resources
630 # The network stanza specifies the networking requirements for the task
631 # group, including the network mode and port allocations. When scheduling
632 # jobs in Nomad they are provisioned across your fleet of machines along
633 # with other jobs and services. Because you don't know in advance what host
634 # your job will be provisioned on, Nomad will provide your tasks with
635 # network configuration when they start up.
637 # For more information and examples on the "template" stanza, please see
638 # the online documentation at:
640 # https://www.nomadproject.io/docs/job-specification/network
643 port "${service_name}" {