2 # The "region" parameter specifies the region in which to execute the job.
3 # If omitted, this inherits the default region name of "global".
6 # The "datacenters" parameter specifies the list of datacenters which should
7 # be considered when placing this task. This must be provided.
8 datacenters = "${datacenters}"
10 # The "type" parameter controls the type of job, which impacts the scheduler's
11 # decision on placement. This configuration is optional and defaults to
12 # "service". For a full list of job types and their differences, please see
13 # the online documentation.
15 # For more information, please see the online documentation at:
17 # https://www.nomadproject.io/docs/jobspec/schedulers
22 # The "max_parallel" parameter specifies the maximum number of updates to
23 # perform in parallel. In this case, this specifies to update a single task
27 health_check = "checks"
29 # The "min_healthy_time" parameter specifies the minimum time the allocation
30 # must be in the healthy state before it is marked as healthy and unblocks
31 # further allocations from being updated.
32 min_healthy_time = "10s"
34 # The "healthy_deadline" parameter specifies the deadline in which the
35 # allocation must be marked as healthy after which the allocation is
36 # automatically transitioned to unhealthy. Transitioning to unhealthy will
37 # fail the deployment and potentially roll back the job if "auto_revert" is
39 healthy_deadline = "3m"
41 # The "progress_deadline" parameter specifies the deadline in which an
42 # allocation must be marked as healthy. The deadline begins when the first
43 # allocation for the deployment is created and is reset whenever an allocation
44 # as part of the deployment transitions to a healthy state. If no allocation
45 # transitions to the healthy state before the progress deadline, the
46 # deployment is marked as failed.
47 progress_deadline = "10m"
50 # The "canary" parameter specifies that changes to the job that would result
51 # in destructive updates should create the specified number of canaries
52 # without stopping any previous allocations. Once the operator determines the
53 # canaries are healthy, they can be promoted which unblocks a rolling update
54 # of the remaining allocations at a rate of "max_parallel".
56 # Further, setting "canary" equal to the count of the task group allows
57 # blue/green deployments. When the job is updated, a full set of the new
58 # version is deployed and upon promotion the old version is stopped.
61 # Specifies if the job should auto-promote to the canary version when all
62 # canaries become healthy during a deployment. Defaults to false which means
63 # canaries must be manually updated with the nomad deployment promote
67 # The "auto_revert" parameter specifies if the job should auto-revert to the
68 # last stable job on deployment failure. A job is marked as stable if all the
69 # allocations as part of its deployment were marked healthy.
74 # The reschedule stanza specifies the group's rescheduling strategy. If
75 # specified at the job level, the configuration will apply to all groups
76 # within the job. If the reschedule stanza is present on both the job and the
77 # group, they are merged with the group stanza taking the highest precedence
81 delay_function = "constant"
85 # The "group" stanza defines a series of tasks that should be co-located on
86 # the same Nomad client. Any task within a group will be placed on the same
89 # For more information and examples on the "group" stanza, please see
90 # the online documentation at:
92 # https://www.nomadproject.io/docs/job-specification/group
94 group "prod-group1-${service_name}" {
95 # The "count" parameter specifies the number of the task groups that should
96 # be running under this group. This value must be non-negative and defaults
98 count = ${group_count}
100 # The restart stanza configures a tasks's behavior on task failure. Restarts
101 # happen on the client that is running the task.
103 # https://www.nomadproject.io/docs/job-specification/restart
112 # The volume stanza allows the group to specify that it requires a given
113 # volume from the cluster.
115 # For more information and examples on the "volume" stanza, please see
116 # the online documentation at:
118 # https://www.nomadproject.io/docs/job-specification/volume
120 %{ if use_host_volume }
121 volume "prod-volume1-${service_name}" {
124 source = "${host_volume}"
128 # The constraint allows restricting the set of eligible nodes. Constraints
129 # may filter on attributes or client metadata.
131 # For more information and examples on the "volume" stanza, please see
132 # the online documentation at:
134 # https://www.nomadproject.io/docs/job-specification/constraint
137 attribute = "$${attr.cpu.arch}"
142 # The "task" stanza creates an individual unit of work, such as a Docker
143 # container, web application, or batch processing.
145 # For more information and examples on the "task" stanza, please see
146 # the online documentation at:
148 # https://www.nomadproject.io/docs/job-specification/task
150 task "prod-task1-${service_name}" {
151 # The "driver" parameter specifies the task driver that should be used to
155 %{ if use_host_volume }
157 volume = "prod-volume1-${service_name}"
158 destination = "${data_dir}"
163 %{ if use_vault_provider }
165 policies = "${vault_kv_policy_name}"
169 # The "config" stanza specifies the driver configuration, which is passed
170 # directly to the driver to start the task. The details of configurations
171 # are specific to each driver, so please see specific driver
172 # documentation for more information.
174 command = "local/prometheus-${version}.linux-amd64/prometheus"
176 "--config.file=secrets/prometheus.yml",
177 "--storage.tsdb.path=${data_dir}prometheus/",
178 "--storage.tsdb.retention.time=15d"
182 # The artifact stanza instructs Nomad to fetch and unpack a remote resource,
183 # such as a file, tarball, or binary. Nomad downloads artifacts using the
184 # popular go-getter library, which permits downloading artifacts from a
185 # variety of locations using a URL as the input source.
187 # For more information and examples on the "artifact" stanza, please see
188 # the online documentation at:
190 # https://www.nomadproject.io/docs/job-specification/artifact
196 # The "template" stanza instructs Nomad to manage a template, such as
197 # a configuration file or script. This template can optionally pull data
198 # from Consul or Vault to populate runtime configuration data.
200 # For more information and examples on the "template" stanza, please see
201 # the online documentation at:
203 # https://www.nomadproject.io/docs/job-specification/template
207 change_signal = "SIGINT"
208 destination = "secrets/alerts.yml"
209 left_delimiter = "{{{"
210 right_delimiter = "}}}"
214 - name: "Jenkins Job Health Exporter"
216 - alert: JenkinsJobHealthExporterFailures
217 expr: jenkins_job_failure{id=~".*"} > jenkins_job_success{id=~".*"}
222 summary: "Jenkins Job Health detected high failure rate on jenkins jobs."
223 description: "Job: {{ $labels.id }}"
224 - alert: JenkinsJobHealthExporterUnstable
225 expr: jenkins_job_unstable{id=~".*"} > jenkins_job_success{id=~".*"}
230 summary: "Jenkins Job Health detected high unstable rate on jenkins jobs."
231 description: "Job: {{ $labels.id }}"
234 - alert: ConsulServiceHealthcheckFailed
235 expr: consul_catalog_service_node_healthy == 0
240 summary: "Consul service healthcheck failed (instance {{ $labels.instance }})."
241 description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`."
242 - alert: ConsulMissingMasterNode
243 expr: consul_raft_peers < 3
248 summary: "Consul missing master node (instance {{ $labels.instance }})."
249 description: "Numbers of consul raft peers should be 3, in order to preserve quorum."
250 - alert: ConsulAgentUnhealthy
251 expr: consul_health_node_status{status="critical"} == 1
256 summary: "Consul agent unhealthy (instance {{ $labels.instance }})."
257 description: "A Consul agent is down."
266 summary: "Prometheus target missing (instance {{ $labels.instance }})."
267 description: "A Prometheus target has disappeared. An exporter might be crashed."
268 - alert: HostHighCpuLoad
269 expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95
274 summary: "Host high CPU load (instance {{ $labels.instance }})."
275 description: "CPU load is > 95%."
276 - alert: HostOutOfMemory
277 expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
282 summary: "Host out of memory (instance {{ $labels.instance }})."
283 description: "Node memory is filling up (< 10% left)."
284 - alert: HostOomKillDetected
285 expr: increase(node_vmstat_oom_kill[1m]) > 0
290 summary: "Host OOM kill detected (instance {{ $labels.instance }})."
291 description: "OOM kill detected."
292 - alert: HostMemoryUnderMemoryPressure
293 expr: rate(node_vmstat_pgmajfault[1m]) > 1000
298 summary: "Host memory under memory pressure (instance {{ $labels.instance }})."
299 description: "The node is under heavy memory pressure. High rate of major page faults."
300 - alert: HostOutOfDiskSpace
301 expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
306 summary: "Host out of disk space (instance {{ $labels.instance }})."
307 description: "Disk is almost full (< 10% left)."
308 - alert: HostRaidDiskFailure
309 expr: node_md_disks{state="failed"} > 0
314 summary: "Host RAID disk failure (instance {{ $labels.instance }})."
315 description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap."
316 - alert: HostConntrackLimit
317 expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
322 summary: "Host conntrack limit (instance {{ $labels.instance }})."
323 description: "The number of conntrack is approching limit."
324 - alert: HostNetworkInterfaceSaturated
325 expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8
330 summary: "Host Network Interface Saturated (instance {{ $labels.instance }})."
331 description: "The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded."
332 - alert: HostSystemdServiceCrashed
333 expr: node_systemd_unit_state{state="failed"} == 1
338 summary: "Host SystemD service crashed (instance {{ $labels.instance }})."
339 description: "SystemD service crashed."
340 - alert: HostEdacCorrectableErrorsDetected
341 expr: increase(node_edac_correctable_errors_total[1m]) > 0
346 summary: "Host EDAC Correctable Errors detected (instance {{ $labels.instance }})."
347 description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'
348 - alert: HostEdacUncorrectableErrorsDetected
349 expr: node_edac_uncorrectable_errors_total > 0
354 summary: "Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})."
355 description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'
358 - alert: MinioDiskOffline
359 expr: minio_offline_disks > 0
364 summary: "Minio disk offline (instance {{ $labels.instance }})"
365 description: "Minio disk is offline."
366 - alert: MinioStorageSpaceExhausted
367 expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 < 10
372 summary: "Minio storage space exhausted (instance {{ $labels.instance }})."
373 description: "Minio storage space is low (< 10 GB)."
376 - alert: PrometheusConfigurationReloadFailure
377 expr: prometheus_config_last_reload_successful != 1
382 summary: "Prometheus configuration reload failure (instance {{ $labels.instance }})."
383 description: "Prometheus configuration reload error."
384 - alert: PrometheusTooManyRestarts
385 expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
390 summary: "Prometheus too many restarts (instance {{ $labels.instance }})."
391 description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping."
392 - alert: PrometheusAlertmanagerConfigurationReloadFailure
393 expr: alertmanager_config_last_reload_successful != 1
398 summary: "Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})."
399 description: "AlertManager configuration reload error."
400 - alert: PrometheusRuleEvaluationFailures
401 expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
406 summary: "Prometheus rule evaluation failures (instance {{ $labels.instance }})."
407 description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts."
408 - alert: PrometheusTargetScrapingSlow
409 expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60
414 summary: "Prometheus target scraping slow (instance {{ $labels.instance }})."
415 description: "Prometheus is scraping exporters slowly."
416 - alert: PrometheusTsdbCompactionsFailed
417 expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
422 summary: "Prometheus TSDB compactions failed (instance {{ $labels.instance }})."
423 description: "Prometheus encountered {{ $value }} TSDB compactions failures."
424 - alert: PrometheusTsdbHeadTruncationsFailed
425 expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
430 summary: "Prometheus TSDB head truncations failed (instance {{ $labels.instance }})."
431 description: "Prometheus encountered {{ $value }} TSDB head truncation failures."
432 - alert: PrometheusTsdbWalCorruptions
433 expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
438 summary: "Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})."
439 description: "Prometheus encountered {{ $value }} TSDB WAL corruptions."
440 - alert: PrometheusTsdbWalTruncationsFailed
441 expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
446 summary: "Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})."
447 description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures."
453 change_signal = "SIGINT"
454 destination = "secrets/prometheus.yml"
460 evaluation_interval: 5s
465 - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
466 services: [ 'alertmanager' ]
473 - job_name: 'Nomad Cluster'
475 - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
476 services: [ 'nomad-client', 'nomad' ]
478 - source_labels: [__meta_consul_tags]
479 regex: '(.*)http(.*)'
481 metrics_path: /v1/metrics
483 format: [ 'prometheus' ]
485 - job_name: 'Consul Cluster'
487 - targets: [ '10.30.51.28:8500' ]
488 - targets: [ '10.30.51.29:8500' ]
489 - targets: [ '10.30.51.30:8500' ]
490 - targets: [ '10.30.51.32:8500' ]
491 - targets: [ '10.30.51.33:8500' ]
492 - targets: [ '10.30.51.34:8500' ]
493 - targets: [ '10.30.51.35:8500' ]
494 - targets: [ '10.30.51.39:8500' ]
495 - targets: [ '10.30.51.40:8500' ]
496 - targets: [ '10.30.51.50:8500' ]
497 - targets: [ '10.30.51.51:8500' ]
498 - targets: [ '10.30.51.65:8500' ]
499 - targets: [ '10.30.51.66:8500' ]
500 - targets: [ '10.30.51.67:8500' ]
501 - targets: [ '10.30.51.68:8500' ]
502 - targets: [ '10.30.51.70:8500' ]
503 - targets: [ '10.30.51.71:8500' ]
504 - targets: [ '10.32.8.14:8500' ]
505 - targets: [ '10.32.8.15:8500' ]
506 - targets: [ '10.32.8.16:8500' ]
507 - targets: [ '10.32.8.17:8500' ]
508 metrics_path: /v1/agent/metrics
510 format: [ 'prometheus' ]
512 - job_name: 'Blackbox Exporter (icmp)'
514 - targets: [ 'gerrit.fd.io' ]
515 - targets: [ 'jenkins.fd.io' ]
516 - targets: [ '10.30.51.32' ]
518 module: [ 'icmp_v4' ]
520 - source_labels: [__address__]
521 target_label: __param_target
522 - source_labels: [__param_target]
523 target_label: instance
524 - target_label: __address__
525 replacement: localhost:9115
528 - job_name: 'Blackbox Exporter (http)'
530 - targets: [ 'gerrit.fd.io' ]
531 - targets: [ 'jenkins.fd.io' ]
533 module: [ 'http_2xx' ]
535 - source_labels: [__address__]
536 target_label: __param_target
537 - source_labels: [__param_target]
538 target_label: instance
539 - target_label: __address__
540 replacement: localhost:9115
543 - job_name: 'cAdvisor Exporter'
545 - targets: [ '10.30.51.28:8080' ]
546 - targets: [ '10.30.51.29:8080' ]
547 - targets: [ '10.30.51.30:8080' ]
548 #- targets: [ '10.30.51.32:8080' ]
549 - targets: [ '10.30.51.33:8080' ]
550 - targets: [ '10.30.51.34:8080' ]
551 - targets: [ '10.30.51.35:8080' ]
552 - targets: [ '10.30.51.39:8080' ]
553 - targets: [ '10.30.51.40:8080' ]
554 - targets: [ '10.30.51.50:8080' ]
555 - targets: [ '10.30.51.51:8080' ]
556 - targets: [ '10.30.51.65:8080' ]
557 - targets: [ '10.30.51.66:8080' ]
558 - targets: [ '10.30.51.67:8080' ]
559 - targets: [ '10.30.51.68:8080' ]
560 - targets: [ '10.30.51.70:8080' ]
561 - targets: [ '10.30.51.71:8080' ]
562 - targets: [ '10.32.8.14:8080' ]
563 - targets: [ '10.32.8.15:8080' ]
564 - targets: [ '10.32.8.16:8080' ]
565 - targets: [ '10.32.8.17:8080' ]
567 - job_name: 'Jenkins Job Health Exporter'
569 - targets: [ '10.30.51.32:9186' ]
570 metric_relabel_configs:
571 - source_labels: [ __name__ ]
572 regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'
576 - source_labels: [ __name__ ]
577 regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'
578 replacement: 'jenkins_job_$2'
579 target_label: __name__
581 - job_name: 'Node Exporter'
583 - targets: [ '10.30.51.28:9100' ]
584 - targets: [ '10.30.51.29:9100' ]
585 - targets: [ '10.30.51.30:9100' ]
586 - targets: [ '10.30.51.32:9100' ]
587 - targets: [ '10.30.51.33:9100' ]
588 - targets: [ '10.30.51.34:9100' ]
589 - targets: [ '10.30.51.35:9100' ]
590 - targets: [ '10.30.51.39:9100' ]
591 - targets: [ '10.30.51.40:9100' ]
592 - targets: [ '10.30.51.50:9100' ]
593 - targets: [ '10.30.51.51:9100' ]
594 - targets: [ '10.30.51.65:9100' ]
595 - targets: [ '10.30.51.66:9100' ]
596 - targets: [ '10.30.51.67:9100' ]
597 - targets: [ '10.30.51.68:9100' ]
598 - targets: [ '10.30.51.70:9100' ]
599 - targets: [ '10.30.51.71:9100' ]
600 - targets: [ '10.32.8.14:9100' ]
601 - targets: [ '10.32.8.15:9100' ]
602 - targets: [ '10.32.8.16:9100' ]
603 - targets: [ '10.32.8.17:9100' ]
605 - job_name: 'Alertmanager'
607 - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
608 services: [ 'alertmanager' ]
610 - job_name: 'Grafana'
612 - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
613 services: [ 'grafana' ]
615 - job_name: 'Prometheus'
617 - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
618 services: [ 'prometheus' ]
621 bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg
623 - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
624 services: [ 'storage' ]
625 metrics_path: /minio/prometheus/metrics
629 # The service stanza instructs Nomad to register a service with Consul.
631 # For more information and examples on the "task" stanza, please see
632 # the online documentation at:
634 # https://www.nomadproject.io/docs/job-specification/service
637 name = "${service_name}"
638 port = "${service_name}"
639 tags = [ "${service_name}$${NOMAD_ALLOC_INDEX}" ]
641 name = "Prometheus Check Live"
649 # The "resources" stanza describes the requirements a task needs to
650 # execute. Resource requirements include memory, network, cpu, and more.
651 # This ensures the task will execute on a machine that contains enough
654 # For more information and examples on the "resources" stanza, please see
655 # the online documentation at:
657 # https://www.nomadproject.io/docs/job-specification/resources
662 # The network stanza specifies the networking requirements for the task
663 # group, including the network mode and port allocations. When scheduling
664 # jobs in Nomad they are provisioned across your fleet of machines along
665 # with other jobs and services. Because you don't know in advance what host
666 # your job will be provisioned on, Nomad will provide your tasks with
667 # network configuration when they start up.
669 # For more information and examples on the "template" stanza, please see
670 # the online documentation at:
672 # https://www.nomadproject.io/docs/job-specification/network
675 port "${service_name}" {