2 # The "region" parameter specifies the region in which to execute the job.
3 # If omitted, this inherits the default region name of "global".
6 # The "datacenters" parameter specifies the list of datacenters which should
7 # be considered when placing this task. This must be provided.
8 datacenters = "${datacenters}"
10 # The "type" parameter controls the type of job, which impacts the scheduler's
11 # decision on placement. This configuration is optional and defaults to
12 # "service". For a full list of job types and their differences, please see
13 # the online documentation.
15 # For more information, please see the online documentation at:
17 # https://www.nomadproject.io/docs/jobspec/schedulers
22 # The "max_parallel" parameter specifies the maximum number of updates to
23 # perform in parallel. In this case, this specifies to update a single task
27 health_check = "checks"
29 # The "min_healthy_time" parameter specifies the minimum time the allocation
30 # must be in the healthy state before it is marked as healthy and unblocks
31 # further allocations from being updated.
32 min_healthy_time = "10s"
34 # The "healthy_deadline" parameter specifies the deadline in which the
35 # allocation must be marked as healthy after which the allocation is
36 # automatically transitioned to unhealthy. Transitioning to unhealthy will
37 # fail the deployment and potentially roll back the job if "auto_revert" is
39 healthy_deadline = "3m"
41 # The "progress_deadline" parameter specifies the deadline in which an
42 # allocation must be marked as healthy. The deadline begins when the first
43 # allocation for the deployment is created and is reset whenever an allocation
44 # as part of the deployment transitions to a healthy state. If no allocation
45 # transitions to the healthy state before the progress deadline, the
46 # deployment is marked as failed.
47 progress_deadline = "10m"
50 # The "canary" parameter specifies that changes to the job that would result
51 # in destructive updates should create the specified number of canaries
52 # without stopping any previous allocations. Once the operator determines the
53 # canaries are healthy, they can be promoted which unblocks a rolling update
54 # of the remaining allocations at a rate of "max_parallel".
56 # Further, setting "canary" equal to the count of the task group allows
57 # blue/green deployments. When the job is updated, a full set of the new
58 # version is deployed and upon promotion the old version is stopped.
61 # Specifies if the job should auto-promote to the canary version when all
62 # canaries become healthy during a deployment. Defaults to false which means
63 # canaries must be manually updated with the nomad deployment promote
67 # The "auto_revert" parameter specifies if the job should auto-revert to the
68 # last stable job on deployment failure. A job is marked as stable if all the
69 # allocations as part of its deployment were marked healthy.
74 # The "group" stanza defines a series of tasks that should be co-located on
75 # the same Nomad client. Any task within a group will be placed on the same
78 # For more information and examples on the "group" stanza, please see
79 # the online documentation at:
81 # https://www.nomadproject.io/docs/job-specification/group
83 group "prod-group1-${service_name}" {
84 # The "count" parameter specifies the number of the task groups that should
85 # be running under this group. This value must be non-negative and defaults
87 count = ${group_count}
89 # The volume stanza allows the group to specify that it requires a given
90 # volume from the cluster.
92 # For more information and examples on the "volume" stanza, please see
93 # the online documentation at:
95 # https://www.nomadproject.io/docs/job-specification/volume
97 %{ if use_host_volume }
98 volume "prod-volume1-${service_name}" {
101 source = "${host_volume}"
105 # The constraint allows restricting the set of eligible nodes. Constraints
106 # may filter on attributes or client metadata.
108 # For more information and examples on the "volume" stanza, please see
109 # the online documentation at:
111 # https://www.nomadproject.io/docs/job-specification/constraint
114 attribute = "$${attr.cpu.arch}"
119 # The "task" stanza creates an individual unit of work, such as a Docker
120 # container, web application, or batch processing.
122 # For more information and examples on the "task" stanza, please see
123 # the online documentation at:
125 # https://www.nomadproject.io/docs/job-specification/task
127 task "prod-task1-${service_name}" {
128 # The "driver" parameter specifies the task driver that should be used to
132 %{ if use_host_volume }
134 volume = "prod-volume1-${service_name}"
135 destination = "${data_dir}"
140 %{ if use_vault_provider }
142 policies = "${vault_kv_policy_name}"
146 # The "config" stanza specifies the driver configuration, which is passed
147 # directly to the driver to start the task. The details of configurations
148 # are specific to each driver, so please see specific driver
149 # documentation for more information.
151 command = "local/prometheus-${version}.linux-amd64/prometheus"
153 "--config.file=secrets/prometheus.yml",
154 "--storage.tsdb.path=${data_dir}prometheus/",
155 "--storage.tsdb.retention.time=15d"
159 # The artifact stanza instructs Nomad to fetch and unpack a remote resource,
160 # such as a file, tarball, or binary. Nomad downloads artifacts using the
161 # popular go-getter library, which permits downloading artifacts from a
162 # variety of locations using a URL as the input source.
164 # For more information and examples on the "artifact" stanza, please see
165 # the online documentation at:
167 # https://www.nomadproject.io/docs/job-specification/artifact
173 # The "template" stanza instructs Nomad to manage a template, such as
174 # a configuration file or script. This template can optionally pull data
175 # from Consul or Vault to populate runtime configuration data.
177 # For more information and examples on the "template" stanza, please see
178 # the online documentation at:
180 # https://www.nomadproject.io/docs/job-specification/template
184 change_signal = "SIGINT"
185 destination = "secrets/alerts.yml"
186 left_delimiter = "{{{"
187 right_delimiter = "}}}"
191 - name: "Jenkins Job Health Exporter"
193 - alert: JenkinsJobHealthExporterFailures
194 expr: jenkins_job_failure{id=~".*"} >= 10
199 summary: "Jenkins Job Health detected high failure rate on jenkins jobs."
200 description: "Job: {{ $labels.id }}"
201 - alert: JenkinsJobHealthExporterUnstable
202 expr: jenkins_job_unstable{id=~".*"} >= 10
207 summary: "Jenkins Job Health detected high unstable rate on jenkins jobs."
208 description: "Job: {{ $labels.id }}"
211 - alert: ConsulServiceHealthcheckFailed
212 expr: consul_catalog_service_node_healthy == 0
217 summary: "Consul service healthcheck failed (instance {{ $labels.instance }})."
218 description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`."
219 - alert: ConsulMissingMasterNode
220 expr: consul_raft_peers < 3
225 summary: "Consul missing master node (instance {{ $labels.instance }})."
226 description: "Numbers of consul raft peers should be 3, in order to preserve quorum."
227 - alert: ConsulAgentUnhealthy
228 expr: consul_health_node_status{status="critical"} == 1
233 summary: "Consul agent unhealthy (instance {{ $labels.instance }})."
234 description: "A Consul agent is down."
243 summary: "Prometheus target missing (instance {{ $labels.instance }})."
244 description: "A Prometheus target has disappeared. An exporter might be crashed."
245 - alert: HostHighCpuLoad
246 expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95
251 summary: "Host high CPU load (instance {{ $labels.instance }})."
252 description: "CPU load is > 95%."
253 - alert: HostOutOfMemory
254 expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
259 summary: "Host out of memory (instance {{ $labels.instance }})."
260 description: "Node memory is filling up (< 10% left)."
261 - alert: HostOomKillDetected
262 expr: increase(node_vmstat_oom_kill[1m]) > 0
267 summary: "Host OOM kill detected (instance {{ $labels.instance }})."
268 description: "OOM kill detected."
269 - alert: HostMemoryUnderMemoryPressure
270 expr: rate(node_vmstat_pgmajfault[1m]) > 1000
275 summary: "Host memory under memory pressure (instance {{ $labels.instance }})."
276 description: "The node is under heavy memory pressure. High rate of major page faults."
277 - alert: HostOutOfDiskSpace
278 expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
283 summary: "Host out of disk space (instance {{ $labels.instance }})."
284 description: "Disk is almost full (< 10% left)."
285 - alert: HostRaidDiskFailure
286 expr: node_md_disks{state="failed"} > 0
291 summary: "Host RAID disk failure (instance {{ $labels.instance }})."
292 description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap."
293 - alert: HostConntrackLimit
294 expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
299 summary: "Host conntrack limit (instance {{ $labels.instance }})."
300 description: "The number of conntrack is approching limit."
301 - alert: HostNetworkInterfaceSaturated
302 expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8
307 summary: "Host Network Interface Saturated (instance {{ $labels.instance }})."
308 description: "The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded."
309 - alert: HostSystemdServiceCrashed
310 expr: node_systemd_unit_state{state="failed"} == 1
315 summary: "Host SystemD service crashed (instance {{ $labels.instance }})."
316 description: "SystemD service crashed."
317 - alert: HostEdacCorrectableErrorsDetected
318 expr: increase(node_edac_correctable_errors_total[1m]) > 0
323 summary: "Host EDAC Correctable Errors detected (instance {{ $labels.instance }})."
324 description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'
325 - alert: HostEdacUncorrectableErrorsDetected
326 expr: node_edac_uncorrectable_errors_total > 0
331 summary: "Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})."
332 description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'
335 - alert: MinioDiskOffline
336 expr: minio_offline_disks > 0
341 summary: "Minio disk offline (instance {{ $labels.instance }})"
342 description: "Minio disk is offline."
343 - alert: MinioStorageSpaceExhausted
344 expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 < 10
349 summary: "Minio storage space exhausted (instance {{ $labels.instance }})."
350 description: "Minio storage space is low (< 10 GB)."
353 - alert: PrometheusConfigurationReloadFailure
354 expr: prometheus_config_last_reload_successful != 1
359 summary: "Prometheus configuration reload failure (instance {{ $labels.instance }})."
360 description: "Prometheus configuration reload error."
361 - alert: PrometheusTooManyRestarts
362 expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
367 summary: "Prometheus too many restarts (instance {{ $labels.instance }})."
368 description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping."
369 - alert: PrometheusAlertmanagerConfigurationReloadFailure
370 expr: alertmanager_config_last_reload_successful != 1
375 summary: "Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})."
376 description: "AlertManager configuration reload error."
377 - alert: PrometheusRuleEvaluationFailures
378 expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
383 summary: "Prometheus rule evaluation failures (instance {{ $labels.instance }})."
384 description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts."
385 - alert: PrometheusTargetScrapingSlow
386 expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60
391 summary: "Prometheus target scraping slow (instance {{ $labels.instance }})."
392 description: "Prometheus is scraping exporters slowly."
393 - alert: PrometheusTsdbCompactionsFailed
394 expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
399 summary: "Prometheus TSDB compactions failed (instance {{ $labels.instance }})."
400 description: "Prometheus encountered {{ $value }} TSDB compactions failures."
401 - alert: PrometheusTsdbHeadTruncationsFailed
402 expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
407 summary: "Prometheus TSDB head truncations failed (instance {{ $labels.instance }})."
408 description: "Prometheus encountered {{ $value }} TSDB head truncation failures."
409 - alert: PrometheusTsdbWalCorruptions
410 expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
415 summary: "Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})."
416 description: "Prometheus encountered {{ $value }} TSDB WAL corruptions."
417 - alert: PrometheusTsdbWalTruncationsFailed
418 expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
423 summary: "Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})."
424 description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures."
430 change_signal = "SIGINT"
431 destination = "secrets/prometheus.yml"
437 evaluation_interval: 5s
442 - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
443 services: [ 'alertmanager' ]
450 - job_name: 'Nomad Cluster'
452 - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
453 services: [ 'nomad-client', 'nomad' ]
455 - source_labels: [__meta_consul_tags]
456 regex: '(.*)http(.*)'
458 metrics_path: /v1/metrics
460 format: [ 'prometheus' ]
462 - job_name: 'Consul Cluster'
464 - targets: [ '10.30.51.28:8500' ]
465 - targets: [ '10.30.51.29:8500' ]
466 - targets: [ '10.30.51.30:8500' ]
467 - targets: [ '10.30.51.32:8500' ]
468 - targets: [ '10.30.51.33:8500' ]
469 - targets: [ '10.30.51.34:8500' ]
470 - targets: [ '10.30.51.35:8500' ]
471 - targets: [ '10.30.51.39:8500' ]
472 - targets: [ '10.30.51.40:8500' ]
473 - targets: [ '10.30.51.50:8500' ]
474 - targets: [ '10.30.51.51:8500' ]
475 - targets: [ '10.30.51.65:8500' ]
476 - targets: [ '10.30.51.66:8500' ]
477 - targets: [ '10.30.51.67:8500' ]
478 - targets: [ '10.30.51.68:8500' ]
479 - targets: [ '10.30.51.70:8500' ]
480 - targets: [ '10.30.51.71:8500' ]
481 - targets: [ '10.32.8.14:8500' ]
482 - targets: [ '10.32.8.15:8500' ]
483 - targets: [ '10.32.8.16:8500' ]
484 - targets: [ '10.32.8.17:8500' ]
485 metrics_path: /v1/agent/metrics
487 format: [ 'prometheus' ]
489 - job_name: 'Blackbox Exporter (icmp)'
491 - targets: [ 'gerrit.fd.io' ]
492 - targets: [ 'jenkins.fd.io' ]
493 - targets: [ '10.30.51.32' ]
495 module: [ 'icmp_v4' ]
497 - source_labels: [__address__]
498 target_label: __param_target
499 - source_labels: [__param_target]
500 target_label: instance
501 - target_label: __address__
502 replacement: localhost:9115
505 - job_name: 'Blackbox Exporter (http)'
507 - targets: [ 'gerrit.fd.io' ]
508 - targets: [ 'jenkins.fd.io' ]
510 module: [ 'http_2xx' ]
512 - source_labels: [__address__]
513 target_label: __param_target
514 - source_labels: [__param_target]
515 target_label: instance
516 - target_label: __address__
517 replacement: localhost:9115
520 - job_name: 'cAdvisor Exporter'
522 - targets: [ '10.30.51.28:8080' ]
523 - targets: [ '10.30.51.29:8080' ]
524 - targets: [ '10.30.51.30:8080' ]
525 #- targets: [ '10.30.51.32:8080' ]
526 - targets: [ '10.30.51.33:8080' ]
527 - targets: [ '10.30.51.34:8080' ]
528 - targets: [ '10.30.51.35:8080' ]
529 - targets: [ '10.30.51.39:8080' ]
530 - targets: [ '10.30.51.40:8080' ]
531 - targets: [ '10.30.51.50:8080' ]
532 - targets: [ '10.30.51.51:8080' ]
533 - targets: [ '10.30.51.65:8080' ]
534 - targets: [ '10.30.51.66:8080' ]
535 - targets: [ '10.30.51.67:8080' ]
536 - targets: [ '10.30.51.68:8080' ]
537 - targets: [ '10.30.51.70:8080' ]
538 - targets: [ '10.30.51.71:8080' ]
539 - targets: [ '10.32.8.14:8080' ]
540 - targets: [ '10.32.8.15:8080' ]
541 - targets: [ '10.32.8.16:8080' ]
542 - targets: [ '10.32.8.17:8080' ]
544 - job_name: 'Jenkins Job Health Exporter'
546 - targets: [ '10.30.51.32:9186' ]
547 metric_relabel_configs:
548 - source_labels: [ __name__ ]
549 regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'
553 - source_labels: [ __name__ ]
554 regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'
555 replacement: 'jenkins_job_$2'
556 target_label: __name__
558 - job_name: 'Node Exporter'
560 - targets: [ '10.30.51.28:9100' ]
561 - targets: [ '10.30.51.29:9100' ]
562 - targets: [ '10.30.51.30:9100' ]
563 - targets: [ '10.30.51.32:9100' ]
564 - targets: [ '10.30.51.33:9100' ]
565 - targets: [ '10.30.51.34:9100' ]
566 - targets: [ '10.30.51.35:9100' ]
567 - targets: [ '10.30.51.39:9100' ]
568 - targets: [ '10.30.51.40:9100' ]
569 - targets: [ '10.30.51.50:9100' ]
570 - targets: [ '10.30.51.51:9100' ]
571 - targets: [ '10.30.51.65:9100' ]
572 - targets: [ '10.30.51.66:9100' ]
573 - targets: [ '10.30.51.67:9100' ]
574 - targets: [ '10.30.51.68:9100' ]
575 - targets: [ '10.30.51.70:9100' ]
576 - targets: [ '10.30.51.71:9100' ]
577 - targets: [ '10.32.8.14:9100' ]
578 - targets: [ '10.32.8.15:9100' ]
579 - targets: [ '10.32.8.16:9100' ]
580 - targets: [ '10.32.8.17:9100' ]
582 - job_name: 'Alertmanager'
584 - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
585 services: [ 'alertmanager' ]
587 - job_name: 'Grafana'
589 - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
590 services: [ 'grafana' ]
592 - job_name: 'Prometheus'
594 - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
595 services: [ 'prometheus' ]
598 bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg
600 - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
601 services: [ 'storage' ]
602 metrics_path: /minio/prometheus/metrics
606 # The service stanza instructs Nomad to register a service with Consul.
608 # For more information and examples on the "task" stanza, please see
609 # the online documentation at:
611 # https://www.nomadproject.io/docs/job-specification/service
614 name = "${service_name}"
615 port = "${service_name}"
616 tags = [ "${service_name}$${NOMAD_ALLOC_INDEX}" ]
618 name = "Prometheus Check Live"
626 # The "resources" stanza describes the requirements a task needs to
627 # execute. Resource requirements include memory, network, cpu, and more.
628 # This ensures the task will execute on a machine that contains enough
631 # For more information and examples on the "resources" stanza, please see
632 # the online documentation at:
634 # https://www.nomadproject.io/docs/job-specification/resources
639 # The network stanza specifies the networking requirements for the task
640 # group, including the network mode and port allocations. When scheduling
641 # jobs in Nomad they are provisioned across your fleet of machines along
642 # with other jobs and services. Because you don't know in advance what host
643 # your job will be provisioned on, Nomad will provide your tasks with
644 # network configuration when they start up.
646 # For more information and examples on the "template" stanza, please see
647 # the online documentation at:
649 # https://www.nomadproject.io/docs/job-specification/network
652 port "${service_name}" {