fdio.infra.terraform/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl.tftpl

   1 job "${job_name}" {
   2   # The "region" parameter specifies the region in which to execute the job.
   3   # If omitted, this inherits the default region name of "global".
   4   # region    = "${region}"
   5
   6   # The "datacenters" parameter specifies the list of datacenters which should
   7   # be considered when placing this task. This must be provided.
   8   datacenters = "${datacenters}"
   9
  10   # The "type" parameter controls the type of job, which impacts the scheduler's
  11   # decision on placement. This configuration is optional and defaults to
  12   # "service". For a full list of job types and their differences, please see
  13   # the online documentation.
  14   #
  15   #     https://www.nomadproject.io/docs/jobspec/schedulers
  16   #
  17   type        = "service"
  18
  19   update {
  20     # The "max_parallel" parameter specifies the maximum number of updates to
  21     # perform in parallel. In this case, this specifies to update a single task
  22     # at a time.
  23     max_parallel      = ${max_parallel}
  24
  25     health_check      = "checks"
  26
  27     # The "min_healthy_time" parameter specifies the minimum time the allocation
  28     # must be in the healthy state before it is marked as healthy and unblocks
  29     # further allocations from being updated.
  30     min_healthy_time  = "10s"
  31
  32     # The "healthy_deadline" parameter specifies the deadline in which the
  33     # allocation must be marked as healthy after which the allocation is
  34     # automatically transitioned to unhealthy. Transitioning to unhealthy will
  35     # fail the deployment and potentially roll back the job if "auto_revert" is
  36     # set to true.
  37     healthy_deadline  = "3m"
  38
  39     # The "progress_deadline" parameter specifies the deadline in which an
  40     # allocation must be marked as healthy. The deadline begins when the first
  41     # allocation for the deployment is created and is reset whenever an allocation
  42     # as part of the deployment transitions to a healthy state. If no allocation
  43     # transitions to the healthy state before the progress deadline, the
  44     # deployment is marked as failed.
  45     progress_deadline = "10m"
  46
  47 %{ if use_canary }
  48     # The "canary" parameter specifies that changes to the job that would result
  49     # in destructive updates should create the specified number of canaries
  50     # without stopping any previous allocations. Once the operator determines the
  51     # canaries are healthy, they can be promoted which unblocks a rolling update
  52     # of the remaining allocations at a rate of "max_parallel".
  53     #
  54     # Further, setting "canary" equal to the count of the task group allows
  55     # blue/green deployments. When the job is updated, a full set of the new
  56     # version is deployed and upon promotion the old version is stopped.
  57     canary            = ${canary}
  58
  59     # Specifies if the job should auto-promote to the canary version when all
  60     # canaries become healthy during a deployment. Defaults to false which means
  61     # canaries must be manually updated with the nomad deployment promote
  62     # command.
  63     auto_promote      = ${auto_promote}
  64
  65     # The "auto_revert" parameter specifies if the job should auto-revert to the
  66     # last stable job on deployment failure. A job is marked as stable if all the
  67     # allocations as part of its deployment were marked healthy.
  68     auto_revert       = ${auto_revert}
  69 %{ endif }
  70   }
  71
  72   # All groups in this job should be scheduled on different hosts.
  73   constraint {
  74     operator = "distinct_hosts"
  75     value    = "true"
  76   }
  77
  78   # The "group" stanza defines a series of tasks that should be co-located on
  79   # the same Nomad client. Any task within a group will be placed on the same
  80   # client.
  81   #
  82   #     https://www.nomadproject.io/docs/job-specification/group
  83   #
  84   group "${job_name}-group-1" {
  85     # The "count" parameter specifies the number of the task groups that should
  86     # be running under this group. This value must be non-negative and defaults
  87     # to 1.
  88     count = ${group_count}
  89
  90     # The volume stanza allows the group to specify that it requires a given
  91     # volume from the cluster. The key of the stanza is the name of the volume
  92     # as it will be exposed to task configuration.
  93     #
  94     # https://www.nomadproject.io/docs/job-specification/volume
  95     %{ if use_host_volume }
  96     volume "${job_name}-volume-1" {
  97       type      = "host"
  98       read_only = false
  99       source    = "${volume_source}"
 100     }
 101     %{ endif }
 102
 103     # The restart stanza configures a tasks's behavior on task failure. Restarts
 104     # happen on the client that is running the task.
 105     #
 106     # https://www.nomadproject.io/docs/job-specification/restart
 107     #
 108     restart {
 109       interval = "30m"
 110       attempts = 40
 111       delay    = "15s"
 112       mode     = "delay"
 113     }
 114
 115     # The constraint allows restricting the set of eligible nodes. Constraints
 116     # may filter on attributes or client metadata.
 117     #
 118     #     https://www.nomadproject.io/docs/job-specification/constraint
 119     #
 120     constraint {
 121       attribute = "$${attr.cpu.arch}"
 122       operator  = "!="
 123       value     = "arm64"
 124     }
 125
 126     constraint {
 127       attribute = "$${node.class}"
 128       value     = "builder"
 129     }
 130
 131     # The network stanza specifies the networking requirements for the task
 132     # group, including the network mode and port allocations. When scheduling
 133     # jobs in Nomad they are provisioned across your fleet of machines along
 134     # with other jobs and services. Because you don't know in advance what host
 135     # your job will be provisioned on, Nomad will provide your tasks with
 136     # network configuration when they start up.
 137     #
 138     #     https://www.nomadproject.io/docs/job-specification/network
 139     #
 140     network {
 141       port "${service_name}" {
 142         static = ${port}
 143         to     = ${port}
 144       }
 145     }
 146
 147     # The "task" stanza creates an individual unit of work, such as a Docker
 148     # container, web application, or batch processing.
 149     #
 150     #     https://www.nomadproject.io/docs/job-specification/task
 151     #
 152     task "${job_name}-task-1" {
 153       # The "driver" parameter specifies the task driver that should be used to
 154       # run the task.
 155       driver = "exec"
 156
 157     %{ if use_host_volume }
 158       volume_mount {
 159         volume      = "${job_name}-volume-1"
 160         destination = "${volume_destination}"
 161         read_only   = false
 162       }
 163     %{ endif }
 164
 165     %{ if use_vault_provider }
 166       vault {
 167         policies = "${vault_kv_policy_name}"
 168       }
 169     %{ endif }
 170
 171       # The "config" stanza specifies the driver configuration, which is passed
 172       # directly to the driver to start the task. The details of configurations
 173       # are specific to each driver, so please see specific driver
 174       # documentation for more information.
 175       config {
 176         command = "local/alertmanager-${version}.linux-amd64/alertmanager"
 177         args    = [
 178           "--config.file=secrets/alertmanager.yml"
 179         ]
 180       }
 181
 182       # The artifact stanza instructs Nomad to fetch and unpack a remote resource,
 183       # such as a file, tarball, or binary. Nomad downloads artifacts using the
 184       # popular go-getter library, which permits downloading artifacts from a
 185       # variety of locations using a URL as the input source.
 186       #
 187       #     https://www.nomadproject.io/docs/job-specification/artifact
 188       #
 189       artifact {
 190         source = "${url}"
 191       }
 192
 193       # The "template" stanza instructs Nomad to manage a template, such as
 194       # a configuration file or script. This template can optionally pull data
 195       # from Consul or Vault to populate runtime configuration data.
 196       #
 197       #     https://www.nomadproject.io/docs/job-specification/template
 198       #
 199       template {
 200         change_mode     = "noop"
 201         change_signal   = "SIGINT"
 202         destination     = "secrets/alertmanager.yml"
 203         left_delimiter  = "{{{"
 204         right_delimiter = "}}}"
 205         data            = <<EOH
 206 # The directory from which notification templates are read.
 207 templates:
 208 - '/etc/alertmanager/template/*.tmpl'
 209
 210 #tls_config:
 211 #  # CA certificate to validate the server certificate with.
 212 #  ca_file: <filepath> ]
 213 #
 214 #  # Certificate and key files for client cert authentication to the server.
 215 #  cert_file: <filepath>
 216 #  key_file: <filepath>
 217 #
 218 #  # ServerName extension to indicate the name of the server.
 219 #  # http://tools.ietf.org/html/rfc4366#section-3.1
 220 #  server_name: <string>
 221 #
 222 #  # Disable validation of the server certificate.
 223 #  insecure_skip_verify: true
 224
 225 # The root route on which each incoming alert enters.
 226 route:
 227   receiver: '${slack_default_receiver}'
 228
 229   # The labels by which incoming alerts are grouped together. For example,
 230   # multiple alerts coming in for cluster=A and alertname=LatencyHigh would
 231   # be batched into a single group.
 232   #
 233   # To aggregate by all possible labels use '...' as the sole label name.
 234   # This effectively disables aggregation entirely, passing through all
 235   # alerts as-is. This is unlikely to be what you want, unless you have
 236   # a very low alert volume or your upstream notification system performs
 237   # its own grouping. Example: group_by: [...]
 238   group_by: ['alertname']
 239
 240   # When a new group of alerts is created by an incoming alert, wait at
 241   # least 'group_wait' to send the initial notification.
 242   # This way ensures that you get multiple alerts for the same group that start
 243   # firing shortly after another are batched together on the first
 244   # notification.
 245   group_wait: 30s
 246
 247   # When the first notification was sent, wait 'group_interval' to send a batch
 248   # of new alerts that started firing for that group.
 249   group_interval: 5m
 250
 251   # If an alert has successfully been sent, wait 'repeat_interval' to
 252   # resend them.
 253   repeat_interval: 3h
 254
 255   # All the above attributes are inherited by all child routes and can
 256   # overwritten on each.
 257   # The child route trees.
 258   routes:
 259   - match_re:
 260       alertname: JenkinsJob.*
 261     receiver: ${slack_jenkins_receiver}
 262     routes:
 263     - match:
 264         severity: critical
 265       receiver: '${slack_jenkins_receiver}'
 266
 267   - match_re:
 268       service: .*
 269     receiver: ${slack_default_receiver}
 270     routes:
 271     - match:
 272         severity: critical
 273       receiver: '${slack_default_receiver}'
 274
 275 # Inhibition rules allow to mute a set of alerts given that another alert is
 276 # firing.
 277 # We use this to mute any warning-level notifications if the same alert is
 278 # already critical.
 279 inhibit_rules:
 280 - source_match:
 281     severity: 'critical'
 282   target_match:
 283     severity: 'warning'
 284   equal: ['alertname', 'instance']
 285
 286 receivers:
 287 - name: '${slack_jenkins_receiver}'
 288   slack_configs:
 289   - api_url: 'https://hooks.slack.com/services/${slack_jenkins_api_key}'
 290     channel: '#${slack_jenkins_channel}'
 291     send_resolved: true
 292     icon_url: https://avatars3.githubusercontent.com/u/3380462
 293     title: |-
 294      [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}
 295      {{- if gt (len .CommonLabels) (len .GroupLabels) -}}
 296        {{" "}}(
 297        {{- with .CommonLabels.Remove .GroupLabels.Names }}
 298          {{- range $index, $label := .SortedPairs -}}
 299            {{ if $index }}, {{ end }}
 300            {{- $label.Name }}="{{ $label.Value -}}"
 301          {{- end }}
 302        {{- end -}}
 303        )
 304      {{- end }}
 305     text: >-
 306      {{ range .Alerts -}}
 307      *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}
 308
 309      *Description:* {{ .Annotations.description }}
 310
 311      *Details:*
 312        {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
 313        {{ end }}
 314      {{ end }}
 315
 316 - name: '${slack_default_receiver}'
 317   slack_configs:
 318   - api_url: 'https://hooks.slack.com/services/${slack_default_api_key}'
 319     channel: '#${slack_default_channel}'
 320     send_resolved: true
 321     icon_url: https://avatars3.githubusercontent.com/u/3380462
 322     title: |-
 323      [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}
 324      {{- if gt (len .CommonLabels) (len .GroupLabels) -}}
 325        {{" "}}(
 326        {{- with .CommonLabels.Remove .GroupLabels.Names }}
 327          {{- range $index, $label := .SortedPairs -}}
 328            {{ if $index }}, {{ end }}
 329            {{- $label.Name }}="{{ $label.Value -}}"
 330          {{- end }}
 331        {{- end -}}
 332        )
 333      {{- end }}
 334     text: >-
 335      {{ range .Alerts -}}
 336      *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}
 337
 338      *Description:* {{ .Annotations.description }}
 339
 340      *Details:*
 341        {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
 342        {{ end }}
 343      {{ end }}
 344 EOH
 345       }
 346
 347       # The service stanza instructs Nomad to register a service with Consul.
 348       #
 349       #     https://www.nomadproject.io/docs/job-specification/service
 350       #
 351       service {
 352         name       = "${service_name}"
 353         port       = "${service_name}"
 354         tags       = [ "${service_name}$${NOMAD_ALLOC_INDEX}" ]
 355         check {
 356           name     = "Alertmanager Check Live"
 357           type     = "http"
 358           path     = "/-/healthy"
 359           interval = "10s"
 360           timeout  = "2s"
 361         }
 362       }
 363
 364       # The "resources" stanza describes the requirements a task needs to
 365       # execute. Resource requirements include memory, network, cpu, and more.
 366       # This ensures the task will execute on a machine that contains enough
 367       # resource capacity.
 368       #
 369       #     https://www.nomadproject.io/docs/job-specification/resources
 370       #
 371       resources {
 372         cpu    = ${cpu}
 373         memory = ${memory}
 374       }
 375     }
 376   }
 377 }