terraform-ci-infra/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl

   1 job "${job_name}" {
   2   # The "region" parameter specifies the region in which to execute the job.
   3   # If omitted, this inherits the default region name of "global".
   4   # region = "global"
   5   #
   6   # The "datacenters" parameter specifies the list of datacenters which should
   7   # be considered when placing this task. This must be provided.
   8   datacenters         = "${datacenters}"
   9
  10   # The "type" parameter controls the type of job, which impacts the scheduler's
  11   # decision on placement. This configuration is optional and defaults to
  12   # "service". For a full list of job types and their differences, please see
  13   # the online documentation.
  14   #
  15   # For more information, please see the online documentation at:
  16   #
  17   #     https://www.nomadproject.io/docs/jobspec/schedulers
  18   #
  19   type                = "service"
  20
  21   update {
  22     # The "max_parallel" parameter specifies the maximum number of updates to
  23     # perform in parallel. In this case, this specifies to update a single task
  24     # at a time.
  25     max_parallel      = 1
  26
  27     health_check      = "checks"
  28
  29     # The "min_healthy_time" parameter specifies the minimum time the allocation
  30     # must be in the healthy state before it is marked as healthy and unblocks
  31     # further allocations from being updated.
  32     min_healthy_time  = "10s"
  33
  34     # The "healthy_deadline" parameter specifies the deadline in which the
  35     # allocation must be marked as healthy after which the allocation is
  36     # automatically transitioned to unhealthy. Transitioning to unhealthy will
  37     # fail the deployment and potentially roll back the job if "auto_revert" is
  38     # set to true.
  39     healthy_deadline  = "3m"
  40
  41     # The "progress_deadline" parameter specifies the deadline in which an
  42     # allocation must be marked as healthy. The deadline begins when the first
  43     # allocation for the deployment is created and is reset whenever an allocation
  44     # as part of the deployment transitions to a healthy state. If no allocation
  45     # transitions to the healthy state before the progress deadline, the
  46     # deployment is marked as failed.
  47     progress_deadline = "10m"
  48
  49 %{ if use_canary }
  50     # The "canary" parameter specifies that changes to the job that would result
  51     # in destructive updates should create the specified number of canaries
  52     # without stopping any previous allocations. Once the operator determines the
  53     # canaries are healthy, they can be promoted which unblocks a rolling update
  54     # of the remaining allocations at a rate of "max_parallel".
  55     #
  56     # Further, setting "canary" equal to the count of the task group allows
  57     # blue/green deployments. When the job is updated, a full set of the new
  58     # version is deployed and upon promotion the old version is stopped.
  59     canary            = 1
  60
  61     # Specifies if the job should auto-promote to the canary version when all
  62     # canaries become healthy during a deployment. Defaults to false which means
  63     # canaries must be manually updated with the nomad deployment promote
  64     # command.
  65     auto_promote      = true
  66
  67     # The "auto_revert" parameter specifies if the job should auto-revert to the
  68     # last stable job on deployment failure. A job is marked as stable if all the
  69     # allocations as part of its deployment were marked healthy.
  70     auto_revert       = true
  71 %{ endif }
  72   }
  73
  74   # The "group" stanza defines a series of tasks that should be co-located on
  75   # the same Nomad client. Any task within a group will be placed on the same
  76   # client.
  77   #
  78   # For more information and examples on the "group" stanza, please see
  79   # the online documentation at:
  80   #
  81   #     https://www.nomadproject.io/docs/job-specification/group
  82   #
  83   group "prod-group1-${service_name}" {
  84     # The "count" parameter specifies the number of the task groups that should
  85     # be running under this group. This value must be non-negative and defaults
  86     # to 1.
  87     count             = ${group_count}
  88
  89     # The constraint allows restricting the set of eligible nodes. Constraints
  90     # may filter on attributes or client metadata.
  91     #
  92     # For more information and examples on the "volume" stanza, please see
  93     # the online documentation at:
  94     #
  95     #     https://www.nomadproject.io/docs/job-specification/constraint
  96     #
  97     constraint {
  98       attribute       = "$${attr.cpu.arch}"
  99       operator        = "!="
 100       value           = "arm64"
 101     }
 102
 103     # The "task" stanza creates an individual unit of work, such as a Docker
 104     # container, web application, or batch processing.
 105     #
 106     # For more information and examples on the "task" stanza, please see
 107     # the online documentation at:
 108     #
 109     #     https://www.nomadproject.io/docs/job-specification/task
 110     #
 111     task "prod-task1-${service_name}" {
 112       # The "driver" parameter specifies the task driver that should be used to
 113       # run the task.
 114       driver          = "exec"
 115
 116       %{ if use_vault_provider }
 117       vault {
 118         policies        = "${vault_kv_policy_name}"
 119       }
 120       %{ endif }
 121
 122       # The "config" stanza specifies the driver configuration, which is passed
 123       # directly to the driver to start the task. The details of configurations
 124       # are specific to each driver, so please see specific driver
 125       # documentation for more information.
 126       config {
 127         command       = "local/alertmanager-${version}.linux-amd64/alertmanager"
 128         args          = [
 129           "--config.file=secrets/alertmanager.yml"
 130         ]
 131       }
 132
 133       # The artifact stanza instructs Nomad to fetch and unpack a remote resource,
 134       # such as a file, tarball, or binary. Nomad downloads artifacts using the
 135       # popular go-getter library, which permits downloading artifacts from a
 136       # variety of locations using a URL as the input source.
 137       #
 138       # For more information and examples on the "artifact" stanza, please see
 139       # the online documentation at:
 140       #
 141       #     https://www.nomadproject.io/docs/job-specification/artifact
 142       #
 143       artifact {
 144         source          = "${url}"
 145       }
 146
 147       # The "template" stanza instructs Nomad to manage a template, such as
 148       # a configuration file or script. This template can optionally pull data
 149       # from Consul or Vault to populate runtime configuration data.
 150       #
 151       # For more information and examples on the "template" stanza, please see
 152       # the online documentation at:
 153       #
 154       #     https://www.nomadproject.io/docs/job-specification/template
 155       #
 156       template {
 157         change_mode     = "noop"
 158         change_signal   = "SIGINT"
 159         destination     = "secrets/alertmanager.yml"
 160         left_delimiter  = "{{{"
 161         right_delimiter = "}}}"
 162         data            = <<EOH
 163 # The directory from which notification templates are read.
 164 templates:
 165 - '/etc/alertmanager/template/*.tmpl'
 166
 167 #tls_config:
 168 #  # CA certificate to validate the server certificate with.
 169 #  ca_file: <filepath> ]
 170 #
 171 #  # Certificate and key files for client cert authentication to the server.
 172 #  cert_file: <filepath>
 173 #  key_file: <filepath>
 174 #
 175 #  # ServerName extension to indicate the name of the server.
 176 #  # http://tools.ietf.org/html/rfc4366#section-3.1
 177 #  server_name: <string>
 178 #
 179 #  # Disable validation of the server certificate.
 180 #  insecure_skip_verify: true
 181
 182 # The root route on which each incoming alert enters.
 183 route:
 184   receiver: '${slack_default_receiver}'
 185
 186   # The labels by which incoming alerts are grouped together. For example,
 187   # multiple alerts coming in for cluster=A and alertname=LatencyHigh would
 188   # be batched into a single group.
 189   #
 190   # To aggregate by all possible labels use '...' as the sole label name.
 191   # This effectively disables aggregation entirely, passing through all
 192   # alerts as-is. This is unlikely to be what you want, unless you have
 193   # a very low alert volume or your upstream notification system performs
 194   # its own grouping. Example: group_by: [...]
 195   group_by: ['alertname']
 196
 197   # When a new group of alerts is created by an incoming alert, wait at
 198   # least 'group_wait' to send the initial notification.
 199   # This way ensures that you get multiple alerts for the same group that start
 200   # firing shortly after another are batched together on the first
 201   # notification.
 202   group_wait: 30s
 203
 204   # When the first notification was sent, wait 'group_interval' to send a batch
 205   # of new alerts that started firing for that group.
 206   group_interval: 5m
 207
 208   # If an alert has successfully been sent, wait 'repeat_interval' to
 209   # resend them.
 210   repeat_interval: 3h
 211
 212   # All the above attributes are inherited by all child routes and can
 213   # overwritten on each.
 214   # The child route trees.
 215   routes:
 216   - match_re:
 217       alertname: JenkinsJob.*
 218     receiver: ${slack_jenkins_receiver}
 219     routes:
 220     - match:
 221         severity: critical
 222       receiver: '${slack_jenkins_receiver}'
 223
 224   - match_re:
 225       service: .*
 226     receiver: ${slack_default_receiver}
 227     routes:
 228     - match:
 229         severity: critical
 230       receiver: '${slack_default_receiver}'
 231
 232 # Inhibition rules allow to mute a set of alerts given that another alert is
 233 # firing.
 234 # We use this to mute any warning-level notifications if the same alert is
 235 # already critical.
 236 inhibit_rules:
 237 - source_match:
 238     severity: 'critical'
 239   target_match:
 240     severity: 'warning'
 241   equal: ['alertname', 'instance']
 242
 243 receivers:
 244 - name: '${slack_jenkins_receiver}'
 245   slack_configs:
 246   - api_url: 'https://hooks.slack.com/services/${slack_jenkins_api_key}'
 247     channel: '#${slack_jenkins_channel}'
 248     send_resolved: true
 249     icon_url: https://avatars3.githubusercontent.com/u/3380462
 250     title: |-
 251      [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}
 252      {{- if gt (len .CommonLabels) (len .GroupLabels) -}}
 253        {{" "}}(
 254        {{- with .CommonLabels.Remove .GroupLabels.Names }}
 255          {{- range $index, $label := .SortedPairs -}}
 256            {{ if $index }}, {{ end }}
 257            {{- $label.Name }}="{{ $label.Value -}}"
 258          {{- end }}
 259        {{- end -}}
 260        )
 261      {{- end }}
 262     text: >-
 263      {{ range .Alerts -}}
 264      *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}
 265
 266      *Description:* {{ .Annotations.description }}
 267
 268      *Details:*
 269        {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
 270        {{ end }}
 271      {{ end }}
 272
 273 - name: '${slack_default_receiver}'
 274   slack_configs:
 275   - api_url: 'https://hooks.slack.com/services/${slack_default_api_key}'
 276     channel: '#${slack_default_channel}'
 277     send_resolved: true
 278     icon_url: https://avatars3.githubusercontent.com/u/3380462
 279     title: |-
 280      [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}
 281      {{- if gt (len .CommonLabels) (len .GroupLabels) -}}
 282        {{" "}}(
 283        {{- with .CommonLabels.Remove .GroupLabels.Names }}
 284          {{- range $index, $label := .SortedPairs -}}
 285            {{ if $index }}, {{ end }}
 286            {{- $label.Name }}="{{ $label.Value -}}"
 287          {{- end }}
 288        {{- end -}}
 289        )
 290      {{- end }}
 291     text: >-
 292      {{ range .Alerts -}}
 293      *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}
 294
 295      *Description:* {{ .Annotations.description }}
 296
 297      *Details:*
 298        {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
 299        {{ end }}
 300      {{ end }}
 301 EOH
 302       }
 303
 304       # The service stanza instructs Nomad to register a service with Consul.
 305       #
 306       # For more information and examples on the "task" stanza, please see
 307       # the online documentation at:
 308       #
 309       #     https://www.nomadproject.io/docs/job-specification/service
 310       #
 311       service {
 312         name            = "${service_name}"
 313         port            = "${service_name}"
 314         tags            = [ "${service_name}$${NOMAD_ALLOC_INDEX}" ]
 315         check {
 316           name          = "Alertmanager Check Live"
 317           type          = "http"
 318           path          = "/-/healthy"
 319           interval      = "10s"
 320           timeout       = "2s"
 321         }
 322       }
 323
 324       # The "resources" stanza describes the requirements a task needs to
 325       # execute. Resource requirements include memory, network, cpu, and more.
 326       # This ensures the task will execute on a machine that contains enough
 327       # resource capacity.
 328       #
 329       # For more information and examples on the "resources" stanza, please see
 330       # the online documentation at:
 331       #
 332       #     https://www.nomadproject.io/docs/job-specification/resources
 333       #
 334       resources {
 335         cpu             = ${cpu}
 336         memory          = ${mem}
 337         # The network stanza specifies the networking requirements for the task
 338         # group, including the network mode and port allocations. When scheduling
 339         # jobs in Nomad they are provisioned across your fleet of machines along
 340         # with other jobs and services. Because you don't know in advance what host
 341         # your job will be provisioned on, Nomad will provide your tasks with
 342         # network configuration when they start up.
 343         #
 344         # For more information and examples on the "template" stanza, please see
 345         # the online documentation at:
 346         #
 347         #     https://www.nomadproject.io/docs/job-specification/network
 348         #
 349         network {
 350           port "${service_name}" {
 351             static      = ${port}
 352           }
 353         }
 354       }
 355     }
 356   }
 357 }