terraform-ci-infra/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl

   1 job "${job_name}" {
   2   # The "region" parameter specifies the region in which to execute the job.
   3   # If omitted, this inherits the default region name of "global".
   4   # region = "global"
   5   #
   6   # The "datacenters" parameter specifies the list of datacenters which should
   7   # be considered when placing this task. This must be provided.
   8   datacenters         = "${datacenters}"
   9
  10   # The "type" parameter controls the type of job, which impacts the scheduler's
  11   # decision on placement. This configuration is optional and defaults to
  12   # "service". For a full list of job types and their differences, please see
  13   # the online documentation.
  14   #
  15   # For more information, please see the online documentation at:
  16   #
  17   #     https://www.nomadproject.io/docs/jobspec/schedulers
  18   #
  19   type                = "service"
  20
  21   update {
  22     # The "max_parallel" parameter specifies the maximum number of updates to
  23     # perform in parallel. In this case, this specifies to update a single task
  24     # at a time.
  25     max_parallel      = 1
  26
  27     health_check      = "checks"
  28
  29     # The "min_healthy_time" parameter specifies the minimum time the allocation
  30     # must be in the healthy state before it is marked as healthy and unblocks
  31     # further allocations from being updated.
  32     min_healthy_time  = "10s"
  33
  34     # The "healthy_deadline" parameter specifies the deadline in which the
  35     # allocation must be marked as healthy after which the allocation is
  36     # automatically transitioned to unhealthy. Transitioning to unhealthy will
  37     # fail the deployment and potentially roll back the job if "auto_revert" is
  38     # set to true.
  39     healthy_deadline  = "3m"
  40
  41     # The "progress_deadline" parameter specifies the deadline in which an
  42     # allocation must be marked as healthy. The deadline begins when the first
  43     # allocation for the deployment is created and is reset whenever an allocation
  44     # as part of the deployment transitions to a healthy state. If no allocation
  45     # transitions to the healthy state before the progress deadline, the
  46     # deployment is marked as failed.
  47     progress_deadline = "10m"
  48
  49 %{ if use_canary }
  50     # The "canary" parameter specifies that changes to the job that would result
  51     # in destructive updates should create the specified number of canaries
  52     # without stopping any previous allocations. Once the operator determines the
  53     # canaries are healthy, they can be promoted which unblocks a rolling update
  54     # of the remaining allocations at a rate of "max_parallel".
  55     #
  56     # Further, setting "canary" equal to the count of the task group allows
  57     # blue/green deployments. When the job is updated, a full set of the new
  58     # version is deployed and upon promotion the old version is stopped.
  59     canary            = 1
  60
  61     # Specifies if the job should auto-promote to the canary version when all
  62     # canaries become healthy during a deployment. Defaults to false which means
  63     # canaries must be manually updated with the nomad deployment promote
  64     # command.
  65     auto_promote      = true
  66
  67     # The "auto_revert" parameter specifies if the job should auto-revert to the
  68     # last stable job on deployment failure. A job is marked as stable if all the
  69     # allocations as part of its deployment were marked healthy.
  70     auto_revert       = true
  71 %{ endif }
  72   }
  73
  74   # The reschedule stanza specifies the group's rescheduling strategy. If
  75   # specified at the job level, the configuration will apply to all groups
  76   # within the job. If the reschedule stanza is present on both the job and the
  77   # group, they are merged with the group stanza taking the highest precedence
  78   # and then the job.
  79   reschedule {
  80     delay             = "30s"
  81     delay_function    = "constant"
  82     unlimited         = true
  83   }
  84
  85   # The "group" stanza defines a series of tasks that should be co-located on
  86   # the same Nomad client. Any task within a group will be placed on the same
  87   # client.
  88   #
  89   # For more information and examples on the "group" stanza, please see
  90   # the online documentation at:
  91   #
  92   #     https://www.nomadproject.io/docs/job-specification/group
  93   #
  94   group "prod-group1-${service_name}" {
  95     # The "count" parameter specifies the number of the task groups that should
  96     # be running under this group. This value must be non-negative and defaults
  97     # to 1.
  98     count             = ${group_count}
  99
 100     # The restart stanza configures a tasks's behavior on task failure. Restarts
 101     # happen on the client that is running the task.
 102     #
 103     # https://www.nomadproject.io/docs/job-specification/restart
 104     #
 105     restart {
 106       interval  = "30m"
 107       attempts  = 40
 108       delay     = "15s"
 109       mode      = "delay"
 110     }
 111
 112     # The constraint allows restricting the set of eligible nodes. Constraints
 113     # may filter on attributes or client metadata.
 114     #
 115     # For more information and examples on the "volume" stanza, please see
 116     # the online documentation at:
 117     #
 118     #     https://www.nomadproject.io/docs/job-specification/constraint
 119     #
 120     constraint {
 121       attribute       = "$${attr.cpu.arch}"
 122       operator        = "!="
 123       value           = "arm64"
 124     }
 125
 126     # The "task" stanza creates an individual unit of work, such as a Docker
 127     # container, web application, or batch processing.
 128     #
 129     # For more information and examples on the "task" stanza, please see
 130     # the online documentation at:
 131     #
 132     #     https://www.nomadproject.io/docs/job-specification/task
 133     #
 134     task "prod-task1-${service_name}" {
 135       # The "driver" parameter specifies the task driver that should be used to
 136       # run the task.
 137       driver          = "exec"
 138
 139       %{ if use_vault_provider }
 140       vault {
 141         policies        = "${vault_kv_policy_name}"
 142       }
 143       %{ endif }
 144
 145       # The "config" stanza specifies the driver configuration, which is passed
 146       # directly to the driver to start the task. The details of configurations
 147       # are specific to each driver, so please see specific driver
 148       # documentation for more information.
 149       config {
 150         command       = "local/alertmanager-${version}.linux-amd64/alertmanager"
 151         args          = [
 152           "--config.file=secrets/alertmanager.yml"
 153         ]
 154       }
 155
 156       # The artifact stanza instructs Nomad to fetch and unpack a remote resource,
 157       # such as a file, tarball, or binary. Nomad downloads artifacts using the
 158       # popular go-getter library, which permits downloading artifacts from a
 159       # variety of locations using a URL as the input source.
 160       #
 161       # For more information and examples on the "artifact" stanza, please see
 162       # the online documentation at:
 163       #
 164       #     https://www.nomadproject.io/docs/job-specification/artifact
 165       #
 166       artifact {
 167         source          = "${url}"
 168       }
 169
 170       # The "template" stanza instructs Nomad to manage a template, such as
 171       # a configuration file or script. This template can optionally pull data
 172       # from Consul or Vault to populate runtime configuration data.
 173       #
 174       # For more information and examples on the "template" stanza, please see
 175       # the online documentation at:
 176       #
 177       #     https://www.nomadproject.io/docs/job-specification/template
 178       #
 179       template {
 180         change_mode     = "noop"
 181         change_signal   = "SIGINT"
 182         destination     = "secrets/alertmanager.yml"
 183         left_delimiter  = "{{{"
 184         right_delimiter = "}}}"
 185         data            = <<EOH
 186 # The directory from which notification templates are read.
 187 templates:
 188 - '/etc/alertmanager/template/*.tmpl'
 189
 190 #tls_config:
 191 #  # CA certificate to validate the server certificate with.
 192 #  ca_file: <filepath> ]
 193 #
 194 #  # Certificate and key files for client cert authentication to the server.
 195 #  cert_file: <filepath>
 196 #  key_file: <filepath>
 197 #
 198 #  # ServerName extension to indicate the name of the server.
 199 #  # http://tools.ietf.org/html/rfc4366#section-3.1
 200 #  server_name: <string>
 201 #
 202 #  # Disable validation of the server certificate.
 203 #  insecure_skip_verify: true
 204
 205 # The root route on which each incoming alert enters.
 206 route:
 207   receiver: '${slack_default_receiver}'
 208
 209   # The labels by which incoming alerts are grouped together. For example,
 210   # multiple alerts coming in for cluster=A and alertname=LatencyHigh would
 211   # be batched into a single group.
 212   #
 213   # To aggregate by all possible labels use '...' as the sole label name.
 214   # This effectively disables aggregation entirely, passing through all
 215   # alerts as-is. This is unlikely to be what you want, unless you have
 216   # a very low alert volume or your upstream notification system performs
 217   # its own grouping. Example: group_by: [...]
 218   group_by: ['alertname']
 219
 220   # When a new group of alerts is created by an incoming alert, wait at
 221   # least 'group_wait' to send the initial notification.
 222   # This way ensures that you get multiple alerts for the same group that start
 223   # firing shortly after another are batched together on the first
 224   # notification.
 225   group_wait: 30s
 226
 227   # When the first notification was sent, wait 'group_interval' to send a batch
 228   # of new alerts that started firing for that group.
 229   group_interval: 5m
 230
 231   # If an alert has successfully been sent, wait 'repeat_interval' to
 232   # resend them.
 233   repeat_interval: 3h
 234
 235   # All the above attributes are inherited by all child routes and can
 236   # overwritten on each.
 237   # The child route trees.
 238   routes:
 239   - match_re:
 240       alertname: JenkinsJob.*
 241     receiver: ${slack_jenkins_receiver}
 242     routes:
 243     - match:
 244         severity: critical
 245       receiver: '${slack_jenkins_receiver}'
 246
 247   - match_re:
 248       service: .*
 249     receiver: ${slack_default_receiver}
 250     routes:
 251     - match:
 252         severity: critical
 253       receiver: '${slack_default_receiver}'
 254
 255 # Inhibition rules allow to mute a set of alerts given that another alert is
 256 # firing.
 257 # We use this to mute any warning-level notifications if the same alert is
 258 # already critical.
 259 inhibit_rules:
 260 - source_match:
 261     severity: 'critical'
 262   target_match:
 263     severity: 'warning'
 264   equal: ['alertname', 'instance']
 265
 266 receivers:
 267 - name: '${slack_jenkins_receiver}'
 268   slack_configs:
 269   - api_url: 'https://hooks.slack.com/services/${slack_jenkins_api_key}'
 270     channel: '#${slack_jenkins_channel}'
 271     send_resolved: true
 272     icon_url: https://avatars3.githubusercontent.com/u/3380462
 273     title: |-
 274      [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}
 275      {{- if gt (len .CommonLabels) (len .GroupLabels) -}}
 276        {{" "}}(
 277        {{- with .CommonLabels.Remove .GroupLabels.Names }}
 278          {{- range $index, $label := .SortedPairs -}}
 279            {{ if $index }}, {{ end }}
 280            {{- $label.Name }}="{{ $label.Value -}}"
 281          {{- end }}
 282        {{- end -}}
 283        )
 284      {{- end }}
 285     text: >-
 286      {{ range .Alerts -}}
 287      *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}
 288
 289      *Description:* {{ .Annotations.description }}
 290
 291      *Details:*
 292        {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
 293        {{ end }}
 294      {{ end }}
 295
 296 - name: '${slack_default_receiver}'
 297   slack_configs:
 298   - api_url: 'https://hooks.slack.com/services/${slack_default_api_key}'
 299     channel: '#${slack_default_channel}'
 300     send_resolved: true
 301     icon_url: https://avatars3.githubusercontent.com/u/3380462
 302     title: |-
 303      [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}
 304      {{- if gt (len .CommonLabels) (len .GroupLabels) -}}
 305        {{" "}}(
 306        {{- with .CommonLabels.Remove .GroupLabels.Names }}
 307          {{- range $index, $label := .SortedPairs -}}
 308            {{ if $index }}, {{ end }}
 309            {{- $label.Name }}="{{ $label.Value -}}"
 310          {{- end }}
 311        {{- end -}}
 312        )
 313      {{- end }}
 314     text: >-
 315      {{ range .Alerts -}}
 316      *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}
 317
 318      *Description:* {{ .Annotations.description }}
 319
 320      *Details:*
 321        {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
 322        {{ end }}
 323      {{ end }}
 324 EOH
 325       }
 326
 327       # The service stanza instructs Nomad to register a service with Consul.
 328       #
 329       # For more information and examples on the "task" stanza, please see
 330       # the online documentation at:
 331       #
 332       #     https://www.nomadproject.io/docs/job-specification/service
 333       #
 334       service {
 335         name            = "${service_name}"
 336         port            = "${service_name}"
 337         tags            = [ "${service_name}$${NOMAD_ALLOC_INDEX}" ]
 338         check {
 339           name          = "Alertmanager Check Live"
 340           type          = "http"
 341           path          = "/-/healthy"
 342           interval      = "10s"
 343           timeout       = "2s"
 344         }
 345       }
 346
 347       # The "resources" stanza describes the requirements a task needs to
 348       # execute. Resource requirements include memory, network, cpu, and more.
 349       # This ensures the task will execute on a machine that contains enough
 350       # resource capacity.
 351       #
 352       # For more information and examples on the "resources" stanza, please see
 353       # the online documentation at:
 354       #
 355       #     https://www.nomadproject.io/docs/job-specification/resources
 356       #
 357       resources {
 358         cpu             = ${cpu}
 359         memory          = ${mem}
 360         # The network stanza specifies the networking requirements for the task
 361         # group, including the network mode and port allocations. When scheduling
 362         # jobs in Nomad they are provisioned across your fleet of machines along
 363         # with other jobs and services. Because you don't know in advance what host
 364         # your job will be provisioned on, Nomad will provide your tasks with
 365         # network configuration when they start up.
 366         #
 367         # For more information and examples on the "template" stanza, please see
 368         # the online documentation at:
 369         #
 370         #     https://www.nomadproject.io/docs/job-specification/network
 371         #
 372         network {
 373           port "${service_name}" {
 374             static      = ${port}
 375           }
 376         }
 377       }
 378     }
 379   }
 380 }