fdio.infra.terraform/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl

   1 job "${job_name}" {
   2   # The "region" parameter specifies the region in which to execute the job.
   3   # If omitted, this inherits the default region name of "global".
   4   # region = "global"
   5   #
   6   # The "datacenters" parameter specifies the list of datacenters which should
   7   # be considered when placing this task. This must be provided.
   8   datacenters         = "${datacenters}"
   9
  10   # The "type" parameter controls the type of job, which impacts the scheduler's
  11   # decision on placement. This configuration is optional and defaults to
  12   # "service". For a full list of job types and their differences, please see
  13   # the online documentation.
  14   #
  15   # For more information, please see the online documentation at:
  16   #
  17   #     https://www.nomadproject.io/docs/jobspec/schedulers
  18   #
  19   type                = "service"
  20
  21   update {
  22     # The "max_parallel" parameter specifies the maximum number of updates to
  23     # perform in parallel. In this case, this specifies to update a single task
  24     # at a time.
  25     max_parallel      = 1
  26
  27     health_check      = "checks"
  28
  29     # The "min_healthy_time" parameter specifies the minimum time the allocation
  30     # must be in the healthy state before it is marked as healthy and unblocks
  31     # further allocations from being updated.
  32     min_healthy_time  = "10s"
  33
  34     # The "healthy_deadline" parameter specifies the deadline in which the
  35     # allocation must be marked as healthy after which the allocation is
  36     # automatically transitioned to unhealthy. Transitioning to unhealthy will
  37     # fail the deployment and potentially roll back the job if "auto_revert" is
  38     # set to true.
  39     healthy_deadline  = "3m"
  40
  41     # The "progress_deadline" parameter specifies the deadline in which an
  42     # allocation must be marked as healthy. The deadline begins when the first
  43     # allocation for the deployment is created and is reset whenever an allocation
  44     # as part of the deployment transitions to a healthy state. If no allocation
  45     # transitions to the healthy state before the progress deadline, the
  46     # deployment is marked as failed.
  47     progress_deadline = "10m"
  48
  49 %{ if use_canary }
  50     # The "canary" parameter specifies that changes to the job that would result
  51     # in destructive updates should create the specified number of canaries
  52     # without stopping any previous allocations. Once the operator determines the
  53     # canaries are healthy, they can be promoted which unblocks a rolling update
  54     # of the remaining allocations at a rate of "max_parallel".
  55     #
  56     # Further, setting "canary" equal to the count of the task group allows
  57     # blue/green deployments. When the job is updated, a full set of the new
  58     # version is deployed and upon promotion the old version is stopped.
  59     canary            = 1
  60
  61     # Specifies if the job should auto-promote to the canary version when all
  62     # canaries become healthy during a deployment. Defaults to false which means
  63     # canaries must be manually updated with the nomad deployment promote
  64     # command.
  65     auto_promote      = true
  66
  67     # The "auto_revert" parameter specifies if the job should auto-revert to the
  68     # last stable job on deployment failure. A job is marked as stable if all the
  69     # allocations as part of its deployment were marked healthy.
  70     auto_revert       = true
  71 %{ endif }
  72   }
  73
  74   # The reschedule stanza specifies the group's rescheduling strategy. If
  75   # specified at the job level, the configuration will apply to all groups
  76   # within the job. If the reschedule stanza is present on both the job and the
  77   # group, they are merged with the group stanza taking the highest precedence
  78   # and then the job.
  79   reschedule {
  80     delay             = "30s"
  81     delay_function    = "constant"
  82     unlimited         = true
  83   }
  84
  85   # The "group" stanza defines a series of tasks that should be co-located on
  86   # the same Nomad client. Any task within a group will be placed on the same
  87   # client.
  88   #
  89   # For more information and examples on the "group" stanza, please see
  90   # the online documentation at:
  91   #
  92   #     https://www.nomadproject.io/docs/job-specification/group
  93   #
  94   group "prod-group1-${service_name}" {
  95     # The "count" parameter specifies the number of the task groups that should
  96     # be running under this group. This value must be non-negative and defaults
  97     # to 1.
  98     count             = ${group_count}
  99
 100     # The restart stanza configures a tasks's behavior on task failure. Restarts
 101     # happen on the client that is running the task.
 102     #
 103     # https://www.nomadproject.io/docs/job-specification/restart
 104     #
 105     restart {
 106       interval  = "30m"
 107       attempts  = 40
 108       delay     = "15s"
 109       mode      = "delay"
 110     }
 111
 112     # The constraint allows restricting the set of eligible nodes. Constraints
 113     # may filter on attributes or client metadata.
 114     #
 115     # For more information and examples on the "volume" stanza, please see
 116     # the online documentation at:
 117     #
 118     #     https://www.nomadproject.io/docs/job-specification/constraint
 119     #
 120     constraint {
 121       attribute       = "$${attr.cpu.arch}"
 122       operator        = "!="
 123       value           = "arm64"
 124     }
 125
 126     constraint {
 127       attribute      = "$${node.class}"
 128       value          = "builder"
 129     }
 130
 131     # The "task" stanza creates an individual unit of work, such as a Docker
 132     # container, web application, or batch processing.
 133     #
 134     # For more information and examples on the "task" stanza, please see
 135     # the online documentation at:
 136     #
 137     #     https://www.nomadproject.io/docs/job-specification/task
 138     #
 139     task "prod-task1-${service_name}" {
 140       # The "driver" parameter specifies the task driver that should be used to
 141       # run the task.
 142       driver          = "exec"
 143
 144       %{ if use_vault_provider }
 145       vault {
 146         policies        = "${vault_kv_policy_name}"
 147       }
 148       %{ endif }
 149
 150       # The "config" stanza specifies the driver configuration, which is passed
 151       # directly to the driver to start the task. The details of configurations
 152       # are specific to each driver, so please see specific driver
 153       # documentation for more information.
 154       config {
 155         command       = "local/alertmanager-${version}.linux-amd64/alertmanager"
 156         args          = [
 157           "--config.file=secrets/alertmanager.yml"
 158         ]
 159       }
 160
 161       # The artifact stanza instructs Nomad to fetch and unpack a remote resource,
 162       # such as a file, tarball, or binary. Nomad downloads artifacts using the
 163       # popular go-getter library, which permits downloading artifacts from a
 164       # variety of locations using a URL as the input source.
 165       #
 166       # For more information and examples on the "artifact" stanza, please see
 167       # the online documentation at:
 168       #
 169       #     https://www.nomadproject.io/docs/job-specification/artifact
 170       #
 171       artifact {
 172         source          = "${url}"
 173       }
 174
 175       # The "template" stanza instructs Nomad to manage a template, such as
 176       # a configuration file or script. This template can optionally pull data
 177       # from Consul or Vault to populate runtime configuration data.
 178       #
 179       # For more information and examples on the "template" stanza, please see
 180       # the online documentation at:
 181       #
 182       #     https://www.nomadproject.io/docs/job-specification/template
 183       #
 184       template {
 185         change_mode     = "noop"
 186         change_signal   = "SIGINT"
 187         destination     = "secrets/alertmanager.yml"
 188         left_delimiter  = "{{{"
 189         right_delimiter = "}}}"
 190         data            = <<EOH
 191 # The directory from which notification templates are read.
 192 templates:
 193 - '/etc/alertmanager/template/*.tmpl'
 194
 195 #tls_config:
 196 #  # CA certificate to validate the server certificate with.
 197 #  ca_file: <filepath> ]
 198 #
 199 #  # Certificate and key files for client cert authentication to the server.
 200 #  cert_file: <filepath>
 201 #  key_file: <filepath>
 202 #
 203 #  # ServerName extension to indicate the name of the server.
 204 #  # http://tools.ietf.org/html/rfc4366#section-3.1
 205 #  server_name: <string>
 206 #
 207 #  # Disable validation of the server certificate.
 208 #  insecure_skip_verify: true
 209
 210 # The root route on which each incoming alert enters.
 211 route:
 212   receiver: '${slack_default_receiver}'
 213
 214   # The labels by which incoming alerts are grouped together. For example,
 215   # multiple alerts coming in for cluster=A and alertname=LatencyHigh would
 216   # be batched into a single group.
 217   #
 218   # To aggregate by all possible labels use '...' as the sole label name.
 219   # This effectively disables aggregation entirely, passing through all
 220   # alerts as-is. This is unlikely to be what you want, unless you have
 221   # a very low alert volume or your upstream notification system performs
 222   # its own grouping. Example: group_by: [...]
 223   group_by: ['alertname']
 224
 225   # When a new group of alerts is created by an incoming alert, wait at
 226   # least 'group_wait' to send the initial notification.
 227   # This way ensures that you get multiple alerts for the same group that start
 228   # firing shortly after another are batched together on the first
 229   # notification.
 230   group_wait: 30s
 231
 232   # When the first notification was sent, wait 'group_interval' to send a batch
 233   # of new alerts that started firing for that group.
 234   group_interval: 5m
 235
 236   # If an alert has successfully been sent, wait 'repeat_interval' to
 237   # resend them.
 238   repeat_interval: 3h
 239
 240   # All the above attributes are inherited by all child routes and can
 241   # overwritten on each.
 242   # The child route trees.
 243   routes:
 244   - match_re:
 245       alertname: JenkinsJob.*
 246     receiver: ${slack_jenkins_receiver}
 247     routes:
 248     - match:
 249         severity: critical
 250       receiver: '${slack_jenkins_receiver}'
 251
 252   - match_re:
 253       service: .*
 254     receiver: ${slack_default_receiver}
 255     routes:
 256     - match:
 257         severity: critical
 258       receiver: '${slack_default_receiver}'
 259
 260 # Inhibition rules allow to mute a set of alerts given that another alert is
 261 # firing.
 262 # We use this to mute any warning-level notifications if the same alert is
 263 # already critical.
 264 inhibit_rules:
 265 - source_match:
 266     severity: 'critical'
 267   target_match:
 268     severity: 'warning'
 269   equal: ['alertname', 'instance']
 270
 271 receivers:
 272 - name: '${slack_jenkins_receiver}'
 273   slack_configs:
 274   - api_url: 'https://hooks.slack.com/services/${slack_jenkins_api_key}'
 275     channel: '#${slack_jenkins_channel}'
 276     send_resolved: true
 277     icon_url: https://avatars3.githubusercontent.com/u/3380462
 278     title: |-
 279      [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}
 280      {{- if gt (len .CommonLabels) (len .GroupLabels) -}}
 281        {{" "}}(
 282        {{- with .CommonLabels.Remove .GroupLabels.Names }}
 283          {{- range $index, $label := .SortedPairs -}}
 284            {{ if $index }}, {{ end }}
 285            {{- $label.Name }}="{{ $label.Value -}}"
 286          {{- end }}
 287        {{- end -}}
 288        )
 289      {{- end }}
 290     text: >-
 291      {{ range .Alerts -}}
 292      *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}
 293
 294      *Description:* {{ .Annotations.description }}
 295
 296      *Details:*
 297        {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
 298        {{ end }}
 299      {{ end }}
 300
 301 - name: '${slack_default_receiver}'
 302   slack_configs:
 303   - api_url: 'https://hooks.slack.com/services/${slack_default_api_key}'
 304     channel: '#${slack_default_channel}'
 305     send_resolved: true
 306     icon_url: https://avatars3.githubusercontent.com/u/3380462
 307     title: |-
 308      [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}
 309      {{- if gt (len .CommonLabels) (len .GroupLabels) -}}
 310        {{" "}}(
 311        {{- with .CommonLabels.Remove .GroupLabels.Names }}
 312          {{- range $index, $label := .SortedPairs -}}
 313            {{ if $index }}, {{ end }}
 314            {{- $label.Name }}="{{ $label.Value -}}"
 315          {{- end }}
 316        {{- end -}}
 317        )
 318      {{- end }}
 319     text: >-
 320      {{ range .Alerts -}}
 321      *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}
 322
 323      *Description:* {{ .Annotations.description }}
 324
 325      *Details:*
 326        {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
 327        {{ end }}
 328      {{ end }}
 329 EOH
 330       }
 331
 332       # The service stanza instructs Nomad to register a service with Consul.
 333       #
 334       # For more information and examples on the "task" stanza, please see
 335       # the online documentation at:
 336       #
 337       #     https://www.nomadproject.io/docs/job-specification/service
 338       #
 339       service {
 340         name            = "${service_name}"
 341         port            = "${service_name}"
 342         tags            = [ "${service_name}$${NOMAD_ALLOC_INDEX}" ]
 343         check {
 344           name          = "Alertmanager Check Live"
 345           type          = "http"
 346           path          = "/-/healthy"
 347           interval      = "10s"
 348           timeout       = "2s"
 349         }
 350       }
 351
 352       # The "resources" stanza describes the requirements a task needs to
 353       # execute. Resource requirements include memory, network, cpu, and more.
 354       # This ensures the task will execute on a machine that contains enough
 355       # resource capacity.
 356       #
 357       # For more information and examples on the "resources" stanza, please see
 358       # the online documentation at:
 359       #
 360       #     https://www.nomadproject.io/docs/job-specification/resources
 361       #
 362       resources {
 363         cpu             = ${cpu}
 364         memory          = ${mem}
 365         # The network stanza specifies the networking requirements for the task
 366         # group, including the network mode and port allocations. When scheduling
 367         # jobs in Nomad they are provisioned across your fleet of machines along
 368         # with other jobs and services. Because you don't know in advance what host
 369         # your job will be provisioned on, Nomad will provide your tasks with
 370         # network configuration when they start up.
 371         #
 372         # For more information and examples on the "template" stanza, please see
 373         # the online documentation at:
 374         #
 375         #     https://www.nomadproject.io/docs/job-specification/network
 376         #
 377         network {
 378           port "${service_name}" {
 379             static      = ${port}
 380           }
 381         }
 382       }
 383     }
 384   }
 385 }