X-Git-Url: https://gerrit.fd.io/r/gitweb?p=csit.git;a=blobdiff_plain;f=fdio.infra.terraform%2F1n_nmd%2Falertmanager%2Fconf%2Fnomad%2Falertmanager.hcl;fp=fdio.infra.terraform%2F1n_nmd%2Falertmanager%2Fconf%2Fnomad%2Falertmanager.hcl;h=0000000000000000000000000000000000000000;hp=ab92761ac2612c29e922f4574d24ded6fd363cb9;hb=0bbb81c4fd1afdee6eb23ba4d49171d8dced6b19;hpb=0576c4293e05b1eded486b48a217495451a8b685 diff --git a/fdio.infra.terraform/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl b/fdio.infra.terraform/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl deleted file mode 100644 index ab92761ac2..0000000000 --- a/fdio.infra.terraform/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl +++ /dev/null @@ -1,385 +0,0 @@ -job "${job_name}" { - # The "region" parameter specifies the region in which to execute the job. - # If omitted, this inherits the default region name of "global". - # region = "global" - # - # The "datacenters" parameter specifies the list of datacenters which should - # be considered when placing this task. This must be provided. - datacenters = "${datacenters}" - - # The "type" parameter controls the type of job, which impacts the scheduler's - # decision on placement. This configuration is optional and defaults to - # "service". For a full list of job types and their differences, please see - # the online documentation. - # - # For more information, please see the online documentation at: - # - # https://www.nomadproject.io/docs/jobspec/schedulers - # - type = "service" - - update { - # The "max_parallel" parameter specifies the maximum number of updates to - # perform in parallel. In this case, this specifies to update a single task - # at a time. - max_parallel = 1 - - health_check = "checks" - - # The "min_healthy_time" parameter specifies the minimum time the allocation - # must be in the healthy state before it is marked as healthy and unblocks - # further allocations from being updated. - min_healthy_time = "10s" - - # The "healthy_deadline" parameter specifies the deadline in which the - # allocation must be marked as healthy after which the allocation is - # automatically transitioned to unhealthy. Transitioning to unhealthy will - # fail the deployment and potentially roll back the job if "auto_revert" is - # set to true. - healthy_deadline = "3m" - - # The "progress_deadline" parameter specifies the deadline in which an - # allocation must be marked as healthy. The deadline begins when the first - # allocation for the deployment is created and is reset whenever an allocation - # as part of the deployment transitions to a healthy state. If no allocation - # transitions to the healthy state before the progress deadline, the - # deployment is marked as failed. - progress_deadline = "10m" - -%{ if use_canary } - # The "canary" parameter specifies that changes to the job that would result - # in destructive updates should create the specified number of canaries - # without stopping any previous allocations. Once the operator determines the - # canaries are healthy, they can be promoted which unblocks a rolling update - # of the remaining allocations at a rate of "max_parallel". - # - # Further, setting "canary" equal to the count of the task group allows - # blue/green deployments. When the job is updated, a full set of the new - # version is deployed and upon promotion the old version is stopped. - canary = 1 - - # Specifies if the job should auto-promote to the canary version when all - # canaries become healthy during a deployment. Defaults to false which means - # canaries must be manually updated with the nomad deployment promote - # command. - auto_promote = true - - # The "auto_revert" parameter specifies if the job should auto-revert to the - # last stable job on deployment failure. A job is marked as stable if all the - # allocations as part of its deployment were marked healthy. - auto_revert = true -%{ endif } - } - - # The reschedule stanza specifies the group's rescheduling strategy. If - # specified at the job level, the configuration will apply to all groups - # within the job. If the reschedule stanza is present on both the job and the - # group, they are merged with the group stanza taking the highest precedence - # and then the job. - reschedule { - delay = "30s" - delay_function = "constant" - unlimited = true - } - - # The "group" stanza defines a series of tasks that should be co-located on - # the same Nomad client. Any task within a group will be placed on the same - # client. - # - # For more information and examples on the "group" stanza, please see - # the online documentation at: - # - # https://www.nomadproject.io/docs/job-specification/group - # - group "prod-group1-${service_name}" { - # The "count" parameter specifies the number of the task groups that should - # be running under this group. This value must be non-negative and defaults - # to 1. - count = ${group_count} - - # The restart stanza configures a tasks's behavior on task failure. Restarts - # happen on the client that is running the task. - # - # https://www.nomadproject.io/docs/job-specification/restart - # - restart { - interval = "30m" - attempts = 40 - delay = "15s" - mode = "delay" - } - - # The constraint allows restricting the set of eligible nodes. Constraints - # may filter on attributes or client metadata. - # - # For more information and examples on the "volume" stanza, please see - # the online documentation at: - # - # https://www.nomadproject.io/docs/job-specification/constraint - # - constraint { - attribute = "$${attr.cpu.arch}" - operator = "!=" - value = "arm64" - } - - constraint { - attribute = "$${node.class}" - value = "builder" - } - - # The "task" stanza creates an individual unit of work, such as a Docker - # container, web application, or batch processing. - # - # For more information and examples on the "task" stanza, please see - # the online documentation at: - # - # https://www.nomadproject.io/docs/job-specification/task - # - task "prod-task1-${service_name}" { - # The "driver" parameter specifies the task driver that should be used to - # run the task. - driver = "exec" - - %{ if use_vault_provider } - vault { - policies = "${vault_kv_policy_name}" - } - %{ endif } - - # The "config" stanza specifies the driver configuration, which is passed - # directly to the driver to start the task. The details of configurations - # are specific to each driver, so please see specific driver - # documentation for more information. - config { - command = "local/alertmanager-${version}.linux-amd64/alertmanager" - args = [ - "--config.file=secrets/alertmanager.yml" - ] - } - - # The artifact stanza instructs Nomad to fetch and unpack a remote resource, - # such as a file, tarball, or binary. Nomad downloads artifacts using the - # popular go-getter library, which permits downloading artifacts from a - # variety of locations using a URL as the input source. - # - # For more information and examples on the "artifact" stanza, please see - # the online documentation at: - # - # https://www.nomadproject.io/docs/job-specification/artifact - # - artifact { - source = "${url}" - } - - # The "template" stanza instructs Nomad to manage a template, such as - # a configuration file or script. This template can optionally pull data - # from Consul or Vault to populate runtime configuration data. - # - # For more information and examples on the "template" stanza, please see - # the online documentation at: - # - # https://www.nomadproject.io/docs/job-specification/template - # - template { - change_mode = "noop" - change_signal = "SIGINT" - destination = "secrets/alertmanager.yml" - left_delimiter = "{{{" - right_delimiter = "}}}" - data = < ] -# -# # Certificate and key files for client cert authentication to the server. -# cert_file: -# key_file: -# -# # ServerName extension to indicate the name of the server. -# # http://tools.ietf.org/html/rfc4366#section-3.1 -# server_name: -# -# # Disable validation of the server certificate. -# insecure_skip_verify: true - -# The root route on which each incoming alert enters. -route: - receiver: '${slack_default_receiver}' - - # The labels by which incoming alerts are grouped together. For example, - # multiple alerts coming in for cluster=A and alertname=LatencyHigh would - # be batched into a single group. - # - # To aggregate by all possible labels use '...' as the sole label name. - # This effectively disables aggregation entirely, passing through all - # alerts as-is. This is unlikely to be what you want, unless you have - # a very low alert volume or your upstream notification system performs - # its own grouping. Example: group_by: [...] - group_by: ['alertname'] - - # When a new group of alerts is created by an incoming alert, wait at - # least 'group_wait' to send the initial notification. - # This way ensures that you get multiple alerts for the same group that start - # firing shortly after another are batched together on the first - # notification. - group_wait: 30s - - # When the first notification was sent, wait 'group_interval' to send a batch - # of new alerts that started firing for that group. - group_interval: 5m - - # If an alert has successfully been sent, wait 'repeat_interval' to - # resend them. - repeat_interval: 3h - - # All the above attributes are inherited by all child routes and can - # overwritten on each. - # The child route trees. - routes: - - match_re: - alertname: JenkinsJob.* - receiver: ${slack_jenkins_receiver} - routes: - - match: - severity: critical - receiver: '${slack_jenkins_receiver}' - - - match_re: - service: .* - receiver: ${slack_default_receiver} - routes: - - match: - severity: critical - receiver: '${slack_default_receiver}' - -# Inhibition rules allow to mute a set of alerts given that another alert is -# firing. -# We use this to mute any warning-level notifications if the same alert is -# already critical. -inhibit_rules: -- source_match: - severity: 'critical' - target_match: - severity: 'warning' - equal: ['alertname', 'instance'] - -receivers: -- name: '${slack_jenkins_receiver}' - slack_configs: - - api_url: 'https://hooks.slack.com/services/${slack_jenkins_api_key}' - channel: '#${slack_jenkins_channel}' - send_resolved: true - icon_url: https://avatars3.githubusercontent.com/u/3380462 - title: |- - [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }} - {{- if gt (len .CommonLabels) (len .GroupLabels) -}} - {{" "}}( - {{- with .CommonLabels.Remove .GroupLabels.Names }} - {{- range $index, $label := .SortedPairs -}} - {{ if $index }}, {{ end }} - {{- $label.Name }}="{{ $label.Value -}}" - {{- end }} - {{- end -}} - ) - {{- end }} - text: >- - {{ range .Alerts -}} - *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }} - - *Description:* {{ .Annotations.description }} - - *Details:* - {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` - {{ end }} - {{ end }} - -- name: '${slack_default_receiver}' - slack_configs: - - api_url: 'https://hooks.slack.com/services/${slack_default_api_key}' - channel: '#${slack_default_channel}' - send_resolved: true - icon_url: https://avatars3.githubusercontent.com/u/3380462 - title: |- - [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }} - {{- if gt (len .CommonLabels) (len .GroupLabels) -}} - {{" "}}( - {{- with .CommonLabels.Remove .GroupLabels.Names }} - {{- range $index, $label := .SortedPairs -}} - {{ if $index }}, {{ end }} - {{- $label.Name }}="{{ $label.Value -}}" - {{- end }} - {{- end -}} - ) - {{- end }} - text: >- - {{ range .Alerts -}} - *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }} - - *Description:* {{ .Annotations.description }} - - *Details:* - {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` - {{ end }} - {{ end }} -EOH - } - - # The service stanza instructs Nomad to register a service with Consul. - # - # For more information and examples on the "task" stanza, please see - # the online documentation at: - # - # https://www.nomadproject.io/docs/job-specification/service - # - service { - name = "${service_name}" - port = "${service_name}" - tags = [ "${service_name}$${NOMAD_ALLOC_INDEX}" ] - check { - name = "Alertmanager Check Live" - type = "http" - path = "/-/healthy" - interval = "10s" - timeout = "2s" - } - } - - # The "resources" stanza describes the requirements a task needs to - # execute. Resource requirements include memory, network, cpu, and more. - # This ensures the task will execute on a machine that contains enough - # resource capacity. - # - # For more information and examples on the "resources" stanza, please see - # the online documentation at: - # - # https://www.nomadproject.io/docs/job-specification/resources - # - resources { - cpu = ${cpu} - memory = ${mem} - # The network stanza specifies the networking requirements for the task - # group, including the network mode and port allocations. When scheduling - # jobs in Nomad they are provisioned across your fleet of machines along - # with other jobs and services. Because you don't know in advance what host - # your job will be provisioned on, Nomad will provide your tasks with - # network configuration when they start up. - # - # For more information and examples on the "template" stanza, please see - # the online documentation at: - # - # https://www.nomadproject.io/docs/job-specification/network - # - network { - port "${service_name}" { - static = ${port} - } - } - } - } - } -} \ No newline at end of file