From 0bbb81c4fd1afdee6eb23ba4d49171d8dced6b19 Mon Sep 17 00:00:00 2001 From: Peter Mikus Date: Wed, 9 Feb 2022 09:58:09 +0100 Subject: [PATCH 1/1] feat(terraform): Refactor Alertmanager - prepare for ETL Signed-off-by: Peter Mikus Change-Id: I8931f76f78b5acee39716398b92e4b107d399773 --- .../{alertmanager.hcl => alertmanager.hcl.tftpl} | 163 ++++++++++----------- .../1n_nmd/alertmanager/fdio/main.tf | 14 ++ .../1n_nmd/alertmanager/fdio/providers.tf | 13 ++ .../1n_nmd/alertmanager/fdio/variables.tf | 47 ++++++ .../1n_nmd/alertmanager/fdio/versions.tf | 17 +++ fdio.infra.terraform/1n_nmd/alertmanager/main.tf | 70 +++++---- .../1n_nmd/alertmanager/variables.tf | 131 ++++++++++++----- .../1n_nmd/alertmanager/versions.tf | 10 +- fdio.infra.terraform/1n_nmd/main.tf | 30 ---- 9 files changed, 305 insertions(+), 190 deletions(-) rename fdio.infra.terraform/1n_nmd/alertmanager/conf/nomad/{alertmanager.hcl => alertmanager.hcl.tftpl} (77%) create mode 100644 fdio.infra.terraform/1n_nmd/alertmanager/fdio/main.tf create mode 100644 fdio.infra.terraform/1n_nmd/alertmanager/fdio/providers.tf create mode 100644 fdio.infra.terraform/1n_nmd/alertmanager/fdio/variables.tf create mode 100644 fdio.infra.terraform/1n_nmd/alertmanager/fdio/versions.tf diff --git a/fdio.infra.terraform/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl b/fdio.infra.terraform/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl.tftpl similarity index 77% rename from fdio.infra.terraform/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl rename to fdio.infra.terraform/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl.tftpl index ab92761ac2..d1bb8e85cd 100644 --- a/fdio.infra.terraform/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl +++ b/fdio.infra.terraform/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl.tftpl @@ -1,28 +1,26 @@ job "${job_name}" { # The "region" parameter specifies the region in which to execute the job. # If omitted, this inherits the default region name of "global". - # region = "global" - # + # region = "${region}" + # The "datacenters" parameter specifies the list of datacenters which should # be considered when placing this task. This must be provided. - datacenters = "${datacenters}" + datacenters = "${datacenters}" # The "type" parameter controls the type of job, which impacts the scheduler's # decision on placement. This configuration is optional and defaults to # "service". For a full list of job types and their differences, please see # the online documentation. # - # For more information, please see the online documentation at: - # # https://www.nomadproject.io/docs/jobspec/schedulers # - type = "service" + type = "service" update { # The "max_parallel" parameter specifies the maximum number of updates to # perform in parallel. In this case, this specifies to update a single task # at a time. - max_parallel = 1 + max_parallel = ${max_parallel} health_check = "checks" @@ -56,46 +54,51 @@ job "${job_name}" { # Further, setting "canary" equal to the count of the task group allows # blue/green deployments. When the job is updated, a full set of the new # version is deployed and upon promotion the old version is stopped. - canary = 1 + canary = ${canary} # Specifies if the job should auto-promote to the canary version when all # canaries become healthy during a deployment. Defaults to false which means # canaries must be manually updated with the nomad deployment promote # command. - auto_promote = true + auto_promote = ${auto_promote} # The "auto_revert" parameter specifies if the job should auto-revert to the # last stable job on deployment failure. A job is marked as stable if all the # allocations as part of its deployment were marked healthy. - auto_revert = true + auto_revert = ${auto_revert} %{ endif } } - # The reschedule stanza specifies the group's rescheduling strategy. If - # specified at the job level, the configuration will apply to all groups - # within the job. If the reschedule stanza is present on both the job and the - # group, they are merged with the group stanza taking the highest precedence - # and then the job. - reschedule { - delay = "30s" - delay_function = "constant" - unlimited = true + # All groups in this job should be scheduled on different hosts. + constraint { + operator = "distinct_hosts" + value = "true" } # The "group" stanza defines a series of tasks that should be co-located on # the same Nomad client. Any task within a group will be placed on the same # client. # - # For more information and examples on the "group" stanza, please see - # the online documentation at: - # # https://www.nomadproject.io/docs/job-specification/group # - group "prod-group1-${service_name}" { + group "${job_name}-group-1" { # The "count" parameter specifies the number of the task groups that should # be running under this group. This value must be non-negative and defaults # to 1. - count = ${group_count} + count = ${group_count} + + # The volume stanza allows the group to specify that it requires a given + # volume from the cluster. The key of the stanza is the name of the volume + # as it will be exposed to task configuration. + # + # https://www.nomadproject.io/docs/job-specification/volume + %{ if use_host_volume } + volume "${job_name}-volume-1" { + type = "host" + read_only = false + source = "${volume_source}" + } + %{ endif } # The restart stanza configures a tasks's behavior on task failure. Restarts # happen on the client that is running the task. @@ -103,57 +106,75 @@ job "${job_name}" { # https://www.nomadproject.io/docs/job-specification/restart # restart { - interval = "30m" - attempts = 40 - delay = "15s" - mode = "delay" + interval = "30m" + attempts = 40 + delay = "15s" + mode = "delay" } # The constraint allows restricting the set of eligible nodes. Constraints # may filter on attributes or client metadata. # - # For more information and examples on the "volume" stanza, please see - # the online documentation at: - # # https://www.nomadproject.io/docs/job-specification/constraint # constraint { - attribute = "$${attr.cpu.arch}" - operator = "!=" - value = "arm64" + attribute = "$${attr.cpu.arch}" + operator = "!=" + value = "arm64" } constraint { - attribute = "$${node.class}" - value = "builder" + attribute = "$${node.class}" + value = "builder" + } + + # The network stanza specifies the networking requirements for the task + # group, including the network mode and port allocations. When scheduling + # jobs in Nomad they are provisioned across your fleet of machines along + # with other jobs and services. Because you don't know in advance what host + # your job will be provisioned on, Nomad will provide your tasks with + # network configuration when they start up. + # + # https://www.nomadproject.io/docs/job-specification/network + # + network { + port "${service_name}" { + static = ${port} + to = ${port} + } } # The "task" stanza creates an individual unit of work, such as a Docker # container, web application, or batch processing. # - # For more information and examples on the "task" stanza, please see - # the online documentation at: - # # https://www.nomadproject.io/docs/job-specification/task # - task "prod-task1-${service_name}" { + task "${job_name}-task-1" { # The "driver" parameter specifies the task driver that should be used to # run the task. - driver = "exec" + driver = "exec" + + %{ if use_host_volume } + volume_mount { + volume = "${job_name}-volume-1" + destination = "${volume_destination}" + read_only = false + } + %{ endif } - %{ if use_vault_provider } + %{ if use_vault_provider } vault { - policies = "${vault_kv_policy_name}" + policies = "${vault_kv_policy_name}" } - %{ endif } + %{ endif } # The "config" stanza specifies the driver configuration, which is passed # directly to the driver to start the task. The details of configurations # are specific to each driver, so please see specific driver # documentation for more information. config { - command = "local/alertmanager-${version}.linux-amd64/alertmanager" - args = [ + command = "local/alertmanager-${version}.linux-amd64/alertmanager" + args = [ "--config.file=secrets/alertmanager.yml" ] } @@ -163,22 +184,16 @@ job "${job_name}" { # popular go-getter library, which permits downloading artifacts from a # variety of locations using a URL as the input source. # - # For more information and examples on the "artifact" stanza, please see - # the online documentation at: - # # https://www.nomadproject.io/docs/job-specification/artifact # artifact { - source = "${url}" + source = "${url}" } # The "template" stanza instructs Nomad to manage a template, such as # a configuration file or script. This template can optionally pull data # from Consul or Vault to populate runtime configuration data. # - # For more information and examples on the "template" stanza, please see - # the online documentation at: - # # https://www.nomadproject.io/docs/job-specification/template # template { @@ -337,15 +352,15 @@ EOH # https://www.nomadproject.io/docs/job-specification/service # service { - name = "${service_name}" - port = "${service_name}" - tags = [ "${service_name}$${NOMAD_ALLOC_INDEX}" ] + name = "${service_name}" + port = "${service_name}" + tags = [ "${service_name}$${NOMAD_ALLOC_INDEX}" ] check { - name = "Alertmanager Check Live" - type = "http" - path = "/-/healthy" - interval = "10s" - timeout = "2s" + name = "Alertmanager Check Live" + type = "http" + path = "/-/healthy" + interval = "10s" + timeout = "2s" } } @@ -354,32 +369,12 @@ EOH # This ensures the task will execute on a machine that contains enough # resource capacity. # - # For more information and examples on the "resources" stanza, please see - # the online documentation at: - # # https://www.nomadproject.io/docs/job-specification/resources # resources { - cpu = ${cpu} - memory = ${mem} - # The network stanza specifies the networking requirements for the task - # group, including the network mode and port allocations. When scheduling - # jobs in Nomad they are provisioned across your fleet of machines along - # with other jobs and services. Because you don't know in advance what host - # your job will be provisioned on, Nomad will provide your tasks with - # network configuration when they start up. - # - # For more information and examples on the "template" stanza, please see - # the online documentation at: - # - # https://www.nomadproject.io/docs/job-specification/network - # - network { - port "${service_name}" { - static = ${port} - } - } + cpu = ${cpu} + memory = ${memory} } } } -} \ No newline at end of file +} diff --git a/fdio.infra.terraform/1n_nmd/alertmanager/fdio/main.tf b/fdio.infra.terraform/1n_nmd/alertmanager/fdio/main.tf new file mode 100644 index 0000000000..745e450a8c --- /dev/null +++ b/fdio.infra.terraform/1n_nmd/alertmanager/fdio/main.tf @@ -0,0 +1,14 @@ +module "alertmanager" { + providers = { + nomad = nomad.yul1 + } + source = "../" + + # alertmanager + datacenters = ["yul1"] + slack_jenkins_api_key = "TE07RD1V1/B01U1NV9HV3/hKZXJJ74g2JcISq4K3QC1eG9" + slack_jenkins_channel = "fdio-jobs-monitoring" + slack_default_api_key = "TE07RD1V1/B01UUK23B6C/hZTcCu42FUv8d6rtirHtcYIi" + slack_default_channel = "fdio-infra-monitoring" + am_version = "0.23.0" +} \ No newline at end of file diff --git a/fdio.infra.terraform/1n_nmd/alertmanager/fdio/providers.tf b/fdio.infra.terraform/1n_nmd/alertmanager/fdio/providers.tf new file mode 100644 index 0000000000..42a6a45ce0 --- /dev/null +++ b/fdio.infra.terraform/1n_nmd/alertmanager/fdio/providers.tf @@ -0,0 +1,13 @@ +provider "nomad" { + address = var.nomad_provider_address + alias = "yul1" + # ca_file = var.nomad_provider_ca_file + # cert_file = var.nomad_provider_cert_file + # key_file = var.nomad_provider_key_file +} + +provider "vault" { + address = var.vault_provider_address + skip_tls_verify = var.vault_provider_skip_tls_verify + token = var.vault_provider_token +} \ No newline at end of file diff --git a/fdio.infra.terraform/1n_nmd/alertmanager/fdio/variables.tf b/fdio.infra.terraform/1n_nmd/alertmanager/fdio/variables.tf new file mode 100644 index 0000000000..7d5be09d21 --- /dev/null +++ b/fdio.infra.terraform/1n_nmd/alertmanager/fdio/variables.tf @@ -0,0 +1,47 @@ +variable "nomad_acl" { + description = "Nomad ACLs enabled/disabled." + type = bool + default = false +} + +variable "nomad_provider_address" { + description = "FD.io Nomad cluster address." + type = string + default = "http://10.32.8.14:4646" +} + +variable "nomad_provider_ca_file" { + description = "A local file path to a PEM-encoded certificate authority." + type = string + default = "/etc/nomad.d/ssl/nomad-ca.pem" +} + +variable "nomad_provider_cert_file" { + description = "A local file path to a PEM-encoded certificate." + type = string + default = "/etc/nomad.d/ssl/nomad-cli.pem" +} + +variable "nomad_provider_key_file" { + description = "A local file path to a PEM-encoded private key." + type = string + default = "/etc/nomad.d/ssl/nomad-cli-key.pem" +} + +variable "vault_provider_address" { + description = "Vault cluster address." + type = string + default = "http://10.30.51.28:8200" +} + +variable "vault_provider_skip_tls_verify" { + description = "Verification of the Vault server's TLS certificate." + type = bool + default = false +} + +variable "vault_provider_token" { + description = "Vault root token." + type = string + sensitive = true +} \ No newline at end of file diff --git a/fdio.infra.terraform/1n_nmd/alertmanager/fdio/versions.tf b/fdio.infra.terraform/1n_nmd/alertmanager/fdio/versions.tf new file mode 100644 index 0000000000..385c5c3f18 --- /dev/null +++ b/fdio.infra.terraform/1n_nmd/alertmanager/fdio/versions.tf @@ -0,0 +1,17 @@ +terraform { + backend "consul" { + address = "10.32.8.14:8500" + scheme = "http" + path = "terraform/alertmanager" + } + required_providers { + nomad = { + source = "hashicorp/nomad" + version = ">= 1.4.16" + } + vault = { + version = ">= 3.2.1" + } + } + required_version = ">= 1.1.4" +} \ No newline at end of file diff --git a/fdio.infra.terraform/1n_nmd/alertmanager/main.tf b/fdio.infra.terraform/1n_nmd/alertmanager/main.tf index b7ab5dce92..e8a1389150 100644 --- a/fdio.infra.terraform/1n_nmd/alertmanager/main.tf +++ b/fdio.infra.terraform/1n_nmd/alertmanager/main.tf @@ -1,40 +1,48 @@ locals { - datacenters = join(",", var.nomad_datacenters) - - alertmanager_url = join("", + datacenters = join(",", var.datacenters) + url = join("", [ "https://github.com", "/prometheus/alertmanager/releases/download/", - "v${var.alertmanager_version}/", - "alertmanager-${var.alertmanager_version}.linux-amd64.tar.gz" + "v${var.am_version}/", + "alertmanager-${var.am_version}.linux-amd64.tar.gz" ] ) } -data "template_file" "nomad_job_alertmanager" { - template = file("${path.module}/conf/nomad/alertmanager.hcl") - vars = { - datacenters = local.datacenters - url = local.alertmanager_url - job_name = var.alertmanager_job_name - use_canary = var.alertmanager_use_canary - group_count = var.alertmanager_group_count - service_name = var.alertmanager_service_name - use_vault_provider = var.alertmanager_vault_secret.use_vault_provider - version = var.alertmanager_version - cpu = var.alertmanager_cpu - mem = var.alertmanager_mem - port = var.alertmanager_port - slack_jenkins_api_key = var.alertmanager_slack_jenkins_api_key - slack_jenkins_channel = var.alertmanager_slack_jenkins_channel - slack_jenkins_receiver = var.alertmanager_slack_jenkins_receiver - slack_default_api_key = var.alertmanager_slack_default_api_key - slack_default_channel = var.alertmanager_slack_default_channel - slack_default_receiver = var.alertmanager_slack_default_receiver - } -} - resource "nomad_job" "nomad_job_alertmanager" { - jobspec = data.template_file.nomad_job_alertmanager.rendered - detach = false -} \ No newline at end of file + jobspec = templatefile( + "${path.module}/conf/nomad/alertmanager.hcl.tftpl", + { + auto_promote = var.auto_promote, + auto_revert = var.auto_revert, + canary = var.canary, + cpu = var.cpu, + datacenters = local.datacenters, + group_count = var.group_count, + job_name = var.job_name, + max_parallel = var.max_parallel, + memory = var.memory + port = var.port, + region = var.region, + service_name = var.service_name, + slack_jenkins_api_key = var.slack_jenkins_api_key, + slack_jenkins_channel = var.slack_jenkins_channel, + slack_jenkins_receiver = var.slack_jenkins_receiver, + slack_default_api_key = var.slack_default_api_key, + slack_default_channel = var.slack_default_channel, + slack_default_receiver = var.slack_default_receiver, + url = local.url, + use_canary = var.use_canary, + use_host_volume = var.use_host_volume, + use_vault_provider = var.vault_secret.use_vault_provider, + vault_kv_policy_name = var.vault_secret.vault_kv_policy_name, + vault_kv_path = var.vault_secret.vault_kv_path, + vault_kv_field_access_key = var.vault_secret.vault_kv_field_access_key, + vault_kv_field_secret_key = var.vault_secret.vault_kv_field_secret_key, + version = var.am_version, + volume_destination = var.volume_destination, + volume_source = var.volume_source + }) + detach = false +} diff --git a/fdio.infra.terraform/1n_nmd/alertmanager/variables.tf b/fdio.infra.terraform/1n_nmd/alertmanager/variables.tf index e24ceb64c6..e452598fa6 100644 --- a/fdio.infra.terraform/1n_nmd/alertmanager/variables.tf +++ b/fdio.infra.terraform/1n_nmd/alertmanager/variables.tf @@ -1,43 +1,102 @@ # Nomad -variable "nomad_datacenters" { - description = "Nomad data centers" +variable "datacenters" { + description = "Specifies the list of DCs to be considered placing this task" type = list(string) default = ["dc1"] } -# Alermanager -variable "alertmanager_job_name" { - description = "Job name" +variable "region" { + description = "Specifies the list of DCs to be considered placing this task" type = string - default = "alertmanager" + default = "global" +} + +variable "volume_source" { + description = "The name of the volume to request" + type = string + default = "persistence" +} + +# Alertmanager +variable "am_version" { + description = "Alertmanager version" + type = string + default = "0.21.0" +} + +variable "auto_promote" { + description = "Specifies if the job should auto-promote to the canary version" + type = bool + default = true +} + +variable "auto_revert" { + description = "Specifies if the job should auto-revert to the last stable job" + type = bool + default = true } -variable "alertmanager_group_count" { - description = "Number of group instances" +variable "canary" { + description = "Equal to the count of the task group allows blue/green depl." type = number default = 1 } -variable "alertmanager_service_name" { - description = "Service name" +variable "cpu" { + description = "CPU allocation" + type = number + default = 1000 +} + +variable "group_count" { + description = "Specifies the number of the task groups running under this one" + type = number + default = 1 +} + +variable "job_name" { + description = "Specifies a name for the job" type = string default = "alertmanager" } -variable "alertmanager_version" { - description = "Version" +variable "max_parallel" { + description = "Specifies the maximum number of updates to perform in parallel" + type = number + default = 1 +} + +variable "memory" { + description = "Specifies the memory required in MB" + type = number + default = 1024 +} + +variable "port" { + description = "Specifies the static TCP/UDP port to allocate" + type = number + default = 9093 +} + +variable "service_name" { + description = "Specifies the name this service will be advertised in Consul" type = string - default = "0.21.0" + default = "alertmanager" } -variable "alertmanager_use_canary" { +variable "use_canary" { description = "Uses canary deployment" type = bool + default = true +} + +variable "use_host_volume" { + description = "Use Nomad host volume feature" + type = bool default = false } -variable "alertmanager_vault_secret" { - description = "Set of properties to be able to fetch secret from vault" +variable "vault_secret" { type = object({ use_vault_provider = bool, vault_kv_policy_name = string, @@ -45,57 +104,53 @@ variable "alertmanager_vault_secret" { vault_kv_field_access_key = string, vault_kv_field_secret_key = string }) + description = "Set of properties to be able to fetch secret from vault." + default = { + use_vault_provider = false + vault_kv_policy_name = "kv" + vault_kv_path = "secret/data/alertmanager" + vault_kv_field_access_key = "access_key" + vault_kv_field_secret_key = "secret_key" + } } -variable "alertmanager_cpu" { - description = "CPU allocation" - type = number - default = 1000 -} - -variable "alertmanager_mem" { - description = "RAM allocation" - type = number - default = 1024 -} - -variable "alertmanager_port" { - description = "TCP allocation" - type = number - default = 9093 +variable "volume_destination" { + description = "Specifies where the volume should be mounted inside the task" + type = string + default = "/data/" } -variable "alertmanager_slack_jenkins_api_key" { +variable "slack_jenkins_api_key" { description = "Alertmanager jenkins slack API key" type = string default = "XXXXXXXXX/XXXXXXXXXXX/XXXXXXXXXXXXXXXXXXXXXXXX" } -variable "alertmanager_slack_jenkins_receiver" { +variable "slack_jenkins_receiver" { description = "Alertmanager jenkins slack receiver" type = string default = "jenkins-slack-receiver" } -variable "alertmanager_slack_jenkins_channel" { +variable "slack_jenkins_channel" { description = "Alertmanager jenkins slack channel" type = string default = "jenkins-channel" } -variable "alertmanager_slack_default_api_key" { +variable "slack_default_api_key" { description = "Alertmanager default slack API key" type = string default = "XXXXXXXXX/XXXXXXXXXXX/XXXXXXXXXXXXXXXXXXXXXXXX" } -variable "alertmanager_slack_default_receiver" { +variable "slack_default_receiver" { description = "Alertmanager default slack receiver" type = string default = "default-slack-receiver" } -variable "alertmanager_slack_default_channel" { +variable "slack_default_channel" { description = "Alertmanager default slack channel" type = string default = "default-channel" diff --git a/fdio.infra.terraform/1n_nmd/alertmanager/versions.tf b/fdio.infra.terraform/1n_nmd/alertmanager/versions.tf index b80610a525..5f283ed4ea 100644 --- a/fdio.infra.terraform/1n_nmd/alertmanager/versions.tf +++ b/fdio.infra.terraform/1n_nmd/alertmanager/versions.tf @@ -2,12 +2,8 @@ terraform { required_providers { nomad = { source = "hashicorp/nomad" - version = "~> 1.4.15" - } - template = { - source = "hashicorp/template" - version = "~> 2.2.0" + version = ">= 1.4.16" } } - required_version = ">= 1.0.3" -} + required_version = ">= 1.1.4" +} \ No newline at end of file diff --git a/fdio.infra.terraform/1n_nmd/main.tf b/fdio.infra.terraform/1n_nmd/main.tf index ed4f2b5ac3..7cdd245b7a 100644 --- a/fdio.infra.terraform/1n_nmd/main.tf +++ b/fdio.infra.terraform/1n_nmd/main.tf @@ -4,36 +4,6 @@ # and downstream modules can simply declare resources for that provider # and have them automatically associated with the root provider # configurations. -module "alertmanager" { - source = "./alertmanager" - providers = { - nomad = nomad.yul1 - } - - # nomad - nomad_datacenters = ["yul1"] - - # alertmanager - alertmanager_job_name = "prod-alertmanager" - alertmanager_use_canary = true - alertmanager_group_count = 1 - alertmanager_vault_secret = { - use_vault_provider = false, - vault_kv_policy_name = "kv-secret", - vault_kv_path = "secret/data/prometheus", - vault_kv_field_access_key = "access_key", - vault_kv_field_secret_key = "secret_key" - } - alertmanager_version = "0.21.0" - alertmanager_cpu = 1000 - alertmanager_mem = 1024 - alertmanager_port = 9093 - alertmanager_slack_jenkins_api_key = "TE07RD1V1/B01U1NV9HV3/hKZXJJ74g2JcISq4K3QC1eG9" - alertmanager_slack_jenkins_channel = "fdio-jobs-monitoring" - alertmanager_slack_default_api_key = "TE07RD1V1/B01UUK23B6C/hZTcCu42FUv8d6rtirHtcYIi" - alertmanager_slack_default_channel = "fdio-infra-monitoring" -} - module "grafana" { source = "./grafana" providers = { -- 2.16.6