From 9481aad815189d6251d36c11e3f901f9179dab40 Mon Sep 17 00:00:00 2001 From: pmikus Date: Sat, 6 Mar 2021 21:03:37 +0000 Subject: [PATCH] Infra: Add reschedule policy Add rechedule policy to jobs so in case of failure they will respawn in a endless loop Signed-off-by: pmikus Change-Id: I15698d9e147644e68bec549fc53474d421b25d9a --- .../alertmanager/conf/nomad/alertmanager.hcl | 23 +++++++ .../1n_nmd/grafana/conf/nomad/grafana.hcl | 24 ++++++- .../1n_nmd/minio/conf/nomad/minio.hcl | 49 ++++++-------- .../1n_nmd/nginx/conf/nomad/nginx.hcl | 21 +++--- .../1n_nmd/prometheus/conf/nomad/prometheus.hcl | 23 +++++++ terraform-ci-infra/1n_nmd/terraform.tfstate | 78 ++++++++++++---------- terraform-ci-infra/1n_nmd/terraform.tfstate.backup | 47 +++++++------ .../1n_nmd/tools/artifacts_download.py | 47 +++++++++++++ 8 files changed, 217 insertions(+), 95 deletions(-) create mode 100755 terraform-ci-infra/1n_nmd/tools/artifacts_download.py diff --git a/terraform-ci-infra/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl b/terraform-ci-infra/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl index 40d84e337a..6b0d669d0e 100644 --- a/terraform-ci-infra/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl +++ b/terraform-ci-infra/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl @@ -71,6 +71,17 @@ job "${job_name}" { %{ endif } } + # The reschedule stanza specifies the group's rescheduling strategy. If + # specified at the job level, the configuration will apply to all groups + # within the job. If the reschedule stanza is present on both the job and the + # group, they are merged with the group stanza taking the highest precedence + # and then the job. + reschedule { + delay = "30s" + delay_function = "constant" + unlimited = true + } + # The "group" stanza defines a series of tasks that should be co-located on # the same Nomad client. Any task within a group will be placed on the same # client. @@ -86,6 +97,18 @@ job "${job_name}" { # to 1. count = ${group_count} + # The restart stanza configures a tasks's behavior on task failure. Restarts + # happen on the client that is running the task. + # + # https://www.nomadproject.io/docs/job-specification/restart + # + restart { + interval = "30m" + attempts = 40 + delay = "15s" + mode = "delay" + } + # The constraint allows restricting the set of eligible nodes. Constraints # may filter on attributes or client metadata. # diff --git a/terraform-ci-infra/1n_nmd/grafana/conf/nomad/grafana.hcl b/terraform-ci-infra/1n_nmd/grafana/conf/nomad/grafana.hcl index 7325c6aef4..a759abc4f7 100644 --- a/terraform-ci-infra/1n_nmd/grafana/conf/nomad/grafana.hcl +++ b/terraform-ci-infra/1n_nmd/grafana/conf/nomad/grafana.hcl @@ -71,6 +71,17 @@ job "${job_name}" { %{ endif } } + # The reschedule stanza specifies the group's rescheduling strategy. If + # specified at the job level, the configuration will apply to all groups + # within the job. If the reschedule stanza is present on both the job and the + # group, they are merged with the group stanza taking the highest precedence + # and then the job. + reschedule { + delay = "30s" + delay_function = "constant" + unlimited = true + } + # The "group" stanza defines a series of tasks that should be co-located on # the same Nomad client. Any task within a group will be placed on the same # client. @@ -86,6 +97,17 @@ job "${job_name}" { # to 1. count = ${group_count} + # The restart stanza configures a tasks's behavior on task failure. Restarts + # happen on the client that is running the task. + # + # https://www.nomadproject.io/docs/job-specification/restart + # + restart { + interval = "30m" + attempts = 40 + delay = "15s" + mode = "delay" + } # The constraint allows restricting the set of eligible nodes. Constraints # may filter on attributes or client metadata. @@ -120,7 +142,7 @@ job "${job_name}" { # documentation for more information. config { image = "${image}" - dns_servers = [ "$${attr.unique.network.ip-address}" ] + dns_servers = [ "172.17.0.1" ] volumes = [ "secrets/prometheus.yml:/etc/grafana/provisioning/datasources/prometheus.yml", "secrets/dashboards.yml:/etc/grafana/provisioning/dashboards/dashboards.yml", diff --git a/terraform-ci-infra/1n_nmd/minio/conf/nomad/minio.hcl b/terraform-ci-infra/1n_nmd/minio/conf/nomad/minio.hcl index a2df44f666..3889b51a9f 100644 --- a/terraform-ci-infra/1n_nmd/minio/conf/nomad/minio.hcl +++ b/terraform-ci-infra/1n_nmd/minio/conf/nomad/minio.hcl @@ -12,9 +12,7 @@ job "${job_name}" { # "service". For a full list of job types and their differences, please see # the online documentation. # - # For more information, please see the online documentation at: - # - # https://www.nomadproject.io/docs/jobspec/schedulers.html + # https://www.nomadproject.io/docs/jobspec/schedulers # type = "service" @@ -81,32 +79,38 @@ job "${job_name}" { # the same Nomad client. Any task within a group will be placed on the same # client. # - # For more information and examples on the "group" stanza, please see - # the online documentation at: - # - # https://www.nomadproject.io/docs/job-specification/group.html + # https://www.nomadproject.io/docs/job-specification/group # group "prod-group1-minio" { # The "count" parameter specifies the number of the task groups that should # be running under this group. This value must be non-negative and defaults # to 1. - count = ${group_count} + count = ${group_count} # https://www.nomadproject.io/docs/job-specification/volume %{ if use_host_volume } volume "prod-volume1-minio" { - type = "host" - read_only = false - source = "${host_volume}" + type = "host" + read_only = false + source = "${host_volume}" } %{ endif } + # The restart stanza configures a tasks's behavior on task failure. Restarts + # happen on the client that is running the task. + # + # https://www.nomadproject.io/docs/job-specification/restart + # + restart { + interval = "30m" + attempts = 40 + delay = "15s" + mode = "delay" + } + # The "task" stanza creates an individual unit of work, such as a Docker # container, web application, or batch processing. # - # For more information and examples on the "task" stanza, please see - # the online documentation at: - # # https://www.nomadproject.io/docs/job-specification/task.html # task "prod-task1-minio" { @@ -134,7 +138,7 @@ job "${job_name}" { # documentation for more information. config { image = "${image}" - dns_servers = [ "$${attr.unique.network.ip-address}" ] + dns_servers = [ "172.17.0.1" ] network_mode = "host" command = "server" args = [ "${host}:${port}${data_dir}" ] @@ -161,10 +165,7 @@ job "${job_name}" { # The service stanza instructs Nomad to register a service with Consul. # - # For more information and examples on the "task" stanza, please see - # the online documentation at: - # - # https://www.nomadproject.io/docs/job-specification/service.html + # https://www.nomadproject.io/docs/job-specification/service # service { name = "${service_name}" @@ -197,10 +198,7 @@ job "${job_name}" { # This ensures the task will execute on a machine that contains enough # resource capacity. # - # For more information and examples on the "resources" stanza, please see - # the online documentation at: - # - # https://www.nomadproject.io/docs/job-specification/resources.html + # https://www.nomadproject.io/docs/job-specification/resources # resources { cpu = ${cpu} @@ -212,10 +210,7 @@ job "${job_name}" { # your job will be provisioned on, Nomad will provide your tasks with # network configuration when they start up. # - # For more information and examples on the "template" stanza, please see - # the online documentation at: - # - # https://www.nomadproject.io/docs/job-specification/network.html + # https://www.nomadproject.io/docs/job-specification/network # network { port "http" { diff --git a/terraform-ci-infra/1n_nmd/nginx/conf/nomad/nginx.hcl b/terraform-ci-infra/1n_nmd/nginx/conf/nomad/nginx.hcl index 3c4761c567..0775a498da 100644 --- a/terraform-ci-infra/1n_nmd/nginx/conf/nomad/nginx.hcl +++ b/terraform-ci-infra/1n_nmd/nginx/conf/nomad/nginx.hcl @@ -98,11 +98,14 @@ job "${job_name}" { # The restart stanza configures a tasks's behavior on task failure. Restarts # happen on the client that is running the task. + # + # https://www.nomadproject.io/docs/job-specification/restart + # restart { - interval = "10m" - attempts = 2 + interval = "30m" + attempts = 40 delay = "15s" - mode = "fail" + mode = "delay" } # The "task" stanza creates an individual unit of work, such as a Docker @@ -124,7 +127,6 @@ job "${job_name}" { # documentation for more information. config { image = "nginx:stable" - dns_servers = [ "$${attr.unique.network.ip-address}" ] port_map { https = 443 } @@ -150,10 +152,9 @@ job "${job_name}" { template { data = <