Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

restarting server and client results in failing Consul health checks #16453

Closed
suikast42 opened this issue Mar 13, 2023 · 3 comments
Closed

restarting server and client results in failing Consul health checks #16453

suikast42 opened this issue Mar 13, 2023 · 3 comments

Comments

@suikast42
Copy link
Contributor

suikast42 commented Mar 13, 2023

Nomad version

Nomad v1.5.1
BuildDate 2023-03-10T22:05:57Z
Revision 6c118dd

Consul version

Consul v1.15.1
Revision 7c04b6a0
Build Date 2023-03-07T20:35:33Z

Operating system and Environment details

Ubuntu 22.04 In VMWare Worksation
Single Node Master
Single Node Worker

Issue

If I deloy the job below at first time everything works fine.
If I reboot only the worker machine this works fine as well.

But If I (re)boot the master and server then the Nomoad UI shows a helathy deloyment but consul shows failed healthcheck.

A restart of the worker deploys the minio job as expected again.

Actual Result

image

image

Job file (if appropriate)

job "minio" {
  datacenters = ["nomadder1"]
  type = "service"
  reschedule {
    delay          = "10s"
    delay_function = "constant"
    unlimited      = true
  }
  update {
      max_parallel      = 1
      health_check      = "checks"
      # Alloc is marked as unhealthy after this time
      healthy_deadline  = "2m"
      min_healthy_time  = "10s"
  }
  group "minio" {

    count = 1
    volume "stack_core_minio_volume" {
      type      = "host"
      source    = "core_minio_volume"
      read_only = false
    }
   volume "ca_cert" {
      type      = "host"
      source    = "ca_cert"
      read_only = true
    }
    restart {
      attempts = 10
      interval = "2m"
      delay = "5s"
      mode = "fail"
    }
    network {
      mode = "bridge"
      port "http" {
        to = 9000
      }
      port "console" {
        to = 9001
      }
    }

    task "minio" {
      volume_mount {
         volume      = "stack_core_minio_volume"
         destination = "/data"
      }
      volume_mount {
        volume      = "ca_cert"
        # the server searches in the /CAs path at that specified directory.
        # Do not change the sub folder name CAs
        destination = "/certs/CAs"
      }
      driver = "docker"

      config {
        image = "registry.cloud.private/minio/minio:RELEASE.2023-03-09T23-16-13Z"
        command = "server"
        args = [
          "/data",
          "--console-address",
          ":9001",
          "--certs-dir",
          "/certs"
          ]
         ports = ["http","console"]
      }

     env {
        HOSTNAME = "${NOMAD_ALLOC_NAME}"
        MINIO_SERVER_URL = "https://minio.cloud.private"
        MINIO_PROMETHEUS_AUTH_TYPE = "public"
        MINIO_PROMETHEUS_URL     = "http://mimir.service.consul:9009/prometheus"
        MINIO_PROMETHEUS_JOB_ID  = "integrations/minio"

      }
    template {
       destination = "${NOMAD_SECRETS_DIR}/env.vars"
       env         = true
       change_mode = "restart"
       data        = <<EOF
       {{- with nomadVar "nomad/jobs/minio" -}}
          MINIO_ROOT_USER      = {{.minio_root_user}}
          MINIO_ROOT_PASSWORD  =  {{.minio_root_password}}
        {{- end -}}
       EOF
    }

      resources {
        cpu= 500
        memory = 512
        memory_max = 4096
      }

      service {
        port = "http"
        name = "minio"
        tags = [
         "prometheus_minio",
         "frontend",
         "minio",
         "prometheus:server=${NOMAD_ALLOC_NAME}",
         "prometheus:version=RELEASE.2023-03-09T23-16-13Z",
         "traefik.enable=true",
         "traefik.consulcatalog.connect=false",
         "traefik.http.routers.minio.tls=true",
         "traefik.http.routers.minio.rule=Host(`minio.cloud.private`)",
        ]

      check {
        name      = "minio-live"
        type     = "http"
        path     = "/minio/health/live"
        port     = "http"
        interval = "10s"
        timeout  = "2s"
      }
      check {
          name      = "minio-ready"
          type     = "http"
          path     = "/minio/health/ready"
          port     = "http"
          interval = "10s"
          timeout  = "4s"
          check_restart {
            limit = 3
            grace = "60s"
            ignore_warnings = false
          }
       }
     }
      service {
           port = "console"
           name = "minio-console"
           tags = [
             "console",
             "minio",
             "traefik.enable=true",
             "traefik.consulcatalog.connect=false",
             "traefik.http.routers.minio-console.tls=true",
             "traefik.http.routers.minio-console.rule=Host(`minio.console.cloud.private`)",
           ]
           check {
             type     = "http"
             path     = "/"
             port     = "console"
             interval = "30s"
             timeout  = "2s"
           }
         }
      }
  }
}

Nomad Server logs (if appropriate)

hod=GET path=/v1/job/minio/allocations?index=2651 duration=8.98017897s
Mar 13 17:15:35 master-01 nomad[862]: 2023-03-13T17:15:35.724Z [DEBUG] http: request complete: method=GET path=/v1/client/allocation/8306cc25-592b-9230-ee60-5d973045bac2/stats duration=2.135934ms
Mar 13 17:15:36 master-01 nomad[862]: 2023-03-13T17:15:36.361Z [DEBUG] http: request complete: method=GET path=/v1/job/minio/allocations?index=2654 duration=646.380027ms
Mar 13 17:15:36 master-01 nomad[862]: 2023-03-13T17:15:36.361Z [DEBUG] http: request complete: method=GET path=/v1/job/minio/summary?index=2651 duration=2.973866356s
Mar 13 17:15:36 master-01 nomad[862]: 2023-03-13T17:15:36.361Z [DEBUG] http: request complete: method=GET path=/v1/job/minio/summary?index=2651 duration=9.604144736s
Mar 13 17:15:39 master-01 nomad[862]: 2023-03-13T17:15:39.584Z [DEBUG] nomad: memberlist: Stream connection from=127.0.0.1:56384
Mar 13 17:15:39 master-01 nomad[862]: 2023-03-13T17:15:39.677Z [DEBUG] http: request complete: method=GET path=/v1/client/allocation/8306cc25-592b-9230-ee60-5d973045bac2/stats duration=1.050135ms
Mar 13 17:15:41 master-01 nomad[862]: 2023-03-13T17:15:41.985Z [DEBUG] http: request complete: method=GET path=/v1/metrics?format=prometheus duration=2.096954ms
Mar 13 17:15:43 master-01 nomad[862]: 2023-03-13T17:15:43.676Z [DEBUG] http: request complete: method=GET path=/v1/client/allocation/8306cc25-592b-9230-ee60-5d973045bac2/stats duration=1.192653ms
Mar 13 17:15:47 master-01 nomad[862]: 2023-03-13T17:15:47.678Z [DEBUG] http: request complete: method=GET path=/v1/client/allocation/8306cc25-592b-9230-ee60-5d973045bac2/stats duration=1.810706ms
Mar 13 17:15:49 master-01 nomad[862]: 2023-03-13T17:15:49.585Z [DEBUG] nomad: memberlist: Stream connection from=127.0.0.1:52056
Mar 13 17:15:51 master-01 nomad[862]: 2023-03-13T17:15:51.674Z [DEBUG] http: request complete: method=GET path=/v1/client/allocation/8306cc25-592b-9230-ee60-5d973045bac2/stats duration=1.126754ms
Mar 13 17:15:51 master-01 nomad[862]: 2023-03-13T17:15:51.985Z [DEBUG] http: request complete: method=GET path=/v1/metrics?format=prometheus duration=1.35715ms
Mar 13 17:15:55 master-01 nomad[862]: 2023-03-13T17:15:55.675Z [DEBUG] http: request complete: method=GET path=/v1/client/allocation/8306cc25-592b-9230-ee60-5d973045bac2/stats duration=1.214397ms
Mar 13 17:15:59 master-01 nomad[862]: 2023-03-13T17:15:59.588Z [DEBUG] nomad: memberlist: Stream connection from=127.0.0.1:52266
Mar 13 17:15:59 master-01 nomad[862]: 2023-03-13T17:15:59.674Z [DEBUG] http: request complete: method=GET path=/v1/client/allocation/8306cc25-592b-9230-ee60-5d973045bac2/stats duration=1.349902ms
Mar 13 17:16:01 master-01 nomad[862]: 2023-03-13T17:16:01.985Z [DEBUG] http: request complete: method=GET path=/v1/metrics?format=prometheus duration=1.264898ms
Mar 13 17:16:03 master-01 nomad[862]: 2023-03-13T17:16:03.675Z [DEBUG] http: request complete: method=GET path=/v1/client/allocation/8306cc25-592b-9230-ee60-5d973045bac2/stats duration=1.765445ms
Mar 13 17:16:07 master-01 nomad[862]: 2023-03-13T17:16:07.674Z [DEBUG] http: request complete: method=GET path=/v1/client/allocation/8306cc25-592b-9230-ee60-5d973045bac2/stats duration=1.560067ms
Mar 13 17:16:09 master-01 nomad[862]: 2023-03-13T17:16:09.590Z [DEBUG] nomad: memberlist: Stream connection from=127.0.0.1:35512
Mar 13 17:16:11 master-01 nomad[862]: 2023-03-13T17:16:11.675Z [DEBUG] http: request complete: method=GET path=/v1/client/allocation/8306cc25-592b-9230-ee60-5d973045bac2/stats duration=1.72123ms
Mar 13 17:16:11 master-01 nomad[862]: 2023-03-13T17:16:11.990Z [DEBUG] http: request complete: method=GET path=/v1/metrics?format=prometheus duration=4.115019ms
Mar 13 17:16:15 master-01 nomad[862]: 2023-03-13T17:16:15.675Z [DEBUG] http: request complete: method=GET path=/v1/client/allocation/8306cc25-592b-9230-ee60-5d973045bac2/stats duration=1.512785ms

Nomad Client logs (if appropriate)

Mar 13 17:15:42 worker-01 nomad[927]: 2023-03-13T17:15:42.196Z [DEBUG] client.driver_mgr.docker.docker_logger: starting plugin: driver=docker path=/usr/local/bin/nomad args=["/usr/local/bin/nomad", "docker_logger"]
Mar 13 17:15:42 worker-01 nomad[927]: 2023-03-13T17:15:42.196Z [DEBUG] client.driver_mgr.docker.docker_logger: plugin started: driver=docker path=/usr/local/bin/nomad pid=15936
Mar 13 17:15:42 worker-01 nomad[927]: 2023-03-13T17:15:42.196Z [DEBUG] client.driver_mgr.docker.docker_logger: waiting for RPC address: driver=docker path=/usr/local/bin/nomad
Mar 13 17:15:42 worker-01 nomad[927]: 2023-03-13T17:15:42.221Z [DEBUG] client.driver_mgr.docker.docker_logger: using plugin: driver=docker version=2
Mar 13 17:15:42 worker-01 nomad[927]: 2023-03-13T17:15:42.221Z [DEBUG] client.driver_mgr.docker.docker_logger.nomad: plugin address: driver=docker address=/tmp/plugin4213717586 network=unix @module=docker_logger timestamp=2023-03-13T17:15:42.221Z
Mar 13 17:15:42 worker-01 nomad[927]: 2023-03-13T17:15:42.222Z [DEBUG] client.driver_mgr.docker.docker_logger.nomad: using client connection initialized from environment: driver=docker @module=docker_logger timestamp=2023-03-13T17:15:42.222Z
Mar 13 17:15:42 worker-01 nomad[927]: 2023-03-13T17:15:42.223Z [INFO] client.alloc_runner.task_runner: Task event: alloc_id=8306cc25-592b-9230-ee60-5d973045bac2 task=minio type=Started msg="Task started by client" failed=false
Mar 13 17:15:42 worker-01 nomad[927]: 2023-03-13T17:15:42.240Z [DEBUG] consul.sync: sync complete: registered_services=2 deregistered_services=0 registered_checks=0 deregistered_checks=0
Mar 13 17:15:42 worker-01 nomad[927]: 2023-03-13T17:15:42.422Z [DEBUG] client: updated allocations: index=2656 total=24 pulled=3 filtered=21
Mar 13 17:15:42 worker-01 nomad[927]: 2023-03-13T17:15:42.422Z [DEBUG] client: allocation updates: added=0 removed=0 updated=3 ignored=21
Mar 13 17:15:42 worker-01 nomad[927]: 2023-03-13T17:15:42.425Z [DEBUG] client: allocation updates applied: added=0 removed=0 updated=3 ignored=21 errors=0
Mar 13 17:15:46 worker-01 nomad[927]: 2023-03-13T17:15:46.507Z [DEBUG] http: request complete: method=GET path=/v1/metrics?format=prometheus duration=1.675172ms
Mar 13 17:15:56 worker-01 nomad[927]: 2023-03-13T17:15:56.504Z [DEBUG] http: request complete: method=GET path=/v1/metrics?format=prometheus duration=1.059369ms
Mar 13 17:16:06 worker-01 nomad[927]: 2023-03-13T17:16:06.311Z [DEBUG] http: request complete: method=GET path="/v1/var/nomad/jobs/minio?index=1886&namespace=default&stale=&wait=60000ms" duration=1m0.84551494s
Mar 13 17:16:06 worker-01 nomad[927]: 2023-03-13T17:16:06.505Z [DEBUG] http: request complete: method=GET path=/v1/metrics?format=prometheus duration=1.112804ms
Mar 13 17:16:16 worker-01 nomad[927]: 2023-03-13T17:16:16.505Z [DEBUG] http: request complete: method=GET path=/v1/metrics?format=prometheus duration="988.982µs"
Mar 13 17:16:16 worker-01 nomad[927]: 2023-03-13T17:16:16.723Z [DEBUG] http: request complete: method=GET path="/v1/var/nomad/jobs/security?index=1885&namespace=default&stale=&wait=60000ms" duration=1m3.404017411s
Mar 13 17:16:19 worker-01 nomad[927]: 2023-03-13T17:16:19.944Z [DEBUG] http: request complete: method=GET path="/v1/var/nomad/jobs/security?index=1885&namespace=default&stale=&wait=60000ms" duration=1m0.406644411s
Mar 13 17:16:26 worker-01 nomad[927]: 2023-03-13T17:16:26.508Z [DEBUG] http: request complete: method=GET path=/v1/metrics?format=prometheus duration=1.542904ms
Mar 13 17:16:31 worker-01 nomad[927]: 2023-03-13T17:16:31.131Z [DEBUG] http: request complete: method=GET path="/v1/var/nomad/jobs/security?index=1885&namespace=default&stale=&wait=60000ms" duration=1m0.295490132s
Mar 13 17:16:36 worker-01 nomad[927]: 2023-03-13T17:16:36.506Z [DEBUG] http: request complete: method=GET path=/v1/metrics?format=prometheus duration=1.148015ms
Mar 13 17:16:42 worker-01 nomad[927]: 2023-03-13T17:16:42.711Z [DEBUG] watch.checks: check became unhealthy. Will restart if check doesn't become healthy: alloc_id=8306cc25-592b-9230-ee60-5d973045bac2 check=minio-ready task=minio time_limit=20s
Mar 13 17:16:46 worker-01 nomad[927]: 2023-03-13T17:16:46.506Z [DEBUG] http: request complete: method=GET path=/v1/metrics?format=prometheus duration=1.506482ms

@suikast42
Copy link
Contributor Author

That's strange.

If I shutdown the master and worker then boot only the master nomad shows me the alloxations as running
image

@tgross
Copy link
Member

tgross commented Mar 24, 2023

Hi @suikast42! Here's the relevant bit from your logs:

Mar 13 17:16:42 worker-01 nomad[927]: 2023-03-13T17:16:42.711Z [DEBUG] watch.checks: check became unhealthy. Will restart if check doesn't become healthy: alloc_id=8306cc25-592b-9230-ee60-5d973045bac2 check=minio-ready task=minio time_limit=20s

The first thing I'd look at in this case is to see if the health check should be passing. Just because the client's handle to the allocation has been restored doesn't mean that it's actually able to serve the health check. You should check:

  • The minio process is actually running.
  • The allocation logs don't show errors.
  • The http port is still reachable.
  • The error messages in Consul's UI for the health check.

It's also a little unclear as to whether you've restarted the client agent or the entire host the client is on. If you reboot the host, the allocation will not restore during the Nomad client start, and the client will have to get rescheduled allocation from the server. That might explain why restarting the server caused the problem.

If I shutdown the master and worker then boot only the master nomad shows me the alloxations as running

The server will wait until the client misses a heartbeat before marking the client as down and then the allocation will be marked lost. The amount of time this takes varies and depends on the configuration and size of the cluster. See the Client Heartbeats documentation.

@tgross tgross changed the title Deployments hangs on (re)start master and and work at same time restarting server and client results in failing Consul health checks Mar 24, 2023
@suikast42
Copy link
Contributor Author

I redefine the reschedule, restart , update configs as folowing and not have the described issue anymore.

job "minio" {
  datacenters = ["nomadder1"]
  type = "service"
   reschedule {
    delay          = "10s"
    delay_function = "constant"
    unlimited      = true
  }
  update {
      max_parallel      = 1
      health_check      = "checks"
      # Alloc is marked as unhealthy after this time
      healthy_deadline  = "5m"
      auto_revert  = true
      # Mark the task as healthy after 10s positive check
      min_healthy_time  = "10s"
      # Task is dead after failed checks in 1h
      progress_deadline = "1h"
  }
  group "minio" {

    count = 1
    volume "stack_core_minio_volume" {
      type      = "host"
      source    = "core_minio_volume"
      read_only = false
    }
   volume "ca_cert" {
      type      = "host"
      source    = "ca_cert"
      read_only = true
    }
    restart {
      attempts = 1
      interval = "1h"
      delay = "5s"
      mode = "fail"
    }
    network {
      mode = "bridge"
      port "http" {
        to = 9000
      }
      port "console" {
        to = 9001
      }
    }

    task "minio" {
      volume_mount {
         volume      = "stack_core_minio_volume"
         destination = "/data"
      }
      volume_mount {
        volume      = "ca_cert"
        # the server searches in the /CAs path at that specified directory.
        # Do not change the sub folder name CAs
        destination = "/certs/CAs"
      }
      driver = "docker"

      config {
        image = "registry.cloud.private/minio/minio:RELEASE.2023-03-09T23-16-13Z"
        command = "server"
        args = [
          "/data",
          "--console-address",
          ":9001",
          "--certs-dir",
          "/certs"
          ]
         ports = ["http","console"]
      }

     env {
        HOSTNAME = "${NOMAD_ALLOC_NAME}"
        MINIO_SERVER_URL = "https://minio.cloud.private"
        #MINIO_IDENTITY_OPENID_CONFIG_URL="https://security.cloud.private/realms/nomadder/.well-known/openid-configuration"
        MINIO_PROMETHEUS_AUTH_TYPE = "public"
        MINIO_PROMETHEUS_URL     = "http://mimir.service.consul:9009/prometheus"
        MINIO_PROMETHEUS_JOB_ID  = "integrations/minio"
      }
    template {
       destination = "${NOMAD_SECRETS_DIR}/env.vars"
       env         = true
       change_mode = "restart"
       data        = <<EOF
       {{- with nomadVar "nomad/jobs/minio" -}}
          MINIO_ROOT_USER      = {{.minio_root_user}}
          MINIO_ROOT_PASSWORD  =  {{.minio_root_password}}
        {{- end -}}
       EOF
    }

      resources {
        cpu= 500
        memory = 512
        memory_max = 4096
      }

      service {
        port = "http"
        name = "minio"
        tags = [
         "prometheus_minio",
         "frontend",
         "minio",
         "prometheus:server=${NOMAD_ALLOC_NAME}",
         "prometheus:version=RELEASE.2023-03-09T23-16-13Z",
         "traefik.enable=true",
         "traefik.consulcatalog.connect=false",
         "traefik.http.routers.minio.tls=true",
         "traefik.http.routers.minio.rule=Host(`minio.cloud.private`)",
        ]

      check {
        name      = "minio-live"
        type     = "http"
        path     = "/minio/health/live"
        port     = "http"
        interval = "10s"
        timeout  = "2s"
      }
      check {
          name      = "minio-ready"
          type     = "http"
          path     = "/minio/health/ready"
          port     = "http"
          interval = "10s"
          timeout  = "2s"
          check_restart {
            limit = 3
            grace = "60s"
            ignore_warnings = false
          }
       }
     }
      service {
           port = "console"
           name = "minio-console"
           tags = [
             "console",
             "minio",
             "traefik.enable=true",
             "traefik.consulcatalog.connect=false",
             "traefik.http.routers.minio-console.tls=true",
             "traefik.http.routers.minio-console.rule=Host(`minio.console.cloud.private`)",
           ]
           check {
             type     = "http"
             path     = "/"
             port     = "console"
             interval = "10s"
             timeout  = "2s"
             check_restart {
               limit = 3
               grace = "60s"
               ignore_warnings = false
             }
           }
         }
      }
  }
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
Development

No branches or pull requests

2 participants