diff --git a/production/README.md b/production/README.md index c90f83f5f103..030ae6b3ec49 100644 --- a/production/README.md +++ b/production/README.md @@ -1,9 +1,10 @@ # Running Loki -Currently there are five ways to try out Loki, in order from easier to hardest: +Currently there are six ways to try out Loki, in order from easier to hardest: - [Grafana Cloud: Hosted Logs](#grafana-cloud-logs) - [Run Loki locally with Docker](#run-locally-using-docker) +- [Run Loki with Nomad](#run-with-nomad) - [Use Helm to deploy on Kubernetes](#using-helm-to-deploy-on-kubernetes) - [Build Loki from source](#build-and-run-from-source) - [Get inspired by our production setup](#get-inspired-by-our-production-setup) @@ -43,6 +44,12 @@ For instructions on how to query Loki, see [our usage docs](https://grafana.com/ To deploy a cluster of loki locally, please refer to this [doc](./docker/) +## Run with Nomad + +There are example [Nomad jobs](./nomad) that can be used to deploy Loki with +[Nomad](https://www.nomadproject.io/) - simple and powerful workload +orchestrator from HashiCorp. + ## Using Helm to deploy on Kubernetes There is a [Helm chart](helm) to deploy Loki and Promtail to Kubernetes. diff --git a/production/nomad/README.md b/production/nomad/README.md new file mode 100644 index 000000000000..14c3d9db286c --- /dev/null +++ b/production/nomad/README.md @@ -0,0 +1,232 @@ +# Loki Nomad examples + +## Requirements + +### Hard requirements + +- recent version of Nomad [installed](https://www.nomadproject.io/docs/install) with healthy Docker driver +- [Consul integration](https://www.nomadproject.io/docs/integrations/consul-integration) + is enabled in Nomad +- access to S3 storage + +### Optional requirements + +- [Vault integration](https://www.nomadproject.io/docs/integrations/vault-integration) + for providing S3 credentials securely +- Traefik configured to use + [Consul provider](https://doc.traefik.io/traefik/providers/consul-catalog/) to + loadbalance between Loki instances + +### Production use + +For use in production it is recommended to: + +- secure HTTP endpoints with + [Consul Connect](https://www.nomadproject.io/docs/integrations/consul-connect) +- setup authentication - can be achieved with + [Traefik](https://doc.traefik.io/traefik/middlewares/http/basicauth/) +- secure GRPC communication with mTLS - can be achived with Vault's + [PKI secret engine](https://www.vaultproject.io/docs/secrets/pki) + +See [loki-distributed](./loki-distributed) README for more info. + +## Service discovery when scaling + +When using multiple Loki instances memberlist advertises wrong address (see this +[issue](https://github.com/grafana/loki/issues/5610)), that is why these +examples are using Consul ring for service discovery. + +Is you are using Nomad then you are probably also using Consul, so this +shouldn't be an issue. + +## Run Loki behind loadbalancer + +When running multiple instances of Loki incoming requests should be +loadbalanced. + +Register Loki in Traefik: + +```hcl +tags = [ + "traefik.enable=true", + "traefik.http.routers.loki.entrypoints=https", + "traefik.http.routers.loki.rule=Host(`loki.service.consul`)", +] +``` + +## Setup basicauth + +Generate basicauth credentials: + +```shell +> docker run --rm httpd:alpine htpasswd -nb promtail password123 +promtail:$apr1$Lr55BanK$BV/rE2POaOolkFz8kIfY4/ +``` + +Register Loki in Traefik: + +```hcl +tags = [ + "traefik.enable=true", + "traefik.http.routers.loki.entrypoints=https", + "traefik.http.routers.loki.rule=Host(`loki.service.consul`)", + "traefik.http.middlewares.loki.basicauth.users=promtail:$apr1$Lr55BanK$BV/rE2POaOolkFz8kIfY4/", + "traefik.http.routers.loki.middlewares=loki@consulcatalog", +] +``` + +Update Promtail config: + +```yaml +clients: + - url: https://loki.service.consul/loki/api/v1/push + basic_auth: + username: promtail + password_file: password123 +``` + +## Use HashiCorp Vault to provide S3 credentials + +To provide static credentials: + +```hcl +template { + data = <<-EOH + {{ with secret "secret/minio/loki" }} + S3_ACCESS_KEY_ID={{ .Data.data.access_key }} + S3_SECRET_ACCESS_KEY={{ .Data.data.secret_key }} + {{- end }} + EOH + + destination = "secrets/s3.env" + env = true +} +``` + +Is is better to provide dynamic credentials using +[AWS secret engine](https://www.vaultproject.io/docs/secrets/aws) when using AWS +S3: + +```hcl +template { + data = <<-EOH + {{ with secret "aws/creds/loki" -}} + S3_ACCESS_KEY_ID={{ .Data.access_key }} + S3_SECRET_ACCESS_KEY={{ .Data.secret_key }} + {{- end }} + EOH + + destination = "secrets/s3.env" + env = true +} +``` + +## Supply alerting rules to Loki ruler with `local` ruler storage + +### Using [`artifact` stanza](https://www.nomadproject.io/docs/job-specification/artifact) + +Alert rules can be downloaded from remote storage using artifact stanza. It +supports: + +- Git +- Mercurial +- HTTP +- Amazon S3 +- Google GCP + +Example with git: + +```hcl +artifact { + source = "git::github.com////" + destination = "local/rules/" +} +``` + +### Using local files + +Alert rules can be stored locally (beside job definition) and provided to Loki +ruler container with +[`template`](https://www.nomadproject.io/docs/job-specification/template) stanza +and some HCL magic, namely: + +- [fileset](https://www.nomadproject.io/docs/job-specification/hcl2/functions/file/fileset) - + to generate a list of files +- [file](https://www.nomadproject.io/docs/job-specification/hcl2/functions/file/file) - + to get the content of a file +- [dynamic](https://www.nomadproject.io/docs/job-specification/hcl2/expressions#dynamic-blocks) - + to dynamically generate `template` stanza for each file found + +Example: + +```shell +> tree rules/ +rules/ +└── fake + └── some-alerts.yml + +1 directory, 1 file +``` + +```hcl +dynamic "template" { + for_each = fileset(".", "rules/**") + + content { + data = file(template.value) + destination = "local/${template.value}" + left_delimiter = "[[" + right_delimiter = "]]" + } +} +``` + +Each file will end up in `/local/rules/` inside ruler container. + +### Using Consul K/V and Terraform + +```shell +> tree loki-rules/ +loki-rules/ +└── fake + └── some-alerts.yml + +1 directory, 1 file +``` + +Using Terraform +[Consul provider](https://registry.terraform.io/providers/hashicorp/consul/latest/docs/resources/keys) +put all files in `loki-rules/` to Consul K/V + +```hcl +resource "consul_keys" "loki-rules" { + dynamic "key" { + for_each = fileset("${path.module}/loki-rules", "**") + content { + path = "configs/loki-rules/${trimsuffix(key.value, ".yml")}" + value = file("${file.module}/loki-rules/${key.value}") + delete = true + } + } +} +``` + +Provide rules from K/V to Loki ruler container inside Nomad with +[`safeTree`](https://github.com/hashicorp/consul-template/blob/main/docs/templating-language.md#safetree) + +```hcl +template { + data = <<-EOF + {{- range safeTree "configs/loki-rules" }} + --- + {{ .Value | indent 2 }} + {{ end -}} + EOF + + destination = "local/rules/fake/rules.yml" + change_mode = "signal" + change_signal = "SIGINT" +} +``` + +When updated in Consul K/V rules will be automatically updated in Loki ruler. diff --git a/production/nomad/loki-distributed/README.md b/production/nomad/loki-distributed/README.md new file mode 100644 index 000000000000..c966bb091ba5 --- /dev/null +++ b/production/nomad/loki-distributed/README.md @@ -0,0 +1,209 @@ +# Microservices mode + +This Nomad job will deploy Loki in +[microservices mode](https://grafana.com/docs/loki/latest/fundamentals/architecture/deployment-modes/#microservices-mode) +using boltdb-shipper and S3 backend. + +## Usage + +Have a look at the job file and Loki configuration file and change it to suite +your environment. + +### Run job + +Inside directory with job run: + +```shell +nomad run job.nomad.hcl +``` + +To deploy a different version change `variable.version` default value or +specify from command line: + +```shell +nomad job run -var="version=2.5.0" job.nomad.hcl +``` + +### Scale Loki + +Change `count` in job file in `group "loki"` and run: + +```shell +nomad run job.nomad.hcl +``` + +or use Nomad CLI + +```shell +nomad job scale loki distributor +``` + +## Recommendations for running in production + +### Gather metrics + +To collect metrics from all components use this config: + +```yaml +- job_name: "loki" + consul_sd_configs: + - services: + - "loki-compactor" + - "loki-ruler" + - "loki-distributor" + - "loki-ingestor" + - "loki-querier" + - "loki-index-gateway" + - "loki-query-frontend" + - "loki-query-scheduler" + relabel_configs: + - source_labels: ["__meta_consul_service_metadata_alloc_id"] + target_label: "instance" + - source_labels: ["__meta_consul_service_metadata_component"] + target_label: "component" +``` + +### Secure HTTP endpoints with Consul Connect + +Set network to `bridge` mode and add `health` port, that will be used by Consul +healthcheck: + +```hcl + network { + mode = "bridge" + + port "http" {} + port "health" {} + port "grpc" {} + } +``` + +```hcl + task "distibutor" { + driver = "docker" + user = "nobody" + kill_timeout = "90s" + + config { + image = "grafana/loki:${var.versions.loki}" + ports = [ + "http", + "health", # do not forget to publish health port + "grpc", + ] +``` + +Bind HTTP endpoint to `127.0.0.1:80` so it is not accessible from outside: + +```yml +server: + http_listen_address: 127.0.0.1 + http_listen_port: 80 +``` + +Add service registration with Consul Connect enabled, `/metrics` and `/ready` +endpoint [exposed](https://www.nomadproject.io/docs/job-specification/expose) +and API accessible with basicauth through Traefik with Consul Connect +integration: + +```hcl + service { + name = "loki-distributor" + port = "http" + + meta { + alloc_id = NOMAD_ALLOC_ID + component = "distributor" + } + + tags = [ + "traefik.enable=true", + "traefik.consulcatalog.connect=true", + + "traefik.http.routers.loki-distributor.entrypoints=https", + "traefik.http.routers.loki-distributor.rule=Host(`loki-distributor.service.consul`)", + "traefik.http.middlewares.loki-distributor.basicauth.users=promtail:$$apr1$$wnih40yf$$vcxJYiqcEQLknQAZcpy/I1", + "traefik.http.routers.loki-distirbutor.middlewares=loki-distributor@consulcatalog", + + "traefik.http.routers.loki-distributor-ring.entrypoints=https", + "traefik.http.routers.loki-distributor-ring.rule=Host(`loki-distributor.service.consul`) && Path(`/distributor/ring`)", + "traefik.http.middlewares.loki-distributor-ring.basicauth.users=devops:$apr1$bNIZL02A$QrOgT3NAOx.koXWnqfXbo0", + "traefik.http.routers.loki-distributor-ring.middlewares=loki-distributor-ring@consulcatalog", + ] + + check { + name = "Loki distibutor" + port = "health" + type = "http" + path = "/ready" + interval = "20s" + timeout = "1s" + } + + connect { + sidecar_service { + proxy { + local_service_port = 80 + + expose { + path { + path = "/metrics" + protocol = "http" + local_path_port = 80 + listener_port = "http" + } + + path { + path = "/ready" + protocol = "http" + local_path_port = 80 + listener_port = "health" + } + } + } + } + } + } +``` + +## Secure GRPC endpoints with mTLS + +Unfortenately Consul Connect cannot be used to secure GRPC communication between +Loki components, since some components should be able to connect to all +instances of other components. We can secure components GRPC communication with +Vault [PKI engine](https://www.vaultproject.io/docs/secrets/pki). + +Certificate generation can be made less verbose with the following HCL trick: + +1. Add the following to `locals`: + +```hcl +locals { + certs = { + "CA" = "issuing_ca", + "cert" = "certificate", + "key" = "private_key", + } +} +``` + +2. Add dynamic template per service: + +```hcl + dynamic "template" { + for_each = local.certs + content { + data = <<-EOH + {{- with secret "pki/issue/internal" "ttl=10d" "common_name=loki-.service.consul" (env "attr.unique.network.ip-address" | printf "ip_sans=%s") -}} + {{ .Data.${template.value} }} + {{- end -}} + EOH + + destination = "secrets/certs/${template.key}.pem" + change_mode = "restart" + splay = "5m" + } + } +``` + +3. Update config to use generated certificates diff --git a/production/nomad/loki-distributed/config.yml b/production/nomad/loki-distributed/config.yml new file mode 100644 index 000000000000..d4b7cf87efdb --- /dev/null +++ b/production/nomad/loki-distributed/config.yml @@ -0,0 +1,128 @@ +auth_enabled: false + +server: + log_level: info + http_listen_port: {{ env "NOMAD_PORT_http" }} + grpc_listen_port: {{ env "NOMAD_PORT_grpc" }} + # grpc_tls_config: + # client_auth_type: "RequireAndVerifyClientCert" + # client_ca_file: "/secrets/certs/CA.pem" + # cert_file: "/secrets/certs/cert.pem" + # key_file: "/secrets/certs/key.pem" + +common: + replication_factor: 2 + # Tell Loki which address to advertise + instance_addr: {{ env "NOMAD_IP_grpc" }} + # Failure domain + # Must be the same as specified in job constraints + instance_availability_zone: {{ env "node.unique.name" }} + zone_awareness_enabled: true + ring: + # Tell Loki which address to advertise in ring + instance_addr: {{ env "NOMAD_IP_grpc" }} + kvstore: + store: consul + prefix: loki/ + consul: + host: {{ env "attr.unique.network.ip-address" }}:8500 + +# ingester_client: +# grpc_client_config: +# grpc_compression: snappy +# tls_enabled: true +# tls_ca_path: "/secrets/certs/CA.pem" +# tls_cert_path: "/secrets/certs/cert.pem" +# tls_key_path: "/secrets/certs/key.pem" + +ingester: + wal: + dir: {{ env "NOMAD_ALLOC_DIR" }}/data/wal + flush_on_shutdown: true + replay_memory_ceiling: "1G" + +# query_scheduler: +# grpc_client_config: +# grpc_compression: snappy +# tls_enabled: true +# tls_ca_path: "/secrets/certs/CA.pem" +# tls_cert_path: "/secrets/certs/cert.pem" +# tls_key_path: "/secrets/certs/key.pem" + +frontend: + scheduler_address: loki-query-scheduler.service.consul:9096 + compress_responses: true + log_queries_longer_than: 5s + # grpc_client_config: + # grpc_compression: snappy + # tls_enabled: true + # tls_ca_path: "/secrets/certs/CA.pem" + # tls_cert_path: "/secrets/certs/cert.pem" + # tls_key_path: "/secrets/certs/key.pem" + +frontend_worker: + scheduler_address: loki-query-scheduler.service.consul:9096 + # grpc_client_config: + # grpc_compression: snappy + # tls_enabled: true + # tls_ca_path: "/secrets/certs/CA.pem" + # tls_cert_path: "/secrets/certs/cert.pem" + # tls_key_path: "/secrets/certs/key.pem" + +schema_config: + configs: + - from: 2022-05-15 + store: boltdb-shipper + object_store: s3 + schema: v12 + index: + prefix: index_ + period: 24h + +storage_config: + boltdb_shipper: + # Nomad ephemeral disk is used to store index and cache + # it will try to preserve /alloc/data between job updates + active_index_directory: {{ env "NOMAD_ALLOC_DIR" }}/data/index + cache_location: {{ env "NOMAD_ALLOC_DIR" }}/data/index-cache + shared_store: s3 + index_gateway_client: + server_address: loki-index-gateway.service.consul:9097 + # grpc_client_config: + # grpc_compression: snappy + # tls_enabled: true + # tls_ca_path: "/secrets/certs/CA.pem" + # tls_cert_path: "/secrets/certs/cert.pem" + # tls_key_path: "/secrets/certs/key.pem" + + aws: + endpoint: https://minio.service.consul + bucketnames: loki + region: us-west-1 + access_key_id: ${S3_ACCESS_KEY_ID} + secret_access_key: ${S3_SECRET_ACCESS_KEY} + s3forcepathstyle: true + +compactor: + working_directory: {{ env "NOMAD_ALLOC_DIR" }}/compactor + shared_store: s3 + compaction_interval: 24h + retention_enabled: true + +ruler: + alertmanager_url: https://alertmanager.service.consul + enable_alertmanager_v2: true + enable_api: true + external_url: https://loki-ruler.service.consul + rule_path: {{ env "NOMAD_ALLOC_DIR" }}/tmp/rules + storage: + type: local + local: + directory: {{ env "NOMAD_TASK_DIR" }}/rules + wal: + dir: {{ env "NOMAD_ALLOC_DIR" }}/data/ruler + +limits_config: + enforce_metric_name: false + reject_old_samples: true + reject_old_samples_max_age: 168h diff --git a/production/nomad/loki-distributed/job.nomad.hcl b/production/nomad/loki-distributed/job.nomad.hcl new file mode 100644 index 000000000000..ef85828172fc --- /dev/null +++ b/production/nomad/loki-distributed/job.nomad.hcl @@ -0,0 +1,615 @@ +variable "version" { + type = string + description = "Loki version" + default = "2.5.0" +} + +job "loki" { + datacenters = ["dc1"] + + group "compactor" { + count = 1 + + ephemeral_disk { + size = 1000 + sticky = true + } + + network { + port "http" {} + port "grpc" {} + } + + service { + name = "loki-compactor" + port = "http" + + meta { + alloc_id = NOMAD_ALLOC_ID + component = "compactor" + } + + tags = [ + "traefik.enable=true", + + "traefik.http.routers.loki-compactor-ring.entrypoints=https", + "traefik.http.routers.loki-compactor-ring.rule=Host(`loki-compactor.service.consul`) && Path(`/compactor/ring`)", + ] + + check { + name = "Loki compactor" + port = "http" + type = "http" + path = "/ready" + interval = "20s" + timeout = "1s" + } + } + + task "compactor" { + driver = "docker" + user = "nobody" + kill_timeout = "90s" + + config { + image = "grafana/loki:${var.version}" + ports = [ + "http", + "grpc", + ] + + args = [ + "-target=compactor", + "-config.file=/local/config.yml", + "-config.expand-env=true", + ] + } + + template { + data = file("config.yml") + destination = "local/config.yml" + } + + template { + data = <<-EOH + S3_ACCESS_KEY_ID= + S3_SECRET_ACCESS_KEY= + EOH + + destination = "secrets/s3.env" + env = true + } + + resources { + cpu = 3000 + memory = 256 + memory_max = 1024 + } + } + } + + group "ruler" { + count = 1 + + ephemeral_disk { + size = 1000 + sticky = true + } + + network { + port "http" {} + port "grpc" {} + } + + service { + name = "loki-ruler" + port = "http" + + meta { + alloc_id = NOMAD_ALLOC_ID + component = "ruler" + } + + tags = [ + "traefik.enable=true", + + "traefik.http.routers.loki-ruler.entrypoints=https", + "traefik.http.routers.loki-ruler.rule=Host(`loki-query-frontend.service.consul`) && (PathPrefix(`/loki/api/v1/rules`) || PathPrefix(`/api/prom/rules`) || PathPrefix (`/prometheus/api/v1`))", + + "traefik.http.routers.loki-ruler-ring.entrypoints=https", + "traefik.http.routers.loki-ruler-ring.rule=Host(`loki-ruler.service.consul`) && Path(`/ruler/ring`)", + ] + + check { + name = "Loki ruler" + port = "http" + type = "http" + path = "/ready" + interval = "20s" + timeout = "1s" + } + + task "ruler" { + driver = "docker" + user = "nobody" + kill_timeout = "90s" + + config { + image = "grafana/loki:${var.version}" + ports = [ + "http", + "grpc", + ] + + args = [ + "-target=ruler", + "-config.file=/local/config.yml", + "-config.expand-env=true", + ] + } + + template { + data = file("config.yml") + destination = "local/config.yml" + } + + dynamic "template" { + for_each = fileset(".", "rules/**") + + content { + data = file(template.value) + destination = "local/${template.value}" + left_delimiter = "[[" + right_delimiter = "]]" + } + } + + template { + data = <<-EOH + S3_ACCESS_KEY_ID= + S3_SECRET_ACCESS_KEY= + EOH + + destination = "secrets/s3.env" + env = true + } + + resources { + cpu = 1000 + memory = 256 + memory_max = 512 + } + } + } + + group "distibutor" { + count = 2 + + network { + port "http" {} + port "grpc" {} + } + + service { + name = "loki-distributor" + port = "http" + + meta { + alloc_id = NOMAD_ALLOC_ID + component = "distributor" + } + + tags = [ + "traefik.enable=true", + + "traefik.http.routers.loki-distributor.entrypoints=https", + "traefik.http.routers.loki-distributor.rule=Host(`loki-distributor.service.consul`)", + + "traefik.http.routers.loki-distributor-ring.entrypoints=https", + "traefik.http.routers.loki-distributor-ring.rule=Host(`loki-distributor.cinarra.com`) && Path(`/distributor/ring`)", + ] + + check { + name = "Loki distibutor" + port = "http" + type = "http" + path = "/ready" + interval = "20s" + timeout = "1s" + } + } + + task "distibutor" { + driver = "docker" + user = "nobody" + kill_timeout = "90s" + + config { + image = "grafana/loki:${var.version}" + ports = [ + "http", + "grpc", + ] + + args = [ + "-target=distributor", + "-config.file=/local/config.yml", + "-config.expand-env=true", + ] + } + + template { + data = file("config.yml") + destination = "local/config.yml" + } + + resources { + cpu = 200 + memory = 128 + memory_max = 1024 + } + } + } + + group "ingester" { + count = 2 + + constraint { + # choose your failure domain + # must be the same as `instance_availability_zone` in config file + distinct_property = node.unique.name + # distinct_property = node.datacenter + # distinct_property = attr.platform.aws.placement.availability-zone + } + + ephemeral_disk { + size = 4000 + sticky = true + } + + network { + port "http" {} + port "grpc" {} + } + + service { + name = "loki-ingester" + port = "http" + + meta { + alloc_id = NOMAD_ALLOC_ID + component = "ingester" + } + + tags = [ + "traefik.enable=true", + + "traefik.http.routers.loki-ingester-ring.entrypoints=https", + "traefik.http.routers.loki-ingester-ring.rule=Host(`loki-ingester.service.consul`) && Path(`/ring`)", + ] + + check { + name = "Loki ingester" + port = "http" + type = "http" + path = "/ready" + interval = "20s" + timeout = "1s" + } + } + + task "ingester" { + driver = "docker" + user = "nobody" + kill_timeout = "90s" + + config { + image = "grafana/loki:${var.version}" + ports = [ + "http", + "grpc", + ] + + args = [ + "-target=ingester", + "-config.file=/local/config.yml", + "-config.expand-env=true", + ] + } + + template { + data = file("config.yml") + destination = "local/config.yml" + } + + template { + data = <<-EOH + S3_ACCESS_KEY_ID= + S3_SECRET_ACCESS_KEY= + EOH + + destination = "secrets/s3.env" + env = true + } + + resources { + cpu = 300 + memory = 128 + memory_max = 2048 + } + } + } + + group "querier" { + count = 2 + + network { + port "http" {} + port "grpc" {} + } + + service { + name = "loki-querier" + port = "http" + + meta { + alloc_id = NOMAD_ALLOC_ID + component = "querier" + } + + check { + name = "Loki querier" + port = "http" + type = "http" + path = "/ready" + interval = "50s" + timeout = "1s" + } + } + + task "querier" { + driver = "docker" + user = "nobody" + kill_timeout = "90s" + + config { + image = "grafana/loki:${var.version}" + ports = [ + "http", + "grpc", + ] + + args = [ + "-target=querier", + "-config.file=/local/config.yml", + "-config.expand-env=true", + ] + } + + template { + data = file("config.yml") + destination = "local/config.yml" + } + + template { + data = <<-EOH + S3_ACCESS_KEY_ID= + S3_SECRET_ACCESS_KEY= + EOH + + destination = "secrets/s3.env" + env = true + } + + resources { + cpu = 200 + memory = 128 + memory_max = 2048 + } + } + } + + group "query-scheduler" { + count = 2 + + network { + port "http" {} + port "grpc" { + to = 9096 + static = 9096 + } + } + + service { + name = "loki-query-scheduler" + port = "http" + + meta { + alloc_id = NOMAD_ALLOC_ID + component = "query-scheduler" + } + + check { + name = "Loki query-scheduler" + port = "http" + type = "http" + path = "/ready" + interval = "20s" + timeout = "1s" + } + } + + task "query-scheduler" { + driver = "docker" + user = "nobody" + kill_timeout = "90s" + + config { + image = "grafana/loki:${var.version}" + ports = [ + "http", + "grpc", + ] + + args = [ + "-target=query-scheduler", + "-config.file=/local/config.yml", + "-config.expand-env=true", + ] + } + + template { + data = file("config.yml") + destination = "local/config.yml" + } + + resources { + cpu = 100 + memory = 64 + memory_max = 128 + } + } + } + + group "query-frontend" { + count = 2 + + network { + port "http" {} + port "grpc" {} + } + + service { + name = "loki-query-frontend" + port = "http" + + meta { + alloc_id = NOMAD_ALLOC_ID + component = "query-frontend" + } + + tags = [ + "traefik.enable=true", + + "traefik.http.routers.loki-query-frontend.entrypoints=https", + "traefik.http.routers.loki-query-frontend.rule=Host(`loki-query-frontend.service.consul`)", + ] + + check { + name = "Loki query-frontend" + port = "http" + type = "http" + path = "/ready" + interval = "20s" + timeout = "1s" + } + + task "query-frontend" { + driver = "docker" + user = "nobody" + kill_timeout = "90s" + + config { + image = "grafana/loki:${var.version}" + ports = [ + "http", + "grpc", + ] + + args = [ + "-target=query-frontend", + "-config.file=/local/config.yml", + "-config.expand-env=true", + ] + } + + template { + data = file("config.yml") + destination = "local/config.yml" + } + + resources { + cpu = 100 + memory = 64 + memory_max = 128 + } + } + } + + group "index-gateway" { + count = 1 + + ephemeral_disk { + size = 1000 + sticky = true + } + + network { + port "http" {} + port "grpc" { + to = 9097 + static = 9097 + } + } + + service { + name = "loki-index-gateway" + port = "http" + + meta { + alloc_id = NOMAD_ALLOC_ID + component = "index-gateway" + } + + check { + name = "Loki index-gateway" + port = "http" + type = "http" + path = "/ready" + interval = "20s" + timeout = "1s" + } + } + + task "index-gateway" { + driver = "docker" + user = "nobody" + kill_timeout = "90s" + + config { + image = "grafana/loki:${var.version}" + ports = [ + "http", + "grpc", + ] + + args = [ + "-target=index-gateway", + "-config.file=/local/config.yml", + "-config.expand-env=true", + ] + } + + template { + data = file("config.yml") + destination = "local/config.yml" + } + + template { + data = <<-EOH + S3_ACCESS_KEY_ID= + S3_SECRET_ACCESS_KEY= + EOH + + destination = "secrets/s3.env" + env = true + } + + resources { + cpu = 200 + memory = 128 + memory_max = 1024 + } + } + } +} diff --git a/production/nomad/loki-simple/README.md b/production/nomad/loki-simple/README.md new file mode 100644 index 000000000000..4270bbde8b6b --- /dev/null +++ b/production/nomad/loki-simple/README.md @@ -0,0 +1,40 @@ +# Simple scalable deployment mode + +This Nomad job will deploy Loki in +[simple scalable deployment mode](https://grafana.com/docs/loki/latest/fundamentals/architecture/deployment-modes/#simple-scalable-deployment-mode) +with minimum dependencies, using boltdb-shipper and S3 backend and with the +ability to scale. + +## Usage + +Have a look at the job file and Loki configuration file and change it to suite +your environment. + +### Run job + +Inside directory with job run: + +```shell +nomad run job.nomad.hcl +``` + +To deploy a different version change `variable.version` default value or specify +from command line: + +```shell +nomad job run -var="version=2.5.0" job.nomad.hcl +``` + +### Scale Loki + +Change `count` in job file in `group "loki"` and run: + +```shell +nomad run job.nomad.hcl +``` + +or use Nomad CLI + +```shell +nomad job scale loki write +``` diff --git a/production/nomad/loki-simple/config.yml b/production/nomad/loki-simple/config.yml new file mode 100644 index 000000000000..dc57f2e80b0e --- /dev/null +++ b/production/nomad/loki-simple/config.yml @@ -0,0 +1,75 @@ +auth_enabled: false + +server: + log_level: info + http_listen_port: {{ env "NOMAD_PORT_http" }} + grpc_listen_port: {{ env "NOMAD_PORT_grpc" }} + +common: + replication_factor: 1 + # Tell Loki which address to advertise + instance_addr: {{ env "NOMAD_IP_grpc" }} + ring: + # Tell Loki which address to advertise in ring + instance_addr: {{ env "NOMAD_IP_grpc" }} + kvstore: + store: consul + prefix: loki/ + consul: + host: {{ env "attr.unique.network.ip-address" }}:8500 + +ingester: + wal: + dir: {{ env "NOMAD_ALLOC_DIR" }}/data/wal + flush_on_shutdown: true + replay_memory_ceiling: "1G" + +schema_config: + configs: + - from: 2022-05-15 + store: boltdb-shipper + object_store: s3 + schema: v12 + index: + prefix: index_ + period: 24h + +storage_config: + boltdb_shipper: + # Nomad ephemeral disk is used to store index and cache + # it will try to preserve /alloc/data between job updates + active_index_directory: {{ env "NOMAD_ALLOC_DIR" }}/data/index + cache_location: {{ env "NOMAD_ALLOC_DIR" }}/data/index-cache + shared_store: s3 + + aws: + endpoint: https://minio.service.consul + bucketnames: loki + region: us-west-1 + access_key_id: ${S3_ACCESS_KEY_ID} + secret_access_key: ${S3_SECRET_ACCESS_KEY} + s3forcepathstyle: true + +limits_config: + enforce_metric_name: false + reject_old_samples: true + reject_old_samples_max_age: 168h + +compactor: + working_directory: {{ env "NOMAD_ALLOC_DIR" }}/compactor + shared_store: s3 + compaction_interval: 5m + retention_enabled: true + +ruler: + alertmanager_url: https://alertmanager.service.consul + enable_alertmanager_v2: true + enable_api: true + external_url: https://loki.service.consul + rule_path: {{ env "NOMAD_ALLOC_DIR" }}/tmp/rules + storage: + type: local + local: + directory: {{ env "NOMAD_TASK_DIR" }}/rules + wal: + dir: {{ env "NOMAD_ALLOC_DIR" }}/data/ruler diff --git a/production/nomad/loki-simple/job.nomad.hcl b/production/nomad/loki-simple/job.nomad.hcl new file mode 100644 index 000000000000..6e14b4db0f17 --- /dev/null +++ b/production/nomad/loki-simple/job.nomad.hcl @@ -0,0 +1,161 @@ +variable "version" { + type = string + description = "Loki version" + default = "2.5.0" +} + +job "loki" { + datacenters = ["dc1"] + + group "read" { + count = 1 + + ephemeral_disk { + size = 1000 + sticky = true + } + + network { + port "http" {} + port "grpc" {} + } + + task "read" { + driver = "docker" + user = "nobody" + + config { + image = "grafana/loki:${var.version}" + + ports = [ + "http", + "grpc", + ] + + args = [ + "-target=read", + "-config.file=/local/config.yml", + "-config.expand-env=true", + ] + } + + template { + data = file("config.yml") + destination = "local/config.yml" + } + + template { + data = <<-EOH + S3_ACCESS_KEY_ID= + S3_SECRET_ACCESS_KEY= + EOH + + destination = "secrets/s3.env" + env = true + } + + service { + name = "loki-read" + port = "http" + + tags = [ + "traefik.enable=true", + "traefik.http.routers.loki-read.entrypoints=https", + "traefik.http.routers.loki-read.rule=Host(`loki-read.service.consul`)", + ] + + check { + name = "Loki read" + port = "http" + type = "http" + path = "/ready" + interval = "20s" + timeout = "1s" + + initial_status = "passing" + } + } + + resources { + cpu = 500 + memory = 256 + } + } + } + + group "write" { + count = 2 + + ephemeral_disk { + size = 1000 + sticky = true + } + + network { + port "http" {} + port "grpc" {} + } + + task "write" { + driver = "docker" + user = "nobody" + + config { + image = "grafana/loki:${var.version}" + + ports = [ + "http", + "grpc", + ] + + args = [ + "-target=write", + "-config.file=/local/config.yml", + "-config.expand-env=true", + ] + } + + template { + data = file("config.yml") + destination = "local/config.yml" + } + + template { + data = <<-EOH + S3_ACCESS_KEY_ID= + S3_SECRET_ACCESS_KEY= + EOH + + destination = "secrets/s3.env" + env = true + } + + service { + name = "loki-write" + port = "http" + + tags = [ + "traefik.enable=true", + "traefik.http.routers.loki-write.entrypoints=https", + "traefik.http.routers.loki-write.rule=Host(`loki-write.service.consul`)", + ] + + check { + name = "Loki write" + port = "http" + type = "http" + path = "/ready" + interval = "20s" + timeout = "1s" + + initial_status = "passing" + } + } + + resources { + cpu = 500 + memory = 256 + } + } + } +} diff --git a/production/nomad/loki/README.md b/production/nomad/loki/README.md new file mode 100644 index 000000000000..1692cba4d569 --- /dev/null +++ b/production/nomad/loki/README.md @@ -0,0 +1,40 @@ +# Monolithic mode + +This Nomad job will deploy Loki in +[monolithic mode](https://grafana.com/docs/loki/latest/fundamentals/architecture/deployment-modes/#monolithic-mode) +with minimum dependencies, using boltdb-shipper and S3 backend and with the +ability to scale. + +## Usage + +Have a look at the job file and Loki configuration file and change to suite your +environment. + +### Run job + +Inside directory with job run: + +```shell +nomad run job.nomad.hcl +``` + +To deploy a different version change `variable.version` default value or +specify from command line: + +```shell +nomad job run -var="version=2.5.0" job.nomad.hcl +``` + +### Scale Loki + +Change `count` in job file in `group "loki"` and run: + +```shell +nomad run job.nomad.hcl +``` + +or use Nomad CLI + +```shell +nomad job scale loki loki +``` diff --git a/production/nomad/loki/config.yml b/production/nomad/loki/config.yml new file mode 100644 index 000000000000..535da397854d --- /dev/null +++ b/production/nomad/loki/config.yml @@ -0,0 +1,75 @@ +auth_enabled: false + +server: + log_level: info + http_listen_port: {{ env "NOMAD_PORT_http" }} + grpc_listen_port: {{ env "NOMAD_PORT_grpc" }} + +common: + replication_factor: 1 + # Tell Loki which address to advertise + instance_addr: {{ env "NOMAD_IP_grpc" }} + ring: + # Tell Loki which address to advertise in ring + instance_addr: {{ env "NOMAD_IP_grpc" }} + kvstore: + store: consul + prefix: loki/ + consul: + host: {{ env "attr.unique.network.ip-address" }}:8500 + +ingester: + wal: + dir: {{ env "NOMAD_ALLOC_DIR" }}/data/wal + flush_on_shutdown: true + replay_memory_ceiling: "1G" + +schema_config: + configs: + - from: 2022-05-15 + store: boltdb-shipper + object_store: s3 + schema: v12 + index: + prefix: index_ + period: 24h + +storage_config: + boltdb_shipper: + # Nomad ephemeral disk is used to store index and cache + # it will try to preserve /alloc/data between job updates + active_index_directory: {{ env "NOMAD_ALLOC_DIR" }}/data/index + cache_location: {{ env "NOMAD_ALLOC_DIR" }}/data/index-cache + shared_store: s3 + + aws: + endpoint: https://s3.endpoint.com + bucketnames: loki + region: us-west-1 + access_key_id: ${S3_ACCESS_KEY_ID} + secret_access_key: ${S3_SECRET_ACCESS_KEY} + s3forcepathstyle: true + +limits_config: + enforce_metric_name: false + reject_old_samples: true + reject_old_samples_max_age: 168h + +compactor: + working_directory: {{ env "NOMAD_ALLOC_DIR" }}/compactor + shared_store: s3 + compaction_interval: 5m + retention_enabled: true + +ruler: + alertmanager_url: https://alertmanager.service.consul + enable_alertmanager_v2: true + enable_api: true + external_url: https://loki.service.consul + rule_path: {{ env "NOMAD_ALLOC_DIR" }}/tmp/rules + storage: + type: local + local: + directory: {{ env "NOMAD_TASK_DIR" }}/rules + wal: + dir: {{ env "NOMAD_ALLOC_DIR" }}/data/ruler diff --git a/production/nomad/loki/job.nomad.hcl b/production/nomad/loki/job.nomad.hcl new file mode 100644 index 000000000000..ba721a634283 --- /dev/null +++ b/production/nomad/loki/job.nomad.hcl @@ -0,0 +1,104 @@ +variable "version" { + type = string + description = "Loki version" + default = "2.5.0" +} + +job "loki" { + datacenters = ["dc1"] + + group "loki" { + count = 1 + + ephemeral_disk { + # Used to store index, cache, WAL + # Nomad will try to preserve the disk between job updates + size = 1000 + sticky = true + } + + network { + port "http" { + to = 3100 + static = 3100 + } + port "grpc" {} + } + + task "loki" { + driver = "docker" + user = "nobody" + + config { + image = "grafana/loki:${var.version}" + ports = [ + "http", + "grpc", + ] + args = [ + "-target=all", + "-config.file=/local/config.yml", + "-config.expand-env=true", + ] + } + + template { + data = file("config.yml") + destination = "local/config.yml" + } + + template { + data = <<-EOH + S3_ACCESS_KEY_ID= + S3_SECRET_ACCESS_KEY= + EOH + + destination = "secrets/s3.env" + env = true + } + + dynamic "template" { + for_each = fileset(".", "rules/**") + + content { + data = file(template.value) + destination = "local/${template.value}" + left_delimiter = "[[" + right_delimiter = "]]" + } + } + + service { + name = "loki" + port = "http" + + # use Traefik to loadbalance between Loki instances + tags = [ + "traefik.enable=true", + "traefik.http.routers.loki.entrypoints=https", + "traefik.http.routers.loki.rule=Host(`loki.service.consul`)", + ] + + check { + name = "Loki" + port = "http" + type = "http" + path = "/ready" + interval = "20s" + timeout = "1s" + + initial_status = "passing" + } + } + + resources { + # adjust to suite your load + cpu = 500 + memory = 256 + # requiers memory_oversubscription + # https://www.nomadproject.io/api-docs/operator/scheduler#update-scheduler-configuration + # memory_max = 512 + } + } + } +} diff --git a/production/nomad/loki/rules/fake/alerts.yml b/production/nomad/loki/rules/fake/alerts.yml new file mode 100644 index 000000000000..2c9f20ff79ef --- /dev/null +++ b/production/nomad/loki/rules/fake/alerts.yml @@ -0,0 +1,11 @@ +groups: + - name: always-firing + rules: + - alert: fire + expr: | + 1 > 0 + for: 0m + labels: + severity: warning + annotations: + summary: test