Merge pull request #101 from deefreak/deepakfork/ossDocs

Metrics docs added and few changes to installation doc
flipkart-incubator · Jan 12, 2024 · 199c7f4 · 199c7f4
2 parents 37879fb + f4b5a77
commit 199c7f4
Show file tree

Hide file tree

Showing 6 changed files with 73 additions and 23 deletions.
diff --git a/INSTALLATION.md b/INSTALLATION.md
@@ -63,16 +63,16 @@ their default values.
 | `image.repository` | string | `""` | Image name of ottoscalr deployment |
 | `image.tag` | string | `""` | Image tag of ottoscalr deployment |
 | `replicaCount` | int | `1` | Capability to configure the number of replicas for ottoscalr operator. While you can run more replicas of our operator, only one operator instance will be the leader and serving traffic. You can run multiple replicas, but they will not improve the performance of ottoscalr, it could only reduce downtime during a failover. |
-| `resources` | object | `{"limits":{"cpu":1,"memory":"2Gi"},"requests":{"cpu":"1","memory":"2Gi"}}` | Manage [resource request & limits] of ottoscalr operator pod |
+| `resources` | object | `{"limits":{"cpu":2,"memory":"4Gi"},"requests":{"cpu":"2","memory":"4Gi"}}` | Manage resource request & limits of ottoscalr operator pod |
 
 #### Operations
 
 | Parameter | Type | Default | Description |
 |-----------|------|---------|-------------|
-| `ottoscalr.config.metricsScraper.prometheusUrl` | string | `""` | URL where prometheus for the kubernetes cluster is running.   |
+| `ottoscalr.config.metricsScraper.prometheusUrl` | string | `""` | URL where prometheus for the kubernetes cluster is running. Fetching metrics from a single or multiple prometheus instance(give comma separated urls) is supported. Metrics from multple prometheus instances will be aggregated. If you have 2 instances named `p8s1` and `p8s2`, it should be added like `"p8s1,p8s2"`  |
 | `ottoscalr.config.metricsScraper.queryTimeoutSec` | int | `300` | Time in seconds within which the response for any query should be served by the prometheus  |
-| `ottoscalr.config.metricsScraper.querySplitIntervalHr` | int | `24` | The shortest period in hour for which data will be fetched from prometheus |
-| `ottoscalr.config.policyRecommendationController.maxConcurrentReconciles` | int | `1` | Maximum number of concurrent Reconciles which can be run. |
+| `ottoscalr.config.metricsScraper.querySplitIntervalHr` | int | `24` | The shortest period in hour for which data will be fetched from prometheus. If we are fetching data for 28 days, it will fetch data for this duration at a time and merge all of them. |
+| `ottoscalr.config.policyRecommendationController.maxConcurrentReconciles` | int | `1` | Maximum number of concurrent Reconciles of policy recommendation controller which can be run. |
 | `ottoscalr.config.policyRecommendationController.minRequiredReplicas` | int | `3` | The hpa.spec.minReplicas  recommended by the controller will not have replicas minimum than this.  |
 | `ottoscalr.config.policyRecommendationController.policyExpiryAge` | string | `3h` | Target Recommendation will be reached in multiple iterations and through different policies. This is the time after which a policy expires and next policy in the list can be applied. |
 | `ottoscalr.config.breachMonitor.pollingIntervalSec` | int | `300` | Time in seconds to check for any breaches happened during this time |
@@ -87,9 +87,12 @@ their default values.
 | `ottoscalr.config.hpaEnforcer.maxConcurrentReconciles` | int | `1` | Maximum number of concurrent Reconciles which can be run by hpaenforcement controller |
 | `ottoscalr.config.hpaEnforcer.excludedNamespaces` | string | `""` | Comma separated namespaces where ottoscalr will not create HPAs. Example: "namespace1,namespace2,namespace3" |
 | `ottoscalr.config.hpaEnforcer.includedNamespaces` | string | `""` | If provided, ottoscalr will only create HPAs for these namespaces. If it is empty, it will include all except for the excluded ones. Example: "namespace1,namespace2,namespace3" |
+| `ottoscalr.config.hpaEnforcer.minRequiredReplicas` | int | `2` | the hpa will be created only if the min replicas generated by the policy recommendation controller is greater than this. |
 | `ottoscalr.config.hpaEnforcer.whitelistMode` | bool | `true` | hpa controller will act only on deployments having `ottoscalr.io/enable-hpa-enforcement: true` annotation and create hpa for them. If false, hpaEnforcer runs on blacklistMode where it will create hpas for every workload except for ones having `ottoscalr.io/disable-hpa-enforcement: true`. It is recommended to run with whitelistMode as true. |
 | `ottoscalr.config.hpaEnforcer.isDryRun` | bool | `false` | If true, hpa controller will not create any HPAs.   |
-| `ottoscalr.config.hpaEnforcer.enableMetricsTransformer` | bool | `true` | This metrics transformer can be used to interpolate any known period of data that should not be used for generating recommendation.  |
-| `ottoscalr.config.autoscalerClient.scaledObjectConfigs.enableScaledObject` | bool | `false` | Flag whether to use KEDA ScaledObjects or HPA for autoscaling. KEDA needs to be deployed on your cluster for enabling it. |
+| `ottoscalr.config.enableMetricsTransformer` | bool | `true` | This metrics transformer can be used to interpolate any known period of data that should not be used for generating recommendation.  |
+| `ottoscalr.conifg.eventCallIntegration.customEventDataConfigMapName` | string |`custom-event-data-config` | This configmap will be deployed as part of the helm chart, please do not change. You can add any period of data to be interpolated in the configmap in the following format `7f8b9c83: '{"eventId":"7f8b9c83","eventName":"Outlier","startTime":"2023-07-27 04:00","endTime":"2023-07-27 05:00"}'`. Keep the startTime and endTime in this `YYYY-MM-DD HH:MM`. Similarly, multiple such events can be added. To add, edit the data in the configmap `customeventdataconfig.yaml` in the helm templates.  |
+| `ottoscalr.config.autoscalerClient.scaledObjectConfigs.enableScaledObject` | bool | `false` | Flag whether to use KEDA ScaledObjects or HPA for autoscaling. If false, HPA client will be used. KEDA needs to be deployed on your cluster for enabling it. |
 | `ottoscalr.config.autoscalerClient.hpaConfigs.hpaAPIVersion` | string | `"v2"` | Set this if using HPA for autoscaling. By default, `autoscaling/v2` api is supported. If you wish to use `autoscaling/v1` api for HPA, change this to `"v1"`. |
+| `ottoscalr.config.enableArgoRolloutsSupport` | bool | `false` | Change this to true if you have support for Argo Rollouts. |
 
diff --git a/OTTOSCALR_METRICS.md b/OTTOSCALR_METRICS.md
@@ -0,0 +1,50 @@
+## Ottoscalr metrics
+
+| Metric name | Metric type | Description | Labels/tags |
+|-----------|------|---------|-------------|
+| `ottoscalr_policyreco_current_policy_max` | gauge | Current Max replica count to be applied to the HPA | `policyreco`=&lt;policyrecommendation-name&gt; <br> `namespace`=&lt;policyrecommendation-namespace&gt;  |
+| `ottoscalr_policyreco_current_policy_min` | gauge | Current Min replica count to be applied to the HPA | `policyreco`=&lt;policyrecommendation-name&gt; <br> `namespace`=&lt;policyrecommendation-namespace&gt;  |
+| `ottoscalr_policyreco_current_policy_utilization` | gauge | Current CPU utilization threshold to be applied to the HPA | `policyreco`=&lt;policyrecommendation-name&gt; <br> `namespace`=&lt;policyrecommendation-namespace&gt;  |
+| `ottoscalr_policyreco_target_policy_max` | gauge | Max replica count recommended to be applied to the HPA.  | `policyreco`=&lt;policyrecommendation-name&gt; <br> `namespace`=&lt;policyrecommendation-namespace&gt;  |
+| `ottoscalr_policyreco_target_policy_min` | gauge | Min replica count recommended to be applied to the HPA.  | `policyreco`=&lt;policyrecommendation-name&gt; <br> `namespace`=&lt;policyrecommendation-namespace&gt;  |
+| `ottoscalr_policyreco_target_policy_max` | gauge | CPU utilization threshold recommended to be applied to the HPA.  | `policyreco`=&lt;policyrecommendation-name&gt; <br> `namespace`=&lt;policyrecommendation-namespace&gt;  |
+| `ottoscalr_policyreco_workload_info` | gauge | Information about the policyrecommendation  | `policyreco`=&lt;policyrecommendation-name&gt; <br> `namespace`=&lt;policyrecommendation-namespace&gt; <br> `workload`=&lt;policyrecommendation-workload&gt; <br> `workloadKind`=&lt;workloadType(Deployment,Rollout)&gt; |
+| `ottoscalr_policyreco_reconciler_conditions` | gauge | Metric for checking the status of different conditions of `.policyrecommendation.status` | `policyreco`=&lt;policyrecommendation-name&gt; <br> `namespace`=&lt;policyrecommendation-namespace&gt; <br> `status`=&lt;True,False&gt; <br> `type`=&lt;RecoTaskProgress,TargetRecoAchieved&gt; |
+| `ottoscalr_policyreco_reconciler_task_progress_reason` | gauge | Metric for checking the reason for condition type `RecoTaskProgress`. | `policyreco`=&lt;policyrecommendation-name&gt; <br> `namespace`=&lt;policyrecommendation-namespace&gt; <br> `reason`=&lt;RecoTaskErrored,RecoTaskRecommendationGenerated; |
+| `ottoscalr_policyreco_reconciled_count` | counter | Number of times a policyrecommendation has been reconciled | `policyreco`=&lt;policyrecommendation-name&gt; <br> `namespace`=&lt;policyrecommendation-namespace&gt; |
+| `ottoscalr_policyreco_reconciler_errored_count` | counter | Number of times a policyrecommendation's reconciliation has errored | `policyreco`=&lt;policyrecommendation-name&gt; <br> `namespace`=&lt;policyrecommendation-namespace&gt; |
+| `ottoscalr_policyreco_reconciler_targetreco_slo_days` | histogram | Time taken for a policy reco to achieve the target recommendation in days | `policyreco`=&lt;policyrecommendation-name&gt; <br> `namespace`=&lt;policyrecommendation-namespace&gt; |
+| `ottoscalr_minimum_percentage_of_datapoints_present` | gauge | If minimum percentage of datapoints is present to generate recommendation. | `workload`=&lt;deployment-name&gt; <br> `namespace`=&lt;policyrecommendation-namespace&gt; <br> `reason`=&lt;RecoTaskErrored,RecoTaskRecommendationGenerated; |
+| `ottoscalr_p8s_query_error_count` | counter | Error counter for a query made to prometheus | `query`=&lt;query-type&gt; <br> `p8sinstance`=&lt;prometheusinstance-name&gt; |
+| `ottoscalr_p8s_query_success_count` | counter | Success counter for a query made to prometheus | `query`=&lt;query-type&gt; <br> `p8sinstance`=&lt;prometheusinstance-name&gt; |
+| `ottoscalr_p8s_concurrent_queries` | gauge | Number of concurrent p8s api calls for a query | `query`=&lt;query-type&gt; <br> `p8sinstance`=&lt;prometheusinstance-name&gt; |
+| `ottoscalr_datapoints_fetched_by_p8s_instance` | gauge | Number of datapoints fetched for a query for a workload from a prometheus instance | `query`=&lt;query-type&gt; <br> `p8sinstance`=&lt;prometheusinstance-name&gt; <br> `workload`=&lt;deployment-name&gt; <br> `namespace`=&lt;policyrecommendation-namespace&gt; |
+| `ottoscalr_total_datapoints_fetched` | gauge | Total Number of datapoints fetched for a query for a workload after aggregating from all the prometheus instances | `query`=&lt;query-type&gt; <br> `workload`=&lt;deployment-name&gt; <br> `namespace`=&lt;policyrecommendation-namespace&gt; |
+| `ottoscalr_prometheus_scraper_query_latency` | histogram | Time to execute prometheus scraper query in seconds | `query`=&lt;query-type&gt; <br> `p8sinstance`=&lt;prometheusinstance-name&gt; <br> `workload`=&lt;deployment-name&gt; <br> `namespace`=&lt;policyrecommendation-namespace&gt; | 
+| `ottoscalr_get_avg_cpu_utilization_query_latency_seconds` | histogram | Total Time to execute utilization datapoint query in seconds | `policyreco`=&lt;policyrecommendation-name&gt; <br> `namespace`=&lt;policyrecommendation-namespace&gt; <br> `workload`=&lt;deployment-name&gt; <br> `workloadKind`=&lt;workloadType(Deployment,Rollout)&gt; |
+| `ottoscalr_get_reco_generation_latency_seconds` | histogram | Total time to generate policyrecommendation for a workload once it's execution is started | `policyreco`=&lt;policyrecommendation-name&gt; <br> `namespace`=&lt;policyrecommendation-namespace&gt; <br> `workload`=&lt;deployment-name&gt; <br> `workloadKind`=&lt;workloadType(Deployment,Rollout)&gt; |
+| `ottoscalr_breachmonitor_breached` | gauge | If a particular workload has breached the cpu redline or not | `policyreco`=&lt;policyrecommendation-name&gt; <br> `namespace`=&lt;policyrecommendation-namespace&gt; <br> `workload`=&lt;deployment-name&gt; <br> `workloadKind`=&lt;workloadType(Deployment,Rollout)&gt; |
+| `ottoscalr_breachmonitor_execution_rate` | gauge | Rate of breachmonitor executions for the workloads | |
+| `ottoscalr_concurrent_breachmonitor_executions` | counter | Number of concurrent breachmonitor executions for the workloads | |
+| `ottoscalr_breachmonitor_mitigation_latency_seconds` | histogram | Time to mitigate breach in seconds for a workload | `policyreco`=&lt;policyrecommendation-name&gt; <br> `namespace`=&lt;policyrecommendation-namespace&gt; <br> `workload`=&lt;deployment-name&gt; <br> `workloadKind`=&lt;workloadType(Deployment,Rollout)&gt; |
+| `ottoscalr_hpaenforcer_reconciled_count` | counter | Number of times a policyrecommendation has been reconciled by HPAEnforcer | `policyreco`=&lt;policyrecommendation-name&gt; <br> `namespace`=&lt;policyrecommendation-namespace&gt; |
+
+
+### Ottoscalr Controller Metrics
+
+All the kubebuilder controller metrics as mentioned here:- https://book.kubebuilder.io/reference/metrics-reference are available for ottoscalr controllers. 
+All the metrics should be prefixed with `ottoscalr_` .
+
+Following are the names of controllers:-
+<b>PolicyRecommendationRegistrar</b>,
+<b>RecoWorkflowController</b>,
+<b>HPAEnforcementController</b>,
+<b>PolicyWatcher</b>,
+<b>DeploymentTriggerController</b>
+
+
+
+
+
+
+
diff --git a/charts/ottoscalr/envconfigs/ottoscalr_config.yaml b/charts/ottoscalr/envconfigs/ottoscalr_config.yaml
@@ -6,19 +6,19 @@ enableLeaderElection: false
 leaderElectionID: "85d48caf.fcp.ottoscalr.io"
 metricsScraper:
   prometheusUrl: {{ .Values.ottoscalr.config.metricsScraper.prometheusUrl }}
-  queryTimeoutSec: {{ .Values.ottoscalr.config.metricsScraper.queryTimeoutSec | default "30" }}
+  queryTimeoutSec: {{ .Values.ottoscalr.config.metricsScraper.queryTimeoutSec | default "300" }}
   querySplitIntervalHr: {{ .Values.ottoscalr.config.metricsScraper.querySplitIntervalHr | default "24" }}
 breachMonitor:
   pollingIntervalSec: {{ .Values.ottoscalr.config.breachMonitor.pollingIntervalSec | default "300" }}
   cpuRedLine: {{ .Values.ottoscalr.config.breachMonitor.cpuRedLine | default "0.75" }}
   stepSec: 30
   concurrentExecutions: {{ .Values.ottoscalr.config.breachMonitor.concurrentExecutions | default "50" }}
 periodicTrigger:
-  pollingIntervalMin: {{ .Values.ottoscalr.config.periodicTrigger.pollingIntervalMin | default "1440" }}
+  pollingIntervalMin: {{ .Values.ottoscalr.config.periodicTrigger.pollingIntervalMin | default "180" }}
 policyRecommendationController:
   maxConcurrentReconciles: {{ .Values.ottoscalr.config.policyRecommendationController.maxConcurrentReconciles | default "1" }}
   minRequiredReplicas: {{ .Values.ottoscalr.config.policyRecommendationController.minRequiredReplicas | default "3" }}
-  policyExpiryAge: {{ .Values.ottoscalr.config.policyRecommendationController.policyExpiryAge | default "48h" }}
+  policyExpiryAge: {{ .Values.ottoscalr.config.policyRecommendationController.policyExpiryAge | default "3h" }}
 policyRecommendationRegistrar:
   requeueDelayMs: 500
   excludedNamespaces: {{ .Values.ottoscalr.config.policyRecommendationRegistrar.excludedNamespaces | default "kube-system,monitoring,gatekeeper-system,webhook" }}
@@ -28,7 +28,7 @@ cpuUtilizationBasedRecommender:
   stepSec: 30
   minTarget: {{ .Values.ottoscalr.config.cpuUtilizationBasedRecommender.minTarget | default "5" }}
   maxTarget: {{ .Values.ottoscalr.config.cpuUtilizationBasedRecommender.maxTarget | default "60" }}
-  metricsPercentageThreshold: {{ .Values.ottoscalr.config.cpuUtilizationBasedRecommender.metricsPercentageThreshold | default "0" }}
+  metricsPercentageThreshold: {{ .Values.ottoscalr.config.cpuUtilizationBasedRecommender.metricsPercentageThreshold | default "25" }}
 metricIngestionTime: 15.0
 metricProbeTime: 15.0
 enableMetricsTransformer: {{ .Values.ottoscalr.config.enableMetricsTransformer | default false }}

diff --git a/charts/ottoscalr/templates/customeventdataconfig.yaml b/charts/ottoscalr/templates/customeventdataconfig.yaml
@@ -10,7 +10,5 @@ metadata:
     {{- include "ottoscalr.labels" . | nindent 4 }}
   namespace: {{ .Release.Namespace }}
 data:
-{{- if .Values.addCustomEventData }}
   7f8b9c83: '{"eventId":"7f8b9c83","eventName":"Outlier","startTime":"2023-07-27 04:00","endTime":"2023-07-27 05:00"}'
-  7f8b9c84: '{"eventId":"7f8b9c84","eventName":"Outlier","startTime":"2023-08-02 21:00","endTime":"2023-08-02 23:59"}'
-{{- end }}
+
diff --git a/charts/ottoscalr/values.yaml b/charts/ottoscalr/values.yaml
@@ -5,10 +5,10 @@
 replicaCount: 1
 
 image:
-  repository: jfrog.fkinternal.com/alm/ottoscalr-oss
+  repository: ""
   pullPolicy: IfNotPresent
   # Overrides the image tag whose default is the chart appVersion.
-  tag: "v.10"
+  tag: ""
 
 imagePullSecrets: []
 nameOverride: ""
@@ -56,11 +56,11 @@ ingress:
 
 resources:
   limits:
-    cpu: 100m
-    memory: 500Mi
+    cpu: 2
+    memory: 4Gi
   requests:
-    cpu: 100m
-    memory: 500Mi
+    cpu: 2
+    memory: 4Gi
 
 autoscaling:
   enabled: false
@@ -84,8 +84,7 @@ ottoscalrConfigPath: "/etc/ottoscalr/config/ottoscalr_config.yaml"
 ottoscalr:
   config:
     metricsScraper:
-      prometheusUrl: http://prometheus-k8s-0.prometheus-operated.monitoring.svc.cluster.local:9090,http://prometheus-k8s-1.prometheus-operated.monitoring.svc.cluster.local:9090,http://prometheus-k8s-2.prometheus-operated.monitoring.svc.cluster.local:9090
-      queryTimeoutSec: 30
+      prometheusUrl: 
       querySplitIntervalHr: 24
     policyRecommendationController:
       maxConcurrentReconciles: 1
@@ -124,7 +123,7 @@ ottoscalr:
         enableScaledObject: false
         enableEventAutoscaler: false
       hpaConfigs:
-        hpaAPIVersion: v1
+        hpaAPIVersion: v2
     enableArgoRolloutsSupport: false
 
 

diff --git a/pkg/metrics/queries_test.go b/pkg/metrics/queries_test.go
@@ -24,7 +24,7 @@ var _ = Describe("Queries", func() {
 					"label1": "value1",
 					"label2": "value2",
 				}
-				Expect(qc.Render(labels)).To(Equal("test_metric{label1=\"value1\",label2=\"value2\"}"))
+				Expect(qc.Render(labels)).To(Or(Equal("test_metric{label1=\"value1\",label2=\"value2\"}"), Equal("test_metric{label2=\"value2\",label1=\"value1\"}")))
 			})
 		})