diff --git a/cmd/descheduler/app/descheduler.go b/cmd/descheduler/app/descheduler.go index d4525a45a551..ed2940942135 100644 --- a/cmd/descheduler/app/descheduler.go +++ b/cmd/descheduler/app/descheduler.go @@ -43,6 +43,22 @@ const ( // References: // - https://en.wikipedia.org/wiki/Slowloris_(computer_security) ReadHeaderTimeout = 32 * time.Second + // WriteTimeout is the amount of time allowed to write the + // request data. + // HTTP timeouts are necessary to expire inactive connections + // and failing to do so might make the application vulnerable + // to attacks like slowloris which work by sending data very slow, + // which in case of no timeout will keep the connection active + // eventually leading to a denial-of-service (DoS) attack. + WriteTimeout = 5 * time.Minute + // ReadTimeout is the amount of time allowed to read + // response data. + // HTTP timeouts are necessary to expire inactive connections + // and failing to do so might make the application vulnerable + // to attacks like slowloris which work by sending data very slow, + // which in case of no timeout will keep the connection active + // eventually leading to a denial-of-service (DoS) attack. + ReadTimeout = 5 * time.Minute ) // NewDeschedulerCommand creates a *cobra.Command object with default parameters @@ -174,6 +190,8 @@ func serveHealthzAndMetrics(address string) { Addr: address, Handler: mux, ReadHeaderTimeout: ReadHeaderTimeout, + WriteTimeout: WriteTimeout, + ReadTimeout: ReadTimeout, } if err := httpServer.ListenAndServe(); err != nil { klog.Errorf("Failed to serve healthz and metrics: %v", err) diff --git a/cmd/scheduler-estimator/app/scheduler-estimator.go b/cmd/scheduler-estimator/app/scheduler-estimator.go index 5e91a705846c..20bbefad6059 100644 --- a/cmd/scheduler-estimator/app/scheduler-estimator.go +++ b/cmd/scheduler-estimator/app/scheduler-estimator.go @@ -40,6 +40,22 @@ const ( // References: // - https://en.wikipedia.org/wiki/Slowloris_(computer_security) ReadHeaderTimeout = 32 * time.Second + // WriteTimeout is the amount of time allowed to write the + // request data. + // HTTP timeouts are necessary to expire inactive connections + // and failing to do so might make the application vulnerable + // to attacks like slowloris which work by sending data very slow, + // which in case of no timeout will keep the connection active + // eventually leading to a denial-of-service (DoS) attack. + WriteTimeout = 5 * time.Minute + // ReadTimeout is the amount of time allowed to read + // response data. + // HTTP timeouts are necessary to expire inactive connections + // and failing to do so might make the application vulnerable + // to attacks like slowloris which work by sending data very slow, + // which in case of no timeout will keep the connection active + // eventually leading to a denial-of-service (DoS) attack. + ReadTimeout = 5 * time.Minute ) // NewSchedulerEstimatorCommand creates a *cobra.Command object with default parameters @@ -121,6 +137,8 @@ func serveHealthzAndMetrics(address string) { Addr: address, Handler: mux, ReadHeaderTimeout: ReadHeaderTimeout, + WriteTimeout: WriteTimeout, + ReadTimeout: ReadTimeout, } if err := httpServer.ListenAndServe(); err != nil { klog.Errorf("Failed to serve healthz and metrics: %v", err) diff --git a/cmd/scheduler/app/scheduler.go b/cmd/scheduler/app/scheduler.go index ed1c8b58dd14..c779ee542906 100644 --- a/cmd/scheduler/app/scheduler.go +++ b/cmd/scheduler/app/scheduler.go @@ -45,6 +45,22 @@ const ( // References: // - https://en.wikipedia.org/wiki/Slowloris_(computer_security) ReadHeaderTimeout = 32 * time.Second + // WriteTimeout is the amount of time allowed to write the + // request data. + // HTTP timeouts are necessary to expire inactive connections + // and failing to do so might make the application vulnerable + // to attacks like slowloris which work by sending data very slow, + // which in case of no timeout will keep the connection active + // eventually leading to a denial-of-service (DoS) attack. + WriteTimeout = 5 * time.Minute + // ReadTimeout is the amount of time allowed to read + // response data. + // HTTP timeouts are necessary to expire inactive connections + // and failing to do so might make the application vulnerable + // to attacks like slowloris which work by sending data very slow, + // which in case of no timeout will keep the connection active + // eventually leading to a denial-of-service (DoS) attack. + ReadTimeout = 5 * time.Minute ) // Option configures a framework.Registry. @@ -207,6 +223,8 @@ func serveHealthzAndMetrics(address string) { Addr: address, Handler: mux, ReadHeaderTimeout: ReadHeaderTimeout, + WriteTimeout: WriteTimeout, + ReadTimeout: ReadTimeout, } if err := httpServer.ListenAndServe(); err != nil { klog.Errorf("Failed to serve healthz and metrics: %v", err) diff --git a/docs/proposals/mcs/additional-dns-server.jpg b/docs/proposals/mcs/additional-dns-server.jpg new file mode 100644 index 000000000000..c848b52f0bec Binary files /dev/null and b/docs/proposals/mcs/additional-dns-server.jpg differ diff --git a/docs/proposals/mcs/remove-derived-in-controller.jpg b/docs/proposals/mcs/remove-derived-in-controller.jpg new file mode 100644 index 000000000000..a8cd83170c73 Binary files /dev/null and b/docs/proposals/mcs/remove-derived-in-controller.jpg differ diff --git a/docs/proposals/mcs/remove-derived-mcs.md b/docs/proposals/mcs/remove-derived-mcs.md new file mode 100644 index 000000000000..4d988c41e2e7 --- /dev/null +++ b/docs/proposals/mcs/remove-derived-mcs.md @@ -0,0 +1,355 @@ +--- +title: Remove "derived-" prefix of MCS + +authors: +- "@zishen" + +reviewers: +- "@kevin-wangzefeng" +- "@RainbowMango" +- "@XiShanYongYe-Chang" +- "@Garrybest" +- "@GitHubxsy" + +creation-date: 2023-06-30 +--- + + + +# Remove "derived-" prefix of MCS + +## Summary + +As mentioned in [issue-2384](https://github.com/karmada-io/karmada/issues/2384), currently Karmada uses ServiceExport, ServiceImport and other CR to form mcs-controller and manage multi-cluster services. But in the cluster where the service is not located, we need to use a different IP or a service name with a "derived-" prefix to access. We hope to reduce this difference so that different clusters can use the same domain name to access service. + +## Motivation + +Currently, in different cluster members of Karmada, if you need to access the service across clusters, you need to use different ip or domain names with a "derived-" prefix to access. This will increase the complexity of the user's use. We hope that users can use the same domain name to achieve cross-cluster access. + +### Goals + +* Keep the original access method. Avoid the need to make modifications to the customer's system after the upgrade. +* Added the use of a unified domain name for cross-cluster access. The domain name is the access domain name of the cluster where the service resides. + +### Non-Goals + +none + +## Proposal + +### User Stories + +According to the current Karmada's [mcs configuration](https://karmada.io/docs/userguide/service/multi-cluster-service) , if a service in member1, the user needs to access the service in member1, needs to use the domain name: "serve.default.svc.cluster.local", and the domain name in member2 will be: "derived-serve.default.svc.cluster.local". As a result, when customers use the service, they need to add the prefix "derived-" to access the domain name of the service. This brings additional inconvenience. So I hope to be able to: + +1. Provide a unified access method across clusters. + +2. The original access method remains unchanged. So that existing business can also work normally. + +### Notes/Constraints/Caveats (Optional) + +1. The `coreDNS` version must be v1.9.3 or above, and the matching k8s version is at v1.21.0 or above, otherwise the multicluster feature is not supported. + +### Risks and Mitigations + + + +## Design Details + +### API change + +none + +### Components change + +Here is give the implementation plan:`ServiceImport ` + karmada. + +#### 1) Use ServiceImport to generate rr records + +This solution is mainly realized by combining the `multicluster` plug-in of `corends`. It is divided into two parts: `coreDNS` transformation and `karmada`. For the `coreDNS`, here is two implementation plan. + +##### (1) coreDNS+multicluster + +This solution is mainly realized by the `multicluster` plug-in of `corends`. + +###### **Principle Description** + +In order to adapt to mcs, `coreDNS` launched the [`multicluster`](https://github.com/coredns/multicluster) plug-in. After the plug-in completes the configuration and deployment, it will parse `ServiceImport`. The main content of the analysis is the `name`, `namespace` and `ips`, and an rr record is generated accordingly. Then client pod can cross-cluster access services. + +The workflow of the `multicluster` plugin is as follows: + +![diagrammatic drawing of remove mcs](./remove-derived-mcs.png) + +The diagram is explained below: + +* Here give a service named `foo` in `member1`. We should use the full domain name: `foo.default.svc.cluster.local` to access this service. But we cannot use the same domain name in `member2`. + +* `Karmada` exports the service through `ServiceExport` and imports it into `member2` through `ServiceImport`. At this time, the shadow service `derived-foo` will appear in `member2`. User in `memeber2` can access to the `foo` service in `memeber1` by using `derived-foo.default.svc.cluster.local`. + +* After the `coreDNS` installed with `multicluster` found the `ServiceImport` had been created, it will analyze `name`, `namespace`, and `ips` fields of the `ServiceImport` and generate the rr records. In this example, the `ips` in `ServiceImport` can be the `clusterIP` of `derived-foo`. + +* When the client accesses the service across clusters (the suffix may be different from the local access), `coreDNS` can access it according to the corresponding rr record. + +###### Compilation-instructions + +According to the documentation on the official website [multicluster plugin](https://github.com/coredns/multicluster). Download `coreDNS`, the corresponding version of k8s 1.26 is v1.9.3 (the latest v1.10.1 is also available): + +```shell +git clone https://github.com/coredns/coredns +cd coredns +git checkout v1.9.3 +``` + +* Add multicluster plugin + +Open the `plugin.cfg` file and add `multicluster` in it as follow: + +```shell +... +kubernetes:kubernetes +multicluster: github.com/coredns/multicluster +... +``` + +* Execute compilation + +```shell +make +``` + +After the compilation is successful, the corresponding binary will be generated in the root directory. + +Mirror making and installation or upgrading please refer to [coreDNS github website](https://github.com/coredns/coredns). + +###### Configuration-instructions + +The suffix `clusterset.local` is used as an example to illustrate + +* Configure coredns permissions in member2 + +```shell +kubectl edit clusterrole system:coredns +``` + +Add the following to the end + +```yaml +... +- apiGroups: + - multicluster.x-k8s.io + resources: + - serviceimports + verbs: + - list + - watch +``` + +* Configure multicluster zone rules + +Add `multicluster` processing rules to corefile in `coredns` + +```shell +kubectl edit configmap coredns -n kube-system +``` + + + +```yaml + .... + Corefile: | + .:53 { + errors + health { + lameduck 5s + } + ready + multicluster clusterset.local # Add this line. + kubernetes cluster.local in-addr.arpa ip6.arpa { + pods insecure + fallthrough in-addr.arpa ip6.arpa + ttl 30 + } + prometheus:9153 + forward ./etc/resolv.conf { + max_concurrent 1000 + } + cache 30 + the loop + reload + load balance + } +... +``` + +* Configure dns search item "clusterset.local" + +**Configure node /etc/resolv.conf file** + +Edit the `/etc/resolv.conf` file and add the "clusterset.local" entry to give priority to local services when using the domain name `foo.default.svc`. It is worth reminding that the remote service is only accessed when the local service does not exist, not unavailable. + +```shell +root@btg:/# cat /etc/resolv.conf +search default.svc.cluster.local svc.cluster.local cluster.local clusterset.local +... +``` + +This configuration method needs to configure the `/etc/resolv.conf` file on the client node. If you don't want to change the node's `resolv.conf` file, you can also use the following method. + +**Configure pod's dnsConfig in yaml** + +That is, add the configuration item searches of dnsConfig in yaml, and set the value to `clusterset.local`, as shown below: + +```yaml +... + dnsPolicy: ClusterFirst + dnsConfig: + searches: + - clusterset.local +... +``` + +###### Access method + +Since the search function of dns is used, the `foo` service access method is: + +1. `foo.default.svc` access. This method gives priority to local access, and if the local service with the same name is inexistence, it will access the remote cluster service. + +2. `foo.default.svc.clusterset.local` full domain name access. The request will go directly to the remote server for processing. + +###### Advantages and disadvantages + +shortcoming: + +1. Using `foo.default.svc` access cannot achieve load balancing. The remote service will only be accessed when the local service is inexistence. Therefore, if you need to achieve local and remote access load balancing, you need to cooperate with other solutions. + +2. To use `foo.default.svc`, you need to modify the dns configuration. If you use the `resolv.conf `method, you need to modify the `resolv.conf` file of each possible nodes. If you use the dnsConfig method, you need to modify the yaml or api of the delivered pod. + +3. The `multicluster` plug-in cannot be loaded dynamically. Customers need to recompile `coredns` and modify the `coredns` configuration of member clusters. + +4. The original access method will be changed. This solution uses different suffixes to distinguish single-cluster Service and multi-cluster Service. Therefore, the access domain name `foo.default.svc.cluster.local` will become `foo.default.svc.clusterset.local`. Customers need to modify the code to adapt. + +advantage: + +1. The impact is controllable. No new components will be added. The way `coredns` adds `multicluster` is to comply with the official proposal of `coredns`, and the code is open. + +2. The amount of modification is small. It can be done by using the existing code and scheme of `multicluster`. + +##### (2) Additional dns server + +Similar to submariner's [SERVICE DISCOVERY](https://submariner.io/getting-started/architecture/service-discovery/). It is necessary to develop an extended dns resolver, and install the corresponding plug-in in `coreDNS` to transfer the domain name with the specified suffix (such as `clusterset.local`) to this additional dns resolver. Therefore, the domain name can be resolved by dns. + +The workflow of the additional dns server as follows: + +![](.\additional-dns-server.jpg) + +The advantages and disadvantages of this method are as follows: + +advantage: + +1. Load balancing can be achieved. Since the extended dns resolver is customized, customers can achieve load balancing here. + +2. Access in the original way can be realized. Rules for local services and remote services can be determined in custom plug-ins. Therefore, the original access method can be maintained, in this case is `foo.default.svc.cluster.local`. + +3. No other configuration of dns is required. Since there is a custom dns resolver, there is no need to configure dns search. Therefore, there is no configuration of dns search in solution one. + +shortcoming: + +1. The `coreDNS` component needs to be modified. It is also necessary to recompile `coreDNS` in order to install the corresponding plug-ins, and also need to modify the corresponding `coreDNS` configuration. + +2. New components need to be deployed in member clusters. This solution requires a dns extension resolver to be installed. + +### 2) Karmada adaptation + +What needs to be done in the `karmada` part is to fill in the `ips` field of the `ServiceImport` which imported to the member cluster on the existing basis and correspond to the `clusterIP` of `derived-foo`. Its flow diagram is as follows: + +![remove-derived-in-controller](./remove-derived-in-controller.jpg) + +As shown in the figure above, we still take the `foo` service as an example. After configuring `coreDNS`, we add the logic to obtain the `derived-foo` service in the cluster members in the ServiceImport-controller of the control plane. + +1. Wait for cluster member config the `derived-foo` service's `clusterIP` . + +2. The controller of the control plane obtains the value, and fills the value into the work corresponding to the `ServiceImport` of the control plane. + +3. According to the synchronization principle of `Karmada`, the `ServiceImport` object in the member cluster will be filled with the `clusterIP` value of `derived-foo`. + +That is, the effect of `ServiceImport` after completion needs to be: + +```yaml +apiVersion: multicluster.x-k8s.io/v1alpha1 +kind: ServiceImport +metadata: + name: foo + namespace: default +spec: + type: ClusterSetIP + ports: + - name: "http" + port: 80 + protocol: TCP + ips: + - "10.10.0.5" # clusterIP for cross-cluster access +``` + +### 3) Suggested operation + +The recommended configuration instructions for the `coreDNS`+`multicluster` plug-in solution are given below. + +#### Environment preparation: + +A normal Karmada cluster, and can perform existing mcs operations. + +The following uses the cluster member2 where the client resides as an example to access the `foo`service using the suffix `clusterset.local`. This solution needs to be configured as follows: + +#### coredns compile + +Refer to subsection [`Compilation-Instructions`](#Compilation-Instructions). + +#### coredns configuration + +Refer to subsection [`Configuration-instructions`](#Configuration-instructions). + +#### Client pod configuration + +Taking yaml as an example, you need to add the configuration item searches of dnsConfig in yaml, and set the value to `clusterset.local`. A simple and complete example is as follows: + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: dns-test +spec: + replicas: 2 + selector: + matchLabels: + app: dns-test + template: + metadata: + labels: + app: dns-test + spec: + containers: + - name: dns-test + image: ubuntu:18.04 + command: + - "/bin/bash" + - "-c" + args: [ "while true; do sleep 3; cat /etc/resolv.conf; done" ] + dnsPolicy: ClusterFirst + dnsConfig: + searches: + -clusterset.local + options: + - name: timeout + value: "2 +``` + +### Test Plan + +- All current testing should be passed, no break change would be involved by this feature. + +## Alternatives + +Here are another two ways I know: + +First, using the same service name, not service with`derived-` prefix on other cluster. More details see **[bivas](https://github.com/bivas)** 's pr [proposal for native service discovery](https://github.com/karmada-io/karmada/pull/3694) + +Second, install and using [submariner](https://submariner.io/) . \ No newline at end of file diff --git a/docs/proposals/mcs/remove-derived-mcs.png b/docs/proposals/mcs/remove-derived-mcs.png new file mode 100644 index 000000000000..5e52a18cf62d Binary files /dev/null and b/docs/proposals/mcs/remove-derived-mcs.png differ diff --git a/docs/proposals/service-discovery/README.md b/docs/proposals/service-discovery/README.md new file mode 100644 index 000000000000..e51436c732c3 --- /dev/null +++ b/docs/proposals/service-discovery/README.md @@ -0,0 +1,194 @@ +--- +title: Service discovery with native Kubernetes naming and resolution +authors: +- "@bivas" +reviewers: +- "@XiShanYongYe-Chang" +- TBD +approvers: +- "@robot" +- TBD + +creation-date: 2023-06-22 + +--- + +# Service discovery with native Kubernetes naming and resolution + +## Summary + +With the current `ServiceImportController` when a `ServiceImport` object is reconciled, the derived service is prefixed with `derived-` prefix. + +## Motivation + + +Having a `derived-` prefix for `Service` resources seems counterintuitive when thinking about service discovery: +- Assuming the pod is exported as the service `foo` +- Another pod that wishes to access it on the same cluster will simply call `foo` and Kubernetes will bind to the correct one +- If that pod is scheduled to another cluster, the original service discovery will fail as there's no service by the name `foo` +- To find the original pod, the other pod is required to know it is in another cluster and use `derived-foo` to work properly + +### Goals + +- Remove the "derived-" prefix from the service +- User-friendly and native service discovery + +### Non-Goals + +- Multi cluster connectivity + +## Proposal + +Following are flows to support the service import proposal: + +1. `Deployment` and `Service` are created on cluster member1 and the `Service` imported to cluster member2 using `ServiceImport` (described below as [user story 1](#story-1)) +2. `Deployment` and `Service` are created on cluster member1 and both propagated to cluster member2. `Service` from cluster member1 is imported to cluster member2 using `ServiceImport` (described below as [user story 2](#story-2)) + +The proposal for this flow is what can be referred to as local-and-remote service discovery. The options could be: + +1. **Local** only - In case there's a local service by the name `foo` Karmada never attempts to import the remote service and doesn't create an `EndPointSlice` +2. **Local** and **Remote** - Users accessing the `foo` service will reach either member1 or member2 +3. **Remote** only - in case there's a local service by the name `foo` Karmada will remove the local `EndPointSlice` and will create an `EndPointSlice` pointing to the other cluster (e.g. instead of resolving member2 cluster is will reach member1) + +This proposal suggests to distinguish the services type by the extending the API `multicluster.x-k8s.io/v1alpha1` of `ServiceImport` with Karmada specific annotations: +```yaml +apiVersion: multicluster.x-k8s.io/v1alpha1 +kind: ServiceImport +metadata: + name: serve + namespace: demo + annotations: + discovery.karmada.io/strategy: LocalAndRemote +spec: + type: ClusterSetIP + ports: + - port: 80 + protocol: TCP +``` + +The default could be `LocalAndRemote`. + + +### User Stories (Optional) + + + +#### Story 1 + +As a Kubernetes cluster member, +I want to access a service from another cluster member, +So that I can communicate with the service using its original name. + +**Background**: The Service named `foo` is created on cluster member1 and imported to cluster member2 using `ServiceImport`. + +**Scenario**: + +1. Given that the `Service` named `foo` exists on cluster member1 +2. And the `ServiceImport` resource is created on cluster member2, specifying the import of `foo` +3. When I try to access the service inside member2 +4. Then I can access the service using the name `foo.myspace.svc.cluster.local` + +#### Story 2 + +As a Kubernetes cluster member, +I want to handle conflicts when importing a service from another cluster member, +So that I can access the service without collisions and maintain high availability. + +**Background**: The Service named `foo` is created on cluster member1 and has a conflict when attempting to import to cluster member2. +Conflict refers to the situation where there is already a `Service` `foo` existing on the cluster (e.g. propagated with `PropagationPolicy`), but we still need to import `Service` `foo` from other clusters onto this cluster (using `ServiceImport`) + +**Scenario**: + +1. Given that the `Service` named `foo` exists on cluster member1 +2. And there is already a conflicting `Service` named foo on cluster member2 +3. When I attempt to access the service in cluster member2 using `foo.myspace.svc.cluster.local` +4. Then the requests round-robin between the local `foo` service and the imported `foo` service (member1 and member2) + +### Notes/Constraints/Caveats (Optional) + + + +### Risks and Mitigations + + +Adding a `Service` that resolve to a remote cluster will add a network latency of communication between clusters. + +## Design Details + + + +To achieve service discovery with native Kubernetes naming and resolution, the following design details should be considered: + +1. Service Import Controller Changes: The Service Import Controller needs to be updated to remove the `"derived-"` prefix from the derived service. This ensures that the imported service retains its original name when accessed within the importing cluster. +2. Local-Only Service Discovery: To support local-only service discovery, Karmada should be enhanced to check if there is a local service with the same name as the imported service. If a local service exists, the remote service import should be skipped, and no `EndPointSlice` should be created for the remote service. This allows local services to be accessed directly without going through the remote service. +3. Local and Remote Service Discovery: When a service is imported from another cluster, both local and remote services should be accessible. Users accessing the service should be able to reach either the local or remote member based on their location and cluster routing. This enables high availability and load balancing between the local and remote instances of the service. +4. Remote-Only Service Discovery: In cases where there is local service with the same name as the imported service, Karmada should remove the local `EndPointSlice` and create an `EndPointSlice` pointing to the remote cluster. This allows users to access the service through the imported `EndPointSlice`, effectively reaching the remote cluster. + +Implementing the service registration flow, the following should be considered: +1. Given that the `Service` named `foo` exists on cluster member1 +2. The `ServiceImport` resource is created on cluster member2, specifying the import of `foo` +3. A `Service` resource is created in cluster member2, but does not prefix the name with `derived-` + 1. If there is already an existing `Service` named `foo` on cluster member2 + 2. The `EndPointSlice` associated with the local `foo` service points to the local `Deployment` running on member2 + 3. The controller will ignore the conflict of existing `Service` and will not create a new `Service` object +4. And the imported `EndPointSlice` is created, pointing to the remote `Deployment` running on member1 +5. And the `EndPointSlice` associated with the imported service is prefixed with `import-` +6. Then the requests round-robin between the local `foo` service and the imported `foo` service (member1 and member2) + +### Test Plan + + + +## Alternatives + + + +One alternative approach to service discovery with native Kubernetes naming and resolution is to rely on external DNS-based service discovery mechanisms. However, this approach may require additional configuration and management overhead, as well as potential inconsistencies between different DNS implementations. By leveraging the native Kubernetes naming and resolution capabilities, the proposed solution simplifies service discovery and provides a consistent user experience. + +Another alternative approach could be to enforce a strict naming convention for imported services, where a specific prefix or suffix is added to the service name to differentiate it from local services. However, this approach may introduce complexity for users and require manual handling of naming collisions. The proposed solution aims to provide a more user-friendly experience by removing the "derived-" prefix and allowing services to be accessed using their original names. + + \ No newline at end of file diff --git a/operator/pkg/karmadaresource/webhookconfiguration/webhookconfiguration.go b/operator/pkg/karmadaresource/webhookconfiguration/webhookconfiguration.go index 8a0f797ac985..6345819e9dee 100644 --- a/operator/pkg/karmadaresource/webhookconfiguration/webhookconfiguration.go +++ b/operator/pkg/karmadaresource/webhookconfiguration/webhookconfiguration.go @@ -12,8 +12,8 @@ import ( "github.com/karmada-io/karmada/operator/pkg/util/apiclient" ) -// EnsureWebhookconfiguration creates karmada webhook mutatingWebhookConfiguration and validatingWebhookConfiguration -func EnsureWebhookconfiguration(client clientset.Interface, namespace, name, caBundle string) error { +// EnsureWebhookConfiguration creates karmada webhook mutatingWebhookConfiguration and validatingWebhookConfiguration +func EnsureWebhookConfiguration(client clientset.Interface, namespace, name, caBundle string) error { if err := mutatingWebhookConfiguration(client, namespace, name, caBundle); err != nil { return err } diff --git a/operator/pkg/tasks/init/apiserver.go b/operator/pkg/tasks/init/apiserver.go index 69dbebfa1783..05d77fb9e20b 100644 --- a/operator/pkg/tasks/init/apiserver.go +++ b/operator/pkg/tasks/init/apiserver.go @@ -3,7 +3,6 @@ package tasks import ( "errors" "fmt" - "time" "k8s.io/klog/v2" @@ -104,7 +103,7 @@ func runWaitKarmadaAPIServer(r workflow.RunData) error { return errors.New("wait-KarmadaAPIServer task invoked with an invalid data struct") } - waiter := apiclient.NewKarmadaWaiter(data.ControlplaneConfig(), data.RemoteClient(), time.Second*30) + waiter := apiclient.NewKarmadaWaiter(data.ControlplaneConfig(), data.RemoteClient(), componentBeReadyTimeout) err := waiter.WaitForSomePods(karmadaApiserverLabels.String(), data.GetNamespace(), 1) if err != nil { @@ -147,11 +146,11 @@ func runWaitKarmadaAggregatedAPIServer(r workflow.RunData) error { return errors.New("wait-KarmadaAggregatedAPIServer task invoked with an invalid data struct") } - waiter := apiclient.NewKarmadaWaiter(data.ControlplaneConfig(), data.RemoteClient(), time.Second*30) + waiter := apiclient.NewKarmadaWaiter(data.ControlplaneConfig(), data.RemoteClient(), componentBeReadyTimeout) err := waiter.WaitForSomePods(karmadaAggregatedAPIServerLabels.String(), data.GetNamespace(), 1) if err != nil { - return fmt.Errorf("waiting for karmada-apiserver to ready timeout, err: %w", err) + return fmt.Errorf("waiting for karmada-aggregated-apiserver to ready timeout, err: %w", err) } klog.V(2).InfoS("[wait-KarmadaAggregatedAPIServer] the karmada-aggregated-apiserver is ready", "karmada", klog.KObj(data)) diff --git a/operator/pkg/tasks/init/etcd.go b/operator/pkg/tasks/init/etcd.go index 39346decf35c..59ce284ec3af 100644 --- a/operator/pkg/tasks/init/etcd.go +++ b/operator/pkg/tasks/init/etcd.go @@ -3,7 +3,6 @@ package tasks import ( "errors" "fmt" - "time" "k8s.io/klog/v2" @@ -72,12 +71,12 @@ func runWaitEtcd(r workflow.RunData) error { return errors.New("wait-etcd task invoked with an invalid data struct") } - waiter := apiclient.NewKarmadaWaiter(data.ControlplaneConfig(), data.RemoteClient(), time.Second*30) + waiter := apiclient.NewKarmadaWaiter(data.ControlplaneConfig(), data.RemoteClient(), componentBeReadyTimeout) // wait etcd, karmada apiserver and aggregated apiserver to ready // as long as a replica of pod is ready, we consider the service available. if err := waiter.WaitForSomePods(etcdLabels.String(), data.GetNamespace(), 1); err != nil { - return fmt.Errorf("waiting for etcd to ready timeout, err: %w", err) + return fmt.Errorf("waiting for karmada-etcd to ready timeout, err: %w", err) } klog.V(2).InfoS("[wait-etcd] the etcd pods is ready", "karmada", klog.KObj(data)) diff --git a/operator/pkg/tasks/init/karmadaresource.go b/operator/pkg/tasks/init/karmadaresource.go index 082e71ad9ed4..b11b6d858767 100644 --- a/operator/pkg/tasks/init/karmadaresource.go +++ b/operator/pkg/tasks/init/karmadaresource.go @@ -8,7 +8,6 @@ import ( "path" "regexp" "strings" - "time" corev1 "k8s.io/api/core/v1" apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" @@ -176,7 +175,7 @@ func runWebhookConfiguration(r workflow.RunData) error { } caBase64 := base64.StdEncoding.EncodeToString(cert.CertData()) - return webhookconfiguration.EnsureWebhookconfiguration( + return webhookconfiguration.EnsureWebhookConfiguration( data.KarmadaClient(), data.GetNamespace(), data.GetName(), @@ -200,7 +199,7 @@ func runAPIService(r workflow.RunData) error { return fmt.Errorf("failed to apply aggregated APIService resource to karmada controlplane, err: %w", err) } - waiter := apiclient.NewKarmadaWaiter(config, nil, time.Second*20) + waiter := apiclient.NewKarmadaWaiter(config, nil, componentBeReadyTimeout) if err := apiclient.TryRunCommand(waiter.WaitForAPIService, 3); err != nil { return fmt.Errorf("the APIService is unhealthy, err: %w", err) } diff --git a/operator/pkg/tasks/init/wait.go b/operator/pkg/tasks/init/wait.go index 794171e92453..89f412aa006a 100644 --- a/operator/pkg/tasks/init/wait.go +++ b/operator/pkg/tasks/init/wait.go @@ -14,12 +14,16 @@ import ( ) var ( + // The timeout for wait each component be ready + // It includes the time for pulling the component image. + componentBeReadyTimeout = 120 * time.Second + etcdLabels = labels.Set{"karmada-app": constants.Etcd} karmadaApiserverLabels = labels.Set{"karmada-app": constants.KarmadaAPIServer} karmadaAggregatedAPIServerLabels = labels.Set{"karmada-app": constants.KarmadaAggregatedAPIServer} kubeControllerManagerLabels = labels.Set{"karmada-app": constants.KubeControllerManager} karmadaControllerManagerLabels = labels.Set{"karmada-app": constants.KarmadaControllerManager} - karmadaSchedulerLablels = labels.Set{"karmada-app": constants.KarmadaScheduler} + karmadaSchedulerLabels = labels.Set{"karmada-app": constants.KarmadaScheduler} karmadaWebhookLabels = labels.Set{"karmada-app": constants.KarmadaWebhook} ) @@ -38,7 +42,7 @@ func runWaitApiserver(r workflow.RunData) error { } klog.V(4).InfoS("[check-apiserver-health] Running task", "karmada", klog.KObj(data)) - waiter := apiclient.NewKarmadaWaiter(data.ControlplaneConfig(), data.RemoteClient(), time.Second*30) + waiter := apiclient.NewKarmadaWaiter(data.ControlplaneConfig(), data.RemoteClient(), componentBeReadyTimeout) // check whether the karmada apiserver is health. if err := apiclient.TryRunCommand(waiter.WaitForAPI, 3); err != nil { @@ -57,7 +61,7 @@ func NewWaitControlPlaneTask() workflow.Task { Tasks: []workflow.Task{ newWaitControlPlaneSubTask("KubeControllerManager", kubeControllerManagerLabels), newWaitControlPlaneSubTask("KarmadaControllerManager", karmadaControllerManagerLabels), - newWaitControlPlaneSubTask("KarmadaScheduler", karmadaSchedulerLablels), + newWaitControlPlaneSubTask("KarmadaScheduler", karmadaSchedulerLabels), newWaitControlPlaneSubTask("KarmadaWebhook", karmadaWebhookLabels), }, } @@ -87,7 +91,7 @@ func runWaitControlPlaneSubTask(component string, lables labels.Set) func(r work return errors.New("wait-controlPlane task invoked with an invalid data struct") } - waiter := apiclient.NewKarmadaWaiter(nil, data.RemoteClient(), time.Second*120) + waiter := apiclient.NewKarmadaWaiter(nil, data.RemoteClient(), componentBeReadyTimeout) if err := waiter.WaitForSomePods(lables.String(), data.GetNamespace(), 1); err != nil { return fmt.Errorf("waiting for %s to ready timeout, err: %w", component, err) } diff --git a/operator/pkg/util/apiclient/idempotency.go b/operator/pkg/util/apiclient/idempotency.go index 749513e0cab4..9ec70328e1f2 100644 --- a/operator/pkg/util/apiclient/idempotency.go +++ b/operator/pkg/util/apiclient/idempotency.go @@ -25,7 +25,7 @@ import ( var errAllocated = errors.New("provided port is already allocated") -// NewCRDsClient is to create a clientset ClientSet +// NewCRDsClient is to create a Clientset func NewCRDsClient(c *rest.Config) (*crdsclient.Clientset, error) { return crdsclient.NewForConfig(c) } @@ -207,26 +207,26 @@ func PatchCustomResourceDefinition(client *crdsclient.Clientset, name string, da } // CreateOrUpdateStatefulSet creates a StatefulSet if the target resource doesn't exist. If the resource exists already, this function will update the resource instead. -func CreateOrUpdateStatefulSet(client clientset.Interface, statefuleSet *appsv1.StatefulSet) error { - _, err := client.AppsV1().StatefulSets(statefuleSet.GetNamespace()).Create(context.TODO(), statefuleSet, metav1.CreateOptions{}) +func CreateOrUpdateStatefulSet(client clientset.Interface, statefulSet *appsv1.StatefulSet) error { + _, err := client.AppsV1().StatefulSets(statefulSet.GetNamespace()).Create(context.TODO(), statefulSet, metav1.CreateOptions{}) if err != nil { if !apierrors.IsAlreadyExists(err) { return err } - older, err := client.AppsV1().StatefulSets(statefuleSet.GetNamespace()).Get(context.TODO(), statefuleSet.GetName(), metav1.GetOptions{}) + older, err := client.AppsV1().StatefulSets(statefulSet.GetNamespace()).Get(context.TODO(), statefulSet.GetName(), metav1.GetOptions{}) if err != nil { return err } - statefuleSet.ResourceVersion = older.ResourceVersion - _, err = client.AppsV1().StatefulSets(statefuleSet.GetNamespace()).Update(context.TODO(), statefuleSet, metav1.UpdateOptions{}) + statefulSet.ResourceVersion = older.ResourceVersion + _, err = client.AppsV1().StatefulSets(statefulSet.GetNamespace()).Update(context.TODO(), statefulSet, metav1.UpdateOptions{}) if err != nil { return err } } - klog.V(5).InfoS("Successfully created or updated statefulset", "statefulset", statefuleSet.GetName) + klog.V(5).InfoS("Successfully created or updated statefulset", "statefulset", statefulSet.GetName) return nil } @@ -248,7 +248,7 @@ func DeleteDeploymentIfHasLabels(client clientset.Interface, name, namespace str return client.AppsV1().Deployments(namespace).Delete(context.TODO(), name, metav1.DeleteOptions{}) } -// DeleteStatefulSetIfHasLabels deletes a StatefuleSet that exists the given labels. +// DeleteStatefulSetIfHasLabels deletes a StatefulSet that exists the given labels. func DeleteStatefulSetIfHasLabels(client clientset.Interface, name, namespace string, ls labels.Set) error { sts, err := client.AppsV1().StatefulSets(namespace).Get(context.TODO(), name, metav1.GetOptions{}) if err != nil { diff --git a/operator/pkg/util/endpoint.go b/operator/pkg/util/endpoint.go index f24e2cce2f5d..7defaf2a66eb 100644 --- a/operator/pkg/util/endpoint.go +++ b/operator/pkg/util/endpoint.go @@ -19,8 +19,7 @@ func GetControlplaneEndpoint(address, port string) (string, error) { if ip == nil { return "", fmt.Errorf("invalid value `%s` given for address", address) } - url := formatURL(ip.String(), port) - return url.String(), nil + return formatURL(ip.String(), port).String(), nil } // formatURL takes a host and a port string and creates a net.URL using https scheme diff --git a/pkg/controllers/status/cluster_status_controller.go b/pkg/controllers/status/cluster_status_controller.go index f797ca51c614..7572cd50a15a 100644 --- a/pkg/controllers/status/cluster_status_controller.go +++ b/pkg/controllers/status/cluster_status_controller.go @@ -367,8 +367,6 @@ func (c *ClusterStatusController) initLeaseController(cluster *clusterv1alpha1.C ctx, cancelFunc := context.WithCancel(context.TODO()) c.ClusterLeaseControllers.Store(cluster.Name, cancelFunc) - var wg sync.WaitGroup - wg.Add(1) // start syncing lease go func() { klog.Infof("Starting syncing lease for cluster: %s", cluster.Name) @@ -378,9 +376,7 @@ func (c *ClusterStatusController) initLeaseController(cluster *clusterv1alpha1.C klog.Infof("Stop syncing lease for cluster: %s", cluster.Name) c.ClusterLeaseControllers.Delete(cluster.Name) // ensure the cache is clean - wg.Done() }() - wg.Wait() } func getClusterHealthStatus(clusterClient *util.ClusterClient) (online, healthy bool) { diff --git a/pkg/modeling/modeling.go b/pkg/modeling/modeling.go index 71eabeb2a74a..82cc6ed150ce 100644 --- a/pkg/modeling/modeling.go +++ b/pkg/modeling/modeling.go @@ -162,6 +162,10 @@ func safeChangeNum(num *int, change int) { // AddToResourceSummary add resource node into modeling summary func (rs *ResourceSummary) AddToResourceSummary(crn ClusterResourceNode) { index := rs.getIndex(crn) + if index == -1 { + klog.Error("ClusterResource can not add to resource summary: index is invalid.") + return + } modeling := &(*rs)[index] if rs.GetNodeNumFromModel(modeling) <= 5 { root := modeling.linkedlist @@ -269,6 +273,9 @@ func (rs *ResourceSummary) GetNodeNumFromModel(model *resourceModels) int { // DeleteFromResourceSummary dalete resource node into modeling summary func (rs *ResourceSummary) DeleteFromResourceSummary(crn ClusterResourceNode) error { index := rs.getIndex(crn) + if index == -1 { + return errors.New("ClusterResource can not delet the resource summary: index is invalid.") + } modeling := &(*rs)[index] if rs.GetNodeNumFromModel(modeling) >= 6 { root := modeling.redblackTree diff --git a/pkg/sharedcli/profileflag/profileflag.go b/pkg/sharedcli/profileflag/profileflag.go index 0a57866b5507..2e30a5fc754f 100644 --- a/pkg/sharedcli/profileflag/profileflag.go +++ b/pkg/sharedcli/profileflag/profileflag.go @@ -21,6 +21,22 @@ const ( // References: // - https://en.wikipedia.org/wiki/Slowloris_(computer_security) ReadHeaderTimeout = 32 * time.Second + // WriteTimeout is the amount of time allowed to write the + // request data. + // HTTP timeouts are necessary to expire inactive connections + // and failing to do so might make the application vulnerable + // to attacks like slowloris which work by sending data very slow, + // which in case of no timeout will keep the connection active + // eventually leading to a denial-of-service (DoS) attack. + WriteTimeout = 5 * time.Minute + // ReadTimeout is the amount of time allowed to read + // response data. + // HTTP timeouts are necessary to expire inactive connections + // and failing to do so might make the application vulnerable + // to attacks like slowloris which work by sending data very slow, + // which in case of no timeout will keep the connection active + // eventually leading to a denial-of-service (DoS) attack. + ReadTimeout = 5 * time.Minute ) // Options are options for pprof. @@ -57,6 +73,8 @@ func ListenAndServe(opts Options) { Addr: opts.ProfilingBindAddress, Handler: mux, ReadHeaderTimeout: ReadHeaderTimeout, + WriteTimeout: WriteTimeout, + ReadTimeout: ReadTimeout, } if err := httpServer.ListenAndServe(); err != nil { klog.Errorf("Failed to enable profiling: %v", err)