diff --git a/.github/workflows/auto-assign.yaml b/.github/workflows/auto-assign.yaml index fce7665e5f6..907e702af40 100644 --- a/.github/workflows/auto-assign.yaml +++ b/.github/workflows/auto-assign.yaml @@ -11,7 +11,8 @@ jobs: runs-on: ubuntu-latest steps: - name: take the issue - uses: bdougie/take-action@main + # yamllint disable-line rule:line-length + uses: bdougie/take-action@1439165ac45a7461c2d89a59952cd7d941964b87 # main with: message: > Thanks for taking this issue! diff --git a/.github/workflows/build-multi-stage.yaml b/.github/workflows/build-multi-stage.yaml index 333bcc13bab..55549c6e657 100644 --- a/.github/workflows/build-multi-stage.yaml +++ b/.github/workflows/build-multi-stage.yaml @@ -13,7 +13,8 @@ jobs: name: multi-arch-build runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + # yamllint disable-line rule:line-length + - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 - name: multi-arch-build # yamllint disable-line rule:line-length if: ${{ ! contains(github.event.pull_request.labels.*.name, 'ci/skip/multi-arch-build') }} diff --git a/.github/workflows/codespell.yaml b/.github/workflows/codespell.yaml index ece48c9f122..2b0de44297f 100644 --- a/.github/workflows/codespell.yaml +++ b/.github/workflows/codespell.yaml @@ -15,6 +15,7 @@ jobs: name: codespell runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + # yamllint disable-line rule:line-length + - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 - name: codespell run: make containerized-test TARGET=codespell diff --git a/.github/workflows/commitlint.yaml b/.github/workflows/commitlint.yaml index 7b7b653ba8c..877151a0f1b 100644 --- a/.github/workflows/commitlint.yaml +++ b/.github/workflows/commitlint.yaml @@ -14,7 +14,8 @@ jobs: if: ${{ github.event.pull_request.user.login != 'dependabot[bot]' }} runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + # yamllint disable-line rule:line-length + - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 with: ref: ${{ github.event.pull_request.head.sha }} - name: commitlint diff --git a/.github/workflows/dependency-review.yaml b/.github/workflows/dependency-review.yaml index fd204ebee64..867c2674d34 100644 --- a/.github/workflows/dependency-review.yaml +++ b/.github/workflows/dependency-review.yaml @@ -15,8 +15,10 @@ jobs: runs-on: ubuntu-latest steps: - name: 'Checkout Repository' - uses: actions/checkout@v4 + # yamllint disable-line rule:line-length + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 - name: 'Dependency Review' - uses: actions/dependency-review-action@v4 + # yamllint disable-line rule:line-length + uses: actions/dependency-review-action@5a2ce3f5b92ee19cbb1541a4984c76d921601d7c # v4.3.4 with: allow-ghsas: GHSA-f4w6-3rh6-6q4q diff --git a/.github/workflows/go-test.yaml b/.github/workflows/go-test.yaml index da1adca2d28..1818375de48 100644 --- a/.github/workflows/go-test.yaml +++ b/.github/workflows/go-test.yaml @@ -14,7 +14,8 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout the repo - uses: actions/checkout@v4 + # yamllint disable-line rule:line-length + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 - name: Check generated deploy code run: make generate-deploy @@ -29,20 +30,23 @@ jobs: name: e2e-build-test runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + # yamllint disable-line rule:line-length + - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 - name: e2e-build-test run: make containerized-build TARGET=e2e.test go-test: name: go-test runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + # yamllint disable-line rule:line-length + - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 - name: go-test run: make containerized-test TARGET=go-test go-test-api: name: go-test-api runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + # yamllint disable-line rule:line-length + - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 - name: go-test-api run: make containerized-test TARGET=go-test-api diff --git a/.github/workflows/golangci-lint.yaml b/.github/workflows/golangci-lint.yaml index 7d18eafb00f..f0473ef8d09 100644 --- a/.github/workflows/golangci-lint.yaml +++ b/.github/workflows/golangci-lint.yaml @@ -13,6 +13,7 @@ jobs: name: golangci-lint runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + # yamllint disable-line rule:line-length + - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 - name: golangci-lint run: make containerized-test TARGET=go-lint diff --git a/.github/workflows/lint-extras.yaml b/.github/workflows/lint-extras.yaml index de6fcd363fa..8c04d7eea6c 100644 --- a/.github/workflows/lint-extras.yaml +++ b/.github/workflows/lint-extras.yaml @@ -13,6 +13,7 @@ jobs: name: lint-extras runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + # yamllint disable-line rule:line-length + - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 - name: lint-extras run: make containerized-test TARGET=lint-extras diff --git a/.github/workflows/mergify-copy-labels.yaml b/.github/workflows/mergify-copy-labels.yaml index 3323b438c14..dbcdd5c44d8 100644 --- a/.github/workflows/mergify-copy-labels.yaml +++ b/.github/workflows/mergify-copy-labels.yaml @@ -12,7 +12,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Copying labels - uses: Mergifyio/gha-mergify-merge-queue-labels-copier@main + uses: Mergifyio/gha-mergify-merge-queue-labels-copier@1d2b277f94d52987008ec05b571fb68f2357e63f # main with: additional-labels: 'ok-to-test' token: ${{ secrets.CEPH_CSI_BOT_TOKEN }} diff --git a/.github/workflows/mod-check.yaml b/.github/workflows/mod-check.yaml index b94460ad38f..3809d9e393e 100644 --- a/.github/workflows/mod-check.yaml +++ b/.github/workflows/mod-check.yaml @@ -13,6 +13,7 @@ jobs: name: mod-check runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + # yamllint disable-line rule:line-length + - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 - name: mod-check run: make containerized-test TARGET=mod-check diff --git a/.github/workflows/publish-artifacts.yaml b/.github/workflows/publish-artifacts.yaml index b29f7cf2f4b..066a9681d57 100644 --- a/.github/workflows/publish-artifacts.yaml +++ b/.github/workflows/publish-artifacts.yaml @@ -18,10 +18,12 @@ jobs: runs-on: ubuntu-latest if: github.repository == 'ceph/ceph-csi' steps: - - uses: actions/checkout@v4 + # yamllint disable-line rule:line-length + - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 - name: Login to Quay - uses: docker/login-action@v3 + # yamllint disable-line rule:line-length + uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567 # v3.3.0 with: registry: quay.io username: ${{ secrets.QUAY_IO_USERNAME }} diff --git a/.github/workflows/pull-request-commentor.yaml b/.github/workflows/pull-request-commentor.yaml index 4941651a53f..d64d74cdd0d 100644 --- a/.github/workflows/pull-request-commentor.yaml +++ b/.github/workflows/pull-request-commentor.yaml @@ -51,7 +51,8 @@ jobs: Add comment to trigger external storage tests for Kubernetes ${{ matrix.k8s }} if: ${{ github.base_ref == matrix.branch }} - uses: peter-evans/create-or-update-comment@v4 + # yamllint disable-line rule:line-length + uses: peter-evans/create-or-update-comment@71345be0265236311c031f5c7866368bd1eff043 # v4.0.0 with: token: ${{ secrets.CEPH_CSI_BOT_TOKEN }} issue-number: ${{ github.event.pull_request.number }} @@ -62,7 +63,8 @@ jobs: Add comment to trigger helm E2E tests for Kubernetes ${{ matrix.k8s }} if: ${{ github.base_ref == matrix.branch }} - uses: peter-evans/create-or-update-comment@v4 + # yamllint disable-line rule:line-length + uses: peter-evans/create-or-update-comment@71345be0265236311c031f5c7866368bd1eff043 # v4.0.0 with: token: ${{ secrets.CEPH_CSI_BOT_TOKEN }} issue-number: ${{ github.event.pull_request.number }} @@ -70,7 +72,8 @@ jobs: /test ci/centos/mini-e2e-helm/k8s-${{ matrix.k8s }} - name: Add comment to trigger E2E tests for Kubernetes ${{ matrix.k8s }} - uses: peter-evans/create-or-update-comment@v4 + # yamllint disable-line rule:line-length + uses: peter-evans/create-or-update-comment@71345be0265236311c031f5c7866368bd1eff043 # v4.0.0 if: ${{ github.base_ref == matrix.branch }} with: token: ${{ secrets.CEPH_CSI_BOT_TOKEN }} @@ -87,7 +90,8 @@ jobs: steps: - name: Add comment to trigger cephfs upgrade tests - uses: peter-evans/create-or-update-comment@v4 + # yamllint disable-line rule:line-length + uses: peter-evans/create-or-update-comment@71345be0265236311c031f5c7866368bd1eff043 # v4.0.0 with: token: ${{ secrets.CEPH_CSI_BOT_TOKEN }} issue-number: ${{ github.event.pull_request.number }} @@ -95,7 +99,8 @@ jobs: /test ci/centos/upgrade-tests-cephfs - name: Add comment to trigger rbd upgrade tests - uses: peter-evans/create-or-update-comment@v4 + # yamllint disable-line rule:line-length + uses: peter-evans/create-or-update-comment@71345be0265236311c031f5c7866368bd1eff043 # v4.0.0 with: token: ${{ secrets.CEPH_CSI_BOT_TOKEN }} issue-number: ${{ github.event.pull_request.number }} @@ -116,7 +121,8 @@ jobs: steps: - name: remove ok-to-test-label after commenting - uses: actions/github-script@v7 + # yamllint disable-line rule:line-length + uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 with: github-token: ${{ secrets.CEPH_CSI_BOT_TOKEN }} script: | diff --git a/.github/workflows/retest.yaml b/.github/workflows/retest.yaml index 81879627590..f99d3f59ee1 100644 --- a/.github/workflows/retest.yaml +++ b/.github/workflows/retest.yaml @@ -15,7 +15,8 @@ jobs: runs-on: ubuntu-latest steps: # path to the retest action - - uses: ceph/ceph-csi/actions/retest@devel + # yamllint disable-line rule:line-length + - uses: ceph/ceph-csi/actions/retest@28dc64dcae3cec8d11d84bdf525bda0ef757c688 # devel with: GITHUB_TOKEN: ${{ secrets.CEPH_CSI_BOT_TOKEN }} required-label: "ci/retry/e2e" diff --git a/.github/workflows/snyk-container-image.yaml b/.github/workflows/snyk-container-image.yaml index 74b9b55d2b6..eabfca77b86 100644 --- a/.github/workflows/snyk-container-image.yaml +++ b/.github/workflows/snyk-container-image.yaml @@ -26,18 +26,21 @@ jobs: if: github.repository == 'ceph/ceph-csi' runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + # yamllint disable-line rule:line-length + - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 - name: Build a Docker image run: make image-cephcsi - name: Run Snyk to check Docker image for vulnerabilities continue-on-error: true - uses: snyk/actions/docker@master + # yamllint disable-line rule:line-length + uses: snyk/actions/docker@cdb760004ba9ea4d525f2e043745dfe85bb9077e # master env: SNYK_TOKEN: ${{ secrets.SYNK_TOKEN }} with: image: quay.io/cephcsi/cephcsi:${{ github.base_ref }} args: --file=Dockerfilei - name: Upload result to GitHub Code Scanning - uses: github/codeql-action/upload-sarif@v3 + # yamllint disable-line rule:line-length + uses: github/codeql-action/upload-sarif@8214744c546c1e5c8f03dde8fab3a7353211988d # v3.26.7 with: sarif_file: snyk.sarif diff --git a/.github/workflows/snyk.yaml b/.github/workflows/snyk.yaml index dd1ed75f7e7..8ee96f14d5b 100644 --- a/.github/workflows/snyk.yaml +++ b/.github/workflows/snyk.yaml @@ -20,11 +20,13 @@ jobs: runs-on: ubuntu-latest steps: - name: checkout - uses: actions/checkout@v4 + # yamllint disable-line rule:line-length + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 with: fetch-depth: 0 - name: run Snyk to check for code vulnerabilities - uses: snyk/actions/golang@master + # yamllint disable-line rule:line-length + uses: snyk/actions/golang@cdb760004ba9ea4d525f2e043745dfe85bb9077e # master env: SNYK_TOKEN: ${{ secrets.SYNK_TOKEN }} diff --git a/.github/workflows/stale.yaml b/.github/workflows/stale.yaml index 01fa31fe3e9..629919591e8 100644 --- a/.github/workflows/stale.yaml +++ b/.github/workflows/stale.yaml @@ -18,7 +18,8 @@ jobs: runs-on: ubuntu-latest if: github.repository == 'ceph/ceph-csi' steps: - - uses: actions/stale@v9 + # yamllint disable-line rule:line-length + - uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0 with: repo-token: ${{ secrets.GITHUB_TOKEN }} days-before-issue-stale: 30 diff --git a/.github/workflows/test-retest-action.yaml b/.github/workflows/test-retest-action.yaml index e8578179bce..183bda6336e 100644 --- a/.github/workflows/test-retest-action.yaml +++ b/.github/workflows/test-retest-action.yaml @@ -15,7 +15,8 @@ jobs: build: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + # yamllint disable-line rule:line-length + - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 - name: Docker build # Run cd to avoid loading complete cephcsi directory in docker context diff --git a/.github/workflows/tickgit.yaml b/.github/workflows/tickgit.yaml index 2b49b48eb1d..106e380fdbb 100644 --- a/.github/workflows/tickgit.yaml +++ b/.github/workflows/tickgit.yaml @@ -14,5 +14,6 @@ jobs: name: tickgit runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + # yamllint disable-line rule:line-length + - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 - run: make containerized-test TARGET=tickgit diff --git a/PendingReleaseNotes.md b/PendingReleaseNotes.md index a77eb58d405..43c886283ef 100644 --- a/PendingReleaseNotes.md +++ b/PendingReleaseNotes.md @@ -12,5 +12,7 @@ - deploy: radosNamespaceCephFS can be configured for ceph-csi-cephfs chart in [PR](https://github.com/ceph/ceph-csi/pull/4652) - build: update ceph release to squid in [PR](https://github.com/ceph/ceph-csi/pull/4735) - build: CentOS Stream 9 is used as OS in the container-images [PR](https://github.com/ceph/ceph-csi/pull/4735) +- util: a log message "Slow GRPC" is now emitted when + CSI GRPC call outlives its deadline [PR](https://github.com/ceph/ceph-csi/pull/4847) ## NOTE diff --git a/charts/ceph-csi-cephfs/README.md b/charts/ceph-csi-cephfs/README.md index 2033ca3ad09..1be352f3f65 100644 --- a/charts/ceph-csi-cephfs/README.md +++ b/charts/ceph-csi-cephfs/README.md @@ -118,6 +118,7 @@ charts and their default values. | `commonLabels` | Labels to apply to all resources | `{}` | | `logLevel` | Set logging level for csi containers. Supported values from 0 to 5. 0 for general useful logs, 5 for trace level verbosity. | `5` | | `sidecarLogLevel` | Set logging level for csi sidecar containers. Supported values from 0 to 5. 0 for general useful logs, 5 for trace level verbosity. | `1` | +| `logSlowOperationInterval` | Log slow operations at the specified rate. Operation is considered slow if it outlives its deadline. | `30s` | | `nodeplugin.name` | Specifies the nodeplugin name | `nodeplugin` | | `nodeplugin.updateStrategy` | Specifies the update Strategy. If you are using ceph-fuse client set this value to OnDelete | `RollingUpdate` | | `nodeplugin.priorityClassName` | Set user created priorityClassName for csi plugin pods. default is system-node-critical which is highest priority | `system-node-critical` | diff --git a/charts/ceph-csi-cephfs/templates/nodeplugin-daemonset.yaml b/charts/ceph-csi-cephfs/templates/nodeplugin-daemonset.yaml index b91b8047a00..5be4632799a 100644 --- a/charts/ceph-csi-cephfs/templates/nodeplugin-daemonset.yaml +++ b/charts/ceph-csi-cephfs/templates/nodeplugin-daemonset.yaml @@ -72,6 +72,7 @@ spec: {{- if and .Values.readAffinity .Values.readAffinity.enabled }} - "--crush-location-labels={{ .Values.readAffinity.crushLocationLabels | join "," }}" {{- end }} + - "--logslowopinterval={{ .Values.logSlowOperationInterval }}" env: - name: POD_IP valueFrom: diff --git a/charts/ceph-csi-cephfs/templates/provisioner-deployment.yaml b/charts/ceph-csi-cephfs/templates/provisioner-deployment.yaml index 3257705af59..14b0f2c0cdf 100644 --- a/charts/ceph-csi-cephfs/templates/provisioner-deployment.yaml +++ b/charts/ceph-csi-cephfs/templates/provisioner-deployment.yaml @@ -92,6 +92,7 @@ spec: - "--clustername={{ .Values.provisioner.clustername }}" {{- end }} - "--setmetadata={{ .Values.provisioner.setmetadata }}" + - "--logslowopinterval={{ .Values.logSlowOperationInterval }}" env: - name: POD_IP valueFrom: diff --git a/charts/ceph-csi-cephfs/values.yaml b/charts/ceph-csi-cephfs/values.yaml index 4b52084641c..7d73851e61b 100644 --- a/charts/ceph-csi-cephfs/values.yaml +++ b/charts/ceph-csi-cephfs/values.yaml @@ -40,6 +40,9 @@ commonLabels: {} logLevel: 5 # sidecarLogLevel is the variable for Kubernetes sidecar container's log level sidecarLogLevel: 1 +# Log slow operations at the specified rate. +# Operation is considered slow if it outlives its deadline. +logSlowOperationInterval: 30s # Set fsGroupPolicy for CSI Driver object spec # https://kubernetes-csi.github.io/docs/support-fsgroup.html diff --git a/charts/ceph-csi-rbd/README.md b/charts/ceph-csi-rbd/README.md index c516d410664..ba5fe684be8 100644 --- a/charts/ceph-csi-rbd/README.md +++ b/charts/ceph-csi-rbd/README.md @@ -120,6 +120,7 @@ charts and their default values. | `commonLabels` | Labels to apply to all resources | `{}` | | `logLevel` | Set logging level for csi containers. Supported values from 0 to 5. 0 for general useful logs, 5 for trace level verbosity. | `5` | | `sidecarLogLevel` | Set logging level for csi sidecar containers. Supported values from 0 to 5. 0 for general useful logs, 5 for trace level verbosity. | `1` | +| `logSlowOperationInterval` | Log slow operations at the specified rate. Operation is considered slow if it outlives its deadline. | `30s` | | `nodeplugin.name` | Specifies the nodeplugins name | `nodeplugin` | | `nodeplugin.updateStrategy` | Specifies the update Strategy. If you are using ceph-fuse client set this value to OnDelete | `RollingUpdate` | | `nodeplugin.priorityClassName` | Set user created priorityclassName for csi plugin pods. default is system-node-critical which is highest priority | `system-node-critical` | diff --git a/charts/ceph-csi-rbd/templates/nodeplugin-daemonset.yaml b/charts/ceph-csi-rbd/templates/nodeplugin-daemonset.yaml index cf9b201cedc..e640031b7e2 100644 --- a/charts/ceph-csi-rbd/templates/nodeplugin-daemonset.yaml +++ b/charts/ceph-csi-rbd/templates/nodeplugin-daemonset.yaml @@ -70,6 +70,7 @@ spec: {{- if and .Values.readAffinity .Values.readAffinity.enabled }} - "--crush-location-labels={{ .Values.readAffinity.crushLocationLabels | join "," }}" {{- end }} + - "--logslowopinterval={{ .Values.logSlowOperationInterval }}" env: - name: POD_IP valueFrom: diff --git a/charts/ceph-csi-rbd/templates/provisioner-deployment.yaml b/charts/ceph-csi-rbd/templates/provisioner-deployment.yaml index 70c393c00cb..fe61b56bff8 100644 --- a/charts/ceph-csi-rbd/templates/provisioner-deployment.yaml +++ b/charts/ceph-csi-rbd/templates/provisioner-deployment.yaml @@ -97,6 +97,7 @@ spec: - "--clustername={{ .Values.provisioner.clustername }}" {{- end }} - "--setmetadata={{ .Values.provisioner.setmetadata }}" + - "--logslowopinterval={{ .Values.logSlowOperationInterval }}" env: - name: POD_IP valueFrom: diff --git a/charts/ceph-csi-rbd/values.yaml b/charts/ceph-csi-rbd/values.yaml index 88bad96dd69..40f6f84ff0a 100644 --- a/charts/ceph-csi-rbd/values.yaml +++ b/charts/ceph-csi-rbd/values.yaml @@ -69,6 +69,9 @@ commonLabels: {} logLevel: 5 # sidecarLogLevel is the variable for Kubernetes sidecar container's log level sidecarLogLevel: 1 +# Log slow operations at the specified rate. +# Operation is considered slow if it outlives its deadline. +logSlowOperationInterval: 30s # Set fsGroupPolicy for CSI Driver object spec # https://kubernetes-csi.github.io/docs/support-fsgroup.html diff --git a/cmd/cephcsi.go b/cmd/cephcsi.go index a43ac276362..bc6e1c7a3a0 100644 --- a/cmd/cephcsi.go +++ b/cmd/cephcsi.go @@ -120,6 +120,11 @@ func init() { "path of prometheus endpoint where metrics will be available") flag.DurationVar(&conf.PollTime, "polltime", time.Second*pollTime, "time interval in seconds between each poll") flag.DurationVar(&conf.PoolTimeout, "timeout", time.Second*probeTimeout, "probe timeout in seconds") + flag.DurationVar( + &conf.LogSlowOpInterval, + "logslowopinterval", + time.Second*30, + "how often to inform about slow gRPC calls") flag.UintVar( &conf.RbdHardMaxCloneDepth, diff --git a/docs/deploy-cephfs.md b/docs/deploy-cephfs.md index da798cbae08..ba052b44b4a 100644 --- a/docs/deploy-cephfs.md +++ b/docs/deploy-cephfs.md @@ -50,6 +50,7 @@ make image-cephcsi | `--enable-read-affinity` | `false` | enable read affinity | | `--crush-location-labels`| _empty_ | Kubernetes node labels that determine the CRUSH location the node belongs to, separated by ','.
`Note: These labels will be replaced if crush location labels are defined in the ceph-csi-config ConfigMap for the specific cluster.` | | `--radosnamespacecephfs`| _empty_ | CephFS RadosNamespace used to store CSI specific objects and keys. | +| `--logslowopinterval` | `30s` | Log slow operations at the specified rate. Operation is considered slow if it outlives its deadline. | **NOTE:** The parameter `-forcecephkernelclient` enables the Kernel CephFS mounter on kernels < 4.17. diff --git a/docs/deploy-rbd.md b/docs/deploy-rbd.md index e493113b99f..f27f32df022 100644 --- a/docs/deploy-rbd.md +++ b/docs/deploy-rbd.md @@ -48,6 +48,7 @@ make image-cephcsi | `--setmetadata` | `false` | Set metadata on volume | | `--enable-read-affinity` | `false` | enable read affinity | | `--crush-location-labels`| _empty_ | Kubernetes node labels that determine the CRUSH location the node belongs to, separated by ','.
`Note: These labels will be replaced if crush location labels are defined in the ceph-csi-config ConfigMap for the specific cluster.` | +| `--logslowopinterval` | `30s` | Log slow operations at the specified rate. Operation is considered slow if it outlives its deadline. | **Available volume parameters:** diff --git a/go.mod b/go.mod index 6b730194448..3c137c4dad5 100644 --- a/go.mod +++ b/go.mod @@ -28,7 +28,7 @@ require ( golang.org/x/crypto v0.27.0 golang.org/x/net v0.29.0 golang.org/x/sys v0.25.0 - google.golang.org/grpc v1.66.0 + google.golang.org/grpc v1.66.2 google.golang.org/protobuf v1.34.2 // // when updating k8s.io/kubernetes, make sure to update the replace section too diff --git a/go.sum b/go.sum index 49b5410524b..df63d1e416c 100644 --- a/go.sum +++ b/go.sum @@ -3362,8 +3362,8 @@ google.golang.org/grpc v1.63.0/go.mod h1:WAX/8DgncnokcFUldAxq7GeB5DXHDbMF+lLvDom google.golang.org/grpc v1.63.2/go.mod h1:WAX/8DgncnokcFUldAxq7GeB5DXHDbMF+lLvDomNkRA= google.golang.org/grpc v1.64.0/go.mod h1:oxjF8E3FBnjp+/gVFYdWacaLDx9na1aqy9oovLpxQYg= google.golang.org/grpc v1.65.0/go.mod h1:WgYC2ypjlB0EiQi6wdKixMqukr6lBc0Vo+oOgjrM5ZQ= -google.golang.org/grpc v1.66.0 h1:DibZuoBznOxbDQxRINckZcUvnCEvrW9pcWIE2yF9r1c= -google.golang.org/grpc v1.66.0/go.mod h1:s3/l6xSSCURdVfAnL+TqCNMyTDAGN6+lZeVxnZR128Y= +google.golang.org/grpc v1.66.2 h1:3QdXkuq3Bkh7w+ywLdLvM56cmGvQHUMZpiCzt6Rqaoo= +google.golang.org/grpc v1.66.2/go.mod h1:s3/l6xSSCURdVfAnL+TqCNMyTDAGN6+lZeVxnZR128Y= google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.1.0/go.mod h1:6Kw0yEErY5E/yWrBtf03jp27GLLJujG4z/JK95pnjjw= google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= diff --git a/internal/cephfs/driver.go b/internal/cephfs/driver.go index 97509956d77..8023bb1a798 100644 --- a/internal/cephfs/driver.go +++ b/internal/cephfs/driver.go @@ -199,7 +199,9 @@ func (fs *Driver) Run(conf *util.Config) { NS: fs.ns, GS: fs.cs, } - server.Start(conf.Endpoint, srv) + server.Start(conf.Endpoint, srv, csicommon.MiddlewareServerOptionConfig{ + LogSlowOpInterval: conf.LogSlowOpInterval, + }) if conf.EnableProfiling { go util.StartMetricsServer(conf) @@ -230,7 +232,9 @@ func (fs *Driver) setupCSIAddonsServer(conf *util.Config) error { } // start the server, this does not block, it runs a new go-routine - err = fs.cas.Start() + err = fs.cas.Start(csicommon.MiddlewareServerOptionConfig{ + LogSlowOpInterval: conf.LogSlowOpInterval, + }) if err != nil { return fmt.Errorf("failed to start CSI-Addons server: %w", err) } diff --git a/internal/csi-addons/server/server.go b/internal/csi-addons/server/server.go index e5c6310eec3..c41e27100e0 100644 --- a/internal/csi-addons/server/server.go +++ b/internal/csi-addons/server/server.go @@ -85,9 +85,9 @@ func (cas *CSIAddonsServer) RegisterService(svc CSIAddonsService) { // Start creates the internal gRPC server, and registers the CSIAddonsServices. // The internal gRPC server is started in it's own go-routine when no error is // returned. -func (cas *CSIAddonsServer) Start() error { +func (cas *CSIAddonsServer) Start(middlewareConfig csicommon.MiddlewareServerOptionConfig) error { // create the gRPC server and register services - cas.server = grpc.NewServer(csicommon.NewMiddlewareServerOption()) + cas.server = grpc.NewServer(csicommon.NewMiddlewareServerOption(middlewareConfig)) for _, svc := range cas.services { svc.RegisterService(cas.server) diff --git a/internal/csi-common/server.go b/internal/csi-common/server.go index 727ef57f80f..b758f8d4a10 100644 --- a/internal/csi-common/server.go +++ b/internal/csi-common/server.go @@ -31,7 +31,7 @@ import ( // NonBlockingGRPCServer defines Non blocking GRPC server interfaces. type NonBlockingGRPCServer interface { // Start services at the endpoint - Start(endpoint string, srv Servers) + Start(endpoint string, srv Servers, middlewareConfig MiddlewareServerOptionConfig) // Waits for the service to stop Wait() // Stops the service gracefully @@ -60,9 +60,13 @@ type nonBlockingGRPCServer struct { } // Start start service on endpoint. -func (s *nonBlockingGRPCServer) Start(endpoint string, srv Servers) { +func (s *nonBlockingGRPCServer) Start( + endpoint string, + srv Servers, + middlewareConfig MiddlewareServerOptionConfig, +) { s.wg.Add(1) - go s.serve(endpoint, srv) + go s.serve(endpoint, srv, middlewareConfig) } // Wait blocks until the WaitGroup counter. @@ -80,7 +84,11 @@ func (s *nonBlockingGRPCServer) ForceStop() { s.server.Stop() } -func (s *nonBlockingGRPCServer) serve(endpoint string, srv Servers) { +func (s *nonBlockingGRPCServer) serve( + endpoint string, + srv Servers, + middlewareConfig MiddlewareServerOptionConfig, +) { proto, addr, err := parseEndpoint(endpoint) if err != nil { klog.Fatal(err.Error()) @@ -98,7 +106,7 @@ func (s *nonBlockingGRPCServer) serve(endpoint string, srv Servers) { klog.Fatalf("Failed to listen: %v", err) } - server := grpc.NewServer(NewMiddlewareServerOption()) + server := grpc.NewServer(NewMiddlewareServerOption(middlewareConfig)) s.server = server if srv.IS != nil { diff --git a/internal/csi-common/utils.go b/internal/csi-common/utils.go index 91db95f9324..b541e68f6f4 100644 --- a/internal/csi-common/utils.go +++ b/internal/csi-common/utils.go @@ -23,6 +23,7 @@ import ( "runtime/debug" "strings" "sync/atomic" + "time" "github.com/ceph/ceph-csi/internal/util" "github.com/ceph/ceph-csi/internal/util/log" @@ -108,10 +109,35 @@ func NewGroupControllerServiceCapability(ctrlCap csi.GroupControllerServiceCapab } } +// MiddlewareServerOptionConfig contains configuration parameters +// that are passed to the respective middleware interceptors that +// are instantiated when starting gRPC servers. +type MiddlewareServerOptionConfig struct { + LogSlowOpInterval time.Duration +} + // NewMiddlewareServerOption creates a new grpc.ServerOption that configures a // common format for log messages and other gRPC related handlers. -func NewMiddlewareServerOption() grpc.ServerOption { - middleWare := []grpc.UnaryServerInterceptor{contextIDInjector, logGRPC, panicHandler} +func NewMiddlewareServerOption(config MiddlewareServerOptionConfig) grpc.ServerOption { + middleWare := []grpc.UnaryServerInterceptor{ + contextIDInjector, + logGRPC, + } + + if config.LogSlowOpInterval > 0 { + middleWare = append(middleWare, func( + ctx context.Context, + req interface{}, + info *grpc.UnaryServerInfo, + handler grpc.UnaryHandler, + ) (interface{}, error) { + return logSlowGRPC( + config.LogSlowOpInterval, ctx, req, info, handler, + ) + }) + } + + middleWare = append(middleWare, panicHandler) return grpc.UnaryInterceptor(grpc_middleware.ChainUnaryServer(middleWare...)) } @@ -250,6 +276,53 @@ func logGRPC( return resp, err } +func logSlowGRPC( + logInterval time.Duration, + ctx context.Context, + req interface{}, + info *grpc.UnaryServerInfo, + handler grpc.UnaryHandler, +) (interface{}, error) { + handlerFinished := make(chan struct{}) + callStartTime := time.Now() + + // Ticks at a logInterval rate and logs a slow-call message until handler finishes. + // This is called once the handler outlives its context, see below. + doLogSlowGRPC := func() { + ticker := time.NewTicker(logInterval) + defer ticker.Stop() + + for { + select { + case t := <-ticker.C: + timePassed := t.Sub(callStartTime).Truncate(time.Second) + log.ExtendedLog(ctx, + "Slow GRPC call %s (%s)", info.FullMethod, timePassed) + log.TraceLog(ctx, + "Slow GRPC request: %s", protosanitizer.StripSecrets(req)) + case <-handlerFinished: + return + } + } + } + + go func() { + select { + case <-ctx.Done(): + // The call (most likely) outlived its context. Start logging slow messages. + doLogSlowGRPC() + case <-handlerFinished: + // The call finished, exit. + return + } + }() + + resp, err := handler(ctx, req) + close(handlerFinished) + + return resp, err +} + //nolint:nonamedreturns // named return used to send recovered panic error. func panicHandler( ctx context.Context, diff --git a/internal/nfs/driver/driver.go b/internal/nfs/driver/driver.go index 51eefc568e4..890fba815b3 100644 --- a/internal/nfs/driver/driver.go +++ b/internal/nfs/driver/driver.go @@ -77,7 +77,10 @@ func (fs *Driver) Run(conf *util.Config) { srv.CS = controller.NewControllerServer(cd) } - server.Start(conf.Endpoint, srv) + server.Start(conf.Endpoint, srv, csicommon.MiddlewareServerOptionConfig{ + LogSlowOpInterval: conf.LogSlowOpInterval, + }) + if conf.EnableProfiling { go util.StartMetricsServer(conf) log.DebugLogMsg("Registering profiling handler") diff --git a/internal/rbd/driver/driver.go b/internal/rbd/driver/driver.go index cebfc53cf6b..0ad8109f81a 100644 --- a/internal/rbd/driver/driver.go +++ b/internal/rbd/driver/driver.go @@ -179,7 +179,9 @@ func (r *Driver) Run(conf *util.Config) { CS: r.cs, NS: r.ns, } - s.Start(conf.Endpoint, srv) + s.Start(conf.Endpoint, srv, csicommon.MiddlewareServerOptionConfig{ + LogSlowOpInterval: conf.LogSlowOpInterval, + }) r.startProfiling(conf) @@ -233,7 +235,9 @@ func (r *Driver) setupCSIAddonsServer(conf *util.Config) error { } // start the server, this does not block, it runs a new go-routine - err = r.cas.Start() + err = r.cas.Start(csicommon.MiddlewareServerOptionConfig{ + LogSlowOpInterval: conf.LogSlowOpInterval, + }) if err != nil { return fmt.Errorf("failed to start CSI-Addons server: %w", err) } diff --git a/internal/util/util.go b/internal/util/util.go index 869df991b76..1d62c650223 100644 --- a/internal/util/util.go +++ b/internal/util/util.go @@ -131,6 +131,9 @@ type Config struct { MetricsPort int // TCP port for liveness/grpc metrics requests PollTime time.Duration // time interval in seconds between each poll PoolTimeout time.Duration // probe timeout in seconds + // Log interval for slow GRPC calls. Calls that outlive their context deadline + // are considered slow. + LogSlowOpInterval time.Duration EnableProfiling bool // flag to enable profiling IsControllerServer bool // if set to true start provisioner server diff --git a/vendor/google.golang.org/grpc/experimental/stats/metricregistry.go b/vendor/google.golang.org/grpc/experimental/stats/metricregistry.go index 930140f57ed..1d827dd5d9d 100644 --- a/vendor/google.golang.org/grpc/experimental/stats/metricregistry.go +++ b/vendor/google.golang.org/grpc/experimental/stats/metricregistry.go @@ -20,7 +20,6 @@ package stats import ( "maps" - "testing" "google.golang.org/grpc/grpclog" "google.golang.org/grpc/internal" @@ -250,9 +249,9 @@ func RegisterInt64Gauge(descriptor MetricDescriptor) *Int64GaugeHandle { } // snapshotMetricsRegistryForTesting snapshots the global data of the metrics -// registry. Registers a cleanup function on the provided testing.T that sets -// the metrics registry to its original state. Only called in testing functions. -func snapshotMetricsRegistryForTesting(t *testing.T) { +// registry. Returns a cleanup function that sets the metrics registry to its +// original state. +func snapshotMetricsRegistryForTesting() func() { oldDefaultMetrics := DefaultMetrics oldRegisteredMetrics := registeredMetrics oldMetricsRegistry := metricsRegistry @@ -262,9 +261,9 @@ func snapshotMetricsRegistryForTesting(t *testing.T) { maps.Copy(registeredMetrics, registeredMetrics) maps.Copy(metricsRegistry, metricsRegistry) - t.Cleanup(func() { + return func() { DefaultMetrics = oldDefaultMetrics registeredMetrics = oldRegisteredMetrics metricsRegistry = oldMetricsRegistry - }) + } } diff --git a/vendor/google.golang.org/grpc/internal/internal.go b/vendor/google.golang.org/grpc/internal/internal.go index 65f936a623a..73fa407b6c8 100644 --- a/vendor/google.golang.org/grpc/internal/internal.go +++ b/vendor/google.golang.org/grpc/internal/internal.go @@ -217,10 +217,9 @@ var ( SetConnectedAddress any // func(scs *SubConnState, addr resolver.Address) // SnapshotMetricRegistryForTesting snapshots the global data of the metric - // registry. Registers a cleanup function on the provided testing.T that - // sets the metric registry to its original state. Only called in testing - // functions. - SnapshotMetricRegistryForTesting any // func(t *testing.T) + // registry. Returns a cleanup function that sets the metric registry to its + // original state. Only called in testing functions. + SnapshotMetricRegistryForTesting func() func() // SetDefaultBufferPoolForTesting updates the default buffer pool, for // testing purposes. diff --git a/vendor/google.golang.org/grpc/mem/buffer_slice.go b/vendor/google.golang.org/grpc/mem/buffer_slice.go index d7775cea623..228e9c2f20f 100644 --- a/vendor/google.golang.org/grpc/mem/buffer_slice.go +++ b/vendor/google.golang.org/grpc/mem/buffer_slice.go @@ -19,7 +19,6 @@ package mem import ( - "compress/flate" "io" ) @@ -92,9 +91,11 @@ func (s BufferSlice) Materialize() []byte { } // MaterializeToBuffer functions like Materialize except that it writes the data -// to a single Buffer pulled from the given BufferPool. As a special case, if the -// input BufferSlice only actually has one Buffer, this function has nothing to -// do and simply returns said Buffer. +// to a single Buffer pulled from the given BufferPool. +// +// As a special case, if the input BufferSlice only actually has one Buffer, this +// function simply increases the refcount before returning said Buffer. Freeing this +// buffer won't release it until the BufferSlice is itself released. func (s BufferSlice) MaterializeToBuffer(pool BufferPool) Buffer { if len(s) == 1 { s[0].Ref() @@ -124,7 +125,8 @@ func (s BufferSlice) Reader() Reader { // Remaining(), which returns the number of unread bytes remaining in the slice. // Buffers will be freed as they are read. type Reader interface { - flate.Reader + io.Reader + io.ByteReader // Close frees the underlying BufferSlice and never returns an error. Subsequent // calls to Read will return (0, io.EOF). Close() error diff --git a/vendor/google.golang.org/grpc/server.go b/vendor/google.golang.org/grpc/server.go index 457d27338f7..d1e1415a40f 100644 --- a/vendor/google.golang.org/grpc/server.go +++ b/vendor/google.golang.org/grpc/server.go @@ -1359,6 +1359,7 @@ func (s *Server) processUnaryRPC(ctx context.Context, t transport.ServerTranspor } return err } + defer d.Free() if channelz.IsOn() { t.IncrMsgRecv() } diff --git a/vendor/google.golang.org/grpc/version.go b/vendor/google.golang.org/grpc/version.go index 1ffec6e2cee..7c70005d083 100644 --- a/vendor/google.golang.org/grpc/version.go +++ b/vendor/google.golang.org/grpc/version.go @@ -19,4 +19,4 @@ package grpc // Version is the current grpc version. -const Version = "1.66.0" +const Version = "1.66.2" diff --git a/vendor/modules.txt b/vendor/modules.txt index 0ffd4315e24..d82242ba610 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -834,7 +834,7 @@ google.golang.org/genproto/googleapis/api/httpbody ## explicit; go 1.20 google.golang.org/genproto/googleapis/rpc/errdetails google.golang.org/genproto/googleapis/rpc/status -# google.golang.org/grpc v1.66.0 +# google.golang.org/grpc v1.66.2 ## explicit; go 1.21 google.golang.org/grpc google.golang.org/grpc/attributes