diff --git a/.github/actions/setup-default-test-properties/test-properties.json b/.github/actions/setup-default-test-properties/test-properties.json index 89ef25b21160..a11af748efea 100644 --- a/.github/actions/setup-default-test-properties/test-properties.json +++ b/.github/actions/setup-default-test-properties/test-properties.json @@ -1,15 +1,15 @@ { "PythonTestProperties": { - "ALL_SUPPORTED_VERSIONS": ["3.8", "3.9", "3.10", "3.11"], - "LOWEST_SUPPORTED": ["3.8"], - "HIGHEST_SUPPORTED": ["3.11"], - "ESSENTIAL_VERSIONS": ["3.8", "3.11"], - "CROSS_LANGUAGE_VALIDATES_RUNNER_PYTHON_VERSIONS": ["3.8", "3.11"], + "ALL_SUPPORTED_VERSIONS": ["3.9", "3.10", "3.11", "3.12"], + "LOWEST_SUPPORTED": ["3.9"], + "HIGHEST_SUPPORTED": ["3.12"], + "ESSENTIAL_VERSIONS": ["3.9", "3.12"], + "CROSS_LANGUAGE_VALIDATES_RUNNER_PYTHON_VERSIONS": ["3.9", "3.12"], "CROSS_LANGUAGE_VALIDATES_RUNNER_DATAFLOW_USING_SQL_PYTHON_VERSIONS": ["3.11"], - "VALIDATES_CONTAINER_DATAFLOW_PYTHON_VERSIONS": ["3.8", "3.9", "3.10", "3.11" ], - "LOAD_TEST_PYTHON_VERSION": "3.8", - "CHICAGO_TAXI_EXAMPLE_FLINK_PYTHON_VERSION": "3.8", - "DEFAULT_INTERPRETER": "python3.8", + "VALIDATES_CONTAINER_DATAFLOW_PYTHON_VERSIONS": ["3.9", "3.10", "3.11", "3.12"], + "LOAD_TEST_PYTHON_VERSION": "3.9", + "CHICAGO_TAXI_EXAMPLE_FLINK_PYTHON_VERSION": "3.9", + "DEFAULT_INTERPRETER": "python3.9", "TOX_ENV": ["Cloud", "Cython"] }, "JavaTestProperties": { diff --git a/.github/actions/setup-environment-action/action.yml b/.github/actions/setup-environment-action/action.yml index d3d26c1ad3c0..2da2cf0becd5 100644 --- a/.github/actions/setup-environment-action/action.yml +++ b/.github/actions/setup-environment-action/action.yml @@ -42,7 +42,7 @@ runs: if: ${{ inputs.python-version != '' }} uses: actions/setup-python@v4 with: - python-version: ${{ inputs.python-version == 'default' && '3.8' || inputs.python-version }} + python-version: ${{ inputs.python-version == 'default' && '3.9' || inputs.python-version }} - name: Install Java if: ${{ inputs.java-version != '' }} uses: actions/setup-java@v3 diff --git a/.github/build.gradle b/.github/build.gradle index 8105d8130186..c87a98109aeb 100644 --- a/.github/build.gradle +++ b/.github/build.gradle @@ -44,13 +44,13 @@ task check { paths = workflow.getAt(true).pull_request_target.paths as List } catch (Exception e) { errors.add("Fail to get the trigger path for ${fname}. " + - "Make sure it has a pull_request_target trigger.") + "Make sure it has a pull_request_target trigger.") return } // precommit and postcommit should triggered by this specific file // this is to ensure not missing test during release branch verification - if (paths != null && !paths.contains('release/trigger_all_tests.json') && !fname.toLowerCase().contains('sickbay')) { + if (paths != null && !paths.contains('release/trigger_all_tests.json') && !fname.toLowerCase().contains('sickbay') && !workflow.name.toLowerCase().contains('disabled')) { errors.add("Error validating ${fname}: " + "Please add 'release/trigger_all_tests.json' to the trigger path") return diff --git a/.github/trigger_files/IO_Iceberg_Integration_Tests.json b/.github/trigger_files/IO_Iceberg_Integration_Tests.json index 1efc8e9e4405..3f63c0c9975f 100644 --- a/.github/trigger_files/IO_Iceberg_Integration_Tests.json +++ b/.github/trigger_files/IO_Iceberg_Integration_Tests.json @@ -1,4 +1,4 @@ { "comment": "Modify this file in a trivial way to cause this test suite to run", - "modification": 1 + "modification": 2 } diff --git a/.github/trigger_files/beam_PostCommit_Python.json b/.github/trigger_files/beam_PostCommit_Python.json index 2934a91b84b1..30ee463ad4e9 100644 --- a/.github/trigger_files/beam_PostCommit_Python.json +++ b/.github/trigger_files/beam_PostCommit_Python.json @@ -1,5 +1,5 @@ { "comment": "Modify this file in a trivial way to cause this test suite to run.", - "modification": 1 + "modification": 2 } diff --git a/.github/trigger_files/beam_PreCommit_Python_ML.json b/.github/trigger_files/beam_PreCommit_Python_ML.json new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/.github/workflows/beam_CloudML_Benchmarks_Dataflow.yml b/.github/workflows/beam_CloudML_Benchmarks_Dataflow.yml index f3db177001ec..1448783a24c4 100644 --- a/.github/workflows/beam_CloudML_Benchmarks_Dataflow.yml +++ b/.github/workflows/beam_CloudML_Benchmarks_Dataflow.yml @@ -73,8 +73,8 @@ jobs: uses: ./.github/actions/setup-environment-action with: python-version: | - 3.8 3.9 + 3.10 - name: Prepare test arguments uses: ./.github/actions/test-arguments-action with: diff --git a/.github/workflows/beam_Inference_Python_Benchmarks_Dataflow.yml b/.github/workflows/beam_Inference_Python_Benchmarks_Dataflow.yml index c96e84e4d7bf..685241afed94 100644 --- a/.github/workflows/beam_Inference_Python_Benchmarks_Dataflow.yml +++ b/.github/workflows/beam_Inference_Python_Benchmarks_Dataflow.yml @@ -72,7 +72,7 @@ jobs: - name: Setup Python environment uses: ./.github/actions/setup-environment-action with: - python-version: default + python-version: '3.10' - name: Prepare test arguments uses: ./.github/actions/test-arguments-action with: @@ -95,7 +95,7 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.benchmarks.inference.pytorch_image_classification_benchmarks \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.10 \ -PloadTest.requirementsTxtFile=apache_beam/ml/inference/torch_tests_requirements.txt \ '-PloadTest.args=${{ env.beam_Inference_Python_Benchmarks_Dataflow_test_arguments_1 }} --job_name=benchmark-tests-pytorch-imagenet-python-101-${{env.NOW_UTC}} --output=gs://temp-storage-for-end-to-end-tests/torch/result_resnet101-${{env.NOW_UTC}}.txt' \ - name: run Pytorch Imagenet Classification with Resnet 152 @@ -106,7 +106,7 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.benchmarks.inference.pytorch_image_classification_benchmarks \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.10 \ -PloadTest.requirementsTxtFile=apache_beam/ml/inference/torch_tests_requirements.txt \ '-PloadTest.args=${{ env.beam_Inference_Python_Benchmarks_Dataflow_test_arguments_2 }} --job_name=benchmark-tests-pytorch-imagenet-python-152-${{env.NOW_UTC}} --output=gs://temp-storage-for-end-to-end-tests/torch/result_resnet152-${{env.NOW_UTC}}.txt' \ - name: run Pytorch Language Modeling using Hugging face bert-base-uncased model @@ -117,7 +117,7 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.benchmarks.inference.pytorch_language_modeling_benchmarks \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.10 \ -PloadTest.requirementsTxtFile=apache_beam/ml/inference/torch_tests_requirements.txt \ '-PloadTest.args=${{ env.beam_Inference_Python_Benchmarks_Dataflow_test_arguments_3 }} --job_name=benchmark-tests-pytorch-language-modeling-bert-base-uncased-${{env.NOW_UTC}} --output=gs://temp-storage-for-end-to-end-tests/torch/result_bert_base_uncased-${{env.NOW_UTC}}.txt' \ - name: run Pytorch Langauge Modeling using Hugging Face bert-large-uncased model @@ -128,7 +128,7 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.benchmarks.inference.pytorch_language_modeling_benchmarks \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.10 \ -PloadTest.requirementsTxtFile=apache_beam/ml/inference/torch_tests_requirements.txt \ '-PloadTest.args=${{ env.beam_Inference_Python_Benchmarks_Dataflow_test_arguments_4 }} --job_name=benchmark-tests-pytorch-language-modeling-bert-large-uncased-${{env.NOW_UTC}} --output=gs://temp-storage-for-end-to-end-tests/torch/result_bert_large_uncased-${{env.NOW_UTC}}.txt' \ - name: run Pytorch Imagenet Classification with Resnet 152 with Tesla T4 GPU @@ -139,6 +139,6 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.benchmarks.inference.pytorch_image_classification_benchmarks \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.10 \ -PloadTest.requirementsTxtFile=apache_beam/ml/inference/torch_tests_requirements.txt \ '-PloadTest.args=${{ env.beam_Inference_Python_Benchmarks_Dataflow_test_arguments_5 }} --job_name=benchmark-tests-pytorch-imagenet-python-gpu-${{env.NOW_UTC}} --output=gs://temp-storage-for-end-to-end-tests/torch/result_resnet152_gpu-${{env.NOW_UTC}}.txt' \ No newline at end of file diff --git a/.github/workflows/beam_LoadTests_Python_CoGBK_Dataflow_Batch.yml b/.github/workflows/beam_LoadTests_Python_CoGBK_Dataflow_Batch.yml index 87854bdb8f84..2bc4e50d4792 100644 --- a/.github/workflows/beam_LoadTests_Python_CoGBK_Dataflow_Batch.yml +++ b/.github/workflows/beam_LoadTests_Python_CoGBK_Dataflow_Batch.yml @@ -96,7 +96,7 @@ jobs: --info \ -PloadTest.mainClass=apache_beam.testing.load_tests.co_group_by_key_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_CoGBK_Dataflow_Batch_test_arguments_1 }} --job_name=load-tests-python-dataflow-batch-cogbk-1-${{ steps.datetime.outputs.datetime }}' \ - name: run CoGBK 2GB of 100B records with multiple keys uses: ./.github/actions/gradle-command-self-hosted-action @@ -105,7 +105,7 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.co_group_by_key_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_CoGBK_Dataflow_Batch_test_arguments_2 }} --job_name=load-tests-python-dataflow-batch-cogbk-2-${{ steps.datetime.outputs.datetime }}' \ - name: run CoGBK reiterate 4 times 10kB values uses: ./.github/actions/gradle-command-self-hosted-action @@ -114,7 +114,7 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.co_group_by_key_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_CoGBK_Dataflow_Batch_test_arguments_3 }} --job_name=load-tests-python-dataflow-batch-cogbk-3-${{ steps.datetime.outputs.datetime }}' \ - name: run CoGBK reiterate 4 times 2MB values uses: ./.github/actions/gradle-command-self-hosted-action @@ -123,5 +123,5 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.co_group_by_key_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_CoGBK_Dataflow_Batch_test_arguments_4 }} --job_name=load-tests-python-dataflow-batch-cogbk-4-${{ steps.datetime.outputs.datetime }}' \ No newline at end of file diff --git a/.github/workflows/beam_LoadTests_Python_CoGBK_Dataflow_Streaming.yml b/.github/workflows/beam_LoadTests_Python_CoGBK_Dataflow_Streaming.yml index f9e62e9965a8..b8607fb71603 100644 --- a/.github/workflows/beam_LoadTests_Python_CoGBK_Dataflow_Streaming.yml +++ b/.github/workflows/beam_LoadTests_Python_CoGBK_Dataflow_Streaming.yml @@ -95,7 +95,7 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.co_group_by_key_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_CoGBK_Dataflow_Streaming_test_arguments_1 }} --job_name=load-tests-python-dataflow-streaming-cogbk-1-${{ steps.datetime.outputs.datetime }}' \ - name: run CoGBK 2GB of 100B records with multiple keys uses: ./.github/actions/gradle-command-self-hosted-action @@ -104,7 +104,7 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.co_group_by_key_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_CoGBK_Dataflow_Streaming_test_arguments_2 }} --job_name=load-tests-python-dataflow-streaming-cogbk-2-${{ steps.datetime.outputs.datetime }}' \ - name: run CoGBK reiterate 4 times 10kB values uses: ./.github/actions/gradle-command-self-hosted-action @@ -113,7 +113,7 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.co_group_by_key_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_CoGBK_Dataflow_Streaming_test_arguments_3 }} --job_name=load-tests-python-dataflow-streaming-cogbk-3-${{ steps.datetime.outputs.datetime }}' \ - name: run CoGBK reiterate 4 times 2MB values uses: ./.github/actions/gradle-command-self-hosted-action @@ -122,5 +122,5 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.co_group_by_key_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_CoGBK_Dataflow_Streaming_test_arguments_4 }} --job_name=load-tests-python-dataflow-streaming-cogbk-4-${{ steps.datetime.outputs.datetime }}' \ No newline at end of file diff --git a/.github/workflows/beam_LoadTests_Python_Combine_Dataflow_Batch.yml b/.github/workflows/beam_LoadTests_Python_Combine_Dataflow_Batch.yml index 76e2a417c25c..866a5871962d 100644 --- a/.github/workflows/beam_LoadTests_Python_Combine_Dataflow_Batch.yml +++ b/.github/workflows/beam_LoadTests_Python_Combine_Dataflow_Batch.yml @@ -92,7 +92,7 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.combine_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_Combine_Dataflow_Batch_test_arguments_1 }} --job_name=load-tests-python-dataflow-batch-combine-1-${{env.NOW_UTC}}' \ - name: run Combine Dataflow Batch Python Load Test 2 (fanout 4) uses: ./.github/actions/gradle-command-self-hosted-action @@ -101,7 +101,7 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.combine_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_Combine_Dataflow_Batch_test_arguments_2 }} --job_name=load-tests-python-dataflow-batch-combine-2-${{env.NOW_UTC}}' \ - name: run Combine Dataflow Batch Python Load Test 3 (fanout 8) uses: ./.github/actions/gradle-command-self-hosted-action @@ -110,5 +110,5 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.combine_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_Combine_Dataflow_Batch_test_arguments_3 }} --job_name=load-tests-python-dataflow-batch-combine-3-${{env.NOW_UTC}}' \ No newline at end of file diff --git a/.github/workflows/beam_LoadTests_Python_Combine_Dataflow_Streaming.yml b/.github/workflows/beam_LoadTests_Python_Combine_Dataflow_Streaming.yml index 306f71662eb4..e77db13ecb7f 100644 --- a/.github/workflows/beam_LoadTests_Python_Combine_Dataflow_Streaming.yml +++ b/.github/workflows/beam_LoadTests_Python_Combine_Dataflow_Streaming.yml @@ -92,7 +92,7 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.combine_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_Combine_Dataflow_Streaming_test_arguments_1 }} --job_name=load-tests-python-dataflow-streaming-combine-1-${{env.NOW_UTC}}' \ - name: run 2GB Fanout 4 test uses: ./.github/actions/gradle-command-self-hosted-action @@ -101,7 +101,7 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.combine_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_Combine_Dataflow_Streaming_test_arguments_2 }} --job_name=load-tests-python-dataflow-streaming-combine-4-${{env.NOW_UTC}}' \ - name: run 2GB Fanout 8 test uses: ./.github/actions/gradle-command-self-hosted-action @@ -110,5 +110,5 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.combine_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_Combine_Dataflow_Streaming_test_arguments_3 }} --job_name=load-tests-python-dataflow-streaming-combine-5-${{env.NOW_UTC}}' \ No newline at end of file diff --git a/.github/workflows/beam_LoadTests_Python_Combine_Flink_Batch.yml b/.github/workflows/beam_LoadTests_Python_Combine_Flink_Batch.yml index 230f6398e522..0f666a0b7db6 100644 --- a/.github/workflows/beam_LoadTests_Python_Combine_Flink_Batch.yml +++ b/.github/workflows/beam_LoadTests_Python_Combine_Flink_Batch.yml @@ -107,7 +107,7 @@ jobs: with: gradle-command: :sdks:python:apache_beam:testing:load_tests:run arguments: | - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ -PloadTest.mainClass=apache_beam.testing.load_tests.combine_test \ -Prunner=PortableRunner \ '-PloadTest.args=${{ env.beam_LoadTests_Python_Combine_Flink_Batch_test_arguments_1 }} --job_name=load-tests-python-flink-batch-combine-1-${{env.NOW_UTC}}' \ @@ -121,7 +121,7 @@ jobs: with: gradle-command: :sdks:python:apache_beam:testing:load_tests:run arguments: | - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ -PloadTest.mainClass=apache_beam.testing.load_tests.combine_test \ -Prunner=PortableRunner \ '-PloadTest.args=${{ env.beam_LoadTests_Python_Combine_Flink_Batch_test_arguments_2 }} --job_name=load-tests-python-flink-batch-combine-4-${{env.NOW_UTC}}' \ @@ -130,7 +130,7 @@ jobs: with: gradle-command: :sdks:python:apache_beam:testing:load_tests:run arguments: | - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ -PloadTest.mainClass=apache_beam.testing.load_tests.combine_test \ -Prunner=PortableRunner \ '-PloadTest.args=${{ env.beam_LoadTests_Python_Combine_Flink_Batch_test_arguments_3 }} --job_name=load-tests-python-flink-batch-combine-5-${{env.NOW_UTC}}' \ diff --git a/.github/workflows/beam_LoadTests_Python_Combine_Flink_Streaming.yml b/.github/workflows/beam_LoadTests_Python_Combine_Flink_Streaming.yml index 0ea12f49a5f1..6f491e6b9fa9 100644 --- a/.github/workflows/beam_LoadTests_Python_Combine_Flink_Streaming.yml +++ b/.github/workflows/beam_LoadTests_Python_Combine_Flink_Streaming.yml @@ -104,7 +104,7 @@ jobs: with: gradle-command: :sdks:python:apache_beam:testing:load_tests:run arguments: | - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ -PloadTest.mainClass=apache_beam.testing.load_tests.combine_test \ -Prunner=PortableRunner \ '-PloadTest.args=${{ env.beam_LoadTests_Python_Combine_Flink_Streaming_test_arguments_1 }} --job_name=load-tests-python-flink-streaming-combine-4-${{env.NOW_UTC}}' \ @@ -113,7 +113,7 @@ jobs: with: gradle-command: :sdks:python:apache_beam:testing:load_tests:run arguments: | - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ -PloadTest.mainClass=apache_beam.testing.load_tests.combine_test \ -Prunner=PortableRunner \ '-PloadTest.args=${{ env.beam_LoadTests_Python_Combine_Flink_Streaming_test_arguments_2 }} --job_name=load-tests-python-flink-streaming-combine-5-${{env.NOW_UTC}}' \ diff --git a/.github/workflows/beam_LoadTests_Python_FnApiRunner_Microbenchmark.yml b/.github/workflows/beam_LoadTests_Python_FnApiRunner_Microbenchmark.yml index da6b0beee92c..d69efb4636bb 100644 --- a/.github/workflows/beam_LoadTests_Python_FnApiRunner_Microbenchmark.yml +++ b/.github/workflows/beam_LoadTests_Python_FnApiRunner_Microbenchmark.yml @@ -87,5 +87,5 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.microbenchmarks_test \ -Prunner=DirectRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_FnApiRunner_Microbenchmark_test_arguments_1 }}' \ No newline at end of file diff --git a/.github/workflows/beam_LoadTests_Python_GBK_Dataflow_Batch.yml b/.github/workflows/beam_LoadTests_Python_GBK_Dataflow_Batch.yml index 6e1e6a09f200..d2924a081255 100644 --- a/.github/workflows/beam_LoadTests_Python_GBK_Dataflow_Batch.yml +++ b/.github/workflows/beam_LoadTests_Python_GBK_Dataflow_Batch.yml @@ -94,7 +94,7 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.group_by_key_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_GBK_Dataflow_Batch_test_arguments_1 }} --job_name=load-tests-python-dataflow-batch-gbk-1-${{env.NOW_UTC}}' \ - name: run 2GB of 100B records test uses: ./.github/actions/gradle-command-self-hosted-action @@ -103,7 +103,7 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.group_by_key_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_GBK_Dataflow_Batch_test_arguments_2 }} --job_name=load-tests-python-dataflow-batch-gbk-2-${{env.NOW_UTC}}' \ - name: run 2GB of 100kB records test uses: ./.github/actions/gradle-command-self-hosted-action @@ -112,7 +112,7 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.group_by_key_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_GBK_Dataflow_Batch_test_arguments_3 }} --job_name=load-tests-python-dataflow-batch-gbk-3-${{env.NOW_UTC}}' \ - name: run fanout 4 times with 2GB 10-byte records test uses: ./.github/actions/gradle-command-self-hosted-action @@ -121,7 +121,7 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.group_by_key_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_GBK_Dataflow_Batch_test_arguments_4 }} --job_name=load-tests-python-dataflow-batch-gbk-4-${{env.NOW_UTC}}' \ - name: run fanout 8 times with 2GB 10-byte records total test uses: ./.github/actions/gradle-command-self-hosted-action @@ -130,5 +130,5 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.group_by_key_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_GBK_Dataflow_Batch_test_arguments_5 }} --job_name=load-tests-python-dataflow-batch-gbk-5-${{env.NOW_UTC}}' \ No newline at end of file diff --git a/.github/workflows/beam_LoadTests_Python_GBK_Dataflow_Streaming.yml b/.github/workflows/beam_LoadTests_Python_GBK_Dataflow_Streaming.yml index d8d3e35f17a8..70321f2414a0 100644 --- a/.github/workflows/beam_LoadTests_Python_GBK_Dataflow_Streaming.yml +++ b/.github/workflows/beam_LoadTests_Python_GBK_Dataflow_Streaming.yml @@ -90,7 +90,7 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.group_by_key_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_GBK_Dataflow_Streaming_test_arguments_1 }} --job_name=load-tests-python-dataflow-streaming-gbk-3-${{env.NOW_UTC}}' \ # // TODO(https://github.com/apache/beam/issues/20403). Skipping some cases because they are too slow: diff --git a/.github/workflows/beam_LoadTests_Python_GBK_reiterate_Dataflow_Batch.yml b/.github/workflows/beam_LoadTests_Python_GBK_reiterate_Dataflow_Batch.yml index 7f8e2f197359..f99d2a3f7387 100644 --- a/.github/workflows/beam_LoadTests_Python_GBK_reiterate_Dataflow_Batch.yml +++ b/.github/workflows/beam_LoadTests_Python_GBK_reiterate_Dataflow_Batch.yml @@ -91,7 +91,7 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.group_by_key_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_GBK_reiterate_Dataflow_Batch_test_arguments_1 }} --job_name=load-tests-python-dataflow-batch-gbk-6-${{env.NOW_UTC}}' \ - name: run reiterate 4 times 2MB values test uses: ./.github/actions/gradle-command-self-hosted-action @@ -100,5 +100,5 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.group_by_key_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_GBK_reiterate_Dataflow_Batch_test_arguments_2 }} --job_name=load-tests-python-dataflow-batch-gbk-7-${{env.NOW_UTC}}' \ No newline at end of file diff --git a/.github/workflows/beam_LoadTests_Python_GBK_reiterate_Dataflow_Streaming.yml b/.github/workflows/beam_LoadTests_Python_GBK_reiterate_Dataflow_Streaming.yml index 5b07d15337fa..d7e31f1edcac 100644 --- a/.github/workflows/beam_LoadTests_Python_GBK_reiterate_Dataflow_Streaming.yml +++ b/.github/workflows/beam_LoadTests_Python_GBK_reiterate_Dataflow_Streaming.yml @@ -91,7 +91,7 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.group_by_key_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_GBK_reiterate_Dataflow_Streaming_test_arguments_1 }} --job_name=load-tests-python-dataflow-streaming-gbk-6-${{env.NOW_UTC}}' \ - name: run reiterate 4 times 2MB values test uses: ./.github/actions/gradle-command-self-hosted-action @@ -100,5 +100,5 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.group_by_key_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_GBK_reiterate_Dataflow_Streaming_test_arguments_2 }} --job_name=load-tests-python-dataflow-streaming-gbk-7-${{env.NOW_UTC}}' \ diff --git a/.github/workflows/beam_LoadTests_Python_ParDo_Dataflow_Batch.yml b/.github/workflows/beam_LoadTests_Python_ParDo_Dataflow_Batch.yml index 8dc8f031b7e6..b4f505648702 100644 --- a/.github/workflows/beam_LoadTests_Python_ParDo_Dataflow_Batch.yml +++ b/.github/workflows/beam_LoadTests_Python_ParDo_Dataflow_Batch.yml @@ -95,7 +95,7 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.pardo_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_ParDo_Dataflow_Batch_test_arguments_1 }} --job_name=load-tests-python-dataflow-batch-pardo-1-${{ steps.datetime.outputs.datetime }}' \ - name: run ParDo Dataflow Batch Python Load Test 2 (200 iterations) uses: ./.github/actions/gradle-command-self-hosted-action @@ -104,7 +104,7 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.pardo_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_ParDo_Dataflow_Batch_test_arguments_2 }} --job_name=load-tests-python-dataflow-batch-pardo-2-${{ steps.datetime.outputs.datetime }}' \ - name: run ParDo Dataflow Batch Python Load Test 3 (10 counters) uses: ./.github/actions/gradle-command-self-hosted-action @@ -113,7 +113,7 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.pardo_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_ParDo_Dataflow_Batch_test_arguments_3 }} --job_name=load-tests-python-dataflow-batch-pardo-3-${{ steps.datetime.outputs.datetime }}' \ - name: run ParDo Dataflow Batch Python Load Test 4 (100 counters) uses: ./.github/actions/gradle-command-self-hosted-action @@ -122,5 +122,5 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.pardo_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_ParDo_Dataflow_Batch_test_arguments_4 }} --job_name=load-tests-python-dataflow-batch-pardo-4-${{ steps.datetime.outputs.datetime }}' \ No newline at end of file diff --git a/.github/workflows/beam_LoadTests_Python_ParDo_Dataflow_Streaming.yml b/.github/workflows/beam_LoadTests_Python_ParDo_Dataflow_Streaming.yml index f0d0778d98ca..fabd893afaaf 100644 --- a/.github/workflows/beam_LoadTests_Python_ParDo_Dataflow_Streaming.yml +++ b/.github/workflows/beam_LoadTests_Python_ParDo_Dataflow_Streaming.yml @@ -95,7 +95,7 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.pardo_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_ParDo_Dataflow_Streaming_test_arguments_1 }} --job_name=load-tests-python-dataflow-streaming-pardo-1-${{ steps.datetime.outputs.datetime }}' \ - name: run ParDo Dataflow Streaming Python Load Test 2 (200 iterations) uses: ./.github/actions/gradle-command-self-hosted-action @@ -104,7 +104,7 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.pardo_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_ParDo_Dataflow_Streaming_test_arguments_2 }} --job_name=load-tests-python-dataflow-streaming-pardo-2-${{ steps.datetime.outputs.datetime }}' \ - name: run ParDo Dataflow Streaming Python Load Test 3 (10 counters) uses: ./.github/actions/gradle-command-self-hosted-action @@ -113,7 +113,7 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.pardo_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_ParDo_Dataflow_Streaming_test_arguments_3 }} --job_name=load-tests-python-dataflow-streaming-pardo-3-${{ steps.datetime.outputs.datetime }}' \ - name: run ParDo Dataflow Streaming Python Load Test 4 (100 counters) uses: ./.github/actions/gradle-command-self-hosted-action @@ -122,5 +122,5 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.pardo_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_ParDo_Dataflow_Streaming_test_arguments_4 }} --job_name=load-tests-python-dataflow-streaming-pardo-4-${{ steps.datetime.outputs.datetime }}' \ No newline at end of file diff --git a/.github/workflows/beam_LoadTests_Python_ParDo_Flink_Batch.yml b/.github/workflows/beam_LoadTests_Python_ParDo_Flink_Batch.yml index bae2f9f82ee1..b6c86e01c299 100644 --- a/.github/workflows/beam_LoadTests_Python_ParDo_Flink_Batch.yml +++ b/.github/workflows/beam_LoadTests_Python_ParDo_Flink_Batch.yml @@ -109,7 +109,7 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.pardo_test \ -Prunner=PortableRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_ParDo_Flink_Batch_test_arguments_1 }} --job_name=load-tests-python-flink-batch-pardo-1-${{ steps.datetime.outputs.datetime }}' \ - name: run ParDo Flink Batch Python Load Test 2 (200 iterations) uses: ./.github/actions/gradle-command-self-hosted-action @@ -118,7 +118,7 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.pardo_test \ -Prunner=PortableRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_ParDo_Flink_Batch_test_arguments_2 }} --job_name=load-tests-python-flink-batch-pardo-3-${{ steps.datetime.outputs.datetime }}' \ - name: run ParDo Flink Batch Python Load Test 3 (10 counters) uses: ./.github/actions/gradle-command-self-hosted-action @@ -127,5 +127,5 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.pardo_test \ -Prunner=PortableRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_ParDo_Flink_Batch_test_arguments_3 }} --job_name=load-tests-python-flink-batch-pardo-4-${{ steps.datetime.outputs.datetime }}' \ No newline at end of file diff --git a/.github/workflows/beam_LoadTests_Python_ParDo_Flink_Streaming.yml b/.github/workflows/beam_LoadTests_Python_ParDo_Flink_Streaming.yml index 4485b7187f80..a6443c0df10b 100644 --- a/.github/workflows/beam_LoadTests_Python_ParDo_Flink_Streaming.yml +++ b/.github/workflows/beam_LoadTests_Python_ParDo_Flink_Streaming.yml @@ -111,7 +111,7 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.pardo_test \ -Prunner=PortableRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_ParDo_Flink_Streaming_test_arguments_1 }} --job_name=load-tests-python-flink-streaming-pardo-1-${{ steps.datetime.outputs.datetime }}' \ - name: run ParDo Flink Streaming Python Load Test 2 (200 iterations) uses: ./.github/actions/gradle-command-self-hosted-action @@ -120,7 +120,7 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.pardo_test \ -Prunner=PortableRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_ParDo_Flink_Streaming_test_arguments_2 }} --job_name=load-tests-python-flink-streaming-pardo-2-${{ steps.datetime.outputs.datetime }}' \ - name: run ParDo Flink Streaming Python Load Test 3 (10 counters) uses: ./.github/actions/gradle-command-self-hosted-action @@ -129,7 +129,7 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.pardo_test \ -Prunner=PortableRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_ParDo_Flink_Streaming_test_arguments_3 }} --job_name=load-tests-python-flink-streaming-pardo-3-${{ steps.datetime.outputs.datetime }}' \ - name: run ParDo Flink Streaming Python Load Test 4 (100 counters) uses: ./.github/actions/gradle-command-self-hosted-action @@ -138,7 +138,7 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.pardo_test \ -Prunner=PortableRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_ParDo_Flink_Streaming_test_arguments_4 }} --job_name=load-tests-python-flink-streaming-pardo-4-${{ steps.datetime.outputs.datetime }}' \ - name: run ParDo Flink Streaming Python Load Test 5 (5 iterations) uses: ./.github/actions/gradle-command-self-hosted-action @@ -147,7 +147,7 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.pardo_test \ -Prunner=PortableRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_ParDo_Flink_Streaming_test_arguments_5 }} --job_name=load-tests-python-flink-streaming-pardo-6-${{ steps.datetime.outputs.datetime }}' \ - name: Teardown Flink if: always() diff --git a/.github/workflows/beam_LoadTests_Python_SideInput_Dataflow_Batch.yml b/.github/workflows/beam_LoadTests_Python_SideInput_Dataflow_Batch.yml index b93474b2a21d..7917af0ff5d2 100644 --- a/.github/workflows/beam_LoadTests_Python_SideInput_Dataflow_Batch.yml +++ b/.github/workflows/beam_LoadTests_Python_SideInput_Dataflow_Batch.yml @@ -101,7 +101,7 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.sideinput_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_SideInput_Dataflow_Batch_test_arguments_1 }} --job_name=load-tests-python-dataflow-batch-sideinput-1-${{ steps.datetime.outputs.datetime }}' \ - name: run SideInput Dataflow Batch Python Load Test 2 (1gb-1kb-10workers-1window-99key-percent-dict) uses: ./.github/actions/gradle-command-self-hosted-action @@ -110,7 +110,7 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.sideinput_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_SideInput_Dataflow_Batch_test_arguments_2 }} --job_name=load-tests-python-dataflow-batch-sideinput-2-${{ steps.datetime.outputs.datetime }}' \ - name: run SideInput Dataflow Batch Python Load Test 3 (10gb-1kb-10workers-1window-first-iterable) uses: ./.github/actions/gradle-command-self-hosted-action @@ -119,7 +119,7 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.sideinput_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_SideInput_Dataflow_Batch_test_arguments_3 }} --job_name=load-tests-python-dataflow-batch-sideinput-3-${{ steps.datetime.outputs.datetime }}' \ - name: run SideInput Dataflow Batch Python Load Test 4 (10gb-1kb-10workers-1window-iterable) uses: ./.github/actions/gradle-command-self-hosted-action @@ -128,7 +128,7 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.sideinput_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_SideInput_Dataflow_Batch_test_arguments_4 }} --job_name=load-tests-python-dataflow-batch-sideinput-4-${{ steps.datetime.outputs.datetime }}' \ - name: run SideInput Dataflow Batch Python Load Test 5 (1gb-1kb-10workers-1window-first-list) uses: ./.github/actions/gradle-command-self-hosted-action @@ -137,7 +137,7 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.sideinput_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_SideInput_Dataflow_Batch_test_arguments_5 }} --job_name=load-tests-python-dataflow-batch-sideinput-5-${{ steps.datetime.outputs.datetime }}' \ - name: run SideInput Dataflow Batch Python Load Test 6 (1gb-1kb-10workers-1window-list) uses: ./.github/actions/gradle-command-self-hosted-action @@ -146,7 +146,7 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.sideinput_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_SideInput_Dataflow_Batch_test_arguments_6 }} --job_name=load-tests-python-dataflow-batch-sideinput-6-${{ steps.datetime.outputs.datetime }}' \ - name: run SideInput Dataflow Batch Python Load Test 7 (1gb-1kb-10workers-1000window-1key-percent-dict) uses: ./.github/actions/gradle-command-self-hosted-action @@ -155,7 +155,7 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.sideinput_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_SideInput_Dataflow_Batch_test_arguments_7 }} --job_name=load-tests-python-dataflow-batch-sideinput-7-${{ steps.datetime.outputs.datetime }}' \ - name: run SideInput Dataflow Batch Python Load Test 8 (1gb-1kb-10workers-1000window-99key-percent-dict) uses: ./.github/actions/gradle-command-self-hosted-action @@ -164,7 +164,7 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.sideinput_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_SideInput_Dataflow_Batch_test_arguments_8 }} --job_name=load-tests-python-dataflow-batch-sideinput-8-${{ steps.datetime.outputs.datetime }}' \ - name: run SideInput Dataflow Batch Python Load Test 9 (10gb-1kb-10workers-1000window-first-iterable) uses: ./.github/actions/gradle-command-self-hosted-action @@ -173,7 +173,7 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.sideinput_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_SideInput_Dataflow_Batch_test_arguments_9 }} --job_name=load-tests-python-dataflow-batch-sideinput-9-${{ steps.datetime.outputs.datetime }}' \ - name: run SideInput Dataflow Batch Python Load Test 10 (10gb-1kb-10workers-1000window-iterable) uses: ./.github/actions/gradle-command-self-hosted-action @@ -182,5 +182,5 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.sideinput_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_SideInput_Dataflow_Batch_test_arguments_10 }} --job_name=load-tests-python-dataflow-batch-sideinput-10-${{ steps.datetime.outputs.datetime }}' \ No newline at end of file diff --git a/.github/workflows/beam_LoadTests_Python_Smoke.yml b/.github/workflows/beam_LoadTests_Python_Smoke.yml index 39e8e4b56102..22dc63f19faa 100644 --- a/.github/workflows/beam_LoadTests_Python_Smoke.yml +++ b/.github/workflows/beam_LoadTests_Python_Smoke.yml @@ -90,7 +90,7 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.group_by_key_test \ -Prunner=DirectRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_Smoke_test_arguments_1 }} --job_name=load-tests-python-direct-batch-gbk-smoke-${{ steps.datetime.outputs.datetime }}' \ - name: run GroupByKey Python load test Dataflow uses: ./.github/actions/gradle-command-self-hosted-action @@ -99,5 +99,5 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.testing.load_tests.group_by_key_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_LoadTests_Python_Smoke_test_arguments_2 }} --job_name=load-tests-python-dataflow-batch-gbk-smoke-${{ steps.datetime.outputs.datetime }}' \ No newline at end of file diff --git a/.github/workflows/beam_PerformanceTests_BiqQueryIO_Read_Python.yml b/.github/workflows/beam_PerformanceTests_BiqQueryIO_Read_Python.yml index 6f8e5fc9bc8d..3673ca170555 100644 --- a/.github/workflows/beam_PerformanceTests_BiqQueryIO_Read_Python.yml +++ b/.github/workflows/beam_PerformanceTests_BiqQueryIO_Read_Python.yml @@ -89,6 +89,6 @@ jobs: gradle-command: :sdks:python:apache_beam:testing:load_tests:run arguments: | -PloadTest.mainClass=apache_beam.io.gcp.bigquery_read_perf_test \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ -Prunner=DataflowRunner \ '-PloadTest.args=${{env.beam_PerformanceTests_BiqQueryIO_Read_Python_test_arguments_1}}' \ No newline at end of file diff --git a/.github/workflows/beam_PerformanceTests_BiqQueryIO_Write_Python_Batch.yml b/.github/workflows/beam_PerformanceTests_BiqQueryIO_Write_Python_Batch.yml index 5d7b451071f1..f52eec3fa2c8 100644 --- a/.github/workflows/beam_PerformanceTests_BiqQueryIO_Write_Python_Batch.yml +++ b/.github/workflows/beam_PerformanceTests_BiqQueryIO_Write_Python_Batch.yml @@ -89,6 +89,6 @@ jobs: gradle-command: :sdks:python:apache_beam:testing:load_tests:run arguments: | -PloadTest.mainClass=apache_beam.io.gcp.bigquery_write_perf_test \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ -Prunner=DataflowRunner \ '-PloadTest.args=${{env.beam_PerformanceTests_BiqQueryIO_Write_Python_Batch_test_arguments_1}}' \ No newline at end of file diff --git a/.github/workflows/beam_PerformanceTests_PubsubIOIT_Python_Streaming.yml b/.github/workflows/beam_PerformanceTests_PubsubIOIT_Python_Streaming.yml index 22a71967d3a4..e358f113a668 100644 --- a/.github/workflows/beam_PerformanceTests_PubsubIOIT_Python_Streaming.yml +++ b/.github/workflows/beam_PerformanceTests_PubsubIOIT_Python_Streaming.yml @@ -90,5 +90,5 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.io.gcp.pubsub_io_perf_test \ -Prunner=TestDataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_PerformanceTests_PubsubIOIT_Python_Streaming_test_arguments_1 }}' \ No newline at end of file diff --git a/.github/workflows/beam_PerformanceTests_SpannerIO_Read_2GB_Python.yml b/.github/workflows/beam_PerformanceTests_SpannerIO_Read_2GB_Python.yml index 2de75b2496a8..3448bbcf96c5 100644 --- a/.github/workflows/beam_PerformanceTests_SpannerIO_Read_2GB_Python.yml +++ b/.github/workflows/beam_PerformanceTests_SpannerIO_Read_2GB_Python.yml @@ -90,5 +90,5 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.io.gcp.experimental.spannerio_read_perf_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ -PloadTest.args='${{env.beam_PerformanceTests_SpannerIO_Read_2GB_Python_test_arguments_1}}' \ No newline at end of file diff --git a/.github/workflows/beam_PerformanceTests_SpannerIO_Write_2GB_Python_Batch.yml b/.github/workflows/beam_PerformanceTests_SpannerIO_Write_2GB_Python_Batch.yml index 00292aee45ac..ba9d05e46838 100644 --- a/.github/workflows/beam_PerformanceTests_SpannerIO_Write_2GB_Python_Batch.yml +++ b/.github/workflows/beam_PerformanceTests_SpannerIO_Write_2GB_Python_Batch.yml @@ -90,5 +90,5 @@ jobs: arguments: | -PloadTest.mainClass=apache_beam.io.gcp.experimental.spannerio_write_perf_test \ -Prunner=DataflowRunner \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ -PloadTest.args='${{env.beam_PerformanceTests_SpannerIO_Write_2GB_Python_Batch_test_arguments_1}}' \ No newline at end of file diff --git a/.github/workflows/beam_PerformanceTests_TextIOIT_Python.yml b/.github/workflows/beam_PerformanceTests_TextIOIT_Python.yml index f9ca3d949ced..39bd0ab467d2 100644 --- a/.github/workflows/beam_PerformanceTests_TextIOIT_Python.yml +++ b/.github/workflows/beam_PerformanceTests_TextIOIT_Python.yml @@ -88,7 +88,7 @@ jobs: with: gradle-command: :sdks:python:apache_beam:testing:load_tests:run arguments: | - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ -PloadTest.mainClass=apache_beam.io.filebasedio_perf_test \ -Prunner=DataflowRunner \ '-PloadTest.args=${{env.beam_PerformanceTests_TextIOIT_Python_test_arguments_1}}' \ No newline at end of file diff --git a/.github/workflows/beam_PerformanceTests_WordCountIT_PythonVersions.yml b/.github/workflows/beam_PerformanceTests_WordCountIT_PythonVersions.yml index 21dfa13b25dc..e9ef9cd1716a 100644 --- a/.github/workflows/beam_PerformanceTests_WordCountIT_PythonVersions.yml +++ b/.github/workflows/beam_PerformanceTests_WordCountIT_PythonVersions.yml @@ -64,7 +64,7 @@ jobs: job_name: ["beam_PerformanceTests_WordCountIT_PythonVersions"] job_phrase_1: [Run Python] job_phrase_2: [WordCountIT Performance Test] - python_version: ['3.8'] + python_version: ['3.9'] steps: - uses: actions/checkout@v4 - name: Setup repository diff --git a/.github/workflows/beam_PerformanceTests_xlang_KafkaIO_Python.yml b/.github/workflows/beam_PerformanceTests_xlang_KafkaIO_Python.yml index 8abc8a3199dd..00c62edc34ad 100644 --- a/.github/workflows/beam_PerformanceTests_xlang_KafkaIO_Python.yml +++ b/.github/workflows/beam_PerformanceTests_xlang_KafkaIO_Python.yml @@ -118,5 +118,5 @@ jobs: arguments: | -Prunner=DataflowRunner \ -PloadTest.mainClass=apache_beam.io.external.xlang_kafkaio_perf_test \ - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ '-PloadTest.args=${{ env.beam_PerformanceTests_xlang_KafkaIO_Python_test_arguments_1 }}' \ No newline at end of file diff --git a/.github/workflows/beam_PostCommit_PortableJar_Flink.yml b/.github/workflows/beam_PostCommit_PortableJar_Flink.yml index 5347dc45642b..37bfe68d9b20 100644 --- a/.github/workflows/beam_PostCommit_PortableJar_Flink.yml +++ b/.github/workflows/beam_PostCommit_PortableJar_Flink.yml @@ -79,9 +79,9 @@ jobs: CLOUDSDK_CONFIG: ${{ env.KUBELET_GCLOUD_CONFIG_PATH}} uses: ./.github/actions/gradle-command-self-hosted-action with: - gradle-command: :sdks:python:test-suites:portable:py38:testPipelineJarFlinkRunner + gradle-command: :sdks:python:test-suites:portable:py39:testPipelineJarFlinkRunner arguments: | - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ - name: Archive Python Test Results uses: actions/upload-artifact@v4 if: failure() diff --git a/.github/workflows/beam_PostCommit_PortableJar_Spark.yml b/.github/workflows/beam_PostCommit_PortableJar_Spark.yml index 3778f017d1cc..ce7be60133d7 100644 --- a/.github/workflows/beam_PostCommit_PortableJar_Spark.yml +++ b/.github/workflows/beam_PostCommit_PortableJar_Spark.yml @@ -79,9 +79,9 @@ jobs: CLOUDSDK_CONFIG: ${{ env.KUBELET_GCLOUD_CONFIG_PATH}} uses: ./.github/actions/gradle-command-self-hosted-action with: - gradle-command: :sdks:python:test-suites:portable:py38:testPipelineJarSparkRunner + gradle-command: :sdks:python:test-suites:portable:py39:testPipelineJarSparkRunner arguments: | - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ - name: Archive Python Test Results uses: actions/upload-artifact@v4 if: failure() diff --git a/.github/workflows/beam_PostCommit_Python.yml b/.github/workflows/beam_PostCommit_Python.yml index 6705268143e9..4770515c75fb 100644 --- a/.github/workflows/beam_PostCommit_Python.yml +++ b/.github/workflows/beam_PostCommit_Python.yml @@ -60,7 +60,7 @@ jobs: matrix: job_name: [beam_PostCommit_Python] job_phrase: [Run Python PostCommit] - python_version: ['3.8', '3.9', '3.10', '3.11', '3.12'] + python_version: ['3.9', '3.10', '3.11', '3.12'] if: | github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request_target' || diff --git a/.github/workflows/beam_PostCommit_Python_Arm.yml b/.github/workflows/beam_PostCommit_Python_Arm.yml index f9438c3c644d..48fb00b1bb9d 100644 --- a/.github/workflows/beam_PostCommit_Python_Arm.yml +++ b/.github/workflows/beam_PostCommit_Python_Arm.yml @@ -62,7 +62,7 @@ jobs: matrix: job_name: [beam_PostCommit_Python_Arm] job_phrase: [Run Python PostCommit Arm] - python_version: ['3.8', '3.9', '3.10', '3.11', '3.12'] + python_version: ['3.9', '3.10', '3.11', '3.12'] if: | github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request_target' || diff --git a/.github/workflows/beam_PostCommit_Python_Dependency.yml b/.github/workflows/beam_PostCommit_Python_Dependency.yml index 04c22e4ab07d..6e7c4ddbd3eb 100644 --- a/.github/workflows/beam_PostCommit_Python_Dependency.yml +++ b/.github/workflows/beam_PostCommit_Python_Dependency.yml @@ -81,7 +81,7 @@ jobs: - name: Run postCommitPyDep uses: ./.github/actions/gradle-command-self-hosted-action with: - gradle-command: :sdks:python:test-suites:tox:py38:postCommitPyDep + gradle-command: :sdks:python:test-suites:tox:py39:postCommitPyDep arguments: -PuseWheelDistribution - name: Archive Python Test Results uses: actions/upload-artifact@v4 diff --git a/.github/workflows/beam_PostCommit_Python_Examples_Direct.yml b/.github/workflows/beam_PostCommit_Python_Examples_Direct.yml index 7d3bb65a20e1..a6bb49f4e444 100644 --- a/.github/workflows/beam_PostCommit_Python_Examples_Direct.yml +++ b/.github/workflows/beam_PostCommit_Python_Examples_Direct.yml @@ -63,7 +63,7 @@ jobs: matrix: job_name: ["beam_PostCommit_Python_Examples_Direct"] job_phrase: ["Run Python Examples_Direct"] - python_version: ['3.8','3.9','3.10','3.11','3.12'] + python_version: ['3.9','3.10','3.11','3.12'] steps: - uses: actions/checkout@v4 - name: Setup repository diff --git a/.github/workflows/beam_PostCommit_Python_Examples_Flink.yml b/.github/workflows/beam_PostCommit_Python_Examples_Flink.yml index f88d7e205cc6..bda807eb147b 100644 --- a/.github/workflows/beam_PostCommit_Python_Examples_Flink.yml +++ b/.github/workflows/beam_PostCommit_Python_Examples_Flink.yml @@ -63,7 +63,7 @@ jobs: matrix: job_name: ["beam_PostCommit_Python_Examples_Flink"] job_phrase: ["Run Python Examples_Flink"] - python_version: ['3.8', '3.12'] + python_version: ['3.9', '3.12'] steps: - uses: actions/checkout@v4 - name: Setup repository diff --git a/.github/workflows/beam_PostCommit_Python_Examples_Spark.yml b/.github/workflows/beam_PostCommit_Python_Examples_Spark.yml index 4300179421b5..d866d412507b 100644 --- a/.github/workflows/beam_PostCommit_Python_Examples_Spark.yml +++ b/.github/workflows/beam_PostCommit_Python_Examples_Spark.yml @@ -63,7 +63,7 @@ jobs: matrix: job_name: ["beam_PostCommit_Python_Examples_Spark"] job_phrase: ["Run Python Examples_Spark"] - python_version: ['3.8', '3.12'] + python_version: ['3.9', '3.12'] steps: - uses: actions/checkout@v4 - name: Setup repository diff --git a/.github/workflows/beam_PostCommit_Python_Nexmark_Direct.yml b/.github/workflows/beam_PostCommit_Python_Nexmark_Direct.yml index 601cb99a44fb..3d47fb86889d 100644 --- a/.github/workflows/beam_PostCommit_Python_Nexmark_Direct.yml +++ b/.github/workflows/beam_PostCommit_Python_Nexmark_Direct.yml @@ -133,7 +133,7 @@ jobs: with: gradle-command: :sdks:python:apache_beam:testing:benchmarks:nexmark:run arguments: | - -PpythonVersion=3.8 \ + -PpythonVersion=3.9 \ "-Pnexmark.args=${{ env.GRADLE_PYTHON_COMMAND_ARGUMENTS }} \ --query=${{ matrix.query }} \ --input=gs://temp-storage-for-perf-tests/nexmark/eventFiles/beam_PostCommit_Python_Nexmark_Direct/query${{ matrix.query }}-\*" \ No newline at end of file diff --git a/.github/workflows/beam_PostCommit_Python_ValidatesContainer_Dataflow.yml b/.github/workflows/beam_PostCommit_Python_ValidatesContainer_Dataflow.yml index ec7a28d2db2c..bcd936324124 100644 --- a/.github/workflows/beam_PostCommit_Python_ValidatesContainer_Dataflow.yml +++ b/.github/workflows/beam_PostCommit_Python_ValidatesContainer_Dataflow.yml @@ -65,7 +65,7 @@ jobs: matrix: job_name: ["beam_PostCommit_Python_ValidatesContainer_Dataflow"] job_phrase: ["Run Python Dataflow ValidatesContainer"] - python_version: ['3.8','3.9','3.10','3.11','3.12'] + python_version: ['3.9','3.10','3.11','3.12'] steps: - uses: actions/checkout@v4 - name: Setup repository diff --git a/.github/workflows/beam_PostCommit_Python_ValidatesContainer_Dataflow_With_RC.yml b/.github/workflows/beam_PostCommit_Python_ValidatesContainer_Dataflow_With_RC.yml index b90c150291dd..f2eba045722c 100644 --- a/.github/workflows/beam_PostCommit_Python_ValidatesContainer_Dataflow_With_RC.yml +++ b/.github/workflows/beam_PostCommit_Python_ValidatesContainer_Dataflow_With_RC.yml @@ -63,7 +63,7 @@ jobs: matrix: job_name: ["beam_PostCommit_Python_ValidatesContainer_Dataflow_With_RC"] job_phrase: ["Run Python RC Dataflow ValidatesContainer"] - python_version: ['3.8','3.9','3.10','3.11','3.12'] + python_version: ['3.9','3.10','3.11','3.12'] steps: - uses: actions/checkout@v4 - name: Setup repository diff --git a/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Dataflow.yml b/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Dataflow.yml index 8df5d00287bc..1876950c7a93 100644 --- a/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Dataflow.yml +++ b/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Dataflow.yml @@ -63,7 +63,7 @@ jobs: matrix: job_name: ["beam_PostCommit_Python_ValidatesRunner_Dataflow"] job_phrase: ["Run Python Dataflow ValidatesRunner"] - python_version: ['3.8', '3.12'] + python_version: ['3.9', '3.12'] steps: - uses: actions/checkout@v4 - name: Setup repository diff --git a/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Flink.yml b/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Flink.yml index b301402f4de2..f837c7476e12 100644 --- a/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Flink.yml +++ b/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Flink.yml @@ -63,7 +63,7 @@ jobs: matrix: job_name: ["beam_PostCommit_Python_ValidatesRunner_Flink"] job_phrase: ["Run Python Flink ValidatesRunner"] - python_version: ['3.8', '3.12'] + python_version: ['3.9', '3.12'] steps: - uses: actions/checkout@v4 - name: Setup repository diff --git a/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Samza.yml b/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Samza.yml index 6c89b110ec7a..91c249adf338 100644 --- a/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Samza.yml +++ b/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Samza.yml @@ -63,7 +63,7 @@ jobs: matrix: job_name: ["beam_PostCommit_Python_ValidatesRunner_Samza"] job_phrase: ["Run Python Samza ValidatesRunner"] - python_version: ['3.8', '3.12'] + python_version: ['3.9', '3.12'] steps: - uses: actions/checkout@v4 - name: Setup repository diff --git a/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Spark.yml b/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Spark.yml index 66d1ac6756c4..7e87aaff22cc 100644 --- a/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Spark.yml +++ b/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Spark.yml @@ -63,7 +63,7 @@ jobs: matrix: job_name: ["beam_PostCommit_Python_ValidatesRunner_Spark"] job_phrase: ["Run Python Spark ValidatesRunner"] - python_version: ['3.8', '3.12'] + python_version: ['3.9', '3.12'] steps: - uses: actions/checkout@v4 - name: Setup repository diff --git a/.github/workflows/beam_PostCommit_Python_Xlang_Gcp_Dataflow.yml b/.github/workflows/beam_PostCommit_Python_Xlang_Gcp_Dataflow.yml index eb204ce50349..b3f37c6b39f0 100644 --- a/.github/workflows/beam_PostCommit_Python_Xlang_Gcp_Dataflow.yml +++ b/.github/workflows/beam_PostCommit_Python_Xlang_Gcp_Dataflow.yml @@ -74,7 +74,7 @@ jobs: uses: ./.github/actions/setup-environment-action with: python-version: | - 3.8 + 3.9 3.12 - name: run PostCommit Python Xlang Gcp Dataflow script uses: ./.github/actions/gradle-command-self-hosted-action diff --git a/.github/workflows/beam_PostCommit_Python_Xlang_Gcp_Direct.yml b/.github/workflows/beam_PostCommit_Python_Xlang_Gcp_Direct.yml index 63417231cb96..137d7bc13d2f 100644 --- a/.github/workflows/beam_PostCommit_Python_Xlang_Gcp_Direct.yml +++ b/.github/workflows/beam_PostCommit_Python_Xlang_Gcp_Direct.yml @@ -74,7 +74,7 @@ jobs: uses: ./.github/actions/setup-environment-action with: python-version: | - 3.8 + 3.9 3.12 - name: run PostCommit Python Xlang Gcp Direct script uses: ./.github/actions/gradle-command-self-hosted-action diff --git a/.github/workflows/beam_PostCommit_Python_Xlang_IO_Dataflow.yml b/.github/workflows/beam_PostCommit_Python_Xlang_IO_Dataflow.yml index ac90c2cd66b8..8fc0db189078 100644 --- a/.github/workflows/beam_PostCommit_Python_Xlang_IO_Dataflow.yml +++ b/.github/workflows/beam_PostCommit_Python_Xlang_IO_Dataflow.yml @@ -74,7 +74,7 @@ jobs: uses: ./.github/actions/setup-environment-action with: python-version: | - 3.8 + 3.9 3.12 - name: run PostCommit Python Xlang IO Dataflow script uses: ./.github/actions/gradle-command-self-hosted-action diff --git a/.github/workflows/beam_PostCommit_Sickbay_Python.yml b/.github/workflows/beam_PostCommit_Sickbay_Python.yml index 91211df97f7f..6d253e03723d 100644 --- a/.github/workflows/beam_PostCommit_Sickbay_Python.yml +++ b/.github/workflows/beam_PostCommit_Sickbay_Python.yml @@ -59,7 +59,7 @@ jobs: job_name: [beam_PostCommit_Sickbay_Python] job_phrase_1: [Run Python] job_phrase_2: [PostCommit Sickbay] - python_version: ['3.8', '3.9', '3.10', '3.11', '3.12'] + python_version: ['3.9', '3.10', '3.11', '3.12'] if: | github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request_target' || diff --git a/.github/workflows/beam_PostCommit_TransformService_Direct.yml b/.github/workflows/beam_PostCommit_TransformService_Direct.yml index 966938e43e92..cb339eb9fb40 100644 --- a/.github/workflows/beam_PostCommit_TransformService_Direct.yml +++ b/.github/workflows/beam_PostCommit_TransformService_Direct.yml @@ -62,7 +62,7 @@ jobs: matrix: job_name: ["beam_PostCommit_TransformService_Direct"] job_phrase: ["Run TransformService_Direct PostCommit"] - python_version: ['3.8','3.12'] + python_version: ['3.9','3.12'] steps: - uses: actions/checkout@v4 - name: Setup repository @@ -76,7 +76,7 @@ jobs: with: java-version: 11 python-version: | - 3.8 + 3.9 ${{ matrix.python_version }} - name: run TransformService Direct script uses: ./.github/actions/gradle-command-self-hosted-action diff --git a/.github/workflows/beam_PostCommit_XVR_Direct.yml b/.github/workflows/beam_PostCommit_XVR_Direct.yml index ec66cab88e9b..023ae4f8cd31 100644 --- a/.github/workflows/beam_PostCommit_XVR_Direct.yml +++ b/.github/workflows/beam_PostCommit_XVR_Direct.yml @@ -62,7 +62,7 @@ jobs: matrix: job_name: ["beam_PostCommit_XVR_Direct"] job_phrase: ["Run XVR_Direct PostCommit"] - python_version: ['3.8','3.12'] + python_version: ['3.9','3.12'] steps: - uses: actions/checkout@v4 - name: Setup repository @@ -75,12 +75,12 @@ jobs: uses: ./.github/actions/setup-environment-action with: python-version: | - 3.8 + 3.9 ${{ matrix.python_version }} - name: run PostCommit XVR Direct script env: CLOUDSDK_CONFIG: ${{ env.KUBELET_GCLOUD_CONFIG_PATH}} - if: ${{ matrix.python_version != '3.8' }} + if: ${{ matrix.python_version != '3.9' }} uses: ./.github/actions/gradle-command-self-hosted-action with: gradle-command: :sdks:python:test-suites:direct:xlang:validatesCrossLanguageRunner @@ -90,7 +90,7 @@ jobs: - name: run PostCommit XVR Direct script env: CLOUDSDK_CONFIG: ${{ env.KUBELET_GCLOUD_CONFIG_PATH}} - if: ${{ matrix.python_version == '3.8' }} + if: ${{ matrix.python_version == '3.9' }} uses: ./.github/actions/gradle-command-self-hosted-action with: gradle-command: :sdks:python:test-suites:direct:xlang:validatesCrossLanguageRunner diff --git a/.github/workflows/beam_PostCommit_XVR_Flink.yml b/.github/workflows/beam_PostCommit_XVR_Flink.yml index d88b502988ef..5cde38d24244 100644 --- a/.github/workflows/beam_PostCommit_XVR_Flink.yml +++ b/.github/workflows/beam_PostCommit_XVR_Flink.yml @@ -63,7 +63,7 @@ jobs: matrix: job_name: ["beam_PostCommit_XVR_Flink"] job_phrase: ["Run XVR_Flink PostCommit"] - python_version: ['3.8','3.12'] + python_version: ['3.9','3.12'] steps: - uses: actions/checkout@v4 - name: Setup repository @@ -76,12 +76,12 @@ jobs: uses: ./.github/actions/setup-environment-action with: python-version: | - 3.8 + 3.9 ${{ matrix.python_version }} - name: run PostCommit XVR Flink script env: CLOUDSDK_CONFIG: ${{ env.KUBELET_GCLOUD_CONFIG_PATH}} - if: ${{ matrix.python_version != '3.8' }} + if: ${{ matrix.python_version != '3.9' }} uses: ./.github/actions/gradle-command-self-hosted-action with: gradle-command: :runners:flink:${{ env.FlinkVersion }}:job-server:validatesCrossLanguageRunner @@ -91,7 +91,7 @@ jobs: - name: run PostCommit XVR Flink script env: CLOUDSDK_CONFIG: ${{ env.KUBELET_GCLOUD_CONFIG_PATH}} - if: ${{ matrix.python_version == '3.8' }} + if: ${{ matrix.python_version == '3.9' }} uses: ./.github/actions/gradle-command-self-hosted-action with: gradle-command: :runners:flink:${{ env.FlinkVersion }}:job-server:validatesCrossLanguageRunner diff --git a/.github/workflows/beam_PostCommit_XVR_GoUsingJava_Dataflow.yml b/.github/workflows/beam_PostCommit_XVR_GoUsingJava_Dataflow.yml index 09e398288ff4..228f10b90cd0 100644 --- a/.github/workflows/beam_PostCommit_XVR_GoUsingJava_Dataflow.yml +++ b/.github/workflows/beam_PostCommit_XVR_GoUsingJava_Dataflow.yml @@ -13,13 +13,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -name: PostCommit XVR GoUsingJava Dataflow +# TODO(https://github.com/apache/beam/issues/32492): re-enable the suite +# on cron and add release/trigger_all_tests.json to trigger path once fixed. + +name: PostCommit XVR GoUsingJava Dataflow (DISABLED) on: - schedule: - - cron: '45 5/6 * * *' + # schedule: + # - cron: '45 5/6 * * *' pull_request_target: - paths: ['release/trigger_all_tests.json', '.github/trigger_files/beam_PostCommit_XVR_GoUsingJava_Dataflow.json'] + paths: ['.github/trigger_files/beam_PostCommit_XVR_GoUsingJava_Dataflow.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event diff --git a/.github/workflows/beam_PostCommit_XVR_JavaUsingPython_Dataflow.yml b/.github/workflows/beam_PostCommit_XVR_JavaUsingPython_Dataflow.yml index 779d5881ca7a..66770c9a1683 100644 --- a/.github/workflows/beam_PostCommit_XVR_JavaUsingPython_Dataflow.yml +++ b/.github/workflows/beam_PostCommit_XVR_JavaUsingPython_Dataflow.yml @@ -62,7 +62,7 @@ jobs: matrix: job_name: ["beam_PostCommit_XVR_JavaUsingPython_Dataflow"] job_phrase: ["Run XVR_JavaUsingPython_Dataflow PostCommit"] - python_version: ['3.8','3.12'] + python_version: ['3.9','3.12'] steps: - uses: actions/checkout@v4 - name: Setup repository @@ -75,7 +75,7 @@ jobs: uses: ./.github/actions/setup-environment-action with: python-version: | - 3.8 + 3.9 ${{ matrix.python_version }} - name: run PostCommit XVR JavaUsingPython Dataflow script uses: ./.github/actions/gradle-command-self-hosted-action diff --git a/.github/workflows/beam_PostCommit_XVR_PythonUsingJava_Dataflow.yml b/.github/workflows/beam_PostCommit_XVR_PythonUsingJava_Dataflow.yml index 14404e8a9a41..f1269a0ddd09 100644 --- a/.github/workflows/beam_PostCommit_XVR_PythonUsingJava_Dataflow.yml +++ b/.github/workflows/beam_PostCommit_XVR_PythonUsingJava_Dataflow.yml @@ -62,7 +62,7 @@ jobs: matrix: job_name: ["beam_PostCommit_XVR_PythonUsingJava_Dataflow"] job_phrase: ["Run XVR_PythonUsingJava_Dataflow PostCommit"] - python_version: ['3.8','3.12'] + python_version: ['3.9','3.12'] steps: - uses: actions/checkout@v4 - name: Setup repository @@ -75,7 +75,7 @@ jobs: uses: ./.github/actions/setup-environment-action with: python-version: | - 3.8 + 3.9 ${{ matrix.python_version }} - name: run PostCommit XVR PythonUsingJava Dataflow script uses: ./.github/actions/gradle-command-self-hosted-action diff --git a/.github/workflows/beam_PostCommit_XVR_Samza.yml b/.github/workflows/beam_PostCommit_XVR_Samza.yml index 2d854a3678e7..2d26c9131839 100644 --- a/.github/workflows/beam_PostCommit_XVR_Samza.yml +++ b/.github/workflows/beam_PostCommit_XVR_Samza.yml @@ -62,7 +62,7 @@ jobs: matrix: job_name: ["beam_PostCommit_XVR_Samza"] job_phrase: ["Run XVR_Samza PostCommit"] - python_version: ['3.8','3.12'] + python_version: ['3.9','3.12'] steps: - uses: actions/checkout@v4 - name: Setup repository @@ -77,12 +77,12 @@ jobs: with: java-version: 8 python-version: | - 3.8 + 3.9 ${{ matrix.python_version }} - name: run PostCommit XVR Samza script env: CLOUDSDK_CONFIG: ${{ env.KUBELET_GCLOUD_CONFIG_PATH}} - if: ${{ matrix.python_version != '3.8' }} + if: ${{ matrix.python_version != '3.9' }} uses: ./.github/actions/gradle-command-self-hosted-action with: gradle-command: :runners:samza:job-server:validatesCrossLanguageRunner @@ -92,7 +92,7 @@ jobs: - name: run PostCommit XVR Samza script env: CLOUDSDK_CONFIG: ${{ env.KUBELET_GCLOUD_CONFIG_PATH}} - if: ${{ matrix.python_version == '3.8' }} + if: ${{ matrix.python_version == '3.9' }} uses: ./.github/actions/gradle-command-self-hosted-action with: gradle-command: :runners:samza:job-server:validatesCrossLanguageRunner diff --git a/.github/workflows/beam_PostCommit_XVR_Spark3.yml b/.github/workflows/beam_PostCommit_XVR_Spark3.yml index 83554ecfa84c..c1880e01292b 100644 --- a/.github/workflows/beam_PostCommit_XVR_Spark3.yml +++ b/.github/workflows/beam_PostCommit_XVR_Spark3.yml @@ -62,7 +62,7 @@ jobs: matrix: job_name: ["beam_PostCommit_XVR_Spark3"] job_phrase: ["Run XVR_Spark3 PostCommit"] - python_version: ['3.8','3.12'] + python_version: ['3.9','3.12'] steps: - uses: actions/checkout@v4 - name: Setup repository @@ -75,12 +75,12 @@ jobs: uses: ./.github/actions/setup-environment-action with: python-version: | - 3.8 + 3.9 ${{ matrix.python_version }} - name: run PostCommit XVR Spark3 script env: CLOUDSDK_CONFIG: ${{ env.KUBELET_GCLOUD_CONFIG_PATH}} - if: ${{ matrix.python_version != '3.8' }} + if: ${{ matrix.python_version != '3.9' }} uses: ./.github/actions/gradle-command-self-hosted-action with: gradle-command: :runners:spark:3:job-server:validatesCrossLanguageRunner @@ -90,7 +90,7 @@ jobs: - name: run PostCommit XVR Spark3 script env: CLOUDSDK_CONFIG: ${{ env.KUBELET_GCLOUD_CONFIG_PATH}} - if: ${{ matrix.python_version == '3.8' }} + if: ${{ matrix.python_version == '3.9' }} uses: ./.github/actions/gradle-command-self-hosted-action with: gradle-command: :runners:spark:3:job-server:validatesCrossLanguageRunner diff --git a/.github/workflows/beam_PreCommit_Portable_Python.yml b/.github/workflows/beam_PreCommit_Portable_Python.yml index e1e1e6033087..037df9a17c45 100644 --- a/.github/workflows/beam_PreCommit_Portable_Python.yml +++ b/.github/workflows/beam_PreCommit_Portable_Python.yml @@ -86,7 +86,7 @@ jobs: matrix: job_name: ['beam_PreCommit_Portable_Python'] job_phrase: ['Run Portable_Python PreCommit'] - python_version: ['3.8', '3.12'] + python_version: ['3.9', '3.12'] if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || @@ -106,7 +106,7 @@ jobs: java-version: default python-version: | ${{ matrix.python_version }} - 3.8 + 3.9 - name: Set PY_VER_CLEAN id: set_py_ver_clean run: | diff --git a/.github/workflows/beam_PreCommit_Python.yml b/.github/workflows/beam_PreCommit_Python.yml index 2fbab55819f3..fb1c6c80873a 100644 --- a/.github/workflows/beam_PreCommit_Python.yml +++ b/.github/workflows/beam_PreCommit_Python.yml @@ -64,7 +64,7 @@ jobs: matrix: job_name: ['beam_PreCommit_Python'] job_phrase: ['Run Python PreCommit'] - python_version: ['3.8','3.9','3.10','3.11','3.12'] + python_version: ['3.9','3.10','3.11','3.12'] if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || diff --git a/.github/workflows/beam_PreCommit_PythonDocker.yml b/.github/workflows/beam_PreCommit_PythonDocker.yml index 129429238b1f..63fc6d55e19a 100644 --- a/.github/workflows/beam_PreCommit_PythonDocker.yml +++ b/.github/workflows/beam_PreCommit_PythonDocker.yml @@ -64,7 +64,7 @@ jobs: matrix: job_name: ["beam_PreCommit_PythonDocker"] job_phrase: ["Run PythonDocker PreCommit"] - python_version: ['3.8','3.9','3.10','3.11','3.12'] + python_version: ['3.9','3.10','3.11','3.12'] if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || diff --git a/.github/workflows/beam_PreCommit_Python_Coverage.yml b/.github/workflows/beam_PreCommit_Python_Coverage.yml index a22e3c338eed..0e295250817d 100644 --- a/.github/workflows/beam_PreCommit_Python_Coverage.yml +++ b/.github/workflows/beam_PreCommit_Python_Coverage.yml @@ -87,7 +87,7 @@ jobs: - name: Run preCommitPyCoverage uses: ./.github/actions/gradle-command-self-hosted-action with: - gradle-command: :sdks:python:test-suites:tox:py38:preCommitPyCoverage + gradle-command: :sdks:python:test-suites:tox:py39:preCommitPyCoverage - uses: codecov/codecov-action@v3 with: flags: python diff --git a/.github/workflows/beam_PreCommit_Python_Dataframes.yml b/.github/workflows/beam_PreCommit_Python_Dataframes.yml index f498dadae92d..f045842e061d 100644 --- a/.github/workflows/beam_PreCommit_Python_Dataframes.yml +++ b/.github/workflows/beam_PreCommit_Python_Dataframes.yml @@ -64,7 +64,7 @@ jobs: matrix: job_name: ['beam_PreCommit_Python_Dataframes'] job_phrase: ['Run Python_Dataframes PreCommit'] - python_version: ['3.8','3.9','3.10','3.11','3.12'] + python_version: ['3.9','3.10','3.11','3.12'] if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || diff --git a/.github/workflows/beam_PreCommit_Python_Examples.yml b/.github/workflows/beam_PreCommit_Python_Examples.yml index 0fb404e7f55b..09d46217d6d6 100644 --- a/.github/workflows/beam_PreCommit_Python_Examples.yml +++ b/.github/workflows/beam_PreCommit_Python_Examples.yml @@ -64,7 +64,7 @@ jobs: matrix: job_name: ['beam_PreCommit_Python_Examples'] job_phrase: ['Run Python_Examples PreCommit'] - python_version: ['3.8','3.9','3.10','3.11','3.12'] + python_version: ['3.9','3.10','3.11','3.12'] if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || diff --git a/.github/workflows/beam_PreCommit_Python_Integration.yml b/.github/workflows/beam_PreCommit_Python_Integration.yml index a2d80806d2bf..20aade431f6d 100644 --- a/.github/workflows/beam_PreCommit_Python_Integration.yml +++ b/.github/workflows/beam_PreCommit_Python_Integration.yml @@ -64,7 +64,7 @@ jobs: matrix: job_name: ['beam_PreCommit_Python_Integration'] job_phrase: ['Run Python_Integration PreCommit'] - python_version: ['3.8', '3.12'] + python_version: ['3.9', '3.12'] if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || diff --git a/.github/workflows/beam_PreCommit_Python_ML.yml b/.github/workflows/beam_PreCommit_Python_ML.yml index c5e596e3b421..714eceef5f6b 100644 --- a/.github/workflows/beam_PreCommit_Python_ML.yml +++ b/.github/workflows/beam_PreCommit_Python_ML.yml @@ -64,7 +64,7 @@ jobs: matrix: job_name: ['beam_PreCommit_Python_ML'] job_phrase: ['Run Python_ML PreCommit'] - python_version: ['3.8','3.9','3.10','3.11','3.12'] + python_version: ['3.9','3.10','3.11','3.12'] if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || diff --git a/.github/workflows/beam_PreCommit_Python_Runners.yml b/.github/workflows/beam_PreCommit_Python_Runners.yml index 66037cfaffdb..5db6e94be781 100644 --- a/.github/workflows/beam_PreCommit_Python_Runners.yml +++ b/.github/workflows/beam_PreCommit_Python_Runners.yml @@ -64,7 +64,7 @@ jobs: matrix: job_name: ['beam_PreCommit_Python_Runners'] job_phrase: ['Run Python_Runners PreCommit'] - python_version: ['3.8','3.9','3.10','3.11','3.12'] + python_version: ['3.9','3.10','3.11','3.12'] if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || diff --git a/.github/workflows/beam_PreCommit_Python_Transforms.yml b/.github/workflows/beam_PreCommit_Python_Transforms.yml index caec491a7515..820ca3e26df6 100644 --- a/.github/workflows/beam_PreCommit_Python_Transforms.yml +++ b/.github/workflows/beam_PreCommit_Python_Transforms.yml @@ -64,7 +64,7 @@ jobs: matrix: job_name: ['beam_PreCommit_Python_Transforms'] job_phrase: ['Run Python_Transforms PreCommit'] - python_version: ['3.8','3.9','3.10','3.11','3.12'] + python_version: ['3.9','3.10','3.11','3.12'] if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || diff --git a/.github/workflows/beam_PreCommit_Xlang_Generated_Transforms.yml b/.github/workflows/beam_PreCommit_Xlang_Generated_Transforms.yml index c7ba234bef4b..f79712ac2d76 100644 --- a/.github/workflows/beam_PreCommit_Xlang_Generated_Transforms.yml +++ b/.github/workflows/beam_PreCommit_Xlang_Generated_Transforms.yml @@ -82,7 +82,7 @@ jobs: matrix: job_name: ['beam_PreCommit_Xlang_Generated_Transforms'] job_phrase: ['Run Xlang_Generated_Transforms PreCommit'] - python_version: ['3.8'] + python_version: ['3.9'] if: | github.event_name == 'push' || github.event_name == 'workflow_dispatch' || diff --git a/.github/workflows/beam_Publish_Beam_SDK_Snapshots.yml b/.github/workflows/beam_Publish_Beam_SDK_Snapshots.yml index 72de1f15229f..61ef31a00239 100644 --- a/.github/workflows/beam_Publish_Beam_SDK_Snapshots.yml +++ b/.github/workflows/beam_Publish_Beam_SDK_Snapshots.yml @@ -71,6 +71,7 @@ jobs: - "python:container:py310" - "python:container:py311" - "python:container:py312" + - "java:expansion-service:container" steps: - uses: actions/checkout@v4 - name: Setup repository diff --git a/.github/workflows/beam_Python_ValidatesContainer_Dataflow_ARM.yml b/.github/workflows/beam_Python_ValidatesContainer_Dataflow_ARM.yml index e95b2a43845e..c2b360a7ce71 100644 --- a/.github/workflows/beam_Python_ValidatesContainer_Dataflow_ARM.yml +++ b/.github/workflows/beam_Python_ValidatesContainer_Dataflow_ARM.yml @@ -54,7 +54,7 @@ jobs: matrix: job_name: [beam_Python_ValidatesContainer_Dataflow_ARM] job_phrase: [Run Python ValidatesContainer Dataflow ARM] - python_version: ['3.8','3.9','3.10','3.11','3.12'] + python_version: ['3.9','3.10','3.11','3.12'] if: | github.event_name == 'push' || (github.event_name == 'schedule' && github.repository == 'apache/beam') || diff --git a/.github/workflows/build_release_candidate.yml b/.github/workflows/build_release_candidate.yml index f944ce90c9f1..ec65ae99072a 100644 --- a/.github/workflows/build_release_candidate.yml +++ b/.github/workflows/build_release_candidate.yml @@ -260,10 +260,10 @@ jobs: with: distribution: 'temurin' java-version: '11' - - name: Install Python 3.8 + - name: Install Python 3.9 uses: actions/setup-python@v5 with: - python-version: '3.8' + python-version: '3.9' - run: echo $JAVA_HOME - run: echo "JAVA11_HOME=${JAVA_HOME}" >> "$GITHUB_OUTPUT" id: export-java11 @@ -310,10 +310,10 @@ jobs: path: beam-site token: ${{ github.event.inputs.REPO_TOKEN }} ref: release-docs - - name: Install Python 3.8 + - name: Install Python 3.9 uses: actions/setup-python@v5 with: - python-version: '3.8' + python-version: '3.9' - name: Install node uses: actions/setup-node@v4 with: diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml index f97f4de50aee..25030231a5e8 100644 --- a/.github/workflows/build_wheels.yml +++ b/.github/workflows/build_wheels.yml @@ -92,7 +92,7 @@ jobs: - name: Install python uses: actions/setup-python@v5 with: - python-version: 3.8 + python-version: 3.9 - name: Get tag id: get_tag run: | @@ -248,7 +248,7 @@ jobs: - name: Install Python uses: actions/setup-python@v5 with: - python-version: 3.8 + python-version: 3.9 - uses: docker/setup-qemu-action@v1 if: ${{matrix.os_python.arch == 'aarch64'}} name: Set up QEMU diff --git a/.github/workflows/dask_runner_tests.yml b/.github/workflows/dask_runner_tests.yml index 5f39852c228c..f87c70d8b720 100644 --- a/.github/workflows/dask_runner_tests.yml +++ b/.github/workflows/dask_runner_tests.yml @@ -43,7 +43,7 @@ jobs: - name: Install python uses: actions/setup-python@v5 with: - python-version: 3.8 + python-version: 3.9 - name: Build source working-directory: ./sdks/python run: pip install -U build && python -m build --sdist @@ -64,7 +64,6 @@ jobs: matrix: os: [ubuntu-latest, macos-latest, windows-latest] params: [ - {"py_ver": "3.8", "tox_env": "py38"}, {"py_ver": "3.9", "tox_env": "py39"}, {"py_ver": "3.10", "tox_env": "py310" }, ] diff --git a/.github/workflows/load-tests-pipeline-options/python_Combine_Flink_Streaming_2GB_10_byte_records.txt b/.github/workflows/load-tests-pipeline-options/python_Combine_Flink_Streaming_2GB_10_byte_records.txt index 12ffc1790e46..5d1a0be9950e 100644 --- a/.github/workflows/load-tests-pipeline-options/python_Combine_Flink_Streaming_2GB_10_byte_records.txt +++ b/.github/workflows/load-tests-pipeline-options/python_Combine_Flink_Streaming_2GB_10_byte_records.txt @@ -22,7 +22,7 @@ --parallelism=5 --job_endpoint=localhost:8099 --environment_type=DOCKER ---environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.8_sdk:latest +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.9_sdk:latest --top_count=20 --streaming --use_stateful_load_generator diff --git a/.github/workflows/load-tests-pipeline-options/python_Combine_Flink_Streaming_2GB_Fanout_4.txt b/.github/workflows/load-tests-pipeline-options/python_Combine_Flink_Streaming_2GB_Fanout_4.txt index c7d5552a03bd..650236a9c500 100644 --- a/.github/workflows/load-tests-pipeline-options/python_Combine_Flink_Streaming_2GB_Fanout_4.txt +++ b/.github/workflows/load-tests-pipeline-options/python_Combine_Flink_Streaming_2GB_Fanout_4.txt @@ -22,7 +22,7 @@ --parallelism=16 --job_endpoint=localhost:8099 --environment_type=DOCKER ---environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.8_sdk:latest +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.9_sdk:latest --fanout=4 --top_count=20 --streaming diff --git a/.github/workflows/load-tests-pipeline-options/python_Combine_Flink_Streaming_2GB_Fanout_8.txt b/.github/workflows/load-tests-pipeline-options/python_Combine_Flink_Streaming_2GB_Fanout_8.txt index bffdeab2cb11..4208571fef62 100644 --- a/.github/workflows/load-tests-pipeline-options/python_Combine_Flink_Streaming_2GB_Fanout_8.txt +++ b/.github/workflows/load-tests-pipeline-options/python_Combine_Flink_Streaming_2GB_Fanout_8.txt @@ -22,7 +22,7 @@ --parallelism=16 --job_endpoint=localhost:8099 --environment_type=DOCKER ---environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.8_sdk:latest +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.9_sdk:latest --fanout=8 --top_count=20 --streaming diff --git a/.github/workflows/load-tests-pipeline-options/python_GBK_Flink_Batch_2GB_of_100B_records.txt b/.github/workflows/load-tests-pipeline-options/python_GBK_Flink_Batch_2GB_of_100B_records.txt index 4cb5bfb0d988..f4f5e7de8369 100644 --- a/.github/workflows/load-tests-pipeline-options/python_GBK_Flink_Batch_2GB_of_100B_records.txt +++ b/.github/workflows/load-tests-pipeline-options/python_GBK_Flink_Batch_2GB_of_100B_records.txt @@ -24,5 +24,5 @@ --parallelism=5 --job_endpoint=localhost:8099 --environment_type=DOCKER ---environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.8_sdk:latest +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.9_sdk:latest --runner=PortableRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_GBK_Flink_Batch_fanout_4_times_with_2GB_10-byte_records_total.txt b/.github/workflows/load-tests-pipeline-options/python_GBK_Flink_Batch_fanout_4_times_with_2GB_10-byte_records_total.txt index bf9085141eab..df27dc7c4470 100644 --- a/.github/workflows/load-tests-pipeline-options/python_GBK_Flink_Batch_fanout_4_times_with_2GB_10-byte_records_total.txt +++ b/.github/workflows/load-tests-pipeline-options/python_GBK_Flink_Batch_fanout_4_times_with_2GB_10-byte_records_total.txt @@ -24,5 +24,5 @@ --parallelism=16 --job_endpoint=localhost:8099 --environment_type=DOCKER ---environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.8_sdk:latest +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.9_sdk:latest --runner=PortableRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_GBK_Flink_Batch_fanout_8_times_with_2GB_10-byte_records_total.txt b/.github/workflows/load-tests-pipeline-options/python_GBK_Flink_Batch_fanout_8_times_with_2GB_10-byte_records_total.txt index a59f873eb775..6b87f61eed8a 100644 --- a/.github/workflows/load-tests-pipeline-options/python_GBK_Flink_Batch_fanout_8_times_with_2GB_10-byte_records_total.txt +++ b/.github/workflows/load-tests-pipeline-options/python_GBK_Flink_Batch_fanout_8_times_with_2GB_10-byte_records_total.txt @@ -24,5 +24,5 @@ --parallelism=16 --job_endpoint=localhost:8099 --environment_type=DOCKER ---environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.8_sdk:latest +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.9_sdk:latest --runner=PortableRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_GBK_Flink_Batch_reiterate_4_times_10kB_values.txt b/.github/workflows/load-tests-pipeline-options/python_GBK_Flink_Batch_reiterate_4_times_10kB_values.txt index 0e5d00b96151..621777663be0 100644 --- a/.github/workflows/load-tests-pipeline-options/python_GBK_Flink_Batch_reiterate_4_times_10kB_values.txt +++ b/.github/workflows/load-tests-pipeline-options/python_GBK_Flink_Batch_reiterate_4_times_10kB_values.txt @@ -24,5 +24,5 @@ --parallelism=5 --job_endpoint=localhost:8099 --environment_type=DOCKER ---environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.8_sdk:latest +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.9_sdk:latest --runner=PortableRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Batch_10_Counters.txt b/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Batch_10_Counters.txt index 4d8bda8ac2f8..fe451559e625 100644 --- a/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Batch_10_Counters.txt +++ b/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Batch_10_Counters.txt @@ -25,5 +25,5 @@ --parallelism=5 --job_endpoint=localhost:8099 --environment_type=DOCKER ---environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.8_sdk:latest +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.9_sdk:latest --runner=PortableRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Batch_10_Iterations.txt b/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Batch_10_Iterations.txt index e84cee2f50cf..dd5addb65d14 100644 --- a/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Batch_10_Iterations.txt +++ b/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Batch_10_Iterations.txt @@ -25,5 +25,5 @@ --parallelism=5 --job_endpoint=localhost:8099 --environment_type=DOCKER ---environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.8_sdk:latest +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.9_sdk:latest --runner=PortableRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Batch_200_Iterations.txt b/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Batch_200_Iterations.txt index 4d8bda8ac2f8..fe451559e625 100644 --- a/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Batch_200_Iterations.txt +++ b/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Batch_200_Iterations.txt @@ -25,5 +25,5 @@ --parallelism=5 --job_endpoint=localhost:8099 --environment_type=DOCKER ---environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.8_sdk:latest +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.9_sdk:latest --runner=PortableRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Streaming_100_Counters.txt b/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Streaming_100_Counters.txt index b17e2cecc2c8..308deb3ecf4d 100644 --- a/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Streaming_100_Counters.txt +++ b/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Streaming_100_Counters.txt @@ -26,6 +26,6 @@ --streaming --job_endpoint=localhost:8099 --environment_type=DOCKER ---environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.8_sdk:latest +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.9_sdk:latest --use_stateful_load_generator --runner=PortableRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Streaming_10_Counters.txt b/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Streaming_10_Counters.txt index 957bc6c086d8..78ecc1fd98dd 100644 --- a/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Streaming_10_Counters.txt +++ b/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Streaming_10_Counters.txt @@ -26,6 +26,6 @@ --streaming --job_endpoint=localhost:8099 --environment_type=DOCKER ---environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.8_sdk:latest +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.9_sdk:latest --use_stateful_load_generator --runner=PortableRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Streaming_10_Iterations.txt b/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Streaming_10_Iterations.txt index baa34ec455b5..04a1213d4039 100644 --- a/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Streaming_10_Iterations.txt +++ b/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Streaming_10_Iterations.txt @@ -27,6 +27,6 @@ --stateful --job_endpoint=localhost:8099 --environment_type=DOCKER ---environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.8_sdk:latest +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.9_sdk:latest --use_stateful_load_generator --runner=PortableRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Streaming_200_Iterations.txt b/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Streaming_200_Iterations.txt index 44483a6e51cc..a2f7d7600da8 100644 --- a/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Streaming_200_Iterations.txt +++ b/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Streaming_200_Iterations.txt @@ -26,6 +26,6 @@ --streaming --job_endpoint=localhost:8099 --environment_type=DOCKER ---environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.8_sdk:latest +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.9_sdk:latest --use_stateful_load_generator --runner=PortableRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Streaming_5_Iterations.txt b/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Streaming_5_Iterations.txt index 571b33fb7a49..f49be6c70582 100644 --- a/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Streaming_5_Iterations.txt +++ b/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Streaming_5_Iterations.txt @@ -30,6 +30,6 @@ --shutdown_sources_after_idle_ms=300000 --job_endpoint=localhost:8099 --environment_type=DOCKER ---environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.8_sdk:latest +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.9_sdk:latest --use_stateful_load_generator --runner=PortableRunner \ No newline at end of file diff --git a/.github/workflows/playground_backend_precommit.yml b/.github/workflows/playground_backend_precommit.yml index 79517e705c27..9ba6cf20534f 100644 --- a/.github/workflows/playground_backend_precommit.yml +++ b/.github/workflows/playground_backend_precommit.yml @@ -33,7 +33,7 @@ jobs: runs-on: ubuntu-latest env: DATASTORE_EMULATOR_VERSION: '423.0.0' - PYTHON_VERSION: '3.8' + PYTHON_VERSION: '3.9' JAVA_VERSION: '11' steps: - name: Check out the repo diff --git a/.github/workflows/python_dependency_tests.yml b/.github/workflows/python_dependency_tests.yml index fed1056b90b2..2eaa9e4ce5aa 100644 --- a/.github/workflows/python_dependency_tests.yml +++ b/.github/workflows/python_dependency_tests.yml @@ -26,7 +26,6 @@ jobs: matrix: os: [ubuntu-latest] params: [ - {"py_ver": "3.8", "py_env": "py38"}, {"py_ver": "3.9", "py_env": "py39"}, {"py_ver": "3.10", "py_env": "py310" }, { "py_ver": "3.11", "py_env": "py311" }, diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml index 3ef9a5fe26b4..3000d1871be3 100644 --- a/.github/workflows/python_tests.yml +++ b/.github/workflows/python_tests.yml @@ -98,7 +98,6 @@ jobs: matrix: os: [macos-latest, windows-latest] params: [ - {"py_ver": "3.8", "tox_env": "py38"}, {"py_ver": "3.9", "tox_env": "py39"}, {"py_ver": "3.10", "tox_env": "py310" }, { "py_ver": "3.11", "tox_env": "py311" }, @@ -113,10 +112,14 @@ jobs: python-version: ${{ matrix.params.py_ver }} - name: Install tox run: pip install tox - - name: Run tests basic unix - if: startsWith(matrix.os, 'ubuntu') || startsWith(matrix.os, 'macos') + - name: Run tests basic linux + if: startsWith(matrix.os, 'ubuntu') working-directory: ./sdks/python run: tox -c tox.ini run -e ${{ matrix.params.tox_env }} + - name: Run tests basic macos + if: startsWith(matrix.os, 'macos') + working-directory: ./sdks/python + run: tox -c tox.ini run -e ${{ matrix.params.tox_env }}-macos - name: Run tests basic windows if: startsWith(matrix.os, 'windows') working-directory: ./sdks/python @@ -135,7 +138,7 @@ jobs: fail-fast: false matrix: os: [[self-hosted, ubuntu-20.04, main], macos-latest, windows-latest] - python: ["3.8", "3.9", "3.10", "3.11", "3.12"] + python: ["3.9", "3.10", "3.11", "3.12"] steps: - name: Checkout code uses: actions/checkout@v4 @@ -162,7 +165,7 @@ jobs: fail-fast: false matrix: os: [[self-hosted, ubuntu-20.04, main], macos-latest, windows-latest] - python: ["3.8", "3.9", "3.10", "3.11", "3.12"] + python: ["3.9", "3.10", "3.11", "3.12"] steps: - name: Checkout code uses: actions/checkout@v4 diff --git a/.github/workflows/run_perf_alert_tool.yml b/.github/workflows/run_perf_alert_tool.yml index 4bb5df41dcfb..a6aae616efec 100644 --- a/.github/workflows/run_perf_alert_tool.yml +++ b/.github/workflows/run_perf_alert_tool.yml @@ -39,7 +39,7 @@ jobs: - name: Install python uses: actions/setup-python@v5 with: - python-version: 3.8 + python-version: 3.9 - name: Install Apache Beam working-directory: ./sdks/python run: pip install -e .[gcp,test] diff --git a/.github/workflows/run_rc_validation.yml b/.github/workflows/run_rc_validation.yml index 15979a9e1acd..801a72d37130 100644 --- a/.github/workflows/run_rc_validation.yml +++ b/.github/workflows/run_rc_validation.yml @@ -106,7 +106,7 @@ jobs: if: ${{github.event.inputs.RUN_SQL_TAXI_WITH_DATAFLOW == 'true'}} strategy: matrix: - py_version: [3.8] + py_version: [3.9] steps: - name: Checkout code uses: actions/checkout@v4 @@ -171,7 +171,7 @@ jobs: if: ${{github.event.inputs.RUN_PYTHON_CROSS_VALIDATION == 'true'}} strategy: matrix: - py_version: [3.8] + py_version: [3.9] steps: - name: Checkout code uses: actions/checkout@v4 @@ -286,7 +286,7 @@ jobs: - name: Install Python uses: actions/setup-python@v5 with: - python-version: '3.8' + python-version: '3.9' - name: Setting python env uses: ./.github/actions/common-rc-validation @@ -351,7 +351,7 @@ jobs: if: ${{github.event.inputs.RUN_DIRECT_RUNNER_TESTS == 'true' }} strategy: matrix: - py_version: [3.8] + py_version: [3.9] needs: generate_shared_pubsub steps: - name: Checkout code @@ -399,7 +399,7 @@ jobs: if: ${{github.event.inputs.RUN_DATAFLOW_RUNNER_TESTS=='true'}} strategy: matrix: - py_version: [3.8] + py_version: [3.9] needs: [generate_shared_pubsub] steps: - name: Checkout code @@ -452,7 +452,7 @@ jobs: if: ${{github.event.inputs.RUN_DIRECT_RUNNER_TESTS == 'true' }} strategy: matrix: - py_version: [3.8] + py_version: [3.9] needs: [generate_shared_pubsub] steps: - name: Checkout code @@ -501,7 +501,7 @@ jobs: if: ${{github.event.inputs.RUN_DATAFLOW_RUNNER_TESTS=='true'}} strategy: matrix: - py_version: [3.8] + py_version: [3.9] needs: [generate_shared_pubsub] steps: - name: Checkout code diff --git a/.github/workflows/tour_of_beam_backend_integration.yml b/.github/workflows/tour_of_beam_backend_integration.yml index 0ba2711d0d8d..11cd2e2c878b 100644 --- a/.github/workflows/tour_of_beam_backend_integration.yml +++ b/.github/workflows/tour_of_beam_backend_integration.yml @@ -69,7 +69,7 @@ env: jobs: integration: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 defaults: run: working-directory: ./learning/tour-of-beam/backend @@ -88,7 +88,7 @@ jobs: # 1. Start emulators - name: Start emulators - run: docker-compose up -d + run: docker compose up -d # 2. start function-framework processes in BG - name: Compile CF @@ -118,7 +118,7 @@ jobs: - name: Stop emulators if: always() - run: docker-compose down + run: docker compose down # 5. Compare storage/datastore/index.yml VS generated - name: Check index.yaml diff --git a/.github/workflows/typescript_tests.yml b/.github/workflows/typescript_tests.yml index 1b45ea67b5c6..a4e4c2926f84 100644 --- a/.github/workflows/typescript_tests.yml +++ b/.github/workflows/typescript_tests.yml @@ -85,7 +85,7 @@ jobs: - name: Install Python uses: actions/setup-python@v5 with: - python-version: 3.8 + python-version: 3.9 - name: Setup Beam Python working-directory: ./sdks/python run: | @@ -140,7 +140,7 @@ jobs: - name: Install python uses: actions/setup-python@v5 with: - python-version: 3.8 + python-version: 3.9 - name: Setup Beam Python working-directory: ./sdks/python run: | diff --git a/.test-infra/jenkins/PythonTestProperties.groovy b/.test-infra/jenkins/PythonTestProperties.groovy index 98257a6e1c28..7e8e4ad3d8fd 100644 --- a/.test-infra/jenkins/PythonTestProperties.groovy +++ b/.test-infra/jenkins/PythonTestProperties.groovy @@ -20,10 +20,10 @@ class PythonTestProperties { // Indicates all supported Python versions. // This must be sorted in ascending order. final static List ALL_SUPPORTED_VERSIONS = [ - '3.8', '3.9', '3.10', - '3.11' + '3.11', + '3.12' ] final static List SUPPORTED_CONTAINER_TASKS = ALL_SUPPORTED_VERSIONS.collect { "py${it.replace('.', '')}" @@ -37,10 +37,10 @@ class PythonTestProperties { final static List CROSS_LANGUAGE_VALIDATES_RUNNER_PYTHON_VERSIONS = ESSENTIAL_VERSIONS final static List CROSS_LANGUAGE_VALIDATES_RUNNER_DATAFLOW_USING_SQL_PYTHON_VERSIONS = [HIGHEST_SUPPORTED] final static List VALIDATES_CONTAINER_DATAFLOW_PYTHON_VERSIONS = ALL_SUPPORTED_VERSIONS - final static String LOAD_TEST_PYTHON_VERSION = '3.8' - final static String RUN_INFERENCE_TEST_PYTHON_VERSION = '3.8' - final static String CHICAGO_TAXI_EXAMPLE_FLINK_PYTHON_VERSION = '3.8' + final static String LOAD_TEST_PYTHON_VERSION = '3.9' + final static String RUN_INFERENCE_TEST_PYTHON_VERSION = '3.9' + final static String CHICAGO_TAXI_EXAMPLE_FLINK_PYTHON_VERSION = '3.9' // Use for various shell scripts triggered by Jenkins. // Gradle scripts should use project.ext.pythonVersion defined by PythonNature/BeamModulePlugin. - final static String DEFAULT_INTERPRETER = 'python3.8' + final static String DEFAULT_INTERPRETER = 'python3.9' } diff --git a/.test-infra/jenkins/build.gradle b/.test-infra/jenkins/build.gradle index 37c9c4d8d6ae..df43717e2fc3 100644 --- a/.test-infra/jenkins/build.gradle +++ b/.test-infra/jenkins/build.gradle @@ -22,9 +22,6 @@ plugins { applyGroovyNature() applyPythonNature() -// TODO(https://github.com/apache/beam/issues/20209): Don't hardcode this version, take the value provided by Python nature. -pythonVersion = '3.8' - task generateMetricsReport { dependsOn setupVirtualenv def metricsReportFilename = "beam-metrics_report.html" diff --git a/.test-infra/metrics/influxdb/Dockerfile b/.test-infra/metrics/influxdb/Dockerfile index 57a541fb9955..0ec7bd6f2677 100644 --- a/.test-infra/metrics/influxdb/Dockerfile +++ b/.test-infra/metrics/influxdb/Dockerfile @@ -16,7 +16,7 @@ # limitations under the License. ################################################################################ -FROM python:3.8-slim +FROM python:3.9-slim RUN pip install --no-cache-dir gsutil diff --git a/.test-infra/metrics/influxdb/gsutil/Dockerfile b/.test-infra/metrics/influxdb/gsutil/Dockerfile index 09ccf150b938..ea6621e2cf9d 100644 --- a/.test-infra/metrics/influxdb/gsutil/Dockerfile +++ b/.test-infra/metrics/influxdb/gsutil/Dockerfile @@ -16,7 +16,7 @@ # limitations under the License. ################################################################################ -FROM python:3.8-slim +FROM python:3.9-slim # google-compute-engine package allows to obtain credentials for service # account specified in .boto file. diff --git a/.test-infra/metrics/sync/github/Dockerfile b/.test-infra/metrics/sync/github/Dockerfile index e686a9b2f682..3116d0f211fa 100644 --- a/.test-infra/metrics/sync/github/Dockerfile +++ b/.test-infra/metrics/sync/github/Dockerfile @@ -16,7 +16,7 @@ # limitations under the License. ################################################################################ -FROM python:3.8-slim +FROM python:3.9-slim WORKDIR /usr/src/app diff --git a/.test-infra/metrics/sync/jenkins/Dockerfile b/.test-infra/metrics/sync/jenkins/Dockerfile index 32247b438b59..62829ada38ee 100644 --- a/.test-infra/metrics/sync/jenkins/Dockerfile +++ b/.test-infra/metrics/sync/jenkins/Dockerfile @@ -16,7 +16,7 @@ # limitations under the License. ################################################################################ -FROM python:3.8-slim +FROM python:3.9-slim WORKDIR /usr/src/app diff --git a/.test-infra/mock-apis/poetry.lock b/.test-infra/mock-apis/poetry.lock index 98985df7ea4a..322980cc2e75 100644 --- a/.test-infra/mock-apis/poetry.lock +++ b/.test-infra/mock-apis/poetry.lock @@ -171,7 +171,7 @@ name = "protobuf" version = "4.25.0" description = "" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" files = [ {file = "protobuf-4.25.0-cp310-abi3-win32.whl", hash = "sha256:5c1203ac9f50e4853b0a0bfffd32c67118ef552a33942982eeab543f5c634395"}, {file = "protobuf-4.25.0-cp310-abi3-win_amd64.whl", hash = "sha256:c40ff8f00aa737938c5378d461637d15c442a12275a81019cc2fef06d81c9419"}, @@ -191,7 +191,7 @@ name = "setuptools" version = "70.0.0" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" files = [ {file = "setuptools-70.0.0-py3-none-any.whl", hash = "sha256:54faa7f2e8d2d11bcd2c07bed282eef1046b5c080d1c32add737d7b5817b1ad4"}, {file = "setuptools-70.0.0.tar.gz", hash = "sha256:f211a66637b8fa059bb28183da127d4e86396c991a942b028c6650d4319c3fd0"}, @@ -206,7 +206,7 @@ name = "soupsieve" version = "2.5" description = "A modern CSS selector implementation for Beautiful Soup." optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" files = [ {file = "soupsieve-2.5-py3-none-any.whl", hash = "sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7"}, {file = "soupsieve-2.5.tar.gz", hash = "sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690"}, @@ -214,5 +214,5 @@ files = [ [metadata] lock-version = "2.0" -python-versions = "^3.8" +python-versions = "^3.9" content-hash = "35ed5a98dd3f951bbfc44b949ad9148634159976cb54ac6f257d119c12d9d924" diff --git a/CHANGES.md b/CHANGES.md index 83a8299f9b0e..aeecddbf047f 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -57,19 +57,15 @@ ## Highlights -* New highly anticipated feature X added to Python SDK ([#X](https://github.com/apache/beam/issues/X)). -* New highly anticipated feature Y added to Java SDK ([#Y](https://github.com/apache/beam/issues/Y)). - -## I/Os - -* Support for X source added (Java/Python) ([#X](https://github.com/apache/beam/issues/X)). +* Added support for using vLLM in the RunInference transform (Python) ([#32528](https://github.com/apache/beam/issues/32528)) ## New Features / Improvements * Dataflow worker can install packages from Google Artifact Registry Python repositories (Python) ([#32123](https://github.com/apache/beam/issues/32123)). * Added support for Zstd codec in SerializableAvroCodecFactory (Java) ([#32349](https://github.com/apache/beam/issues/32349)) +* Added support for using vLLM in the RunInference transform (Python) ([#32528](https://github.com/apache/beam/issues/32528)) * Added support for writing to Pubsub with ordering keys (Java) ([#21162](https://github.com/apache/beam/issues/21162)) -* X feature added (Java/Python) ([#X](https://github.com/apache/beam/issues/X)). +* Significantly improved performance of Kafka IO reads that enable [commitOffsetsInFinalize](https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/io/kafka/KafkaIO.Read.html#commitOffsetsInFinalize--) by removing the data reshuffle from SDF implementation. ([#31682](https://github.com/apache/beam/pull/31682)). ## Breaking Changes @@ -78,28 +74,32 @@ as strings rather than silently coerced (and possibly truncated) to numeric values. To retain the old behavior, pass `dtype=True` (or any other value accepted by `pandas.read_json`). -* X behavior was changed ([#X](https://github.com/apache/beam/issues/X)). +* Users of KafkaIO Read transform that enable [commitOffsetsInFinalize](https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/io/kafka/KafkaIO.Read.html#commitOffsetsInFinalize--) might encounter pipeline graph compatibility issues when updating the pipeline. To mitigate, set the `updateCompatibilityVersion` option to the SDK version used for the original pipeline, example `--updateCompatabilityVersion=2.58.1` ## Deprecations -* X behavior is deprecated and will be removed in X versions ([#X](https://github.com/apache/beam/issues/X)). +* Python 3.8 is reaching EOL and support is being removed in Beam 2.61.0. The 2.60.0 release will warn users +when running on 3.8. ([#31192](https://github.com/apache/beam/issues/31192)) ## Bugfixes * (Java) Fixed custom delimiter issues in TextIO ([#32249](https://github.com/apache/beam/issues/32249), [#32251](https://github.com/apache/beam/issues/32251)). +* (Java, Python, Go) Fixed PeriodicSequence backlog bytes reporting, which was preventing Dataflow Runner autoscaling from functioning properly ([#32506](https://github.com/apache/beam/issues/32506)). +* (Java) Fix improper decoding of rows with schemas containing nullable fields when encoded with a schema with equal encoding positions but modified field order. ([#32388](https://github.com/apache/beam/issues/32388)). ## Security Fixes * Fixed (CVE-YYYY-NNNN)[https://www.cve.org/CVERecord?id=CVE-YYYY-NNNN] (Java/Python/Go) ([#X](https://github.com/apache/beam/issues/X)). -## Known Issues - -* ([#X](https://github.com/apache/beam/issues/X)). - -# [2.59.0] - Cut, 2024-08-22 +# [2.59.0] - 2024-09-11 ## Highlights * Added support for setting a configureable timeout when loading a model and performing inference in the [RunInference](https://beam.apache.org/documentation/ml/inference-overview/) transform using [with_exception_handling](https://beam.apache.org/releases/pydoc/current/apache_beam.ml.inference.base.html#apache_beam.ml.inference.base.RunInference.with_exception_handling) ([#32137](https://github.com/apache/beam/issues/32137)) +* Initial experimental support for using Prism with the Java and Python SDKs + * Prism is presently targeting local testing usage, or other small scale execution. + * For Java, use 'PrismRunner', or 'TestPrismRunner' as an argument to the `--runner` flag. + * For Python, use 'PrismRunner' as an argument to the `--runner` flag. + * Go already uses Prism as the default local runner. ## I/Os @@ -116,7 +116,7 @@ * Updated Go protobuf package to new version (Go) ([#21515](https://github.com/apache/beam/issues/21515)). * Added support for setting a configureable timeout when loading a model and performing inference in the [RunInference](https://beam.apache.org/documentation/ml/inference-overview/) transform using [with_exception_handling](https://beam.apache.org/releases/pydoc/current/apache_beam.ml.inference.base.html#apache_beam.ml.inference.base.RunInference.with_exception_handling) ([#32137](https://github.com/apache/beam/issues/32137)) * Adds OrderedListState support for Java SDK via FnApi. -* Initial support for using Prism from the Python SDK. +* Initial support for using Prism from the Python and Java SDKs. ## Bugfixes @@ -125,6 +125,13 @@ * (Python) Upgraded google-cloud-storage to version 2.18.2 to fix a data corruption issue ([#32135](https://github.com/apache/beam/pull/32135)). * (Go) Fix corruption on State API writes. ([#32245](https://github.com/apache/beam/issues/32245)). +## Known Issues + +* Prism is under active development and does not yet support all pipelines. See [#29650](https://github.com/apache/beam/issues/29650) for progress. + * In the 2.59.0 release, Prism passes most runner validations tests with the exceptions of pipelines using the following features: + OrderedListState, OnWindowExpiry (eg. GroupIntoBatches), CustomWindows, MergingWindowFns, Trigger and WindowingStrategy associated features, Bundle Finalization, Looping Timers, and some Coder related issues such as with Python combiner packing, and Java Schema transforms, and heterogenous flatten coders. Processing Time timers do not yet have real time support. + * If your pipeline is having difficulty with the Python or Java direct runners, but runs well on Prism, please let us know. + # [2.58.1] - 2024-08-15 ## New Features / Improvements diff --git a/build.gradle.kts b/build.gradle.kts index e6295384b753..38b58b6979ee 100644 --- a/build.gradle.kts +++ b/build.gradle.kts @@ -268,6 +268,7 @@ tasks.register("javaPreCommit") { dependsOn(":runners:jet:build") dependsOn(":runners:local-java:build") dependsOn(":runners:portability:java:build") + dependsOn(":runners:prism:java:build") dependsOn(":runners:samza:build") dependsOn(":runners:samza:job-server:build") dependsOn(":runners:spark:3:build") @@ -470,7 +471,6 @@ tasks.register("playgroundPreCommit") { tasks.register("pythonPreCommit") { dependsOn(":sdks:python:test-suites:tox:pycommon:preCommitPyCommon") - dependsOn(":sdks:python:test-suites:tox:py38:preCommitPy38") dependsOn(":sdks:python:test-suites:tox:py39:preCommitPy39") dependsOn(":sdks:python:test-suites:tox:py310:preCommitPy310") dependsOn(":sdks:python:test-suites:tox:py311:preCommitPy311") @@ -487,7 +487,6 @@ tasks.register("pythonDocsPreCommit") { } tasks.register("pythonDockerBuildPreCommit") { - dependsOn(":sdks:python:container:py38:docker") dependsOn(":sdks:python:container:py39:docker") dependsOn(":sdks:python:container:py310:docker") dependsOn(":sdks:python:container:py311:docker") @@ -543,15 +542,15 @@ tasks.register("python312PostCommit") { dependsOn(":sdks:python:test-suites:direct:py312:postCommitIT") dependsOn(":sdks:python:test-suites:direct:py312:hdfsIntegrationTest") dependsOn(":sdks:python:test-suites:portable:py312:postCommitPy312") + dependsOn(":sdks:python:test-suites:dataflow:py312:inferencePostCommitITPy312") } tasks.register("portablePythonPreCommit") { - dependsOn(":sdks:python:test-suites:portable:py38:preCommitPy38") + dependsOn(":sdks:python:test-suites:portable:py39:preCommitPy39") dependsOn(":sdks:python:test-suites:portable:py312:preCommitPy312") } tasks.register("pythonSparkPostCommit") { - dependsOn(":sdks:python:test-suites:portable:py38:sparkValidatesRunner") dependsOn(":sdks:python:test-suites:portable:py39:sparkValidatesRunner") dependsOn(":sdks:python:test-suites:portable:py312:sparkValidatesRunner") } @@ -576,15 +575,15 @@ tasks.register("javaExamplesDataflowPrecommit") { tasks.register("whitespacePreCommit") { // TODO(https://github.com/apache/beam/issues/20209): Find a better way to specify the tasks without hardcoding py version. - dependsOn(":sdks:python:test-suites:tox:py38:archiveFilesToLint") - dependsOn(":sdks:python:test-suites:tox:py38:unpackFilesToLint") - dependsOn(":sdks:python:test-suites:tox:py38:whitespacelint") + dependsOn(":sdks:python:test-suites:tox:py39:archiveFilesToLint") + dependsOn(":sdks:python:test-suites:tox:py39:unpackFilesToLint") + dependsOn(":sdks:python:test-suites:tox:py39:whitespacelint") } tasks.register("typescriptPreCommit") { // TODO(https://github.com/apache/beam/issues/20209): Find a better way to specify the tasks without hardcoding py version. - dependsOn(":sdks:python:test-suites:tox:py38:eslint") - dependsOn(":sdks:python:test-suites:tox:py38:jest") + dependsOn(":sdks:python:test-suites:tox:py39:eslint") + dependsOn(":sdks:python:test-suites:tox:py39:jest") } tasks.register("pushAllRunnersDockerImages") { diff --git a/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy b/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy index 50f94da25905..a334cf3191f2 100644 --- a/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy +++ b/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy @@ -738,7 +738,7 @@ class BeamModulePlugin implements Plugin { google_api_common : "com.google.api:api-common", // google_cloud_platform_libraries_bom sets version google_api_services_bigquery : "com.google.apis:google-api-services-bigquery:v2-rev20240815-2.0.0", // [bomupgrader] sets version google_api_services_cloudresourcemanager : "com.google.apis:google-api-services-cloudresourcemanager:v1-rev20240310-2.0.0", // [bomupgrader] sets version - google_api_services_dataflow : "com.google.apis:google-api-services-dataflow:v1b3-rev20240624-$google_clients_version", + google_api_services_dataflow : "com.google.apis:google-api-services-dataflow:v1b3-rev20240817-$google_clients_version", google_api_services_healthcare : "com.google.apis:google-api-services-healthcare:v1-rev20240130-$google_clients_version", google_api_services_pubsub : "com.google.apis:google-api-services-pubsub:v1-rev20220904-$google_clients_version", google_api_services_storage : "com.google.apis:google-api-services-storage:v1-rev20240706-2.0.0", // [bomupgrader] sets version @@ -2824,7 +2824,8 @@ class BeamModulePlugin implements Plugin { // CrossLanguageValidatesRunnerTask is setup under python sdk but also runs tasks not involving // python versions. set 'skipNonPythonTask' property to avoid duplicated run of these tasks. if (!(project.hasProperty('skipNonPythonTask') && project.skipNonPythonTask == 'true')) { - mainTask.configure { dependsOn goTask } + System.err.println 'GoUsingJava tests have been disabled: https://github.com/apache/beam/issues/30517#issuecomment-2341881604.' + // mainTask.configure { dependsOn goTask } } cleanupTask.configure { mustRunAfter goTask } config.cleanupJobServer.configure { mustRunAfter goTask } @@ -2866,7 +2867,7 @@ class BeamModulePlugin implements Plugin { // Transform service delivers transforms that refer to SDK harness containers with following sufixes. def transformServiceJavaContainerSuffix = 'java11' - def transformServicePythonContainerSuffix = '38' + def transformServicePythonContainerSuffix = '39' def setupTask = project.tasks.register(config.name+"Setup", Exec) { // Containers for main SDKs when running tests. @@ -2957,7 +2958,7 @@ class BeamModulePlugin implements Plugin { // If none of them applied, version set here will be used as default value. // TODO(BEAM-12000): Move default value to Py3.9. project.ext.pythonVersion = project.hasProperty('pythonVersion') ? - project.pythonVersion : '3.8' + project.pythonVersion : '3.9' def setupVirtualenv = project.tasks.register('setupVirtualenv') { doLast { @@ -3148,6 +3149,7 @@ class BeamModulePlugin implements Plugin { ':sdks:python:container:py39:docker', ':sdks:python:container:py310:docker', ':sdks:python:container:py311:docker', + ':sdks:python:container:py312:docker', ] doLast { // TODO: Figure out GCS credentials and use real GCS input and output. diff --git a/contributor-docs/discussion-docs/2016.md b/contributor-docs/discussion-docs/2016.md new file mode 100644 index 000000000000..2421a86e2ee8 --- /dev/null +++ b/contributor-docs/discussion-docs/2016.md @@ -0,0 +1,50 @@ + + +# List Of Documents Submitted To dev@beam.apache.org In 2016 +| No. | Author | Subject | Date (UTC) | +|---|---|---|---| +| 1 | Frances Perry | [Apache Beam: Technical Vision](https://docs.google.com/document/d/1UyAeugHxZmVlQ5cEWo_eOPgXNQA1oD-rGooWOSwAqh8) | 2016-02-09 11:01:41 | +| 2 | Amit Sela | [Apache Beam (incubating) - Spark Runner Technical Vision](https://docs.google.com/document/d/1y4qlQinjjrusGWlgq-mYmbxRW2z7-_X5Xax-GG0YsC0) | 2016-02-10 13:46:37 | +| 3 | Seetharam Venkatesh | [Issue Navigator - ASF JIRA](http://s.apache.org/gsoc2016ideas) | 2016-02-15 13:38:18 | +| 4 | Kenneth Knowles | [Lateness (and Panes) in Apache Beam (incubating)](https://docs.google.com/document/d/12r7frmxNickxB5tbpuEh_n35_IJeVZn1peOrBrhhP6Y) | 2016-03-01 11:50:30 | +| 5 | Tyler Akidau | [Beam Model & Runner Compatibility Matrix - Google Sheets](https://docs.google.com/spreadsheets/d/1OM077lZBARrtUi6g0X0O0PHaIbFKCD6v0djRefQRE1I) | 2016-03-09 17:14:29 | +| 6 | Ben Chambers | [Static Display Data](https://docs.google.com/document/d/11enEB9JwVp6vO0uOYYTMYTGkr3TdNfELwWqoiUg5ZxM) | 2016-03-14 20:25:44 | +| 7 | Kenneth Knowles | [Apache Beam (incubating) Pipeline Runner API 1-Pager](https://docs.google.com/document/d/1zEJN06YaKEIMhSckVjVgLQIh-jxqkgDRSnos1AOVSQ4) | 2016-03-15 11:28:45 | +| 8 | Frances Perry | [[Draft] Apache Beam Contribution Guide](https://docs.google.com/document/d/1syFyfqIsGOYDE_Hn3ZkRd8a6ylcc64Kud9YtrGHgU0E) | 2016-03-17 17:19:08 | +| 9 | Thomas Groh | [Writing More Expressive Beam Tests](https://docs.google.com/document/d/1fZUUbG2LxBtqCVabQshldXIhkMcXepsbv2vuuny8Ix4) | 2016-03-21 20:38:53 | +| 10 | Kenneth Knowles | [Apache Beam (Incubating) Pipeline Runner API Proposal](https://docs.google.com/document/d/1bao-5B6uBuf-kwH1meenAuXXS0c9cBQ1B2J59I3FiyI) | 2016-03-23 17:17:08 | +| 11 | Kam Kasravi | [ML models and pipelines](https://docs.google.com/document/d/17cRZk_yqHm3C0fljivjN66MbLkeKS1yjo4PBECHb-xA) | 2016-05-13 12:54:13 | +| 12 | Kenneth Knowles | [Side Input Architecture for Apache Beam (incubating) 1-Pager](https://s.apache.org/beam-side-inputs-1-pager) | 2016-05-13 13:50:35 | +| 13 | Kenneth Knowles | [Triggers in Apache Beam (incubating)](https://s.apache.org/beam-triggers) | 2016-05-30 15:18:34 | +| 14 | Thomas Groh | [Capability Matrix Testing](https://docs.google.com/document/d/1fICxq32t9yWn9qXhmT07xpclHeHX2VlUyVtpi2WzzGM) | 2016-06-10 19:34:34 | +| 15 | N/A| [Website Layout](https://docs.google.com/document/d/1-0jMv7NnYp0Ttt4voulUMwVe_qjBYeNMLm2LusYF3gQ) | 2016-06-17 00:53:45 | +| 16 | Thomas Groh | [DoFn Instance Setup and Teardown](https://docs.google.com/document/d/1LLQqggSePURt3XavKBGV7SZJYQ4NW8yCu63lBchzMRk) | 2016-06-28 12:06:48 | +| 17 | Jesse Anderson | [Apache Beam Interviews](https://docs.google.com/document/d/1IQt6FfQI7W4d2QxZm6WwGnZFdA8JmaseKZrMGPu8zgY) | 2016-07-07 20:18:35 | +| 18 | Kenneth Knowles | [State and Timers for DoFn in Apache Beam (incubating)](https://s.apache.org/beam-state) | 2016-07-26 23:20:33 | +| 19 | Kenneth Knowles | [Presenting: A New DoFn - Google Slides](https://s.apache.org/presenting-a-new-dofn) | 2016-07-26 23:29:59 | +| 20 | Sam McVeety | [Beam Proposal for Dynamic PipelineOptions](https://docs.google.com/document/d/1I-iIgWDYasb7ZmXbGBHdok_IK1r1YAJ90JG5Fz0_28o) | 2016-07-29 15:14:32 | +| 21 | Eugene Kirpichov | [Splittable DoFn proposal](https://s.apache.org/splittable-do-fn) | 2016-08-04 13:45:49 | +| 22 | Eugene Kirpichov | [Apache URL Shortener](https://s.apache.org/) | 2016-08-18 20:29:32 | +| 23 | Ben Chambers and Kenneth Knowles | [A New DoFn](https://s.apache.org/a-new-dofn) | 2016-08-19 14:52:34 | +| 24 | Eugene Kirpichov, Daniel Mills, Robert Bradshaw | [Splittable DoFn proposal](https://docs.google.com/document/d/1AQmx-T9XjSi1PNoEp5_L-lT0j7BkgTbmQnc6uFEMI4c) | 2016-08-29 11:40:41 | +| 25 | Kenneth Knowles | [WindowMappingFn / Side Input GC 1-Pager](https://s.apache.org/beam-windowmappingfn-1-pager) | 2016-09-15 14:51:34 | +| 26 | Ben Chambers | [User Defined Metrics API](http://s.apache.org/beam-metrics-api) | 2016-10-05 14:51:18 | +| 27 | Amit Sela | [Reading from an unbounded source with Spark](https://docs.google.com/document/d/12BzHbETDt7ICIF7vc8zzCeLllmIpvvaVDIdBlcIwE1M) | 2016-10-09 03:10:29 | +| 28 | Robert Bradshaw | [Splittable DoFn proposal](https://s.apache.org/splittable-do-fn?) | 2016-10-10 13:22:56 | +| 29 | Thomas Groh | [Availability of PipelineOptions](https://docs.google.com/document/d/1Wr05cYdqnCfrLLqSk--XmGMGgDwwNwWZaFbxLKvPqEQ) | 2016-10-24 19:36:10 | +| 30 | Kenneth Knowles | [Apache Beam (Incubating) Pipeline Runner API Proposal](https://s.apache.org/beam-runner-api) | 2016-11-07 23:11:15 | +| 31 | Pei He | [Apache Beam (incubating) Proposal: Part 1: IOChannelFactory Redesign](https://docs.google.com/document/d/11TdPyZ9_zmjokhNWM3Id-XJsVG3qel2lhdKTknmZ_7M) | 2016-11-16 19:09:51 | +| 32 | Etienne Chauchot | [Testing IO Transforms in Apache Beam](https://docs.google.com/document/d/153J9jPQhMCNi_eBzJfhAg-NprQ7vbf1jNVRgdqeEE8I) | 2016-12-01 11:19:47 | +| 33 | Davor Bonaci | [Apache Beam (incubating) Proposal: Part 2: Configurable BeamFileSystem](https://docs.google.com/document/d/1-7vo9nLRsEEzDGnb562PuL4q9mUiq_ZVpCAiyyJw8p8) | 2016-12-30 23:53:13 | \ No newline at end of file diff --git a/contributor-docs/discussion-docs/2017.md b/contributor-docs/discussion-docs/2017.md new file mode 100644 index 000000000000..6fce298391ca --- /dev/null +++ b/contributor-docs/discussion-docs/2017.md @@ -0,0 +1,90 @@ + + +# List Of Documents Submitted To dev@beam.apache.org In 2017 +| No. | Author | Subject | Date (UTC) | +|---|---|---|---| +| 1 | Wesley Tanaka | [A New DoFn](https://s.apache.org/a-new-dofn) | Unknown | +| 2 | Stephen Sisk | [Apache Beam (incubating) Proposal: Part 2: Configurable BeamFileSystem](https://docs.google.com/document/d/1-7vo9nLRsEEzDGnb562PuL4q9mUiq_ZVpCAiyyJw8p8) | 2017-01-05 16:31:24 | +| 3 | Vladisav Jelisavcic | [ML models and pipelines](https://docs.google.com/document/d/17cRZk_yqHm3C0fljivjN66MbLkeKS1yjo4PBECHb-xA) | 2017-01-06 10:07:12 | +| 4 | Eugene Kirpichov | [Splittable DoFn proposal](https://s.apache.org/splittable-do-fn) | 2017-01-10 19:58:25 | +| 5 | Kenneth Knowles | [WindowMappingFn / Side Input GC 1-Pager](https://s.apache.org/beam-windowmappingfn-1-pager) | 2017-01-11 16:21:55 | +| 6 | Thomas Groh | [Composite PInputs, POutputs, and the Runner API](https://docs.google.com/document/d/1_CHLnj1RFAGKy_MfR54XmixakYNmCnhGZLWmuDSMJ10) | 2017-01-17 13:46:34 | +| 7 | Kenneth Knowles | [Lateness (and Panes) in Apache Beam (incubating)](https://s.apache.org/beam-lateness) | 2017-01-17 14:04:28 | +| 8 | Kenneth Knowles | [Composite PInputs, POutputs, and the Runner API](https://s.apache.org/beam-runner-composites) | 2017-01-17 14:13:07 | +| 9 | Lukasz Cwik | [Apache Beam Fn Api Overview](https://s.apache.org/beam-fn-api) | 2017-01-19 18:56:19 | +| 10 | Etienne Chauchot | [Testing IO Transforms in Apache Beam](https://docs.google.com/document/d/153J9jPQhMCNi_eBzJfhAg-NprQ7vbf1jNVRgdqeEE8I) | 2017-01-25 03:54:17 | +| 11 | Matthew Jadczak | [State and Timers for DoFn in Apache Beam (incubating)](https://s.apache.org/beam-state) | 2017-01-25 16:09:28 | +| 12 | Lukasz Cwik | [Apache Beam (Incubating) Pipeline Runner API Proposal](http://s.apache.org/beam-runner-api) | 2017-01-25 17:06:07 | +| 13 | Stephen Sisk | [Authoring Apache Beam IO Transforms](https://docs.google.com/document/d/1nGGP2sLb5fLamB_dnkHVHC8BVjDD_SE46mQPIPkK5cQ) | 2017-01-27 20:29:58 | +| 14 | Kenneth Knowles | [Issue Navigator - ASF JIRA](http://s.apache.org/gsoc2017ideas) | 2017-02-03 16:47:45 | +| 15 | Eugene Kirpichov | [Splittable DoFn proposal](http://s.apache.org/splittable-do-fn) | 2017-02-07 18:12:21 | +| 16 | Jason Kuster | [Enabling PerfKit Benchmarker for Apache Beam](https://docs.google.com/document/d/1PsjGPSN6FuorEEPrKEP3u3m16tyOzph5FnL2DhaRDz0) | 2017-02-17 19:03:51 | +| 17 | Mingmin Xu | [Apache Beam: Add a Beam SQL DSL](https://docs.google.com/document/d/1Uc5xYTpO9qsLXtT38OfuoqSLimH_0a1Bz5BsCROMzCU) | 2017-02-27 16:33:32 | +| 18 | Sourabh Bajaj | [Apache Beam (incubating) Proposal: Part 1: IOChannelFactory Redesign](https://docs.google.com/document/d/11TdPyZ9_zmjokhNWM3Id-XJsVG3qel2lhdKTknmZ_7M) | 2017-03-01 18:08:48 | +| 19 | Chamikara Jayalath | [Splittable DoFn for Python SDK](https://docs.google.com/document/d/1h_zprJrOilivK2xfvl4L42vaX4DMYGfH1YDmi-s_ozM) | 2017-03-03 20:16:34 | +| 20 | Davor Bonaci | [Beam report to ASF Board, 2017-03](https://docs.google.com/document/d/1eYBBIafwnbNUZj6Iqk0_kDhnqJ8PYkzVjipNm1v1RJs) | 2017-03-07 21:14:19 | +| 21 | Sourabh Bajaj | [Beam File System Python SDK](https://docs.google.com/document/d/10qD0RXmdI0240wPShaGDRm9Zt9a_ess-ABlvYx2LZFA) | 2017-03-17 17:14:43 | +| 22 | Thomas Groh | [Side-Channel Inputs in the Java SDK](https://docs.google.com/document/d/1e_-MenoW2cQ-6-EGVVqfOR-B9FovVXqXyUm4-ZwlgKA) | 2017-03-24 14:48:54 | +| 23 | Thomas Groh | [finalizeCheckpoint spec cleanup](https://s.apache.org/FIWQ) | 2017-03-29 16:28:45 | +| 24 | Eugene Kirpichov | [Proposed Splittable DoFn API changes](https://docs.google.com/document/d/1BGc8pM1GOvZhwR9SARSVte-20XEoBUxrGJ5gTWXdv3c) | 2017-04-05 19:55:39 | +| 25 | Tyler Akidau | [The Beam Model : Streams & Tables](http://s.apache.org/beam-streams-tables) | 2017-04-20 20:57:42 | +| 26 | Madhusudan Borkar | [Beam Hive Connector Proposal](https://docs.google.com/document/d/1JOzihFiXkQjtv6rur8-vCixSK-nHhIoIij9MwJZ_Dp0) | 2017-04-24 20:23:32 | +| 27 | Eugene Kirpichov | [Running Splittable DoFn via Source API](https://s.apache.org/sdf-via-source) | 2017-05-01 01:22:49 | +| 28 | Davor Bonaci | [Apache Beam Community Hackathon](https://docs.google.com/document/d/1UKC2R_9FkSdMVTz2nt2sIW18KoLbIu6w0aj9bwSSPiw) | 2017-05-04 15:07:59 | +| 29 | Madhusudan Borkar | [Apache Beam Hive Connector Proposal](https://docs.google.com/document/d/1aeQRLXjVr38Z03_zWkHO9YQhtnj0jHoCfhsSNm-wxtA) | 2017-05-10 18:05:59 | +| 30 | Mingmin Xu | [Apache Beam: design of DSL SQL interface](https://docs.google.com/document/d/1uWXL_yF3UUO5GfCxbL6kWsmC8xCWfICU3RwiQKsk7Mk) | 2017-05-13 01:49:44 | +| 31 | Jesse Anderson | [Beam 2.0 Q and A](https://docs.google.com/document/d/1vyel3XRfdeGyqLvXiy1C3mrw9QBbveoKjjVLuRxMw4k) | 2017-05-17 14:26:01 | +| 32 | Reuven Lax | [Dynamic (data-dependent) FileBasedSink](https://docs.google.com/document/d/1Bd9mJO1YC8vOoFObJFupVURBMCl7jWt6hOgw6ClwxE4) | 2017-05-19 01:31:28 | +| 33 | Manu Zhang | [Apache Beam Fn API: Processing a Bundle](https://s.apache.org/beam-fn-api-processing-a-bundle) | 2017-05-21 20:54:21 | +| 34 | Kenneth Knowles | [Runner Authoring Guide](https://s.apache.org/beam-runner-guide) | 2017-05-22 01:12:09 | +| 35 | Lukasz Cwik | [Apache Beam Fn API: Fn State API and Bundle Processing](https://s.apache.org/beam-fn-state-api-and-bundle-processing) | 2017-05-26 16:49:01 | +| 36 | Reuven Lax | [Beam Proposal: Pipeline Drain](https://docs.google.com/document/d/1NExwHlj-2q2WUGhSO4jTu8XGhDPmm3cllSN8IMmWci8) | 2017-06-06 13:43:03 | +| 37 | Frances Perry | [Apache Beam: Technical Vision](https://docs.google.com/document/d/1UyAeugHxZmVlQ5cEWo_eOPgXNQA1oD-rGooWOSwAqh8) | 2017-06-06 15:06:16 | +| 38 | James | [Apache Beam SQL: DDL](https://docs.google.com/document/d/162_cuYlZ5pC_8PzGWX844tlLOsSvQmoSGjJAgol4ipE) | 2017-06-08 06:11:40 | +| 39 | Davor Bonaci | [Beam report to ASF Board, 2017-06](https://docs.google.com/document/d/1tgJ_2WEInGa7Wg2RXSWZI7ot3bZ37ZuApalVAJOQscI) | 2017-06-13 10:47:13 | +| 40 | Tyler Akidau | [Beam DSL_SQL merge to master burndown](https://s.apache.org/beam-dsl-sql-burndown) | 2017-06-13 11:30:41 | +| 41 | Ben Chambers | [User Defined Metrics API](https://s.apache.org/beam-metrics-api) | 2017-06-23 04:30:17 | +| 42 | Kenneth Knowles | [Issue Navigator - ASF JIRA](https://s.apache.org/beam-2.1.0-burndown) | 2017-06-23 10:15:07 | +| 43 | Eugene Kirpichov | [Proposal and plan: new TextIO features based on SDF](https://s.apache.org/textio-sdf) | 2017-06-23 19:32:17 | +| 44 | Pei HE | [Fine-grained Resource Configuration in Beam](https://docs.google.com/document/d/1N0y64dbzmukLLEy6M9CygdI_H88pIS3NtcOAkL5-oVw) | 2017-06-28 23:24:08 | +| 45 | Eugene Kirpichov | [Proposal and plan: new TextIO features based on SDF](http://s.apache.org/textio-sdf) | 2017-06-29 19:43:09 | +| 46 | Pei HE | [Beam MapReduce Runner One-Pager](https://docs.google.com/document/d/10jJ8pBTZ10rNr_IO5YnggmZZG1MU-F47sWg8N6xkBM0) | 2017-07-07 09:11:14 | +| 47 | Thomas Groh | [Cross-language Beam Pipelines](https://s.apache.org/beam-mixed-language-pipelines) | 2017-07-10 12:58:37 | +| 48 | Kenneth Knowles | [Issue Navigator - ASF JIRA](https://s.apache.org/beam-2.1.0-burndown?) | 2017-07-10 16:33:23 | +| 49 | Eugene Kirpichov | [Proposal: Watch: a transform for watching growth of sets](http://s.apache.org/beam-watch-transform) | 2017-07-12 00:08:43 | +| 50 | Vikas RK | [Apache Beam Fn API: Progress Reporting](https://s.apache.org/beam-fn-api-progress-reporting) | 2017-07-13 14:34:51 | +| 51 | Kenneth Knowles | [Pull requests · apache/beam · GitHub](https://s.apache.org/beam-2.1.0-cherry-picks) | 2017-07-14 16:01:13 | +| 52 | Kenneth Knowles | [Apache Beam Release Acceptance Criteria - Google Sheets](https://s.apache.org/beam-release-validation) | 2017-07-24 01:57:20 | +| 53 | Tyler Akidau | [Robust Streaming SQL in Apache Apex, Beam, Calcite, & Flink](http://s.apache.org/streaming-sql-spec) | 2017-07-24 19:34:49 | +| 54 | Arnaud Fournier | [Sketches_Extension](https://docs.google.com/document/d/1Xy6g5RPBYX_HadpIr_2WrUeusiwL0Jo2ACI5PEOP1kc) | 2017-08-03 08:35:32 | +| 55 | Tyler Akidau | [Beam DSL_SQL branch API review](https://s.apache.org/beam-sql-dsl-api-review) | 2017-08-03 19:21:02 | +| 56 | Henning Rohde | [Apache Beam Fn API: SDK Harness container contract](https://s.apache.org/beam-fn-api-container-contract) | 2017-08-11 14:45:42 | +| 57 | Asha Rostamianfar | [Beam I/O VCF source: Design Doc](https://docs.google.com/document/d/1jsdxOPALYYlhnww2NLURS8NKXaFyRSJrcGbEDpY9Lkw) | 2017-08-16 13:18:54 | +| 58 | Robert Bradshaw | [Graphical View of Progress](https://docs.google.com/document/d/1Dx18qBTvFWNqwLeecemOpKfleKzFyeV3Qwh71SHATvY) | 2017-08-22 14:49:37 | +| 59 | Kenneth Knowles | [Issue Navigator - ASF JIRA](https://s.apache.org/beam-2.2.0-burndown) | 2017-08-30 23:57:12 | +| 60 | Kenneth Knowles | [A New DoFn](https://docs.google.com/document/d/1ClmQ6LqdnfseRzeSw3SL68DAO1f8jsWBL2FfzWErlbw) | 2017-09-06 03:22:01 | +| 61 | Eugene Kirpichov | [FileIO.write](http://s.apache.org/fileio-write) | 2017-09-06 21:44:02 | +| 62 | Davor Bonaci | [Beam report to ASF Board, 2017-09](https://docs.google.com/document/d/1uX8k99k2OXD6tizsJQ9KZtfxbO_I-8rC_oOJgkcJ_O8) | 2017-09-13 13:45:24 | +| 63 | Robert Bradshaw | [Simplifying Beam Pipelines](https://s.apache.org/no-beam-pipeline) | 2017-09-18 20:49:16 | +| 64 | Griselda Cuevas | [[Public] Monthly Beam Newsletter](https://docs.google.com/document/d/1C4L8b1It9Ju1JgJaSvSPlAYMlG0Q4v4hbjebkmNsnQ8) | 2017-09-21 21:31:55 | +| 65 | Eugene Kirpichov | [Context access from user code closures](http://s.apache.org/context-fn) | 2017-09-29 16:53:24 | +| 66 | pfgerver@gmail.com | [User Defined Metrics API](https://docs.google.com/document/d/1voyUIQ2DrWkoY-BsJwM8YvF4gGKB76CDG8BYL8XBc7A) | 2017-11-09 18:42:27 | +| 67 | Kenneth Knowles | [Apache Beam Release 2.2.0 Acceptance Criteria - Google Sheets](https://s.apache.org/beam-2.2.0-release-validation) | 2017-11-17 14:30:05 | +| 68 | Holden Karau | [Apache Beam Fn API: SDK Harness container contract](https://docs.google.com/document/d/1n6s3BOxOPct3uF4UgbbI9O9rpdiKWFH9R6mtVmR7xp0) | 2017-11-18 09:33:27 | +| 69 | Udi Meiri | [HDFS Support for Python SDK](https://docs.google.com/document/d/1-uzKf4VPlGrkBMXM00sxxf3K01Ss3ZzXeju0w5L0LY0) | 2017-11-20 20:26:09 | +| 70 | Reuven Lax | [Schema-Aware PCollections](https://docs.google.com/document/d/1tnG2DPHZYbsomvihIpXruUmQ12pHGK0QIvXS1FOTgRc) | 2017-11-29 21:38:45 | +| 71 | Kenneth Knowles | [Triggering is for Sinks](https://s.apache.org/beam-sink-triggers) | 2017-11-30 15:06:41 | +| 72 | Henning Rohde | [RFC: Apache Beam Go SDK design](https://s.apache.org/beam-go-sdk-design-rfc) | 2017-11-30 18:40:01 | +| 73 | Etienne Chauchot | [Metrics extraction independant from runners / execution engines](https://s.apache.org/runner_independent_metrics_extraction) | 2017-12-11 11:33:31 | \ No newline at end of file diff --git a/contributor-docs/discussion-docs/2018.md b/contributor-docs/discussion-docs/2018.md new file mode 100644 index 000000000000..2c126310aca3 --- /dev/null +++ b/contributor-docs/discussion-docs/2018.md @@ -0,0 +1,116 @@ + + +# List Of Documents Submitted To dev@beam.apache.org In 2018 +| No. | Author | Subject | Date (UTC) | +|---|---|---|---| +| 1 | Davor Bonaci | [Issue Navigator - ASF JIRA](http://s.apache.org/gsoc2018ideas) | 2018-01-22 02:37:24 | +| 2 | Lukasz Cwik | [Apache Beam (Incubating) Pipeline Runner API Proposal](https://s.apache.org/beam-runner-api) | 2018-01-26 13:09:11 | +| 3 | Reuven Lax | [Schema-Aware PCollections](https://docs.google.com/document/d/1tnG2DPHZYbsomvihIpXruUmQ12pHGK0QIvXS1FOTgRc) | 2018-01-28 20:08:35 | +| 4 | Etienne Chauchot | [Metrics extraction independant from runners / execution engines](https://s.apache.org/runner_independent_metrics_extraction) | 2018-01-31 08:01:01 | +| 5 | Kenneth Knowles | [Apache Beam Release 2.3.0 Acceptance Criteria - Google Sheets](https://s.apache.org/beam-2.3.0-release-validation) | 2018-01-31 09:07:13 | +| 6 | Raghu Angadi | [Event Time and Watermarks in KafkaIO](https://docs.google.com/document/d/1DyWcLJpALRoUfvYUbiPCDVikYb_Xz2X7Co2aDUVVd4I) | 2018-02-01 18:21:42 | +| 7 | Ben Sidhom | [Portable Beam on Flink](https://s.apache.org/portable-beam-on-flink) | 2018-02-08 19:31:18 | +| 8 | Romain Manni-Bucau | [Apache URL Shortener](https://s.apache.org/) | 2018-02-09 00:47:19 | +| 9 | Matthias Baetens | [Apache Beam YouTube channel: guidelines](https://docs.google.com/document/d/10RNWSU7wGyIOkOvlRbw47wsgkxX2-7G1OKmSlINREus) | 2018-02-23 14:12:58 | +| 10 | Eugene Kirpichov | [Breaking the fusion barrier: Deep splitting of Beam instruction graphs](https://s.apache.org/beam-breaking-fusion) | 2018-02-23 17:20:59 | +| 11 | Chamikara Jayalath | [Issue Navigator - ASF JIRA](https://s.apache.org/beam-2.4.0-burndown) | 2018-02-27 15:44:47 | +| 12 | Etienne Chauchot | [Apache Beam Release Acceptance Criteria - Google Sheets](https://docs.google.com/spreadsheets/d/1qk-N5vjXvbcEk68GjbkSZTR8AGqyNUM-oLFo_ZXBpJw) | 2018-03-09 04:02:30 | +| 13 | Daniel Oliveira | [Structure and Lifting of Combines](https://docs.google.com/document/d/1-3mEs3Y7bIkJ0hmQ6SiHpVIFu5vbY6Zcpw-7tOMVg4U) | 2018-03-09 13:19:52 | +| 14 | Kenneth Knowles | [User Defined Metrics API](https://s.apache.org/beam-metrics-api) | 2018-03-09 18:44:42 | +| 15 | Alex Amato | [Apache Beam Fn API : Defining and adding SDK Metrics](https://docs.google.com/document/d/1MtBZYV7NAcfbwyy9Op8STeFNBxtljxgy69FkHMvhTMA) | 2018-03-13 23:47:03 | +| 16 | Griselda Cuevas | [[Notes] Apache Beam Community Summit - March 2018](https://docs.google.com/document/d/1B4EU8jjZy9TnlRiZWW9hSCqegbmOh8boIgjkAyOfnCk) | 2018-03-14 01:56:40 | +| 17 | Lukasz Cwik | [Slack for ASF projects - Apache Infrastructure Website](https://s.apache.org/slack-invite) | 2018-03-14 14:12:42 | +| 18 | Henning Rohde | [RFC: Apache Beam Go SDK design](https://s.apache.org/beam-go-sdk-design-rfc) | 2018-03-16 13:55:07 | +| 19 | Pablo Estrada | [Apache Beam Summit: Notes on Contributor Experience](https://docs.google.com/document/d/1WaK39qrrG_P50FOMHifJhrdHZYmjOOf8MgoObwCZI50) | 2018-03-20 18:40:42 | +| 20 | Robbe Sneyders | [Apache Beam: Python 3 support](https://docs.google.com/document/d/1xDG0MWVlDKDPu_IW9gtMvxi2S9I0GB0VDTkPhjXT0nE) | 2018-03-23 12:27:43 | +| 21 | Lukasz Cwik | [Splittable DoFn proposal](https://s.apache.org/splittable-do-fn) | 2018-03-26 15:41:45 | +| 22 | Alex Amato | [Apache Beam Fn API : Defining and adding SDK Metrics](https://s.apache.org/beam-fn-api-metrics) | 2018-04-10 12:53:39 | +| 23 | Kenneth Knowles | [Cross-language Beam Pipelines](https://s.apache.org/beam-mixed-language-pipelines) | 2018-04-13 10:58:01 | +| 24 | Robert Burke | [Apache Beam Go SDK - Vanity Import Path](https://s.apache.org/go-beam-vanity-import) | 2018-04-16 19:56:45 | +| 25 | Scott Wegner | [Log in - ASF JIRA](https://s.apache.org/beam-gradle-migration) | 2018-04-17 18:15:29 | +| 26 | Chamikara Jayalath | [Apache Kafka for Beam Python SDK](https://docs.google.com/document/d/1ogRS-e-HYYTHsXi_l2zDUUOnvfzEbub3BFkPrYIOawU) | 2018-04-28 01:06:26 | +| 27 | Henning Rohde | [Apache Beam Fn API: SDK Harness container contract](https://s.apache.org/beam-fn-api-container-contract) | 2018-05-04 19:31:50 | +| 28 | Henning Rohde | [Apache Beam Go SDK integration tests](https://docs.google.com/document/d/1jy6EE7D4RjgfNV0FhD3rMsT1YKhnUfcHRZMAlC6ygXw) | 2018-05-08 21:13:42 | +| 29 | Kenneth Knowles | [Triggers in Apache Beam (incubating)](https://s.apache.org/beam-triggers) | 2018-05-10 10:57:34 | +| 30 | Henning Rohde | [Apache Beam Portability Support Matrix - Google Sheets](https://docs.google.com/spreadsheets/d/1KDa_FGn1ShjomGd-UUDOhuh2q73de2tPz6BqHpzqvNI) | 2018-05-11 13:38:03 | +| 31 | Yifan Zou | [Reproducible Environment for Jenkins Tests By Using Containers](https://docs.google.com/document/d/1U7FeVMiHiBP-pFm4ULotqG1QqZY0fi7g9ZwTmeIgvvM) | 2018-05-11 18:25:44 | +| 32 | Lukasz Cwik | [Apache Beam Fn API: Fn State API and Bundle Processing](https://s.apache.org/beam-fn-state-api-and-bundle-processing) | 2018-05-14 12:44:02 | +| 33 | Ankur Goenka | [Launching a Portable Pipeline](https://docs.google.com/document/d/1xOaEEJrMmiSHprd-WiYABegfT129qqF-idUBINjxz8s) | 2018-05-14 15:33:31 | +| 34 | Anton Kedin | [Eventual PAssert](https://docs.google.com/document/d/1X_3KH_6QyfOSnh5kNK-fHlkEDrwPVpA2RnRggMMxhUk) | 2018-05-14 16:56:57 | +| 35 | Lukasz Cwik | [Apache Beam Fn API: Processing a Bundle](https://s.apache.org/beam-fn-api-processing-a-bundle) | 2018-05-14 19:54:19 | +| 36 | Etienne Chauchot | [User Defined Metrics API](https://docs.google.com/document/d/1voyUIQ2DrWkoY-BsJwM8YvF4gGKB76CDG8BYL8XBc7A) | 2018-05-15 03:56:05 | +| 37 | Lukasz Cwik | [State and Timers for DoFn in Apache Beam (incubating)](https://s.apache.org/beam-state) | 2018-05-15 11:38:56 | +| 38 | Udi Meiri | [Beam Fast Precommits](https://docs.google.com/document/d/1udtvggmS2LTMmdwjEtZCcUQy6aQAiYTI3OrTP8CLfJM) | 2018-05-17 21:53:52 | +| 39 | Charles Chen | [Redirecting](https://s.apache.org/beam-python-user-state-and-timers) | 2018-05-21 20:15:40 | +| 40 | Eugene Kirpichov | [Splittable DoFn proposal](http://s.apache.org/splittable-do-fn) | 2018-05-22 15:34:26 | +| 41 | Charles Chen | [Beam Python User State and Timer APIs](https://docs.google.com/document/d/1GadEkAmtbJQjmqiqfSzGw3b66TKerm8tyn6TK4blAys) | 2018-05-23 14:49:34 | +| 42 | Daniel Oliveira | [Structure and Lifting of Combines](https://s.apache.org/beam-runner-api-combine-model) | 2018-05-23 16:57:04 | +| 43 | Ankur Goenka | [Portable Artifact Staging](https://docs.google.com/document/d/12zNk3O2nhTB8Zmxw5U78qXrvlk5r42X8tqF248IDlpI) | 2018-05-23 21:54:37 | +| 44 | Kenneth Knowles | [A New DoFn](https://s.apache.org/a-new-dofn) | 2018-05-24 09:22:42 | +| 45 | Yifan Zou | [Automation For Beam SDK Dependency Check](https://docs.google.com/document/d/1rqr_8a9NYZCgeiXpTIwWLCL7X8amPAVfRXsO72BpBwA) | 2018-05-24 10:08:17 | +| 46 | Lukasz Cwik | [Apache Beam Fn Api Overview](https://s.apache.org/beam-fn-api) | 2018-05-25 13:23:55 | +| 47 | Chamikara Jayalath | [Managing Beam Dependencies](https://docs.google.com/document/d/15m1MziZ5TNd9rh_XN0YYBJfYkt0Oj-Ou9g0KFDPL2aA) | 2018-05-31 22:11:31 | +| 48 | Griselda Cuevas | [June Beam Newsletter](https://docs.google.com/document/d/1BwRhOu-uDd3SLB_Om_Beke5RoGKos4hj7Ljh7zM2YIo) | 2018-06-01 15:19:58 | +| 49 | Lukasz Cwik | [Apache Beam Portability: Modeling, scheduling and executing timers](https://s.apache.org/beam-portability-timers) | 2018-06-04 18:00:34 | +| 50 | Austin Bennett | [Walkthrough with a first contributor to Beam](https://docs.google.com/document/d/1hq-s3L676LkMTftvhv0eCkdwrRnZmCRiLdaQBWLHWWA) | 2018-06-05 16:39:49 | +| 51 | Andrew Pilloud | [Beam SQL Pipeline Options](https://docs.google.com/document/d/1UTsSBuruJRfGnVOS9eXbQI6NauCD4WnSAPgA_Y0zjdk) | 2018-06-06 12:24:36 | +| 52 | Alan Myrvold | [Apache Beam Contribution Guide Improvements](https://docs.google.com/document/d/1zukoPXPgUq3Vli_rOJ0ykzK6NbR6g-FrgSHZjLd23bo) | 2018-06-06 14:54:52 | +| 53 | Griselda Cuevas | [Apache Beam Roadmap - Google Sheets](https://docs.google.com/spreadsheets/d/1W6xvPmGyG8Nd9R7wkwgwRJvZdyLoBg6F3NCrPafbKmk) | 2018-06-06 20:04:54 | +| 54 | Kenneth Knowles | [Beam SQL: Integrating runners & IO](https://s.apache.org/beam-sql-packaging) | 2018-06-11 13:44:27 | +| 55 | Sindy Li | [Interactive Beam Pipelines](https://docs.google.com/document/d/10bTc97GN5Wk-nhwncqNq9_XkJFVVy0WLT4gPFqP6Kmw) | 2018-06-13 14:48:35 | +| 56 | Robin Qiu | [@RequiresStableInput Design Doc](https://docs.google.com/document/d/117yRKbbcEdm3eIKB_26BHOJGmHSZl1YNoF0RqWGtqAM) | 2018-06-14 16:58:45 | +| 57 | Rui Wang | [Unbounded with limit](https://docs.google.com/document/d/13zeTewHH9nfwhSlcE4x77WQwr1U2Z4sTiNRjOXUj2aw) | 2018-06-18 21:20:24 | +| 58 | Boyuan Zhang | [Process of Building Python Wheels](https://docs.google.com/document/d/1HHnUkiFmwAVQwMJ-BjAX3LOmkC44Gb3jvGIJvSMC84s) | 2018-06-19 19:03:55 | +| 59 | N/A | [Apache Beam version 2.5.0](https://docs.google.com/document/d/1BeqHuH1U8iOFJWTfFPW_4O2HtLRZm9rlnEUyMB6Eq7M) | 2018-06-23 00:42:06 | +| 60 | Kenneth Knowles | [Google Calendar - Easier Time Management, Appointments & Scheduling](https://s.apache.org/beam-release-calendar) | 2018-06-25 17:47:30 | +| 61 | Thomas Weise | [Apache Beam Portability Prototype](https://s.apache.org/beam-portability-team-doc) | 2018-06-27 08:35:21 | +| 62 | Yifan Zou | [Beam SDK Dependency Ownership 2018-06-26 - Google Sheets](https://docs.google.com/spreadsheets/d/12NN3vPqFTBQtXBc0fg4sFIb9c_mgst0IDePB_0Ui8kE) | 2018-06-27 11:52:00 | +| 63 | Yifan Zou | [Tracking Beam Dependency Upgrades](https://docs.google.com/document/d/1XXTMnofizSQZSorZy4NFlxKfx5f17PGRmws-DngoTms) | 2018-07-10 18:17:30 | +| 64 | Alan Myrvold | [Apache Beam Contributor Metrics: Collection, Display, Actions](https://s.apache.org/beam-contributor-metrics) | 2018-07-11 13:17:49 | +| 65 | Boyuan Zhang | [Proposal of Building Python Wheels](https://docs.google.com/document/d/1MRVFs48e6g7wORshr2UpuOVD_yTSJTbmR65_j8XbGek) | 2018-07-16 17:47:36 | +| 66 | Mikhail Gryzykhin | [Proposal of starting contributors wiki](https://docs.google.com/document/d/1qLojdA6GheKf0PVl1A1uip3D2JTk9jQroUKuxriBcfY) | 2018-07-19 17:21:33 | +| 67 | David Cavazos | [Community Repository Proposal](https://docs.google.com/document/d/1vhcKJlP0qH1C7NZPDjohT2PUbOD-k71avv1CjEYapdw) | 2018-08-01 18:12:15 | +| 68 | Boyuan Zhang | [Apache Beam Release Acceptance Criteria - Google Sheets](https://s.apache.org/beam-release-validation) | 2018-08-01 19:03:16 | +| 69 | Rose Nguyen | [August Beam Newsletter](https://docs.google.com/document/d/124klHcJcIi_gD6rvMXwbbToINl1KTdXPmYrgGB998FQ) | 2018-08-02 19:18:01 | +| 70 | Pablo Estrada | [Apache Beam version 2.6.0](https://docs.google.com/document/d/1Jwz5AxInSm9C6z0TZqer6JYE2gpbJ_dZn88X37VvQEY) | 2018-08-08 19:20:39 | +| 71 | Rose Nguyen | [Interactive Beam Pipelines](http://s.apache.org/interactive-beam) | 2018-08-10 15:37:11 | +| 72 | Yifan Zou | [Reproducible Environment for Jenkins Tests By Using Docker](https://docs.google.com/document/d/1y0YuQj_oZXC0uM5-gniG7r9-5gv2uiDhzbtgYYJW48c) | 2018-08-27 13:53:14 | +| 73 | Ankur Goenka | [SDK Harness Concurrency Requirements](https://docs.google.com/document/d/1oAXVPbJ0dzj2_8LXEWFAgqCP5Tpld3q5B3QU254PQ6A) | 2018-08-29 20:53:36 | +| 74 | Lukasz Cwik | [Bundles w/ SplittableDoFns: Signals & Splitting](https://s.apache.org/beam-bundles-backlog-splitting) | 2018-08-30 21:13:31 | +| 75 | Robbe Sneyders | [Apache Beam: Python 3 support](https://s.apache.org/beam-python-3) | 2018-09-05 12:11:52 | +| 76 | Lukasz Cwik | [Apache Beam (Incubating) Pipeline Runner API Proposal](https://docs.google.com/document/d/1bao-5B6uBuf-kwH1meenAuXXS0c9cBQ1B2J59I3FiyI) | 2018-09-07 13:15:48 | +| 77 | Maximilian Michels | [September Beam Newsletter](https://docs.google.com/document/d/1PE97Cf3yoNcx_A9zPzROT_kPtujRZLJoZyqIQfzqUvY) | 2018-09-10 10:20:01 | +| 78 | Rose Nguyen | [Redirecting](http://s.apache.org/beam-python-user-state-and-timers) | 2018-09-10 22:56:03 | +| 79 | David Moravek | [Streaming Hadoop OutputFormat](https://s.apache.org/beam-streaming-hofio) | 2018-09-13 10:48:37 | +| 80 | Udi Meiri | [Simplifying Beam Pipelines](https://s.apache.org/no-beam-pipeline) | 2018-09-24 12:37:24 | +| 81 | Eugene Kirpichov | [(Apachecon 2018) Robust, performant and modular APIs for data ingestion with Apache Beam - Google Slides](https://s.apache.org/beam-modular-io-talk) | 2018-09-26 17:16:27 | +| 82 | Charles Chen | [Apache Beam version 2.7.0](https://docs.google.com/document/d/1jIk0pc8CxTMmtz5b7UL0gSPxmjKnyerVFS6FcpP2Ym8) | 2018-09-30 19:59:19 | +| 83 | N/A | [Splittable DoFn Public Meeting 9/14/18](https://docs.google.com/document/d/1kjJLGIiNAGvDiUCMEtQbw8tyOXESvwGeGZLL-0M06fQ) | 2018-10-03 01:07:14 | +| 84 | Etienne Chauchot | [Etienne Chauchot Universal metrics with Beam - Google Slides](https://s.apache.org/universal-metrics) | 2018-10-03 11:08:07 | +| 85 | Thomas Weise | [Apache Beam Portability Support Matrix - Google Sheets](https://s.apache.org/apache-beam-portability-support-table) | 2018-10-08 12:43:08 | +| 86 | Plajt, Vaclav | [Java 8 Euphoria DSL](https://s.apache.org/beam-euphoria) | 2018-10-10 04:26:18 | +| 87 | Udi Meiri | [Beam Python SDK: Datastore Client Upgrade](https://docs.google.com/document/d/1sL9p7NE5Z0p-5SB5uwpxWrddj_UCESKSrsvDTWNKqb4) | 2018-10-16 21:59:13 | +| 88 | Lukasz Cwik | [Apache Beam Portability API: How to Finalize Bundles](https://s.apache.org/beam-finalizing-bundles) | 2018-10-24 17:37:58 | +| 89 | Scott Wegner | [Grafana](https://s.apache.org/beam-community-metrics) | 2018-10-26 01:36:26 | +| 90 | Lukasz Cwik | [Apache Beam Portability API: How to Checkpoint and Split Bundles](https://docs.google.com/document/d/1cKOB9ToasfYs1kLWQgffzvIbJx2Smy4svlodPRhFrk4) | 2018-10-26 18:07:27 | +| 91 | Alex Amato | [MonitoringInfo spec proposal.](https://docs.google.com/document/d/1SB59MMVZXO0Aa6w0gf4m0qM4oYt4SiofDq3QxnpQaK4) | 2018-10-29 20:19:36 | +| 92 | N/A | [Snapshotting and Updating Beam Pipelines](https://docs.google.com/document/d/1UWhnYPgui0gUYOsuGcCjLuoOUlGA4QaY91n8p3wz9MY) | 2018-11-05 10:24:14 | +| 93 | Lukasz Cwik | [Apache Beam Portability API: How to Checkpoint and Split Bundles](https://s.apache.org/beam-checkpoint-and-split-bundles) | 2018-11-06 19:45:31 | +| 94 | Ruoyun Huang | [[BEAM-5448] Java Reference Runner in Python](https://docs.google.com/document/d/1S86saZqiDaE_M5wxO0zOQ_rwC6QHv7sp1BmGTm0dLNE) | 2018-11-08 20:04:52 | +| 95 | Jeff Klukas | [AutoValue Coding and Row Support](https://docs.google.com/document/d/1ucoik4WzUDfilqIz3I1AuMHc1J8DE6iv7gaUCDI42BI) | 2018-11-09 16:50:19 | +| 96 | Chamikara Jayalath | [Issue Navigator - ASF JIRA](https://s.apache.org/beam-2.9.0-burndown) | 2018-11-15 16:59:15 | +| 97 | Kenneth Knowles | [User Defined Metrics API](http://s.apache.org/beam-metrics-api) | 2018-11-20 22:19:47 | +| 98 | Lukasz Cwik | [Apache Beam Fn API: Fn State API and Bundle Processing](https://docs.google.com/document/d/1BOozW0bzBuz4oHJEuZNDOHdzaV5Y56ix58Ozrqm2jFg) | 2018-11-28 15:01:06 | +| 99 | Kenneth Knowles | [DRAFT - Apache Beam Board Report - December '18](https://docs.google.com/document/d/1HenFg37xyNuFC7A4zkqmBPY9_Gqdi6LgPtF6wfoEix8) | 2018-11-30 01:14:11 | \ No newline at end of file diff --git a/contributor-docs/discussion-docs/2019.md b/contributor-docs/discussion-docs/2019.md new file mode 100644 index 000000000000..b48fc8b87d8c --- /dev/null +++ b/contributor-docs/discussion-docs/2019.md @@ -0,0 +1,103 @@ + + +# List Of Documents Submitted To dev@beam.apache.org In 2019 +| No. | Author | Subject | Date (UTC) | +|---|---|---|---| +| 1 | Robert Burke | [[Go SDK] User Defined Coders](https://docs.google.com/document/d/1kQwx4Ah6PzG8z2ZMuNsNEXkGsLXm6gADOZaIO7reUOg) | 2019-01-03 14:45:11 | +| 2 | Mikhail Gryzykhin | [Grafana](https://s.apache.org/beam-community-metrics) | 2019-01-03 20:38:59 | +| 3 | Manu Zhang | [Apache Beam: Python 3 support](https://s.apache.org/beam-python-3) | 2019-01-05 00:03:01 | +| 4 | Thomas Weise | [Copy of [Proposal] Apache Beam Summit Europe 2018](https://docs.google.com/document/d/1h0y85vxt0AGYdz6SZCbV2jzUGs46_M-keUZTMsm2R0I) | 2019-01-06 17:54:55 | +| 5 | Matthias Baetens | [[Public] Beam Summit London 2018 - Google Sheets](https://docs.google.com/spreadsheets/d/1pNoeLD0JBImc1-gja209mSuTVLNna_b1gmQ1r-_Z5t0) | 2019-01-07 02:25:46 | +| 6 | Maximilian Michels | [Grafana](http://s.apache.org/beam-community-metrics) | 2019-01-07 10:21:22 | +| 7 | Kenneth Knowles | [Apache Beam Portability API: How to Checkpoint and Split Bundles](https://s.apache.org/beam-checkpoint-and-split-bundles) | 2019-01-14 22:26:04 | +| 8 | Heejong Lee | [BigQuery Streaming Insert Benchmark](https://docs.google.com/document/d/1EhRNWLevm86GD_QtvlrTauHITVMwQBzuemyp-w4Z_ck) | 2019-01-16 21:53:03 | +| 9 | Alex Amato | [Refactoring Java State Sampler](https://docs.google.com/document/d/1OlAJf4T_CTL9WRH8lP8uQOfLjWYfm8IpRXSe38g34k4) | 2019-01-17 13:48:54 | +| 10 | Alexey Romanenko | [Apache Beam Fn Api Overview](https://s.apache.org/beam-fn-api) | 2019-01-18 10:49:30 | +| 11 | junwan01@gmail.com | [[BEAM-5448] Java Reference Runner in Python](https://docs.google.com/document/d/1S86saZqiDaE_M5wxO0zOQ_rwC6QHv7sp1BmGTm0dLNE) | 2019-01-21 14:32:05 | +| 12 | Robert Bradshaw | [Cross-language Beam Pipelines](https://s.apache.org/beam-mixed-language-pipelines) | 2019-01-22 07:39:05 | +| 13 | Chamikara Jayalath | [Cross-language transforms primer](https://docs.google.com/document/d/1H3yCyVFI9xYs1jsiF1GfrDtARgWGnLDEMwG5aQIx2AU) | 2019-01-22 16:44:33 | +| 14 | Etienne Chauchot | [Runner supported features plugin](https://docs.google.com/document/d/1eXt54ht0h7-pPbP-MJR0N5nzmxRRlAwbFod-LXI1x0A) | 2019-01-23 04:00:40 | +| 15 | Maximilian Michels | [Quick presentation of Nexmark Code](https://docs.google.com/document/d/1VgnGiVu8vSfm7Et-xAtQYv0PlEpqeyfmhpQUNPmWRJs) | 2019-01-28 12:13:16 | +| 16 | Kenneth Knowles | [Apache Beam Release Acceptance Criteria - Google Sheets](https://docs.google.com/spreadsheets/d/1qk-N5vjXvbcEk68GjbkSZTR8AGqyNUM-oLFo_ZXBpJw) | 2019-01-28 12:24:12 | +| 17 | Matthias Baetens | [Apache Beam Portability Support Matrix - Google Sheets](https://s.apache.org/apache-beam-portability-support-table) | 2019-01-31 06:06:06 | +| 18 | Alex Amato | [Apache Beam Fn API: Get Metrics API: Metric Extraction via proto RPC API.](https://s.apache.org/get-metrics-api) | 2019-01-31 14:45:55 | +| 19 | Maximilian Michels | [Cross-Language Pipelines & Legacy IO](https://s.apache.org/beam-cross-language-io) | 2019-02-07 13:59:12 | +| 20 | Austin Bennett | [A Brief Intro to Apache Beam (Feb 2019) - Google Slides](http://s.apache.org/beam-intro-feb-2019) | 2019-02-11 16:51:27 | +| 21 | Daniel Oliveira | [Error - ASF JIRA](https://s.apache.org/beam-test-failure) | 2019-02-11 18:21:19 | +| 22 | Alex Amato | [Apache Beam Fn API: Get Metrics API: Metric Extraction via proto RPC API.](https://docs.google.com/document/d/1p7mRCUEigkrWickqCLCHBshrqQ97YIv1E5cZxJTKx3I) | 2019-03-12 20:22:35 | +| 23 | Udi Meiri | [Beam Python SDK: Datastore Client Upgrade](https://docs.google.com/document/d/1sL9p7NE5Z0p-5SB5uwpxWrddj_UCESKSrsvDTWNKqb4) | 2019-03-19 13:24:53 | +| 24 | Valentyn Tymofieiev | [BQ IO bytes support](https://docs.google.com/document/d/19zvDycWzF82MmtCmxrhqqyXKaRq8slRIjdxE6E8MObA) | 2019-03-25 22:19:11 | +| 25 | Tanay Tummalapalli | [GSoC proposal for Apache Beam - T. Tanay](https://docs.google.com/document/d/15Peyd3Z_wu5rvGWw8lMLpZuTyyreM_JOAEFFWvF97YY) | 2019-03-28 14:58:08 | +| 26 | Lukasz Cwik | [Implement AWS S3 and Azure Blob filesystems for Python SDK Apache - Beam](https://docs.google.com/document/d/1i_PoIrbmhNgwKCS1TYWC28A9RsyZQFsQCJic3aCXO-8) | 2019-04-03 11:45:41 | +| 27 | Yifan Zou | [Apache Beam Jenkins Agents Management](https://docs.google.com/document/d/1c38IPrF94PZC-ItGZgmAgAKrgmC1MGA6N6nkK0cL6L4) | 2019-04-03 21:34:57 | +| 28 | Kenneth Knowles | [Side Input Architecture for Apache Beam (incubating) 1-Pager](https://s.apache.org/beam-side-inputs-1-pager) | 2019-04-11 13:09:25 | +| 29 | Kyle Weaver | [Beam Portable Spark Runner](https://docs.google.com/document/d/1j8GERTiHUuc6CzzCXZHc38rBn41uWfATBh2-5JN8hro) | 2019-04-12 14:33:41 | +| 30 | Kenneth Knowles | [Issue Navigator - ASF JIRA](https://s.apache.org/beam-starter-tasks) | 2019-04-17 17:58:24 | +| 31 | Robert Burke | [[Go SDK] Versioning](https://docs.google.com/document/d/1ZjP30zNLWTu_WzkWbgY8F_ZXlA_OWAobAD9PuohJxPg) | 2019-04-18 00:56:34 | +| 32 | Maximilian Michels | [Beam Application Season of Docs - 2019](https://docs.google.com/document/d/1FNf-BjB4Q7PDdqygPboLr7CyIeo6JAkrt0RBgs2I4dE) | 2019-04-23 08:36:01 | +| 33 | Maximilian Michels | [[DISCUSS] FLIP-38 Support python language in flink Table API](https://docs.google.com/document/d/1ybYt-0xWRMa1Yf5VsuqGRtOfJBz4p74ZmDxZYg3j_h8) | 2019-04-24 09:20:38 | +| 34 | Rui Wang | [Complex Equi-join Condition Support in BeamSQL](https://docs.google.com/document/d/1vDiE4HR5ZdbZypIf1vzyFy9yKmAMWBu6BzBKfP7JhPc) | 2019-04-26 13:12:18 | +| 35 | Etienne Chauchot | [Structured streaming Spark Runner](https://s.apache.org/spark-structured-streaming-runner) | 2019-04-30 13:11:32 | +| 36 | Maximilian Michels | [Artifact Staging in Cross-Language Pipelines](https://docs.google.com/document/d/1XaiNekAY2sptuQRIXpjGAyaYdSc-wlJ-VKjl04c8N48) | 2019-05-07 13:21:30 | +| 37 | Udi Meiri | [Beam Type Hints for Python 3](https://docs.google.com/document/d/15bsOL3YcUWuIjnxqhi9nanhj2eh9S6-QlLYuL7ufcXY) | 2019-05-08 12:28:42 | +| 38 | Lukasz Cwik | [Beam Proposal: Pipeline Drain](https://docs.google.com/document/d/1NExwHlj-2q2WUGhSO4jTu8XGhDPmm3cllSN8IMmWci8) | 2019-05-08 15:58:04 | +| 39 | Kenneth Knowles | [Portable Beam Schemas](https://s.apache.org/beam-schemas) | 2019-05-09 04:05:18 | +| 40 | Jan Lukavsky | [@RequiresTimeSortedInput design doc](https://docs.google.com/document/d/1ObLVUFsf1NcG8ZuIZE4aVy2RYKx2FfyMhkZYWPnI9-c) | 2019-05-23 10:10:37 | +| 41 | Brian Hulette | [Portable Beam Schemas](https://docs.google.com/document/d/1uu9pJktzT_O3DxGd1-Q2op4nRk4HekIZbzi-0oTAips) | 2019-05-24 14:41:42 | +| 42 | Alireza Samadian | [Cost-based Optimization in Beam SQL](https://docs.google.com/document/d/1vi1PBBu5IqSy-qZl1Gk-49CcANOpbNs1UAud6LnOaiY) | 2019-05-31 18:54:48 | +| 43 | Lukasz Cwik | [Apache Beam Board Report Drafting Doc - June 2019](https://docs.google.com/document/d/1GY16lzVKL-mPh4M560AtqPAB1kXEptkhcBymvFr-4z8) | 2019-06-13 10:47:01 | +| 44 | Claire McGinty | [Apache Beam Design Doc - Sort Merge Bucket Source+Sink](https://docs.google.com/document/d/1AQlonN8t4YJrARcWzepyP7mWHTxHAd6WIECwk1s3LQQ) | 2019-06-17 17:12:15 | +| 45 | Lukasz Cwik | [Apache Beam Vendored Dependencies Release Guide](https://s.apache.org/beam-release-vendored-artifacts) | 2019-06-25 13:24:03 | +| 46 | Cyrus Maden | [Beam pipelines diagrams proposal](https://docs.google.com/document/d/1khf9Bx4XJWsKUD6J1eDcYo_8dL9LBoHDtJpyDjDzOMM) | 2019-06-25 16:16:56 | +| 47 | Lukasz Cwik | [Apache Beam Portability: Modeling, scheduling and executing timers](https://docs.google.com/document/d/1GRL88rKLHbMR0zJnBHYwM4xtj66VYlB112EWVUFcGB0) | 2019-06-27 18:38:57 | +| 48 | Rui Wang | [Retractions in Beam](https://docs.google.com/document/d/14WRfxwk_iLUHGPty3C6ZenddPsp_d6jhmx0vuafXqmE) | 2019-07-10 13:58:36 | +| 49 | Kenneth Knowles | [Streaming Beam SQL Extensions](https://s.apache.org/streaming-beam-sql) | 2019-07-18 13:51:09 | +| 50 | Rui Wang | [Lateness (and Panes) in Apache Beam (incubating)](https://docs.google.com/document/d/12r7frmxNickxB5tbpuEh_n35_IJeVZn1peOrBrhhP6Y) | 2019-07-18 14:16:59 | +| 51 | Rakesh Kumar | [Apache Beam Fn API: Fn State API and Bundle Processing](https://docs.google.com/document/d/1BOozW0bzBuz4oHJEuZNDOHdzaV5Y56ix58Ozrqm2jFg) | 2019-07-24 00:20:38 | +| 52 | Thomas Weise | [[Public] Beam Flink K8s](https://docs.google.com/document/d/1z3LNrRtr8kkiFHonZ5JJM_L4NWNBBNcqRc_yAf6G0VI) | 2019-07-25 00:00:52 | +| 53 | Eugene Kirpichov | [Running Splittable DoFn via Source API](http://s.apache.org/sdf-via-source) | 2019-07-25 17:09:02 | +| 54 | Anton Kedin | [Apache Beam Release Acceptance Criteria - Google Sheets](https://s.apache.org/beam-release-validation#gid=1082148452) | 2019-07-25 17:39:20 | +| 55 | Rui Wang | [ZetaSQL Integration In BeamSQL](https://docs.google.com/document/d/14Yi4oEMzqS3n9-LfSNi6Q6kQpEP3gWTHzX0HxqUksdc) | 2019-08-04 14:06:53 | +| 56 | Kyle Weaver | [Beam Portable Job Jars](https://docs.google.com/document/d/1kj_9JWxGWOmSGeZ5hbLVDXSTv-zBrx4kQRqOq85RYD4) | 2019-08-06 20:03:02 | +| 57 | Daniel Oliveira | [Splittable DoFns for Go SDK](https://docs.google.com/document/d/14IwJYEUpar5FmiPNBFvERADiShZjsrsMpgtlntPVCX0) | 2019-08-07 14:57:39 | +| 58 | Tanay Tummalapalli | [[BEAM-7742] BQ file loads hardening](https://s.apache.org/beam-bqfl-hardening) | 2019-08-07 15:19:57 | +| 59 | Chad Dombrova | [[Flink Design] Fine Grained Resource Management](https://docs.google.com/document/d/1h68XOG-EyOFfcomd2N7usHK1X429pJSMiwZwAXCwx1k) | 2019-08-13 13:58:39 | +| 60 | Ning Kang | [Interactive Beam Pipeline Execution Design Overview](https://docs.google.com/document/d/1DYWrT6GL_qDCXhRMoxpjinlVAfHeVilK5Mtf8gO6zxQ) | 2019-08-13 20:00:01 | +| 61 | Lukasz Cwik | [Apache Beam Fn API: Fn State API and Bundle Processing](https://s.apache.org/beam-fn-state-api-and-bundle-processing?) | 2019-08-14 11:13:38 | +| 62 | Maximilian Michels | [Caching in the Fn API](https://docs.google.com/document/d/1ltVqIW0XxUXI6grp17TgeyIybk3-nDF8a0-Nqw-s9mY) | 2019-08-14 12:50:01 | +| 63 | Robert Bradshaw | [Simplifying Beam Pipelines](https://s.apache.org/no-beam-pipeline) | 2019-08-23 14:36:31 | +| 64 | Sam Bourne | [Apache Beam Containers](https://docs.google.com/document/d/1IKE_aEkrAzkzUE4pD_r_zVuL5amHGetJ1efnbTfmunM) | 2019-08-28 19:39:02 | +| 65 | Lukasz Cwik | [Apache Beam (Incubating) Pipeline Runner API Proposal](http://s.apache.org/beam-runner-api) | 2019-08-30 11:36:29 | +| 66 | Pablo Estrada | [Performance of Apache Beam doc pages](https://s.apache.org/beam-ga-report) | 2019-09-06 13:47:48 | +| 67 | Alexey Strokach | [File and Stream Caching in Apache Beam](https://docs.google.com/document/d/1sISjl4Q60mR1V22R1UZd417wVEn_EmZT-SalTHXG4H0) | 2019-09-06 15:30:23 | +| 68 | Lukasz Cwik | [FileIO.write](http://s.apache.org/fileio-write) | 2019-09-11 13:45:01 | +| 69 | Kenneth Knowles | [Apache Beam Board Report Sept 2019](https://docs.google.com/document/d/1NIoeLvJPCirsFRy0F_OBYLr3zdcTRPHr-gH3a5cLJBE) | 2019-09-11 16:19:41 | +| 70 | Bharath Kumara Subramanian | [Async ParDo API for Apache Beam](https://docs.google.com/document/d/1t--UYXgaij0ULEoXUnhG3r8OZPBljN9r_WWlwQJBDrI) | 2019-09-13 12:31:27 | +| 71 | Etienne Chauchot | [Watermarks and multiple aggregates in Spark strucutred streaming](https://docs.google.com/document/d/1IAH9UQJPUiUCLd7H6dazRK2k1szDX38SnM6GVNZYvUo) | 2019-09-18 09:18:25 | +| 72 | Cam Mach | [Beam KinesisIO V2 Migration](https://docs.google.com/document/d/1XeIVbiDHBReZY8rEI2OWA3cTEQuaR7RPdwGAup6S1DM) | 2019-09-30 08:02:08 | +| 73 | Kirill Kozlov | [Beam SQL Filter/Project push-down](https://docs.google.com/document/d/1-ysD7U7qF3MAmSfkbXZO_5PLJBevAL9bktlLCerd_jE) | 2019-09-30 13:55:34 | +| 74 | Jan Lukavsky | [KeyedPCollection design doc](https://docs.google.com/document/d/1fdAz-IvFQ3xxYOGskByo_9lI82iQlAtPvZHZb-h3z20) | 2019-10-15 11:16:03 | +| 75 | Pablo Estrada | [Streaming Fn API Runner](http://s.apache.org/streaming-fn-runner-py) | 2019-10-15 17:10:24 | +| 76 | Lukasz Cwik | [Apache Beam Fn API: Processing a Bundle](https://s.apache.org/beam-fn-api-processing-a-bundle) | 2019-10-17 10:06:30 | +| 77 | Chad Dombrova | [Kubernetes native integration](https://docs.google.com/document/d/1-jNzqGF6NfZuwVaFICoFQ5HFFXzF5NVIagUZByFMfBY) | 2019-10-19 14:54:43 | +| 78 | jincheng sun | [Apache Beam Fn API: DoFn Teardown](https://docs.google.com/document/d/1sCgy9VQPf9zVXKRquK8P6N4x7aB62GEO8ozkujRSHZg) | 2019-10-21 07:26:53 | +| 79 | Luke Cwik | [Apache Beam Fn API: SDK Harness container contract](https://docs.google.com/document/d/1n6s3BOxOPct3uF4UgbbI9O9rpdiKWFH9R6mtVmR7xp0) | 2019-10-21 14:05:26 | +| 80 | Ryan Skraba | [Slack](https://s.apache.org/beam-slack-channel) | 2019-10-23 12:32:03 | +| 81 | Robert Burke | [Splittable DoFns for Go SDK](https://s.apache.org/beam-go-sdf) | 2019-10-28 12:14:47 | +| 82 | Yichi Zhang | [Expose SDKHarness status to runner](https://docs.google.com/document/d/1W77buQtdSEIPUKd9zemAM38fb-x3CvOoaTF4P2mSxmI) | 2019-10-29 15:14:04 | +| 83 | Pablo Estrada | [Performance of Apache Beam doc pages](http://s.apache.org/beam-ga-report) | 2019-11-06 20:41:01 | +| 84 | Heejong Lee | [Runner Validation Test Plan for Cross-language transforms](https://docs.google.com/document/d/1xQp0ElIV84b8OCVz8CD2hvbiWdR8w4BvWxPTZJZA6NA) | 2019-11-08 20:27:41 | +| 85 | Reza Rokni | [HLL in Beam](https://s.apache.org/hll-in-beam) | 2019-11-12 21:35:21 | +| 86 | Kenneth Knowles | [DRAFT - Apache Beam Board Report - December '19](https://docs.google.com/document/d/1AJT5j-qRLJPeN5x6nbHD5KqadXLM0zT0Ugmiy_vQ7C8) | 2019-12-12 00:07:40 | \ No newline at end of file diff --git a/contributor-docs/discussion-docs/2020.md b/contributor-docs/discussion-docs/2020.md new file mode 100644 index 000000000000..47df10398628 --- /dev/null +++ b/contributor-docs/discussion-docs/2020.md @@ -0,0 +1,80 @@ + + +# List Of Documents Submitted To dev@beam.apache.org In 2020 +| No. | Author | Subject | Date (UTC) | +|---|---|---|---| +| 1 | Kirill Kozlov | [Beam SQL Filter/Project push-down](https://docs.google.com/document/d/1-ysD7U7qF3MAmSfkbXZO_5PLJBevAL9bktlLCerd_jE) | 2020-01-03 13:31:29 | +| 2 | Kirill Kozlov | [DataStoreIO SQL Connector](https://docs.google.com/document/d/1FxuEGewJ3GPDl0IKglfOYf1edwa2m_wryFZYRMpRNbA) | 2020-01-08 14:28:14 | +| 3 | jincheng sun | [Apache Beam Vendored Dependencies Release Guide](https://s.apache.org/beam-release-vendored-artifacts) | 2020-01-08 20:37:29 | +| 4 | Daniel Oliveira | [Splittable DoFns for Go SDK](https://s.apache.org/beam-go-sdf) | 2020-01-10 19:54:55 | +| 5 | Brian Hulette | [Leveraging SQL Table Providers for Row-based Cross-Language IOs](https://s.apache.org/xlang-table-provider) | 2020-01-13 17:25:10 | +| 6 | Udi Meiri | [Apache Beam Release Acceptance Criteria - Google Sheets](https://docs.google.com/spreadsheets/d/1qk-N5vjXvbcEk68GjbkSZTR8AGqyNUM-oLFo_ZXBpJw) | 2020-01-13 19:08:49 | +| 7 | Kyle Weaver | [Beam SQL Query Parameters](https://docs.google.com/document/d/1kPG3S6qAIPlhTYPYK_CYwkzBk-i7U6vON5_95tRGk6c) | 2020-01-21 17:19:57 | +| 8 | Alex Van Boxel | [Beam Schema Options](https://docs.google.com/document/d/1yCCRU5pViVQIO8-YAb66VRh3I-kl0F7bMez616tgM8Q) | 2020-01-26 11:27:05 | +| 9 | Daniel Oliveira | [Go SDF API v2 Changes](https://docs.google.com/document/d/1UeG5uNO00xCByGEZzDXk0m0LghX6HBWlMfRbMv_Xiyc) | 2020-01-27 14:59:01 | +| 10 | Alex Van Boxel | [Beam Schema Aware stabilisation](https://docs.google.com/document/d/1WseNjxFXYrpjWjbIxfWUpwHmjTL7WSfj9n2Nh3H_qqA) | 2020-02-05 07:48:30 | +| 11 | Hannah Jiang | [Add licenses and notices for third party libraries to Beam SDK docker images](https://s.apache.org/eauq6) | 2020-02-05 19:43:51 | +| 12 | Alex Van Boxel | [Leveraging SQL Table Providers for Row-based Cross-Language IOs](https://s.apache.org/xlang-table-provider/) | 2020-02-07 14:17:07 | +| 13 | N/A | [Apache Beam Capabilities and Requirements Listings](https://docs.google.com/document/d/1CyVElQDYHBRfXu6k1VSXv3Yok_4r8c4V0bkh2nFAWYc) | 2020-02-14 17:03:48 | +| 14 | Reza Rokni | [HLL in Beam](https://s.apache.org/hll-in-beam) | 2020-02-18 19:31:34 | +| 15 | Kenneth Knowles | [[ANN] Seeking volunteer mentors from all Apache projects to help mentor under-represented contributors - Equity, Diversity & Inclusion - Apache Software Foundation](https://s.apache.org/OutreachyMay2020) | 2020-02-24 16:53:42 | +| 16 | Luke Cwik | [Splittable DoFn proposal](https://s.apache.org/splittable-do-fn) | 2020-02-27 17:43:12 | +| 17 | Aizhamal Nurmamat kyzy | [[Public] Beam Knowledge Architecture Revamp - Proposal](https://docs.google.com/document/d/1HlRHfmc9MvKkFEf2gfIL3RYVxTmBXQL4sXjAGGOjeiI) | 2020-03-10 17:00:09 | +| 18 | Kenneth Knowles | [DRAFT - Apache Beam Board Report - March 2020](https://docs.google.com/document/d/1iBSTPHjTLA4rPdwjnoNAMb0IOMhruZHIx1QlT3H-GaY) | 2020-03-11 17:52:26 | +| 19 | Badrul Chowdhury | [Apache Beam](https://docs.google.com/document/d/173e_gnDclwavqobiNjwxRlo9D1xjaZat98g6Yax0kGQ) | 2020-03-18 15:56:24 | +| 20 | John Mora | [BEAM-9198 BeamSQL aggregation analytics functionality](https://docs.google.com/document/d/1nUbV45iL_avgAewYYTkyHHJWY8ZaVcFuky-dQ-pcE0M) | 2020-03-23 19:53:44 | +| 21 | Jan Lukavsky | [Data collection for “Smart quarantine”](https://docs.google.com/document/d/1HPRV1SriRd2v95r2I_MYcRkJgiLZs27wPuowigsmO70) | 2020-03-24 08:43:57 | +| 22 | Kenneth Knowles | [DRAFT - Apache Beam Board Report - June 2020](https://s.apache.org/beam-report-2020-06) | 2020-03-25 13:40:49 | +| 23 | Julian Bruno | [Firefly design donation as Beam Mascot](https://docs.google.com/document/d/1zK8Cm8lwZ3ALVFpD1aY7TLCVNwlyTS3PXxTV2qQCAbk) | 2020-04-02 13:18:47 | +| 24 | Maximilian Michels | [[Proposal] Digital Beam Summit 2020](https://docs.google.com/document/d/1OddPOvP36mTTWEXV0DWtyS3MgfXyWOS3YXiZGeLWfSI) | 2020-04-15 10:59:18 | +| 25 | Udi Meiri | [Log In - Apache Software Foundation](https://s.apache.org/asfyaml-notify) | 2020-04-27 18:50:55 | +| 26 | Aizhamal Nurmamat kyzy | [[Community] Beam website redesign](https://docs.google.com/document/d/1btXMkQGqYaU9pUjYh0iCNrbXf4pAHe3tBFsLQ_cLYHg) | 2020-04-28 19:21:07 | +| 27 | Kenneth Knowles | [Error - ASF JIRA](https://s.apache.org/beam-test-failure) | 2020-05-01 17:34:22 | +| 28 | Alex Amato | [Apache Beam Fn API : Defining and adding SDK Metrics](https://docs.google.com/document/d/1MtBZYV7NAcfbwyy9Op8STeFNBxtljxgy69FkHMvhTMA) | 2020-05-04 17:44:08 | +| 29 | Alex Amato | [Apache Beam Fn API: Histogram Style Metrics](https://docs.google.com/document/d/1kiNG2BAR-51pRdBCK4-XFmc0WuIkSuBzeb__Zv8owbU) | 2020-05-04 18:33:08 | +| 30 | Alex Amato | [Apache Beam Fn API: Histogram Style Metrics](https://s.apache.org/beam-histogram-metrics) | 2020-05-04 18:58:40 | +| 31 | Alex Amato | [Apache Beam FN API: GCP IO Debuggability Metrics](https://s.apache.org/beam-gcp-debuggability) | 2020-05-05 21:23:30 | +| 32 | Luke Cwik | [Issue Navigator - ASF JIRA](https://s.apache.org/beam-starter-tasks) | 2020-05-08 11:43:01 | +| 33 | Luke Cwik | [Structure and Lifting of Combines](https://s.apache.org/beam-runner-api-combine-model) | 2020-05-12 11:04:57 | +| 34 | Aizhamal Nurmamat kyzy | [Slack for ASF projects - Apache Infrastructure Website](https://s.apache.org/slack-invite) | 2020-05-12 23:20:48 | +| 35 | Robert Burke | [[Go SDK] Beam Schemas](https://s.apache.org/beam-go-schemas) | 2020-05-14 12:39:46 | +| 36 | Luke Cwik | [Apache Beam Portability API: How to Checkpoint and Split Bundles](https://docs.google.com/document/d/1cKOB9ToasfYs1kLWQgffzvIbJx2Smy4svlodPRhFrk4) | 2020-05-15 10:59:12 | +| 37 | Brittany Hermann | [Beam Summit Status Report](https://docs.google.com/document/d/11PXOBUbeldgPqz6OlTswCal6SxyX76Bb_ZVKBdwsd7o) | 2020-05-21 10:43:21 | +| 38 | Kenneth Knowles and Mark Shields | [Lateness (and Panes) in Apache Beam (incubating)](https://s.apache.org/beam-lateness) | 2020-05-28 11:02:51 | +| 39 | Jan Lukavsky | [@RequiresTimeSortedInput design doc](https://docs.google.com/document/d/1ObLVUFsf1NcG8ZuIZE4aVy2RYKx2FfyMhkZYWPnI9-c) | 2020-06-11 12:10:29 | +| 40 | Brian Hulette | [Abstractions for Schema-Aware IOs](https://s.apache.org/beam-schema-io) | 2020-06-12 16:32:16 | +| 41 | Qihang Zeng | [Beam Match_Recognition Design Documentation](https://s.apache.org/beam-sql-pattern-recognization) | 2020-06-17 08:39:24 | +| 42 | Luke Cwik | [Apache Beam Fn API: How to send and receive data](https://docs.google.com/document/d/1IGduUqmhWDi_69l9nG8kw73HZ5WI5wOps9Tshl5wpQA) | 2020-06-19 14:22:28 | +| 43 | John Mora | [BEAM-9198 BeamSQL aggregation analytics functionality - Design Doc](https://docs.google.com/document/d/1tJapdA7ZNwkU0NaK7p-em0XnpHqNE1pKIXw9hVJkIUg) | 2020-06-23 00:24:51 | +| 44 | Siyuan Chen | [GroupIntoBatches with Runner Determined Sharding](https://s.apache.org/sharded-group-into-batches) | 2020-06-26 15:21:54 | +| 45 | Ning Kang | [Apache Beam JupyterLab Side Panel](https://docs.google.com/document/d/1aKK8TzSrl8WiG0K4v9xZEfLMCinuGqRlMOyb7xOhgy4) | 2020-07-09 19:35:58 | +| 46 | Etta Rapp | [Azure Filesystem for Beam Java SDK](http://s.apache.org/beam-azfs-java) | 2020-07-16 15:29:17 | +| 47 | Abhishek Yadav | [Contextual TextIO](https://s.apache.org/beam-contextual-io) | 2020-07-18 01:48:54 | +| 48 | Scott Lukas | [Abstractions for Schema-Aware IOs](https://docs.google.com/document/d/1ic3P8EVGHIydHQ-VMDKbN9kEdwm7sBXMo80VrhwksvI) | 2020-07-20 15:31:46 | +| 49 | Jiadai Xia | [[BEAM-4379] Making ParquetIO Splittable](https://docs.google.com/document/d/1xqN7qsV3vsrqd6i18isAA1HTWmTMXK-u4sle-brXxuw) | 2020-07-30 19:21:31 | +| 50 | Gris Cuevas | [[Public] State of Apache Beam (Roadmap)](https://docs.google.com/document/d/1qvjnpSj_5MXBmhPI70BJYdrUPVXZR9nAFGf6zqNigek) | 2020-08-17 17:54:13 | +| 51 | Reuven Lax | [Beam Proposal: Pipeline Drain](https://docs.google.com/document/d/1NExwHlj-2q2WUGhSO4jTu8XGhDPmm3cllSN8IMmWci8) | 2020-08-21 03:32:14 | +| 52 | Gris Cuevas | [[Public PRD] Beam Website 2.0](https://s.apache.org/beam-site-revamp) | 2020-09-09 14:14:18 | +| 53 | Kenneth Knowles | [DRAFT - Apache Beam Board Report - September 2020](https://docs.google.com/document/d/1Z2pKu6NYdpYAka9IfTkEiAMOVubH1jjkkRQMaVJbs5Y) | 2020-09-09 14:30:47 | +| 54 | Luke Cwik | [Apache Beam Fn Api Overview](https://s.apache.org/beam-fn-api) | 2020-09-25 17:42:26 | +| 55 | Boyuan Zhang | [Self-checkpoint Support on Portable Flink](https://docs.google.com/document/d/1372B7HYxtcUYjZOnOM7OBTfSJ4CyFg_gaPD_NUxWClo) | 2020-10-05 14:11:47 | +| 56 | Luke Cwik | [SplittableDoFn 2020 Blog](https://docs.google.com/document/d/1kpn0RxqZaoacUPVSMYhhnfmlo8fGT-p50fEblaFr2HE) | 2020-10-06 16:37:00 | +| 57 | Brittany Hermann | [10-30-20 Beam Community Update](https://docs.google.com/document/d/1_t6xKoOQVwgn2edmRVh1ViudmbnNM3BwZyguKAwwjfA) | 2020-10-26 14:06:12 | +| 58 | Sam Rohde | [ToString Transform Design](https://docs.google.com/document/d/1v7iWj0LIum04mYwRM_Cvze915tATwmEzLrqj_uVBkCE) | 2020-10-27 15:04:18 | +| 59 | Kyle Weaver | [User Defined Metrics API](http://s.apache.org/beam-metrics-api) | 2020-11-19 15:58:26 | +| 60 | Chad Dombrova | [Beam Task Workers](https://docs.google.com/document/d/1GrAvDWwnR1QAmFX7lnNA7I_mQBC2G1V2jE2CZOc6rlw) | 2020-12-02 13:30:44 | +| 61 | Griselda Cuevas | [[Public PRD] Beam Website 2.0](https://docs.google.com/document/d/1rssE8B6fvwLfyODS6KKp049KryhKn8E-HlHkzM8TWkE) | 2020-12-04 19:21:48 | +| 62 | Kenneth Knowles | [DRAFT - Apache Beam Board Report - December 2020](https://s.apache.org/beam-draft-report-2020-12) | 2020-12-07 12:38:42 | +| 63 | Brittany Hermann | [Beam Community Update](https://docs.google.com/document/d/1sA06m_spqgHwI3yGiQ_Z06LXfYqRcgIO1_N8y_a2O6o) | 2020-12-09 14:56:15 | \ No newline at end of file diff --git a/contributor-docs/discussion-docs/2021.md b/contributor-docs/discussion-docs/2021.md new file mode 100644 index 000000000000..001577cc2309 --- /dev/null +++ b/contributor-docs/discussion-docs/2021.md @@ -0,0 +1,69 @@ + + +# List Of Documents Submitted To dev@beam.apache.org In 2021 +| No. | Author | Subject | Date (UTC) | +|---|---|---|---| +| 1 | Pablo Estrada | [Apache Beam Release Acceptance Criteria - Google Sheets](https://docs.google.com/spreadsheets/d/1qk-N5vjXvbcEk68GjbkSZTR8AGqyNUM-oLFo_ZXBpJw) | 2021-01-06 02:17:56 | +| 2 | Boyuan Zhang | [KafkaIO Dynamic Read](https://docs.google.com/document/d/1FU3GxVRetHPLVizP3Mdv6mP5tpjZ3fd99qNjUI5DT5k) | 2021-01-07 23:23:17 | +| 3 | Mirac Vuslat Basaran | [Apache Beam Resource Annotations](https://docs.google.com/document/d/1phExeGD1gdDI9M8LK4ZG57UGa7dswpB8Aj6jxWj4uQk) | 2021-01-13 11:26:02 | +| 4 | Chamikara Jayalath | [Issue Navigator - ASF JIRA](https://s.apache.org/beam-2.28.0-burn-down) | 2021-01-25 20:20:28 | +| 5 | Boyuan Zhang | [Portable OrderedListState](https://docs.google.com/document/d/1U77sAvE6Iy9XsVruRYHxPdFji7nqS6HPi1XU8fhyrxs) | 2021-02-03 15:33:44 | +| 6 | Griselda Cuevas | [[Public PRD] Beam Website 2.0](https://docs.google.com/document/d/1rssE8B6fvwLfyODS6KKp049KryhKn8E-HlHkzM8TWkE) | 2021-02-06 00:12:51 | +| 7 | Brittany Hermann | [[Proposal] Digital Beam Summit 2021](https://docs.google.com/document/d/1nqE2DMfw2qsI57qIZzjF76L0W_NacxQFXIoZtDOApVY) | 2021-02-11 16:39:47 | +| 8 | Kenneth Knowles | [Triggering is for Sinks](https://s.apache.org/beam-sink-triggers) | 2021-02-22 14:51:27 | +| 9 | Brittany Hermann | [Beam Community Update](https://docs.google.com/document/d/1sA06m_spqgHwI3yGiQ_Z06LXfYqRcgIO1_N8y_a2O6o) | 2021-03-05 15:09:54 | +| 10 | Kenneth Knowles | [DRAFT - Apache Beam Board Report - March 2021](https://s.apache.org/beam-draft-report-2021-03) | 2021-03-09 11:50:15 | +| 11 | Kenneth Knowles | [DRAFT - Apache Beam Board Report - June 2021](https://s.apache.org/beam-draft-report-2021-06) | 2021-03-11 16:51:17 | +| 12 | Tomo Suzuki | [Apache Beam Vendored Dependencies Release Guide](https://s.apache.org/beam-release-vendored-artifacts) | 2021-03-15 10:17:37 | +| 13 | Robert Burke | [Apache Beam Fn API: Fn State API and Bundle Processing](https://s.apache.org/beam-fn-state-api-and-bundle-processing) | 2021-03-30 20:21:44 | +| 14 | Kenneth Knowles | [Cassandra 4.x upgrade](https://docs.google.com/document/d/1qsNksUJ_a6PL623iBZ-3QQDkFae81IKbqVQl1chsOfU) | 2021-04-15 13:28:38 | +| 15 | Brian Hulette | [A Pandas-compatible dataframe API for Beam](https://s.apache.org/beam-dataframes) | 2021-04-15 17:40:44 | +| 16 | Rui Wang | [BEAM-9198 BeamSQL aggregation analytics functionality - Design Doc](https://docs.google.com/document/d/1tJapdA7ZNwkU0NaK7p-em0XnpHqNE1pKIXw9hVJkIUg) | 2021-05-12 14:27:21 | +| 17 | Luke Cwik | [Parallel querying in JdbcIO](https://docs.google.com/document/d/1wBzVhQEhTK23ALzTSZ_CVouEOXTm3w2-LjmO3ieUvFc) | 2021-06-08 16:21:51 | +| 18 | Alexey Romanenko | [Issue Navigator - ASF JIRA](https://s.apache.org/beam-starter-tasks) | 2021-06-14 07:00:11 | +| 19 | Robert Bradshaw | [RFC: Apache Beam Go SDK design](https://s.apache.org/beam-go-sdk-design-rfc) | 2021-06-15 14:10:46 | +| 20 | Kiley Sok | [Portable MultimapUserState](https://docs.google.com/document/d/1tpvLAsl7cb4WTgIKRtMF-UVg_TW_WNc73uCn9rmDAoQ) | 2021-06-28 16:38:14 | +| 21 | Kyle Weaver | [Projection Pushdown in SchemaIO](https://docs.google.com/document/d/1M71LrbjfPZwk5C0IpI13Su63UWSBFKYjSrz9SsyBM_Y) | 2021-07-12 19:19:44 | +| 22 | N/A | [Portable Runner Tests Results](https://docs.google.com/document/d/1Pqd0-vuYHSjLr6yQvfGwiK3NcYypT-WrHfjCbP-Xob4) | 2021-07-14 16:27:03 | +| 23 | Luke Cwik | [JmsIO auto scaling feature](https://docs.google.com/document/d/1LMPpMpn9DByQcyCdRrTMBa-8ltByO4aqNYY4D9h3m18) | 2021-07-19 11:19:15 | +| 24 | Chamikara Jayalath | [Splittable DoFn proposal](https://s.apache.org/splittable-do-fn) | 2021-07-25 14:31:57 | +| 25 | Pablo Estrada | [Apache Beam Fn Api Overview](https://s.apache.org/beam-fn-api) | 2021-07-26 07:29:09 | +| 26 | Chamikara Jayalath | [A simpler way to define and use Java cross-language transforms](https://docs.google.com/document/d/1ECXSWicE31K-vSxdb4qL6UcmovOAWvE-ZHFT3NTM654) | 2021-07-26 22:09:44 | +| 27 | Kyle Weaver | [Projection Pushdown in Beam Java](https://docs.google.com/document/d/1eHSO3aIsAUmiVtfDL-pEFenNBRKt26KkF4dQloMhpBQ) | 2021-08-03 17:30:46 | +| 28 | Zachary Houfek | [SBE Beam Extension](https://docs.google.com/document/d/1YiZR__1EsV9tDFsNUhvbpv2YjeP5cMl1c3Wj7Nb1fWs) | 2021-08-26 11:00:11 | +| 29 | Tianyang Hu | [[Go SDK] SqlTransform API](https://s.apache.org/beam-go-sql-api) | 2021-08-30 14:06:09 | +| 30 | Lukasz Cwik | [Apache Beam Fn API: How to send and receive data](https://docs.google.com/document/d/1IGduUqmhWDi_69l9nG8kw73HZ5WI5wOps9Tshl5wpQA) | 2021-08-31 03:51:23 | +| 31 | Vachan Shetty | [Supporting BigQuery Storage Read API in the Python SDK](https://docs.google.com/document/d/1wIVh7GnWFnf6x9hj972GLxZoLgFhgZQHHGpALyhJLL4) | 2021-09-09 15:54:03 | +| 32 | Kenneth Knowles | [DRAFT - Apache Beam Board Report - September 2021](https://s.apache.org/beam-draft-report-2021-09) | 2021-09-10 15:17:42 | +| 33 | Chamikara Jayalath | [URN convention for cross-language transforms](https://docs.google.com/document/d/1JOHPBNv6x6ziMdwr_96EPSP-Bx7C4IrLU1j267MSCts) | 2021-09-14 18:30:39 | +| 34 | Kenneth Knowles | [User Defined Metrics API](http://s.apache.org/beam-metrics-api) | 2021-09-17 11:39:31 | +| 35 | Pablo Estrada | [Defining and documenting key-ordering semantics in Beam runners](https://docs.google.com/document/d/1_7WRJznXlOtWuVaHl_dpy8OZcx_M8BUmeWVA4G0-wEc) | 2021-09-23 19:30:12 | +| 36 | Valentyn Tymofieiev | [Issues · apache/beam · GitHub](http://s.apache.org/beam-flakes) | 2021-09-27 22:44:16 | +| 37 | Zachary Houfek | [SBE Schema in Beam](https://docs.google.com/document/d/1q2KyBxPQ6V6Gfm50Z74YWofATgVauYu066F0t1JPxRU) | 2021-09-28 12:56:56 | +| 38 | Jan Lukavsky | [@RequiresTimeSortedInput design doc](https://docs.google.com/document/d/1ObLVUFsf1NcG8ZuIZE4aVy2RYKx2FfyMhkZYWPnI9-c) | 2021-09-29 04:35:49 | +| 39 | Heejong Lee | [Artifact Registration for Java Expansion Service](https://docs.google.com/document/d/1Wi8IMHFT54T274ybkbek79s_RGKa2X2zY-LVOR_RMNM) | 2021-10-06 18:40:19 | +| 40 | Ryan Thompson | [Upgrading Pickle Library](https://s.apache.org/beam-picklers) | 2021-10-12 17:58:56 | +| 41 | Ning Kang | [Apache Beam JupyterLab Side Panel](https://docs.google.com/document/d/1aKK8TzSrl8WiG0K4v9xZEfLMCinuGqRlMOyb7xOhgy4) | 2021-10-14 14:38:45 | +| 42 | Daniel Oliveira | [[Cross-Language] Java BigQuery IO Externalization](https://s.apache.org/beam-bigquery-externalization) | 2021-10-18 19:25:22 | +| 43 | Pablo Estrada | [Improvements to JdbcIO’s readWithPartitions](https://docs.google.com/document/d/1Lre4n31eYAZdlz9ZDkomucOxRgyAehcg_ZT3HErQ5p0) | 2021-11-01 13:39:57 | +| 44 | Brian Hulette | [Apache Beam Vendored Dependencies Release Guide](https://docs.google.com/document/d/1ztEoyGkqq9ie5riQxRtMuBu3vb6BUO91mSMn1PU0pDA) | 2021-11-01 18:30:30 | +| 45 | Yichi Zhang | [[Proposal] Optional elements embedding in Fn API process bundle request and response](https://docs.google.com/document/d/14p8Y_n4IY5n9L_I9l5x9lVGgml4ZzdCw645HldndCrw) | 2021-11-03 14:00:10 | +| 46 | Jack McCluskey | [Beam Go MultiMap Side Input Doc](https://docs.google.com/document/d/1GyWdszvtJihqEfyu0yhDbYaLnbqDUwaixcbV5sjbWwE) | 2021-11-08 16:58:59 | +| 47 | Valentyn Tymofieiev | [Automated generation of requirement files for Python Docker images](https://s.apache.org/beam-python-image-requirements) | 2021-11-09 16:18:24 | +| 48 | Lara Schmidt | [Schema-Aware Transforms](https://s.apache.org/beam-schema-transform) | 2021-11-09 17:01:11 | +| 49 | Valentyn Tymofieiev | [Python Tips - Apache Beam - Apache Software Foundation](https://s.apache.org/beam-python-dev-wiki) | 2021-11-16 15:01:34 | +| 50 | Brian Hulette | [Batched DoFns](https://s.apache.org/batched-dofns) | 2021-12-15 12:59:22 | +| 51 | Chad Dombrova | [Beam Task Workers](https://docs.google.com/document/d/1GrAvDWwnR1QAmFX7lnNA7I_mQBC2G1V2jE2CZOc6rlw) | 2021-12-17 11:57:03 | +| 52 | Stuart Perks | [Kafka Streams Runner](https://docs.google.com/document/d/1mNqERvvV8oGI_O4tGewH2Kgkq6PQGv3ylmxnaTRBqH8) | 2021-12-27 08:28:31 | \ No newline at end of file diff --git a/contributor-docs/discussion-docs/2022.md b/contributor-docs/discussion-docs/2022.md new file mode 100644 index 000000000000..87f7947df01a --- /dev/null +++ b/contributor-docs/discussion-docs/2022.md @@ -0,0 +1,89 @@ + + +# List Of Documents Submitted To dev@beam.apache.org In 2022 +| No. | Author | Subject | Date (UTC) | +|---|---|---|---| +| 1 | Emily Ye | [Gradle 7 migration (provided -> compileOnly) - Google Sheets](https://docs.google.com/spreadsheets/d/1UpeQtx1PoAgeSmpKxZC9lv3B9G1c7cryW3iICfRtG1o) | 2022-01-11 21:38:34 | +| 2 | Brian Hulette | [Beam SQL Filter/Project push-down](https://docs.google.com/document/d/1-ysD7U7qF3MAmSfkbXZO_5PLJBevAL9bktlLCerd_jE) | 2022-01-13 21:01:36 | +| 3 | Robert Bradshaw | [Schema-Aware PCollections](https://docs.google.com/document/d/1tnG2DPHZYbsomvihIpXruUmQ12pHGK0QIvXS1FOTgRc) | 2022-01-14 14:35:56 | +| 4 | Luke Cwik | [Portable Artifact Staging](https://docs.google.com/document/d/12zNk3O2nhTB8Zmxw5U78qXrvlk5r42X8tqF248IDlpI) | 2022-01-17 13:52:33 | +| 5 | Emily Ye | [Apache Beam Vendored Dependencies Release Guide](https://s.apache.org/beam-release-vendored-artifacts) | 2022-01-18 12:08:42 | +| 6 | Kenneth Knowles | [Apache CDAP Connector Design Doc](https://docs.google.com/document/d/1T-bhd0Qk7DBePIfgHEPagYiA1oLP4z5kYEd0S1SOGxQ) | 2022-01-18 16:49:00 | +| 7 | Anand Inguva | [BEAM-4032](https://docs.google.com/document/d/1htXDZwg7WcCB-ZyNuy91qWuccaQ6cjX4lY4l4Cqb8P0) | 2022-01-18 20:33:41 | +| 8 | Brian Hulette | [Beam Task Workers](https://docs.google.com/document/d/1GrAvDWwnR1QAmFX7lnNA7I_mQBC2G1V2jE2CZOc6rlw) | 2022-01-19 14:10:54 | +| 9 | Pranav Bhandari | [FileIO behavior when destination file exists](https://docs.google.com/document/d/10o3iHzecWLcg3PCURa8ARvcKjL_kxO5M7daFCBXhEdo) | 2022-01-21 13:22:03 | +| 10 | Pablo Estrada | [[Beam I/O Standards] API Syntax and Semantics](https://s.apache.org/beam-io-api-standard) | 2022-01-24 11:00:00 | +| 11 | Emily Ye | [Apache Beam Release Acceptance Criteria - Google Sheets](https://docs.google.com/spreadsheets/d/1qk-N5vjXvbcEk68GjbkSZTR8AGqyNUM-oLFo_ZXBpJw) | 2022-01-26 21:07:14 | +| 12 | Danny McCormick | [Unify PipelineOptions Behavior in the Go Sdk](https://docs.google.com/document/d/1AyO5SDEd_DzyOyrz_TkyDLUv19eLsKI97vKM8I7fQ9o) | 2022-02-03 09:59:49 | +| 13 | Jack McCluskey | [Beam Go Native Iterable Side Inputs](https://docs.google.com/document/d/1V7GKjQz_156ZPE0gN86dhbpagsEj6uYf1wEErGxsDOQ) | 2022-02-03 15:41:23 | +| 14 | Robert Bradshaw | [Simplifying Beam Pipelines](https://s.apache.org/no-beam-pipeline) | 2022-02-04 12:00:01 | +| 15 | Ritesh Ghorse | [Go SDK Debug Capture / Worker Status](https://docs.google.com/document/d/1dMTD5_sKdzLcnoe0ZsQU5Wf9q11uliyYgFnnOZQDzuI) | 2022-02-04 12:32:54 | +| 16 | Pablo Estrada | [A generic Beam IO Sink](https://docs.google.com/document/d/1UIWv6wnD86GYAkeqbVWCG3mx4dTZ9WstUUThPWQmcFM) | 2022-02-04 15:11:07 | +| 17 | John Casey | [Automatically Generate Boilerplate for Cross-Language Transforms](https://docs.google.com/document/d/1SAe9tG3JeKbyWVlJf-oGROHsu_fHk02X_I1j6kfvClM) | 2022-02-07 15:43:00 | +| 18 | Danny McCormick | [Automate Reviewer Assignment](https://docs.google.com/document/d/1FhRPRD6VXkYlLAPhNfZB7y2Yese2FCWBzjx67d3TjBo) | 2022-02-09 17:15:59 | +| 19 | Luke Cwik | [Apache Beam Fn API: Fn State API and Bundle Processing](https://s.apache.org/beam-fn-state-api-and-bundle-processing) | 2022-02-11 13:52:21 | +| 20 | Andy Ye | [RunInference: ML Inference in Beam](https://docs.google.com/document/d/1bVMU7Uo9Nzuu6aXR702j74nhQK4j6J1lkRVVBRySI0g) | 2022-02-11 18:31:31 | +| 21 | Heejong Lee | [Python External Transform Registry](https://s.apache.org/python-external-transform-registry) | 2022-02-11 18:43:57 | +| 22 | N/A | [Migrate Jira to GitHub Issues?](https://docs.google.com/document/d/1_n7gboVbSKPs-CVcHzADgg8qpNL9igiHqUPCmiOslf0) | 2022-02-15 07:54:36 | +| 23 | Daniel Oliveira | [[Investigation] Go Expansion Service Auto-Startup for Dev Environments](https://docs.google.com/document/d/16Yj3oZYAkw7Xc5xDQ88bdTt3BE94vY6833cAtrjbnRg) | 2022-02-18 17:29:00 | +| 24 | Danny McCormick | [Bundle Finalization in the Go Sdk](https://docs.google.com/document/d/1dLylt36oFhsWfyBaqPayYXqYHCICNrSZ6jmr51eqZ4k) | 2022-02-24 10:42:08 | +| 25 | Lara Schmidt | [Schema-Aware Transforms](https://s.apache.org/beam-schema-transform) | 2022-03-01 17:10:27 | +| 26 | Kenneth Knowles | [DRAFT - Apache Beam Board Report - March 2022](https://s.apache.org/beam-draft-report-2022-03) | 2022-03-07 08:36:28 | +| 27 | Danny McCormick | [Why Go Generics Can't Completely Replace the Code Generator (Yet!)](https://docs.google.com/document/d/1imYbBeu2FNJkwPNm6E9GEJkjpHnHscvFoKAE6AISvFA) | 2022-03-10 12:44:07 | +| 28 | Jack McCluskey | [Self-Checkpointing DoFns in the Beam Go SDK](https://docs.google.com/document/d/1_JbzjY9JR07ZK5v7PcZevUfzHPsqwzfV7W6AouNpMPk) | 2022-03-16 13:47:12 | +| 29 | Jack McCluskey | [Fuzz Testing the Beam Go SDK](https://docs.google.com/document/d/1VHe5xFHjsmgOj2upH2k4zT90S5SCPtGYSJNcvPj5fig) | 2022-03-17 16:06:51 | +| 30 | Liam Miller-Cushon | [Apache Beam Vendored Dependencies Release Guide](http://s.apache.org/beam-release-vendored-artifacts) | 2022-03-25 14:03:22 | +| 31 | Danny McCormick | [Watermark Estimation in the Go Sdk](https://docs.google.com/document/d/1DqCYJ-J1YGNelCRIcN5v6BQsZxJB2l5uWBmN4ti--Ew) | 2022-03-30 13:22:43 | +| 32 | Reza Rokni | [[Beam I/O Standards] Documentation](https://s.apache.org/beam-io-api-standard-documentation) | 2022-03-31 18:47:24 | +| 33 | Chamikara Jayalath | [Updated ExternalPythonTransform API](https://docs.google.com/document/d/164rn4otqtQA-8QoLlaoGunUDjLSBBvS2DuR0bn344MQ) | 2022-04-04 16:20:06 | +| 34 | Andy Ye | [RunInference: ML Inference in Beam](https://s.apache.org/inference-sklearn-pytorch) | 2022-04-07 14:59:47 | +| 35 | Robert Burke | [Design Documents - Apache Beam - Apache Software Foundation](https://s.apache.org/beam-design-docs) | 2022-04-08 14:00:03 | +| 36 | Brian Hulette | [Should batch-producing DoFns return Iterables?](https://docs.google.com/document/d/1Xt7T0e_TEJrCCx9DnUBY60zrysRHDw6mPBAcc-e9UHo) | 2022-04-13 17:47:32 | +| 37 | Ritesh Ghorse | [Truncate SDF in Go SDK](https://docs.google.com/document/d/1C4hoE28Ye0fO6lbpZSb84BbJLNI2N4cYCH1FOhAyFEk) | 2022-04-15 12:46:43 | +| 38 | Andy Ye | [Order-sensitive DataFrame Operations in Beam](https://s.apache.org/order-sensitive-dataframe-operations) | 2022-04-21 14:31:46 | +| 39 | Andy Ye | [Interactive DataFrame Operations in Beam](https://s.apache.org/interactive-dataframe-operations) | 2022-04-21 18:44:34 | +| 40 | John Casey | [Upgrading GCS Bucket Storage to support writing to locked buckets](https://docs.google.com/document/d/1Z4TfvEp0AEtZY2xsAt_Tu1W4ziyhbp8rsdaXNHEfVkY) | 2022-04-22 14:14:33 | +| 41 | Yi Hu | [Make GCS read and write buffers customizable](https://docs.google.com/document/d/1-GCE8xQ73eGr_LZ6fUwIScpx9mL8PLFA8azr3r-UrJ8) | 2022-04-28 11:51:14 | +| 42 | John Casey | [Re-Implementing KafkaIO.withDynamicRead with Watch transform](https://docs.google.com/document/d/1Io49s5LBs29HJyppKG3AlR-gHz5m5PC6CqO0CCoSqLs) | 2022-04-29 13:08:14 | +| 43 | Yi Hu | [Proposal: Slowly Changing Dimensions Support in Beam](https://docs.google.com/document/d/1LDY_CtsOJ8Y_zNv1QtkP6AGFrtzkj1q5EW_gSChOIvg) | 2022-05-18 15:32:24 | +| 44 | Danny McCormick | [Analysis of Broken Jenkins Issue](https://docs.google.com/document/d/10qyUsvB_uVy5jftfTiwohlvN8Qwix5AuadssyoC4JsE) | 2022-06-12 20:25:15 | +| 45 | Andrew Pilloud | [[Public] Beam Java DoFn Performance](https://docs.google.com/document/d/12XkHLcE0HpOS0fs0FekDzh68fMPCEZ5uGCh00kPZf0I) | 2022-06-13 15:22:44 | +| 46 | Mara Ruvalcaba | [Doctor pipeline topics - Google Sheets](https://docs.google.com/spreadsheets/d/1wz-RU092-qLNRcMjYUoLM9ZO2hClWQi9dfRB3ZoLw3c) | 2022-06-15 22:12:20 | +| 47 | Brian Hulette | [Batched DoFns](https://s.apache.org/batched-dofns) | 2022-06-17 17:19:26 | +| 48 | Alex Merose | [Apache Beam on Dask](https://docs.google.com/document/d/1Awj_eNmH-WRSte3bKcCcUlQDiZ5mMKmCO_xV-mHWAak) | 2022-06-21 13:45:31 | +| 49 | Ahmed Abualsaud | [Adding support for Locked Buckets](https://docs.google.com/document/d/11kXzI90KmAyknszSFmtfPcL_GWaVzpt8MojQfifZOoM) | 2022-06-28 11:26:33 | +| 50 | Danny McCormick | [Analysis of Beam's ghprb issues](https://docs.google.com/document/d/15CILeNjNxCnbigSvxNq4eXPj6x6sn5DGdbTdWu55kCI) | 2022-06-30 11:39:52 | +| 51 | Yi Hu | [Merge Workflow Recommendations](https://docs.google.com/document/d/10FlXOo_hL2QYTPhwS8uHSyJbQCzwC3K3C12tFccANA8) | 2022-07-07 12:08:11 | +| 52 | Kenneth Knowles | [DRAFT - Apache Beam Board Report - July 2022](https://s.apache.org/beam-draft-report-2022-07) | 2022-07-08 16:02:32 | +| 53 | Moritz Mack | [Beam JMH performance metrics](https://s.apache.org/nvi9g) | 2022-07-12 10:54:01 | +| 54 | Robert Burke | [State and Timers in the Go SDK](https://docs.google.com/document/d/1rcKa1Z6orDDFr1l8t6NA1eLl6zanQbYAEiAqk39NQUU) | 2022-08-04 18:57:29 | +| 55 | Chamikara Jayalath | [Easy Multi-language via a SchemaTransform-aware Expansion Service](https://s.apache.org/easy-multi-language) | 2022-08-04 21:51:21 | +| 56 | Andy Ye | [Controlling Batching in RunInference](https://docs.google.com/document/d/1l40rOTOEqrQAkto3r_AYq8S_L06dDgoZu-4RLKAE6bo) | 2022-08-12 15:36:18 | +| 57 | Anand Inguva | [RunInference API Testing[public]](https://docs.google.com/document/d/1xmh9D_904H-6X19Mi0-tDACwCCMvP4_MFA9QT0TOym8) | 2022-08-16 09:35:50 | +| 58 | Adalbert Makarovych | [SingleStoreDB Apache Beam IO Connector Design Doc](https://docs.google.com/document/d/1WU-hkoZ93SaGXyOz_UtX0jXzIRl194hCId_IdmEV9jw) | 2022-08-25 10:17:34 | +| 59 | Anand Inguva | [RunInference namespace.](https://docs.google.com/document/d/1qf5APv1kw5nbVZIMgqEp1N4XYcb0cJP0KZc9PrUYCO4) | 2022-09-12 13:31:28 | +| 60 | Kenneth Knowles | [DRAFT - Apache Beam Board Report - September 2022](https://s.apache.org/beam-draft-report-2022-09) | 2022-09-13 19:22:01 | +| 61 | Jack McCluskey | [Custom Inference Fn One Pager](https://docs.google.com/document/d/1YYGsF20kminz7j9ifFdCD5WQwVl8aTeCo0cgPjbdFNU) | 2022-09-16 14:45:32 | +| 62 | Pranav Bhandari | [Performance and Cost Benchmarking.docx](https://docs.google.com/document/d/14GatBilwuR4jJGb-ZNpYeuB-KkVmDvEm) | 2022-09-26 12:07:22 | +| 63 | Ning Kang | [Apache Beam JupyterLab Side Panel](https://docs.google.com/document/d/1aKK8TzSrl8WiG0K4v9xZEfLMCinuGqRlMOyb7xOhgy4) | 2022-10-04 18:09:58 | +| 64 | Anand Inguva | [Regression Alerts for Python Performance tests](https://docs.google.com/document/d/1jhcmP405s5IRSB2T5ZlCAuBy8duhQjO0kgRLF0SfSl8) | 2022-11-02 12:32:27 | +| 65 | Ahmed Abualsaud | [Making existing Java transforms discoverable to other SDKs as SchemaTransforms](https://docs.google.com/document/d/1qW9O3VxdGxUM887TdwhD1iH9AdNbpu0_wXbCGvFP0OM) | 2022-11-03 14:25:06 | +| 66 | Austin Bennett | [Jenkins --> GitHub Actions?](https://s.apache.org/beam-jenkins-to-gha) | 2022-11-07 12:08:52 | +| 67 | Chamikara Jayalath | [Schema-Aware Transforms](https://docs.google.com/document/d/1B-pxOjIA8Znl99nDRFEQMfr7VG91MZGfki2BPanjjZA) | 2022-11-15 17:28:27 | +| 68 | Anand Inguva | [Model updates using SideInput](https://docs.google.com/document/d/12j4bDwsIBhMN_8DNT2KGXPol7YS_G-DZFy6fjRsUGOQ) | 2022-11-21 16:25:35 | +| 69 | Danny McCormick | [What labels do we care about?](https://docs.google.com/document/d/1FpaFr_Sdg217ogd5oMDRX4uLIMSatKLF_if9CzLg9tM) | 2022-12-06 12:11:21 | +| 70 | Damon Douglas | [FileWriteSchemaTransformProvider](https://docs.google.com/document/d/1IOZrQ4qQrUS2WwQhadN35vX4AzhG4dyXMk1J-R1qJ9c) | 2022-12-09 18:54:25 | +| 71 | Herman Mak | [[PUBLIC] Beam I/O Standards](https://docs.google.com/document/d/1BCTpSZDUjK90hYZjcn8aAnPd9vuRfj8YU1j3mpSgRwI) | 2022-12-11 09:16:41 | +| 72 | Kenneth Knowles | [DRAFT - Apache Beam Board Report - December 2022](https://s.apache.org/beam-draft-report-2022-12) | 2022-12-14 15:40:26 | \ No newline at end of file diff --git a/contributor-docs/discussion-docs/2023.md b/contributor-docs/discussion-docs/2023.md new file mode 100644 index 000000000000..a1e04e671585 --- /dev/null +++ b/contributor-docs/discussion-docs/2023.md @@ -0,0 +1,62 @@ + + +# List Of Documents Submitted To dev@beam.apache.org In 2023 +| No. | Author | Subject | Date (UTC) | +|---|---|---|---| +| 1 | Jack McCluskey | [TensorFlow Hub Model Loading in Apache Beam](https://docs.google.com/document/d/11WpWQ_x3dpgRTvkzKS47lCXA_-PGzUBAIKNJQfLpJTg) | 2023-01-03 10:27:26 | +| 2 | John Casey | [How to Write a Beam IO](https://docs.google.com/document/d/1-WxZTNu9RrLhh5O7Dl5PbnKqz3e5gm1x3gDBBhszVF8) | 2023-01-09 11:57:44 | +| 3 | Kenneth Knowles | [Apache Beam Release Acceptance Criteria - Google Sheets](https://docs.google.com/spreadsheets/d/1qk-N5vjXvbcEk68GjbkSZTR8AGqyNUM-oLFo_ZXBpJw) | 2023-01-09 16:25:33 | +| 4 | Herman Mak | [[Beam I/O Standards] Documentation](https://s.apache.org/beam-io-api-standard-documentation) | 2023-01-11 07:55:33 | +| 5 | Ritesh Ghorse | [TF Model Handler for RunInference](https://docs.google.com/document/d/1c2rWX7fA7UAl2qabzEXg5r7_zwosHfGo6mg1gz8VdSQ) | 2023-01-26 10:42:33 | +| 6 | Becket Qin | [Migrate Flink runner to run batch jobs with DataStream API](https://docs.google.com/document/d/1cjUJHOS1eEkH76hMNeBuc-kPhbIIc9w2gvjm8miIFS8) | 2023-02-01 04:10:32 | +| 7 | Yi Hu | [template-it-to-beam](https://docs.google.com/document/d/11RBh9_Escr8jq93tev2ADF7Wdw4St89PL4ZNrmHyGNI) | 2023-02-01 14:00:48 | +| 8 | Chamikara Jayalath | [A User-deployable Beam Transform Service](https://s.apache.org/beam-transform-service) | 2023-02-06 16:32:56 | +| 9 | Alan Zhang | [Apache Beam Fn API: Fn State API and Bundle Processing](https://s.apache.org/beam-fn-state-api-and-bundle-processing) | 2023-02-23 02:46:32 | +| 10 | Udi Meiri | [Beam Structured Logging](https://s.apache.org/beam-structured-logging) | 2023-03-08 16:59:10 | +| 11 | Kenneth Knowles | [DRAFT - Apache Beam Board Report - March 2023](https://s.apache.org/beam-draft-report-2023-03) | 2023-03-20 17:49:27 | +| 12 | Siddharth Aryan | [Sentimental Analysis Notebook](https://docs.google.com/document/d/1U6zcXAWsDCrWlbf14f5VlLqPZFucwXR48tD7mrERW-g) | 2023-03-23 12:25:28 | +| 13 | Kenneth Knowles | [Triggered Side Inputs: clarifying the spec and move to portability](https://s.apache.org/beam-triggered-side-inputs) | 2023-03-23 15:19:34 | +| 14 | Yi Hu | [Apache Beam Vendored Dependencies Release Guide](https://docs.google.com/document/d/1ztEoyGkqq9ie5riQxRtMuBu3vb6BUO91mSMn1PU0pDA) | 2023-03-29 15:52:02 | +| 15 | Yi Hu | [External: Apache Beam PM: failed installations due to dependency resolution.](https://s.apache.org/beam-python-dependencies-pm) | 2023-04-12 13:19:11 | +| 16 | Kenneth Knowles | [Guard against "Trigger Finishing", a data loss risk](https://s.apache.org/finishing-triggers-drop-data) | 2023-04-14 16:14:48 | +| 17 | Danny McCormick | [Pre/Post Processing and DLQ in RunInference](https://docs.google.com/document/d/1hr1SaWraneB9dYSFyGA99JT44oKgGNhT70wz99lmdEU) | 2023-04-18 13:49:14 | +| 18 | Anand Inguva | [Beam MLTransform](https://docs.google.com/document/d/1rQkSm_8tseLqDQaLohtlCGqt5pvMaP0XIpPi5UD0LCQ) | 2023-05-09 11:26:22 | +| 19 | Jack McCluskey | [Apache Beam Repository Deletion Postmortem](https://s.apache.org/beam-repo-deletion-postmortem) | 2023-05-16 15:39:41 | +| 20 | Jack McCluskey | [Client-Side Throttling](https://docs.google.com/document/d/1ePorJGZnLbNCmLD9mR7iFYOdPsyDA1rDnTpYnbdrzSU) | 2023-05-30 14:42:03 | +| 21 | Kenneth Knowles | [DRAFT - Apache Beam Board Report - June 2023](https://s.apache.org/beam-draft-report-2023-06) | 2023-06-09 17:28:01 | +| 22 | Jarek Potiuk | [ApacheCon 2022 - Data Engineering - YouTube](https://s.apache.org/data-engineering-videos-2022) | 2023-06-16 10:14:54 | +| 23 | Steven van Rossum | [A New DoFn](https://s.apache.org/a-new-dofn) | 2023-06-20 21:43:12 | +| 24 | Ritesh Ghorse | [Hugging Face Model Handler](https://docs.google.com/document/d/107T71QHMxvu67PyG-mkHwRhm7W2-07jihnqUMz-yuEM) | 2023-06-22 17:38:13 | +| 25 | Robert Bradshaw | [Apache Beam Yaml Improvements](https://s.apache.org/beam-yaml-pipelines-improvements) | 2023-06-22 18:12:26 | +| 26 | Celeste Zeng | [Add ARM Support to All Beam Container Images](https://docs.google.com/document/d/1ikbEJNsFH1D9HqiMqiVyyMlNpDgSqxXK22nUoetzW6I) | 2023-07-18 17:50:01 | +| 27 | Danny McCormick | [Per Key Inference](https://docs.google.com/document/d/1kj3FyWRbJu1KhViX07Z0Gk0MU0842jhYRhI-DMhhcv4) | 2023-07-20 10:47:33 | +| 28 | Yasha Ravindra | [Throttle Time Counters - One Pager](https://docs.google.com/document/d/1hUufb3L5jURGeFLaQKeQbPlYK-B2wbaLHtqNVETSOOk) | 2023-07-27 16:50:44 | +| 29 | Yi Hu | [Apache Beam Vendored Dependencies Release Guide](https://s.apache.org/beam-release-vendored-artifacts) | 2023-08-03 10:44:27 | +| 30 | Danny McCormick | [Contribute to Apache Beam](https://s.apache.org/beam-owners) | 2023-08-08 10:55:01 | +| 31 | Jack McCluskey | [Buffered Logging for Apache Beam Bootloaders](https://s.apache.org/beam-buffered-logging) | 2023-08-16 12:56:54 | +| 32 | Damon Douglas | [Can Beam slow down a PCollection?](https://docs.google.com/document/d/1DFHazCZp7xjgYRsuhD8VBgeWXweU2AQFgzGjTSIw9X4) | 2023-08-23 19:40:59 | +| 33 | Kenneth Knowles | [[PUBLIC] PTransform Design Doc Template](https://s.apache.org/ptransform-design-doc) | 2023-08-24 10:25:18 | +| 34 | Anand Inguva | [Proposal for supporting pyproject](https://docs.google.com/document/d/17-y48WW25-VGBWZNyTdoN0WUN03k9ZhJjLp9wtyG1Wc) | 2023-08-28 10:14:17 | +| 35 | Kenneth Knowles | [DRAFT - Apache Beam Board Report - September 2023](https://s.apache.org/beam-draft-report-2023-09) | 2023-09-12 11:11:49 | +| 36 | Robert Bradshaw | [Beam Yaml Contribution Guide](https://s.apache.org/beam-yaml-contribute) | 2023-09-25 15:12:12 | +| 37 | Kenneth Knowles | [[PUBLIC] Redistribute PTransform Design Doc](https://s.apache.org/beam-reshuffle) | 2023-09-26 11:43:53 | +| 38 | Anand Inguva | [Improving Side Input Performance Through Caching in Apache Beam](https://docs.google.com/document/d/1gllYsIFqKt4TWAxQmXU_-sw7SLnur2Q69d__N0XBMdE) | 2023-10-16 12:26:01 | +| 39 | Joey Tran | [[PUBLIC] PTransform Design Doc Template](https://docs.google.com/document/d/1NpCipgvT6lMgf1nuuPPwZoKp5KsteplFancGqOgy8OY) | 2023-10-19 10:05:52 | +| 40 | Anand Inguva | [Embeddings in MLTransform](https://docs.google.com/document/d/1En4bfbTu4rvu7LWJIKV3G33jO-xJfTdbaSFSURmQw_s) | 2023-10-30 10:00:00 | +| 41 | John Casey | [[BEAM] Adding Dead Letter Queue Functionality to IO Transforms](https://docs.google.com/document/d/1NGeCk6tOqF-TiGEAV7ixd_vhIiWz9sHPlCa1P_77Ajs) | 2023-11-08 15:43:39 | +| 42 | Jack McCluskey | [Beam Python Type Hinting](https://s.apache.org/beam-python-type-hinting-overview) | 2023-11-13 12:02:03 | +| 43 | Ritesh Ghorse | [[PUBLIC] Enrichment PTransform Design Doc](https://s.apache.org/enrichment-transform) | 2023-11-13 14:58:56 | +| 44 | Ahmed Abualsaud | [Generating External Transform Wrappers](https://s.apache.org/autogen-wrappers) | 2023-12-04 12:05:44 | +| 45 | Jack McCluskey | [RunInference Batching Across Bundles](https://docs.google.com/document/d/1Rin_5Vm3qT1Mkb5PcHgTDrjXc3j0Admzi3fEGEHB2-4) | 2023-12-07 15:28:07 | \ No newline at end of file diff --git a/contributor-docs/discussion-docs/2024.md b/contributor-docs/discussion-docs/2024.md new file mode 100644 index 000000000000..baea7c9fc462 --- /dev/null +++ b/contributor-docs/discussion-docs/2024.md @@ -0,0 +1,45 @@ + + +# List Of Documents Submitted To dev@beam.apache.org In 2024 +| No. | Author | Subject | Date (UTC) | +|---|---|---|---| +| 1 | Kenneth Knowles | [DRAFT - Apache Beam Board Report - January 2024](https://s.apache.org/beam-draft-report-2024-01) | 2024-01-05 10:16:15 | +| 2 | Yi Hu | [Apache Beam Vendored Dependencies Release Guide](https://docs.google.com/document/d/1ztEoyGkqq9ie5riQxRtMuBu3vb6BUO91mSMn1PU0pDA) | 2024-01-11 11:06:01 | +| 3 | Yi Hu | [Apache Beam Vendored Dependencies Release Guide](https://s.apache.org/beam-release-vendored-artifacts) | 2024-01-19 16:24:01 | +| 4 | Ahmet Altay | [GSoC 2024 Ideas list - Community Development - Apache Software Foundation](https://s.apache.org/gsoc2024ideas) | 2024-01-26 15:12:15 | +| 5 | Kenneth Knowles | [[PUBLIC] Redistribute Allowing Duplicates (PTransform Design Proposal)](https://s.apache.org/beam-reshuffle-allowing-duplicates) | 2024-01-30 16:33:56 | +| 6 | Robert Burke | [Apache Beam Release Acceptance Criteria - Google Sheets](https://docs.google.com/spreadsheets/d/1qk-N5vjXvbcEk68GjbkSZTR8AGqyNUM-oLFo_ZXBpJw) | 2024-02-02 16:10:03 | +| 7 | Kenneth Knowles | [[PUBLIC] Redistribute PTransform Design Doc](https://s.apache.org/beam-redistribute) | 2024-02-08 13:31:12 | +| 8 | Damon Douglas | [Throttle PTransform](https://s.apache.org/beam-throttle-transform) | 2024-02-20 09:00:00 | +| 9 | N/A | [Design Documents - Apache Beam - Apache Software Foundation](https://s.apache.org/beam-design-docs) | 2024-02-21 03:47:17 | +| 10 | Reeba Qureshi | [Yaml_Features_GSoC_Proposal](https://docs.google.com/document/d/1vXj1qhy0Asiosn3gFDgYVKYQs3Lsyj972klSv5_hfG8) | 2024-03-05 11:38:21 | +| 11 | Kenneth Knowles | [DRAFT - Apache Beam Board Report - March 2024](https://s.apache.org/beam-draft-report-2024-03) | 2024-03-05 12:15:08 | +| 12 | Jeff Kinard | [Callable kwargs for Beam Yaml](https://docs.google.com/document/d/1Zvrl-rxzAWKejHi5F1eaXoLNDzc_Q_-kiE7Grvh2qBA) | 2024-03-12 13:56:06 | +| 13 | Ahmed Abualsaud | [Portable Dynamic Destinations](https://s.apache.org/portable-dynamic-destinations) | 2024-03-27 09:51:14 | +| 14 | Valentyn Tymofieiev | [Beam Structured Logging](https://s.apache.org/beam-structured-logging) | 2024-04-11 13:49:43 | +| 15 | Maciej Szwaja | [Apache Beam Java Record Schema Inference Design Doc](https://docs.google.com/document/d/1zSQ9cnqtVM8ttJEuHBDE6hw4qjUuJy1dpZWB6IBTuOs) | 2024-04-15 11:02:19 | +| 16 | Danny McCormick | [Load N Model Copies in RunInference](https://docs.google.com/document/d/1FmKrBHkb8YTYz_Dcec7JlTqXwy382ar8Gxicr_s13c0) | 2024-04-19 10:28:18 | +| 17 | Kenneth Knowles | [Triggering is for Sinks](https://s.apache.org/beam-sink-triggers) | 2024-04-22 14:40:38 | +| 18 | Danny McCormick | [GSoC Proposal : Implement RAG Pipelines using Beam](https://docs.google.com/document/d/1M_8fvqKVBi68hQo_x1AMQ8iEkzeXTcSl0CwTH00cr80) | 2024-05-01 16:12:23 | +| 19 | Kenneth Knowles | [DRAFT - Apache Beam Board Report - June 2024](https://s.apache.org/beam-draft-report-2024-06) | 2024-05-23 14:57:16 | +| 20 | Jack McCluskey | [Embeddings in MLTransform](https://docs.google.com/document/d/1En4bfbTu4rvu7LWJIKV3G33jO-xJfTdbaSFSURmQw_s) | 2024-05-29 10:26:47 | +| 21 | Bartosz Zab��ocki | [[External] Solace IO - Read Connector](https://docs.google.com/document/d/1Gvq67VrcHCnlO8f_NzMM1Y4c7wCNSdvo6qqLWg8upfw) | 2024-05-29 12:00:23 | +| 22 | Danny McCormick | [RunInference Timeouts](https://docs.google.com/document/d/19ves6iv-m_6DFmePJZqYpLm-bCooPu6wQ-Ti6kAl2Jo) | 2024-08-07 07:11:38 | +| 23 | Jack McCluskey | [BatchElements in Beam Python](https://docs.google.com/document/d/1fOjIjIUH5dxllOGp5Z4ZmpM7BJhAJc2-hNjTnyChvgc) | 2024-08-15 14:56:26 | +| 24 | XQ Hu | [[Public] Beam 3.0: a discussion doc](https://docs.google.com/document/d/13r4NvuvFdysqjCTzMHLuUUXjKTIEY3d7oDNIHT6guww) | 2024-08-19 17:17:26 | +| 25 | Danny McCormick | [Beam Patch Release Process](https://docs.google.com/document/d/1o4UK444hCm1t5KZ9ufEu33e_o400ONAehXUR9A34qc8) | 2024-08-23 04:51:48 | +| 26 | Jack McCluskey | [Beam Python Type Hinting](https://s.apache.org/beam-python-type-hinting-overview) | 2024-08-26 14:16:42 | +| 27 | Ahmed Abualsaud | [Python Multi-language with SchemaTransforms](https://docs.google.com/document/d/1_embA3pGwoYG7sbHaYzAkg3hNxjTughhFCY8ThcoK_Q) | 2024-08-26 19:53:10 | +| 28 | Kenneth Knowles | [DRAFT - Apache Beam Board Report - September 2024](https://s.apache.org/beam-draft-report-2024-09) | 2024-09-11 15:01:55 | \ No newline at end of file diff --git a/contributor-docs/discussion-docs/generate_doc_md.py b/contributor-docs/discussion-docs/generate_doc_md.py new file mode 100644 index 000000000000..4f36802ae0d9 --- /dev/null +++ b/contributor-docs/discussion-docs/generate_doc_md.py @@ -0,0 +1,321 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""This script creates a MD file with a list of discussion documents +from dev@beam.apache.org. + +Usage: + +1. Download email archives: The script requires local copies of +the dev@beam.apache.org mbox files for the desired year. +You can download these manually or modify the script to +automate the download process. + +2. Run the script: + ```bash + python generate_doc_md.py + ``` +3. Output: The script will create a Markdown file named .md containing +a table of discussion documents with their authors, +subjects, and submission dates. + +Note: +The script currently extracts links to Google Docs and +Apache short links (s.apache.org). Ensure you have the necessary libraries +installed (e.g., requests, bs4, mailbox). + +""" + +import os +import re +import requests +import mailbox +import datetime +import sys + +from bs4 import BeautifulSoup +from dataclasses import dataclass + +LIST_NAME = "dev" +DOMAIN = "beam.apache.org" +OUTPUT_DIR = "generated" + + +def download_mbox(list_name, domain, year, month): + """Downloads an mbox file from the Apache mailing list archive.""" + + # Construct the URL + url = f"https://lists.apache.org/api/mbox.lua?list={list_name}&domain={domain}&d={year}-{month:02d}" + + try: + response = requests.get(url, stream=True) + response.raise_for_status() # Raise an exception for bad status codes + + # Create the directory for the archive if it doesn't exist + os.makedirs(OUTPUT_DIR, exist_ok=True) + + # Generate the output filename + output_filename = f"{OUTPUT_DIR}/{list_name}@{domain}_{year}-{month:02d}.mbox" + + with open(output_filename, "wb") as f: + for chunk in response.iter_content(chunk_size=8192): + f.write(chunk) + + print(f"Downloaded {output_filename}") + + except requests.exceptions.RequestException as e: + print(f"Error downloading archive: {e}") + + +def download_mbox_for_one_year(year): + """Downloads mbox files for each month in a given year.""" + for month in range(1, 13): + download_mbox(LIST_NAME, DOMAIN, year, month) + + +def get_google_doc_title(link): + """Fetches the title of a Google Doc from its link.""" + try: + response = requests.get(link) + response.raise_for_status() # Raise an exception for bad status codes + + soup = BeautifulSoup(response.content, "html.parser") + title = soup.title.string.strip() + return title + except requests.exceptions.RequestException as e: + print(f"Error fetching URL: {e} {link}") + return None + except Exception as e: + print(f"Error extracting title: {e} {link}") + return None + + +def extract_name_re(email_string): + """Extracts the name from an email string using regular expressions.""" + email_string = email_string.replace('"', "") + match = re.match(r"^(.+?) via .+ <.+@.+>$", email_string) + if match: + return match.group(1) + else: + match = re.match(r"^(.+?) <.+@.+>$", email_string) + if match: + return match.group(1) + return email_string + + +def convert_to_timestamp(date_string): + """Converts a date string to a timestamp object.""" + + try: + date_format = "%a, %d %b %Y %H:%M:%S %z" + datetime_obj = datetime.datetime.strptime(date_string, date_format) + return datetime_obj.timestamp() + except: + return None + + +@dataclass +class EmailMessage: + """A data class representing an email message.""" + + sender: str + doc_title: str + doc_url: str + body: str + timestamp: datetime.datetime = None + + +def extract_google_doc_sheet_link(text): + """Extracts Google Docs or Sheets link from text.""" + pattern = r"https?:\/\/docs\.google\.com\/(document|spreadsheets)\/d\/([a-zA-Z0-9-_]+)\/.*" + match = re.search(pattern, text) + if match: + return match.group(0) + else: + return None + + +def extract_s_link(text): + """Extracts Apache short link from text.""" + pattern = r"https?://s\.apache\.org/.*" + match = re.search(pattern, text) + if match: + return match.group(0) + else: + return None + +def extract_google_doc_id(url): + """ + Extracts the unique ID of a Google Doc or Google Sheet from a given URL. + + Args: + url: The URL of the Google Doc or Google Sheet. + + Returns: + The unique ID of the Google Doc or Google Sheet, or None if the ID could not be extracted. + """ + pattern = r"/(document|spreadsheets)/d/([a-zA-Z0-9-_]+)" + match = re.search(pattern, url) + if match: + return match.group(2) + else: + return None + +def standardize_url_link(url): + g_url = extract_google_doc_id(url) + if g_url: + if "spreadsheets" in url: + return f"https://docs.google.com/spreadsheets/d/{g_url}" + else: + return f"https://docs.google.com/document/d/{g_url}" + else: + return url + + +def add_message(messages: list[EmailMessage], new_message: EmailMessage): + """Adds a new message to the list, ensuring unique subjects and keeping the oldest message.""" + + url = new_message.doc_url + for i, message in enumerate(messages): + if message.doc_url == url: + if new_message.timestamp < message.timestamp: + messages[i] = new_message + return + messages.append(new_message) + + +def remove_invalid_characters(string_url): + """Removes invalid characters from a string.""" + + while string_url.endswith(".") or string_url.endswith( + ",") or string_url.endswith("*") or string_url.endswith( + "(") or string_url.endswith(")"): + string_url = string_url[:-1] + + return string_url + + +def find_google_docs_links(mbox_file, doc_messages, doc_urls): + """Filters email messages from an mbox file that contain Google Docs links.""" + + if not os.path.isfile(mbox_file): + print(f"Cannot find the file {mbox_file}") + + mbox = mailbox.mbox(mbox_file) + + for message in mbox: + c = message.get_payload() + # for multipart messages, only use the first part + while isinstance(c, list): + c = c[0].get_payload() + + # assume the message only contain one doc url + doc_url = None + gdoc_url = extract_google_doc_sheet_link(c) + if gdoc_url: + doc_url = gdoc_url.split()[0].split(">")[0] + else: + s_url = extract_s_link(c) + if s_url: + doc_url = s_url.split()[0].split(">")[0] + if doc_url and not doc_url in doc_urls: + doc_url = remove_invalid_characters(doc_url) + doc_url = standardize_url_link(doc_url) + doc_urls.append(doc_url) + title = get_google_doc_title(doc_url) + try: + sender = extract_name_re(str(message["From"])) + except: + print("Something is wrong: ", message["From"]) + sender = None + if not sender: + print("test-------") + print(message["From"]) + doc_time = convert_to_timestamp(message["Date"]) + if title: + title = title.replace("- Google Docs", "").strip() + new_msg = EmailMessage( + doc_title=title, + doc_url=doc_url, + body=c, + sender=sender, + timestamp=doc_time, + ) + add_message(doc_messages, new_msg) + + return doc_messages + + +def sort_emails_by_timestamp(emails: list[EmailMessage]) -> list[EmailMessage]: + """Sorts a list of EmailMessage objects by timestamp from oldest to newest.""" + + return sorted(emails, key=lambda email: email.timestamp or 0) + + +def extract_docs_for_one_year(year): + """Extracts Google Docs links from emails in a given year.""" + + doc_messages = [] + doc_urls = [] + for month in range(1, 13): + # Generate the output filename + output_filename = f"{OUTPUT_DIR}/{LIST_NAME}@{DOMAIN}_{year}-{month:02d}.mbox" + find_google_docs_links(output_filename, doc_messages, doc_urls) + return sort_emails_by_timestamp(doc_messages) + + +def convert_to_md_table(email_messages: list[EmailMessage], year: int): + """Converts a list of EmailMessage objects to a Markdown file with a table.""" + + output_file = f"{year}.md" + with open(output_file, "w") as f: + f.write("""\n\n""") + f.write(f"# List Of Documents Submitted To dev@beam.apache.org In {year}\n") + f.write("| No. | Author | Subject | Date (UTC) |\n") + f.write("|---|---|---|---|") + for eid, email in enumerate(email_messages): + if email.timestamp: + datetime_obj = datetime.datetime.fromtimestamp(email.timestamp) + formatted_date = datetime_obj.strftime("%Y-%m-%d %H:%M:%S") + else: + formatted_date = "Unknown" + doc_title = email.doc_title.replace("|", ":") + row_no = f'{eid+1}' + f.write( + f"\n| {row_no} | {email.sender} | [{doc_title}]({email.doc_url}) | {formatted_date} |" + ) + + +if __name__ == "__main__": + if len(sys.argv) > 1: + year = sys.argv[1] + download_mbox_for_one_year(year) + docs = extract_docs_for_one_year(year) + convert_to_md_table(docs, year) + else: + print("Please provide a year as an argument.") diff --git a/examples/notebooks/beam-ml/run_inference_huggingface.ipynb b/examples/notebooks/beam-ml/run_inference_huggingface.ipynb index 71f7e3f0a3fb..2e4556fd310c 100644 --- a/examples/notebooks/beam-ml/run_inference_huggingface.ipynb +++ b/examples/notebooks/beam-ml/run_inference_huggingface.ipynb @@ -109,7 +109,7 @@ "source": [ "!pip install torch --quiet\n", "!pip install tensorflow --quiet\n", - "!pip install transformers==4.30.0 --quiet\n", + "!pip install transformers==4.44.2 --quiet\n", "!pip install apache-beam[gcp]>=2.50 --quiet" ] }, @@ -531,4 +531,4 @@ ] } ] -} \ No newline at end of file +} diff --git a/examples/notebooks/blog/unittests_in_beam.ipynb b/examples/notebooks/blog/unittests_in_beam.ipynb index 43ebad5e5594..da3f39d02959 100644 --- a/examples/notebooks/blog/unittests_in_beam.ipynb +++ b/examples/notebooks/blog/unittests_in_beam.ipynb @@ -4,7 +4,7 @@ "metadata": { "colab": { "provenance": [], - "authorship_tag": "ABX9TyM16129G+tIfKxNIGenSDL1", + "authorship_tag": "ABX9TyNKlk6MKeCAFiaFkcs9pvkB", "include_colab_link": true }, "kernelspec": { @@ -85,126 +85,18 @@ "source": [ "**Example 1**\n", "\n", - "This `DoFn` (and corresponding pipeline) is used to convey a situation in which a `DoFn` makes an API call. Note that an error is raised here if the length of the API response (returned_record) is less than length 10." + "The following example shows how to use the `Map` construct to calculate median house value per bedroom.\n" ], "metadata": { - "id": "Z8__izORM3r8" + "id": "IVjBkewt1sLA" } }, { "cell_type": "code", "source": [ - "# Fake client to simulate an external call\n", - "\n", - "import time\n", - "class Client():\n", - " def get_data(self, api):\n", - " time.sleep(3)\n", - " return [0,1,2,3,4,5,6,7,8,9]\n", - "\n", - "MyApiCall = Client()" - ], - "metadata": { - "id": "GGPF7cY3Ntyj" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "#The following packages are used to run the example pipelines\n", - "\n", "import apache_beam as beam\n", "from apache_beam.io import ReadFromText, WriteToText\n", - "from apache_beam.options.pipeline_options import PipelineOptions\n", "\n", - "class MyDoFn(beam.DoFn):\n", - " def process(self,element):\n", - " returned_record = MyApiCall.get_data(\"http://my-api-call.com\")\n", - " if len(returned_record)!=10:\n", - " raise ValueError(\"Length of record does not match expected length\")\n", - " yield returned_record\n", - "\n", - "with beam.Pipeline() as p:\n", - " result = (\n", - " p\n", - " | ReadFromText(\"/content/sample_data/anscombe.json\")\n", - " | beam.ParDo(MyDoFn())\n", - " | WriteToText(\"/content/example1\")\n", - " )" - ], - "metadata": { - "id": "Ktk9EVIFzGfP" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "**Mocking Example**\n", - "\n", - "The following blocks of code illustrate how we can mock an API response, to test out the error message we've written. Note that we can use mocking to avoid making the actual API call in our test." - ], - "metadata": { - "id": "58GVMyMa2PwE" - } - }, - { - "cell_type": "code", - "source": [ - "!pip install mock # Install the 'mock' module" - ], - "metadata": { - "id": "ESclJ_G-6JcW" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# We import the mock package for mocking functionality.\n", - "from unittest.mock import Mock,patch\n", - "# from MyApiCall import get_data\n", - "import mock\n", - "\n", - "\n", - "# MyApiCall is a function that calls get_data to fetch some data via an API call.\n", - "@patch('MyApiCall.get_data')\n", - "def test_error_message_wrong_length(self, mock_get_data):\n", - " response = ['field1','field2']\n", - " mock_get_data.return_value = Mock()\n", - " mock_get_data.return_value.json.return_value=response\n", - "\n", - " input_elements = ['-122.050000,37.370000,27.000000,3885.000000,661.000000,1537.000000,606.000000,6.608500,344700.000000'] #input length 9\n", - " with self.assertRaisesRegex(ValueError,\n", - " \"Length of record does not match expected length'\"):\n", - " p = beam.Pipeline()\n", - " result = p | beam.create(input_elements) | beam.ParDo(MyDoFn())\n", - " result\n" - ], - "metadata": { - "id": "IRuv8s8a2O8F" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "**Example 2**\n", - "\n", - "The following example shows how we can use the `Map` construct to calculate median house value per bedroom.\n" - ], - "metadata": { - "id": "IVjBkewt1sLA" - } - }, - { - "cell_type": "code", - "source": [ "# The following code computes the median house value per bedroom\n", "def median_house_value_per_bedroom(element):\n", " # median_house_value is at index 8 and total_bedrooms is at index 4\n", @@ -212,9 +104,9 @@ " return float(element[8])/float(element[4])\n", "\n", "\n", - "with beam.Pipeline() as p2:\n", + "with beam.Pipeline() as p1:\n", " result = (\n", - " p2\n", + " p1\n", " | ReadFromText(\"/content/sample_data/california_housing_test.csv\",skip_header_lines=1)\n", " | beam.Map(median_house_value_per_bedroom)\n", " | WriteToText(\"/content/example2\")\n", @@ -229,9 +121,9 @@ { "cell_type": "markdown", "source": [ - "**Example 3**\n", + "**Example 2**\n", "\n", - "The following code is an extension of example 2, but with more complex pipeline logic. Thus, you will see that the `median_house_value_per_bedroom` function is now more complex, and involves writing to various keys." + "The following code is an extension of example 1, but with more complex pipeline logic. The `median_house_value_per_bedroom` function is now more complex, and involves writing to various keys." ], "metadata": { "id": "Mh3nZZ1_12sX" @@ -241,7 +133,7 @@ "cell_type": "code", "source": [ "import random\n", - "# The following code computes the median house value per bedroom\n", + "# The following code computes the median house value per bedroom.\n", "counter=-1 #define a counter globally\n", "\n", "\n", @@ -260,9 +152,9 @@ " return (key,value*10)\n", "\n", "\n", - "with beam.Pipeline() as p3:\n", + "with beam.Pipeline() as p2:\n", " result = (\n", - " p3\n", + " p2\n", " | ReadFromText(\"/content/sample_data/california_housing_test.csv\",skip_header_lines=1)\n", " | beam.Map(median_house_value_per_bedroom)\n", " | beam.Map(multiply_by_factor)\n", @@ -294,14 +186,14 @@ " | beam.Map(multiply_by_factor)\n", " | beam.CombinePerKey(sum))\n", "\n", - "# Define a new class that inherits from beam.PTransform\n", + "# Define a new class that inherits from beam.PTransform.\n", "class MapAndCombineTransform(beam.PTransform):\n", " def expand(self, pcoll):\n", " return transform_data_set(pcoll)\n", "\n", - "with beam.Pipeline() as p3:\n", + "with beam.Pipeline() as p2:\n", " result = (\n", - " p3\n", + " p2\n", " | ReadFromText(\"/content/sample_data/california_housing_test.csv\",skip_header_lines=1)\n", " | MapAndCombineTransform() # Use the new PTransform class\n", " | WriteToText(\"/content/example3\")\n", @@ -316,7 +208,7 @@ { "cell_type": "markdown", "source": [ - "**Unit Test for Pipeline 3**\n", + "**Unit Test for Pipeline 2**\n", "\n", "We've populated some sample records here, as well as set what we're expecting our expected value to be." ], @@ -328,7 +220,6 @@ "cell_type": "code", "source": [ "import unittest\n", - "import apache_beam as beam\n", "from apache_beam.testing.test_pipeline import TestPipeline\n", "from apache_beam.testing.util import assert_that, equal_to\n", "\n", @@ -344,9 +235,9 @@ " '122.05,100.99,24.30,40.5,56.55,42.01,11,35,75.30,92.91',\n", " '-120.05,39.37,29.00,4085.00,681.00,1557.00,626.00,6.8085,364700.00'\n", " ]\n", - " with beam.Pipeline() as p3:\n", + " with beam.Pipeline() as p2:\n", " result = (\n", - " p3\n", + " p2\n", " | beam.Create(input_elements)\n", " | beam.Map(MapAndCombineTransform())\n", " )\n", @@ -357,6 +248,123 @@ }, "execution_count": null, "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "**Example 3**\n", + "\n", + "This `DoFn` and the corresponding pipeline demonstrate a `DoFn` making an API call. An error occurs if the length of the API response (`returned_record`) is less than the length `10`." + ], + "metadata": { + "id": "Z8__izORM3r8" + } + }, + { + "cell_type": "code", + "source": [ + "# Fake client to simulate an external call\n", + "\n", + "import time\n", + "class Client():\n", + " def get_data(self, api):\n", + " time.sleep(3)\n", + " return [0,1,2,3,4,5,6,7,8,9]\n", + "\n", + "MyApiCall = Client()" + ], + "metadata": { + "id": "GGPF7cY3Ntyj" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "**Note:** The following cell can take about 2 minutes to run" + ], + "metadata": { + "id": "3tGnPucbzmEx" + } + }, + { + "cell_type": "code", + "source": [ + "# The following packages are used to run the example pipelines.\n", + "from apache_beam.options.pipeline_options import PipelineOptions\n", + "\n", + "class MyDoFn(beam.DoFn):\n", + " def process(self,element):\n", + " returned_record = MyApiCall.get_data(\"http://my-api-call.com\")\n", + " if len(returned_record)!=10:\n", + " raise ValueError(\"Length of record does not match expected length\")\n", + " yield returned_record\n", + "\n", + "with beam.Pipeline() as p3:\n", + " result = (\n", + " p3\n", + " | ReadFromText(\"/content/sample_data/anscombe.json\")\n", + " | beam.ParDo(MyDoFn())\n", + " | WriteToText(\"/content/example1\")\n", + " )" + ], + "metadata": { + "id": "Ktk9EVIFzGfP" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "**Mocking Example**\n", + "\n", + "To test the error message, mock an API response, as demonstrated in the following blocks of code. Use mocking to avoid making the actual API call in the test." + ], + "metadata": { + "id": "58GVMyMa2PwE" + } + }, + { + "cell_type": "code", + "source": [ + "!pip install mock # Install the 'mock' module." + ], + "metadata": { + "id": "ESclJ_G-6JcW" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Import the mock package for mocking functionality.\n", + "from unittest.mock import Mock,patch\n", + "# from MyApiCall import get_data\n", + "import mock\n", + "\n", + "\n", + "# MyApiCall is a function that calls get_data to fetch some data by using an API call.\n", + "@patch('MyApiCall.get_data')\n", + "def test_error_message_wrong_length(self, mock_get_data):\n", + " response = ['field1','field2']\n", + " mock_get_data.return_value = Mock()\n", + " mock_get_data.return_value.json.return_value=response\n", + "\n", + " input_elements = ['-122.050000,37.370000,27.000000,3885.000000,661.000000,1537.000000,606.000000,6.608500,344700.000000'] #input length 9\n", + " with self.assertRaisesRegex(ValueError,\n", + " \"Length of record does not match expected length'\"):\n", + " p3 = beam.Pipeline()\n", + " result = p3 | beam.create(input_elements) | beam.ParDo(MyDoFn())\n", + " result\n" + ], + "metadata": { + "id": "IRuv8s8a2O8F" + }, + "execution_count": null, + "outputs": [] } ] } diff --git a/learning/tour-of-beam/learning-content/introduction/introduction-concepts/pipeline-concepts/overview-pipeline/description.md b/learning/tour-of-beam/learning-content/introduction/introduction-concepts/pipeline-concepts/overview-pipeline/description.md index 5144f737524f..50955741a9f0 100644 --- a/learning/tour-of-beam/learning-content/introduction/introduction-concepts/pipeline-concepts/overview-pipeline/description.md +++ b/learning/tour-of-beam/learning-content/introduction/introduction-concepts/pipeline-concepts/overview-pipeline/description.md @@ -22,7 +22,7 @@ The Beam SDKs provide several abstractions that simplify the mechanics of large- → `PCollection`: A PCollection represents a distributed data set that your Beam pipeline operates on. The data set can be bounded, meaning it comes from a fixed source like a file, or unbounded, meaning it comes from a continuously updating source via a subscription or other mechanism. Your pipeline typically creates an initial PCollection by reading data from an external data source, but you can also create a PCollection from in-memory data within your driver program. From there, PCollections are the inputs and outputs for each step in your pipeline. -→ `PTransform`: A PTransform represents a data processing operation, or a step, in your pipeline. Every PTransform takes one or more PCollection objects as the input, performs a processing function that you provide on the elements of that PCollection, and then produces zero or more output PCollection objects. +→ `PTransform`: A PTransform represents a data processing operation, or a step, in your pipeline. Every PTransform takes zero or more PCollection objects as the input, performs a processing function that you provide on the elements of that PCollection, and then produces zero or more output PCollection objects. {{if (eq .Sdk "go")}} → `Scope`: The Go SDK has an explicit scope variable used to build a `Pipeline`. A Pipeline can return it’s root scope with the `Root()` method. The scope variable is then passed to `PTransform` functions that place them in the `Pipeline` that owns the `Scope`. diff --git a/learning/tour-of-beam/learning-content/introduction/introduction-concepts/runner-concepts/description.md b/learning/tour-of-beam/learning-content/introduction/introduction-concepts/runner-concepts/description.md index 3989f6de6510..6eb1c04e966a 100644 --- a/learning/tour-of-beam/learning-content/introduction/introduction-concepts/runner-concepts/description.md +++ b/learning/tour-of-beam/learning-content/introduction/introduction-concepts/runner-concepts/description.md @@ -15,7 +15,7 @@ limitations under the License. Apache Beam provides a portable API layer for building sophisticated data-parallel processing `pipelines` that may be executed across a diversity of execution engines, or `runners`. The core concepts of this layer are based upon the Beam Model (formerly referred to as the Dataflow Model), and implemented to varying degrees in each Beam `runner`. -### Direct runner +### Direct Runner The Direct Runner executes pipelines on your machine and is designed to validate that pipelines adhere to the Apache Beam model as closely as possible. Instead of focusing on efficient pipeline execution, the Direct Runner performs additional checks to ensure that users do not rely on semantics that are not guaranteed by the model. Some of these checks include: * enforcing immutability of elements @@ -61,9 +61,9 @@ In java, you need to set runner to `args` when you start the program. {{end}} {{if (eq .Sdk "python")}} -In the Python SDK , the default is runner **DirectRunner**. +In the Python SDK , the **DirectRunner** is the default runner and is used if no runner is specified. -Additionally, you can read more about the Direct Runner [here](https://beam.apache.org/documentation/runners/direct/) +You can read more about the **DirectRunner** [here](https://beam.apache.org/documentation/runners/direct/) #### Run example diff --git a/learning/tour-of-beam/learning-content/introduction/introduction-guide/description.md b/learning/tour-of-beam/learning-content/introduction/introduction-guide/description.md index 9b9d7a09827e..a8fb8e750683 100644 --- a/learning/tour-of-beam/learning-content/introduction/introduction-guide/description.md +++ b/learning/tour-of-beam/learning-content/introduction/introduction-guide/description.md @@ -11,12 +11,29 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. --> -# Tour of Beam Programming Guide - -Welcome to a Tour Of Beam, a learning guide you can use to familiarize yourself with the Apache Beam. -The tour is divided into a list of modules that contain learning units covering various Apache Beam features and principles. -You can access the full list of modules by clicking ‘<<’ button on the left . For each module, learning progress is displayed next to it. -Throughout the tour, you will find learning materials, examples, exercises and challenges for you to complete. -Learning units are accompanied by code examples that you can review in the upper right pane. You can edit the code, or just run the example by clicking the ‘Run’ button. Output is displayed in the lower right pane. -Each module also contains a challenge based on the material learned. Try to solve as many as you can, and if you need help, just click on the ‘Hint’ button or examine the correct solution by clicking the ‘Solution’ button. -Now let’s start the tour by learning some core Beam principles. \ No newline at end of file +# Welcome to a Tour of Beam + +The Tour of Beam is a learning guide you can use to familiarize yourself with **Apache Beam**. + +The tour is divided into a list of modules that contain learning units covering various Apache Beam features and principles. You can access the full list of modules by clicking the ‘<<’ button on the left. For each module, learning progress is displayed next to it. + +Throughout the tour, you will find: + +- **Learning materials** +- **Examples** +- **Exercises** +- **Challenges** for you to complete + +Learning units are accompanied by code examples that you can review in the upper right pane. You can: + +- **Edit the code** +- **Run the example** + +After running the example, the output will be displayed in the lower right pane. + +Each module also contains a challenge based on the material learned. Try to solve as many as you can, and if you need help, just click on the: + +- **Hint** button +- **Solution** button to examine the correct solution + +Now, let’s start the tour by learning some core Beam principles! \ No newline at end of file diff --git a/playground/README.md b/playground/README.md index eb6ed3619bca..8f4aff1663f2 100644 --- a/playground/README.md +++ b/playground/README.md @@ -41,7 +41,7 @@ build, test, and deploy the frontend and backend services. > - buf > - sbt -1. Install Go 1.20+ +1. Install Go 1.23+ **Ubuntu 22.04 and newer:** ```shell diff --git a/playground/backend/containers/go/build.gradle b/playground/backend/containers/go/build.gradle index 39d79103d632..04e86eb53d3f 100644 --- a/playground/backend/containers/go/build.gradle +++ b/playground/backend/containers/go/build.gradle @@ -88,7 +88,7 @@ docker { buildArgs( ['BASE_IMAGE' : project.rootProject.hasProperty(["base-image"]) ? project.rootProject["base-image"] : - "golang:1.20-bullseye", + "golang:1-bullseye", 'SDK_TAG' : project.rootProject.hasProperty(["sdk-tag"]) ? project.rootProject["sdk-tag"] : project.rootProject.sdk_version, 'SDK_TAG_LOCAL': project.rootProject.sdk_version, diff --git a/playground/backend/containers/java/Dockerfile b/playground/backend/containers/java/Dockerfile index 18a37f6f016c..161fd3283f7b 100644 --- a/playground/backend/containers/java/Dockerfile +++ b/playground/backend/containers/java/Dockerfile @@ -16,7 +16,7 @@ # limitations under the License. ############################################################################### ARG BEAM_VERSION=2.44.0 -FROM golang:1.20-bullseye AS build +FROM golang:1-bullseye AS build ARG BEAM_VERSION ARG GIT_COMMIT="" ARG GIT_TIMESTAMP="0" diff --git a/playground/backend/containers/python/Dockerfile b/playground/backend/containers/python/Dockerfile index fd7d8b7f8958..dc734fe34d2f 100644 --- a/playground/backend/containers/python/Dockerfile +++ b/playground/backend/containers/python/Dockerfile @@ -15,7 +15,7 @@ # See the License for the specific language governing permissions and # limitations under the License. ############################################################################### -ARG GO_BASE_IMAGE=golang:1.20-bullseye +ARG GO_BASE_IMAGE=golang:1-bullseye ARG SDK_TAG ARG BASE_IMAGE=apache/beam_python3.10_sdk:$SDK_TAG FROM $GO_BASE_IMAGE AS build diff --git a/playground/backend/containers/python/build.gradle b/playground/backend/containers/python/build.gradle index 4a845b516b80..cce805f393bf 100644 --- a/playground/backend/containers/python/build.gradle +++ b/playground/backend/containers/python/build.gradle @@ -75,7 +75,7 @@ docker { buildArgs( ['GO_BASE_IMAGE': project.rootProject.hasProperty(["go-base-image"]) ? project.rootProject["go-base-image"] : - "golang:1.20-bullseye", + "golang:1-bullseye", 'SDK_TAG' : project.rootProject.hasProperty(["sdk-tag"]) ? project.rootProject["sdk-tag"] : default_beam_version, 'GIT_COMMIT' : getGitCommitHash(), diff --git a/playground/backend/containers/router/Dockerfile b/playground/backend/containers/router/Dockerfile index 57717d09f8cb..863461013a45 100644 --- a/playground/backend/containers/router/Dockerfile +++ b/playground/backend/containers/router/Dockerfile @@ -16,7 +16,7 @@ # limitations under the License. ############################################################################### #Dokerfile to set up the Beam Go SDK -ARG BASE_IMAGE=golang:1.20-bullseye +ARG BASE_IMAGE=golang:1-bullseye #Two-stage assembly FROM $BASE_IMAGE AS build ARG GIT_COMMIT="" diff --git a/playground/backend/containers/router/build.gradle b/playground/backend/containers/router/build.gradle index 49ae2dc5d401..48dcda7a6899 100644 --- a/playground/backend/containers/router/build.gradle +++ b/playground/backend/containers/router/build.gradle @@ -70,7 +70,7 @@ docker { tags containerImageTags() buildArgs(['BASE_IMAGE' : project.rootProject.hasProperty(["base-image"]) ? project.rootProject["base-image"] : - "golang:1.20-bullseye", + "golang:1-bullseye", 'GIT_COMMIT' : getGitCommitHash(), 'GIT_TIMESTAMP': getGitCommitTimestamp()]) } diff --git a/playground/backend/containers/scio/Dockerfile b/playground/backend/containers/scio/Dockerfile index 8e68404360dc..9c9e0ffa32ed 100644 --- a/playground/backend/containers/scio/Dockerfile +++ b/playground/backend/containers/scio/Dockerfile @@ -16,7 +16,7 @@ # limitations under the License. ############################################################################### ARG BASE_IMAGE=openjdk:11 -FROM golang:1.20-bullseye AS build +FROM golang:1-bullseye AS build ARG GIT_COMMIT="" ARG GIT_TIMESTAMP="0" diff --git a/playground/backend/playground_functions/Dockerfile b/playground/backend/playground_functions/Dockerfile index a4045c021314..3ee2c2bdc046 100644 --- a/playground/backend/playground_functions/Dockerfile +++ b/playground/backend/playground_functions/Dockerfile @@ -18,7 +18,7 @@ # This Dockerfile is only for local testing -FROM golang:1.20-alpine as build +FROM golang:1-alpine as build COPY . /app WORKDIR /app/playground_functions diff --git a/playground/infrastructure/cloudbuild/playground_ci_examples.sh b/playground/infrastructure/cloudbuild/playground_ci_examples.sh index 962d18a0f475..437cc337faf7 100755 --- a/playground/infrastructure/cloudbuild/playground_ci_examples.sh +++ b/playground/infrastructure/cloudbuild/playground_ci_examples.sh @@ -84,7 +84,7 @@ export STEP=CI export SDK_CONFIG="$BEAM_ROOT_DIR/playground/sdks.yaml" export BEAM_EXAMPLE_CATEGORIES="$BEAM_ROOT_DIR/playground/categories.yaml" export GRADLE_VERSION=7.5.1 -export GO_VERSION=1.20 +export GO_VERSION=1.23 LogOutput "Installing python java8 and dependencies" apt-get update > /dev/null @@ -244,4 +244,4 @@ do exit 1 fi done -exit 0 \ No newline at end of file +exit 0 diff --git a/release/go-licenses/Dockerfile b/release/go-licenses/Dockerfile index 1dfddc6e3745..f37b161ab3ed 100644 --- a/release/go-licenses/Dockerfile +++ b/release/go-licenses/Dockerfile @@ -16,7 +16,7 @@ # limitations under the License. ############################################################################### -FROM golang:1.20-bookworm +FROM golang:1-bookworm WORKDIR /usr/src/app COPY go.mod ./ diff --git a/release/src/main/scripts/run_rc_validation.sh b/release/src/main/scripts/run_rc_validation.sh index 91bfa9e2f8bb..9c93ed4ef4d4 100755 --- a/release/src/main/scripts/run_rc_validation.sh +++ b/release/src/main/scripts/run_rc_validation.sh @@ -99,7 +99,7 @@ HUB_VERSION=2.12.0 HUB_ARTIFACTS_NAME=hub-linux-amd64-${HUB_VERSION} BACKUP_BASHRC=.bashrc_backup_$(date +"%Y%m%d%H%M%S") BACKUP_M2=settings_backup_$(date +"%Y%m%d%H%M%S").xml -declare -a PYTHON_VERSIONS_TO_VALIDATE=("python3.8") +declare -a PYTHON_VERSIONS_TO_VALIDATE=("python3.9") echo "" echo "====================Checking Environment & Variables=================" echo "PLEASE update RC_VALIDATE_CONFIGS in file script.config first." diff --git a/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/options/DataflowStreamingPipelineOptions.java b/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/options/DataflowStreamingPipelineOptions.java index a761d38de1ab..6a0208f1447f 100644 --- a/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/options/DataflowStreamingPipelineOptions.java +++ b/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/options/DataflowStreamingPipelineOptions.java @@ -125,17 +125,15 @@ public interface DataflowStreamingPipelineOptions extends PipelineOptions { void setWindmillMessagesBetweenIsReadyChecks(int value); @Description("If true, a most a single active rpc will be used per channel.") - @Default.Boolean(false) - boolean getUseWindmillIsolatedChannels(); + Boolean getUseWindmillIsolatedChannels(); - void setUseWindmillIsolatedChannels(boolean value); + void setUseWindmillIsolatedChannels(Boolean value); @Description( "If true, separate streaming rpcs will be used for heartbeats instead of sharing streams with state reads.") - @Default.Boolean(false) - boolean getUseSeparateWindmillHeartbeatStreams(); + Boolean getUseSeparateWindmillHeartbeatStreams(); - void setUseSeparateWindmillHeartbeatStreams(boolean value); + void setUseSeparateWindmillHeartbeatStreams(Boolean value); @Description("The number of streams to use for GetData requests.") @Default.Integer(1) diff --git a/runners/google-cloud-dataflow-java/src/test/java/org/apache/beam/runners/dataflow/DataflowRunnerTest.java b/runners/google-cloud-dataflow-java/src/test/java/org/apache/beam/runners/dataflow/DataflowRunnerTest.java index 37c20c61ad8e..01ceac9da585 100644 --- a/runners/google-cloud-dataflow-java/src/test/java/org/apache/beam/runners/dataflow/DataflowRunnerTest.java +++ b/runners/google-cloud-dataflow-java/src/test/java/org/apache/beam/runners/dataflow/DataflowRunnerTest.java @@ -1255,8 +1255,8 @@ public void testNoStagingLocationAndNoTempLocationFails() { @Test public void testApplySdkEnvironmentOverrides() throws IOException { DataflowPipelineOptions options = buildPipelineOptions(); - String dockerHubPythonContainerUrl = "apache/beam_python3.8_sdk:latest"; - String gcrPythonContainerUrl = "gcr.io/apache-beam-testing/beam-sdk/beam_python3.8_sdk:latest"; + String dockerHubPythonContainerUrl = "apache/beam_python3.9_sdk:latest"; + String gcrPythonContainerUrl = "gcr.io/apache-beam-testing/beam-sdk/beam_python3.9_sdk:latest"; options.setSdkHarnessContainerImageOverrides(".*python.*," + gcrPythonContainerUrl); DataflowRunner runner = DataflowRunner.fromOptions(options); RunnerApi.Pipeline pipeline = @@ -1297,8 +1297,8 @@ public void testApplySdkEnvironmentOverrides() throws IOException { @Test public void testApplySdkEnvironmentOverridesByDefault() throws IOException { DataflowPipelineOptions options = buildPipelineOptions(); - String dockerHubPythonContainerUrl = "apache/beam_python3.8_sdk:latest"; - String gcrPythonContainerUrl = "gcr.io/cloud-dataflow/v1beta3/beam_python3.8_sdk:latest"; + String dockerHubPythonContainerUrl = "apache/beam_python3.9_sdk:latest"; + String gcrPythonContainerUrl = "gcr.io/cloud-dataflow/v1beta3/beam_python3.9_sdk:latest"; DataflowRunner runner = DataflowRunner.fromOptions(options); RunnerApi.Pipeline pipeline = RunnerApi.Pipeline.newBuilder() diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/OperationalLimits.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/OperationalLimits.java index 47e36e498507..84f41c473fe0 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/OperationalLimits.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/OperationalLimits.java @@ -17,37 +17,38 @@ */ package org.apache.beam.runners.dataflow.worker; -import com.google.auto.value.AutoBuilder; +import com.google.auto.value.AutoValue; +import org.apache.beam.sdk.annotations.Internal; /** Keep track of any operational limits required by the backend. */ -public class OperationalLimits { +@AutoValue +@Internal +public abstract class OperationalLimits { + + private static final long DEFAULT_MAX_WORK_ITEM_COMMIT_BYTES = 180 << 20; + // Maximum size of a commit from a single work item. - public final long maxWorkItemCommitBytes; + public abstract long getMaxWorkItemCommitBytes(); // Maximum size of a single output element's serialized key. - public final long maxOutputKeyBytes; + public abstract long getMaxOutputKeyBytes(); // Maximum size of a single output element's serialized value. - public final long maxOutputValueBytes; + public abstract long getMaxOutputValueBytes(); - OperationalLimits(long maxWorkItemCommitBytes, long maxOutputKeyBytes, long maxOutputValueBytes) { - this.maxWorkItemCommitBytes = maxWorkItemCommitBytes; - this.maxOutputKeyBytes = maxOutputKeyBytes; - this.maxOutputValueBytes = maxOutputValueBytes; - } + @AutoValue.Builder + public abstract static class Builder { - @AutoBuilder(ofClass = OperationalLimits.class) - public interface Builder { - Builder setMaxWorkItemCommitBytes(long bytes); + public abstract Builder setMaxWorkItemCommitBytes(long bytes); - Builder setMaxOutputKeyBytes(long bytes); + public abstract Builder setMaxOutputKeyBytes(long bytes); - Builder setMaxOutputValueBytes(long bytes); + public abstract Builder setMaxOutputValueBytes(long bytes); - OperationalLimits build(); + public abstract OperationalLimits build(); } - public static Builder builder() { - return new AutoBuilder_OperationalLimits_Builder() - .setMaxWorkItemCommitBytes(Long.MAX_VALUE) + public static OperationalLimits.Builder builder() { + return new AutoValue_OperationalLimits.Builder() + .setMaxWorkItemCommitBytes(DEFAULT_MAX_WORK_ITEM_COMMIT_BYTES) .setMaxOutputKeyBytes(Long.MAX_VALUE) .setMaxOutputValueBytes(Long.MAX_VALUE); } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java index 1af677382092..ecdba404151e 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java @@ -17,7 +17,6 @@ */ package org.apache.beam.runners.dataflow.worker; -import static org.apache.beam.runners.dataflow.worker.windmill.client.grpc.stubs.WindmillChannelFactory.remoteChannel; import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkArgument; import com.google.api.services.dataflow.model.CounterUpdate; @@ -34,8 +33,6 @@ import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; -import java.util.concurrent.atomic.AtomicReference; -import java.util.function.Consumer; import java.util.function.Function; import java.util.function.Supplier; import org.apache.beam.runners.core.metrics.MetricsLogger; @@ -49,9 +46,11 @@ import org.apache.beam.runners.dataflow.worker.streaming.StageInfo; import org.apache.beam.runners.dataflow.worker.streaming.WorkHeartbeatResponseProcessor; import org.apache.beam.runners.dataflow.worker.streaming.config.ComputationConfig; +import org.apache.beam.runners.dataflow.worker.streaming.config.FixedGlobalConfigHandle; import org.apache.beam.runners.dataflow.worker.streaming.config.StreamingApplianceComputationConfigFetcher; import org.apache.beam.runners.dataflow.worker.streaming.config.StreamingEngineComputationConfigFetcher; -import org.apache.beam.runners.dataflow.worker.streaming.config.StreamingEnginePipelineConfig; +import org.apache.beam.runners.dataflow.worker.streaming.config.StreamingGlobalConfig; +import org.apache.beam.runners.dataflow.worker.streaming.config.StreamingGlobalConfigHandleImpl; import org.apache.beam.runners.dataflow.worker.streaming.harness.SingleSourceWorkerHarness; import org.apache.beam.runners.dataflow.worker.streaming.harness.SingleSourceWorkerHarness.GetWorkSender; import org.apache.beam.runners.dataflow.worker.streaming.harness.StreamingCounters; @@ -63,7 +62,6 @@ import org.apache.beam.runners.dataflow.worker.windmill.Windmill; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.JobHeader; import org.apache.beam.runners.dataflow.worker.windmill.WindmillServerStub; -import org.apache.beam.runners.dataflow.worker.windmill.WindmillServiceAddress; import org.apache.beam.runners.dataflow.worker.windmill.appliance.JniWindmillApplianceServer; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetDataStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStreamPool; @@ -79,10 +77,7 @@ import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.GrpcDispatcherClient; import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.GrpcWindmillServer; import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.GrpcWindmillStreamFactory; -import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.stubs.ChannelCache; -import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.stubs.ChannelCachingRemoteStubFactory; -import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.stubs.ChannelCachingStubFactory; -import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.stubs.IsolationChannel; +import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.stubs.WindmillStubFactoryFactoryImpl; import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateCache; import org.apache.beam.runners.dataflow.worker.windmill.work.processing.StreamingWorkScheduler; import org.apache.beam.runners.dataflow.worker.windmill.work.processing.failures.FailureTracker; @@ -100,12 +95,9 @@ import org.apache.beam.sdk.io.gcp.bigquery.BigQuerySinkMetrics; import org.apache.beam.sdk.metrics.MetricsEnvironment; import org.apache.beam.sdk.util.construction.CoderTranslation; -import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.ManagedChannel; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.cache.CacheStats; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableSet; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.net.HostAndPort; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.ThreadFactoryBuilder; import org.joda.time.Duration; import org.joda.time.Instant; @@ -148,6 +140,8 @@ public final class StreamingDataflowWorker { private static final int DEFAULT_STATUS_PORT = 8081; private static final Random CLIENT_ID_GENERATOR = new Random(); private static final String CHANNELZ_PATH = "/channelz"; + public static final String STREAMING_ENGINE_USE_JOB_SETTINGS_FOR_HEARTBEAT_POOL = + "streaming_engine_use_job_settings_for_heartbeat_pool"; private final WindmillStateCache stateCache; private final StreamingWorkerStatusPages statusPages; @@ -181,7 +175,6 @@ private StreamingDataflowWorker( WorkFailureProcessor workFailureProcessor, StreamingCounters streamingCounters, MemoryMonitor memoryMonitor, - AtomicReference operationalLimits, GrpcWindmillStreamFactory windmillStreamFactory, Function executorSupplier, ConcurrentMap stageInfoMap) { @@ -237,8 +230,8 @@ private StreamingDataflowWorker( streamingCounters, hotKeyLogger, sampler, - operationalLimits, ID_GENERATOR, + configFetcher.getGlobalConfigHandle(), stageInfoMap); ThrottlingGetDataMetricTracker getDataMetricTracker = @@ -256,12 +249,24 @@ private StreamingDataflowWorker( GET_DATA_STREAM_TIMEOUT, windmillServer::getDataStream); getDataClient = new StreamPoolGetDataClient(getDataMetricTracker, getDataStreamPool); - heartbeatSender = - new StreamPoolHeartbeatSender( - options.getUseSeparateWindmillHeartbeatStreams() - ? WindmillStreamPool.create( - 1, GET_DATA_STREAM_TIMEOUT, windmillServer::getDataStream) - : getDataStreamPool); + // Experiment gates the logic till backend changes are rollback safe + if (!DataflowRunner.hasExperiment( + options, STREAMING_ENGINE_USE_JOB_SETTINGS_FOR_HEARTBEAT_POOL) + || options.getUseSeparateWindmillHeartbeatStreams() != null) { + heartbeatSender = + StreamPoolHeartbeatSender.Create( + Boolean.TRUE.equals(options.getUseSeparateWindmillHeartbeatStreams()) + ? separateHeartbeatPool(windmillServer) + : getDataStreamPool); + + } else { + heartbeatSender = + StreamPoolHeartbeatSender.Create( + separateHeartbeatPool(windmillServer), + getDataStreamPool, + configFetcher.getGlobalConfigHandle()); + } + stuckCommitDurationMillis = options.getStuckCommitDurationMillis() > 0 ? options.getStuckCommitDurationMillis() : 0; statusPagesBuilder @@ -298,6 +303,7 @@ private StreamingDataflowWorker( .setCurrentActiveCommitBytes(workCommitter::currentActiveCommitBytes) .setGetDataStatusProvider(getDataClient::printHtml) .setWorkUnitExecutor(workUnitExecutor) + .setGlobalConfigHandle(configFetcher.getGlobalConfigHandle()) .build(); Windmill.GetWorkRequest request = @@ -328,6 +334,11 @@ private StreamingDataflowWorker( LOG.debug("LocalWindmillHostport: {}", options.getLocalWindmillHostport()); } + private static WindmillStreamPool separateHeartbeatPool( + WindmillServerStub windmillServer) { + return WindmillStreamPool.create(1, GET_DATA_STREAM_TIMEOUT, windmillServer::getDataStream); + } + public static StreamingDataflowWorker fromOptions(DataflowWorkerHarnessOptions options) { long clientId = CLIENT_ID_GENERATOR.nextLong(); MemoryMonitor memoryMonitor = MemoryMonitor.fromOptions(options); @@ -335,8 +346,6 @@ public static StreamingDataflowWorker fromOptions(DataflowWorkerHarnessOptions o StreamingCounters streamingCounters = StreamingCounters.create(); WorkUnitClient dataflowServiceClient = new DataflowWorkUnitClient(options, LOG); BoundedQueueExecutor workExecutor = createWorkUnitExecutor(options); - AtomicReference operationalLimits = - new AtomicReference<>(OperationalLimits.builder().build()); WindmillStateCache windmillStateCache = WindmillStateCache.builder() .setSizeMb(options.getWorkerCacheMb()) @@ -354,7 +363,6 @@ public static StreamingDataflowWorker fromOptions(DataflowWorkerHarnessOptions o createConfigFetcherComputationStateCacheAndWindmillClient( options, dataflowServiceClient, - operationalLimits, windmillStreamFactoryBuilder, configFetcher -> ComputationStateCache.create( @@ -412,7 +420,6 @@ public static StreamingDataflowWorker fromOptions(DataflowWorkerHarnessOptions o workFailureProcessor, streamingCounters, memoryMonitor, - operationalLimits, configFetcherComputationStateCacheAndWindmillClient.windmillStreamFactory(), executorSupplier, stageInfo); @@ -428,7 +435,6 @@ public static StreamingDataflowWorker fromOptions(DataflowWorkerHarnessOptions o createConfigFetcherComputationStateCacheAndWindmillClient( DataflowWorkerHarnessOptions options, WorkUnitClient dataflowServiceClient, - AtomicReference operationalLimits, GrpcWindmillStreamFactory.Builder windmillStreamFactoryBuilder, Function computationStateCacheFactory) { ComputationConfig.Fetcher configFetcher; @@ -437,16 +443,11 @@ public static StreamingDataflowWorker fromOptions(DataflowWorkerHarnessOptions o GrpcWindmillStreamFactory windmillStreamFactory; if (options.isEnableStreamingEngine()) { GrpcDispatcherClient dispatcherClient = - GrpcDispatcherClient.create(createStubFactory(options)); + GrpcDispatcherClient.create(options, new WindmillStubFactoryFactoryImpl(options)); configFetcher = StreamingEngineComputationConfigFetcher.create( - options.getGlobalConfigRefreshPeriod().getMillis(), - dataflowServiceClient, - config -> - onPipelineConfig( - config, - dispatcherClient::consumeWindmillDispatcherEndpoints, - operationalLimits::set)); + options.getGlobalConfigRefreshPeriod().getMillis(), dataflowServiceClient); + configFetcher.getGlobalConfigHandle().registerConfigObserver(dispatcherClient::onJobConfig); computationStateCache = computationStateCacheFactory.apply(configFetcher); windmillStreamFactory = windmillStreamFactoryBuilder @@ -468,13 +469,16 @@ public static StreamingDataflowWorker fromOptions(DataflowWorkerHarnessOptions o GrpcWindmillServer.create( options, windmillStreamFactory, - GrpcDispatcherClient.create(createStubFactory(options))); + GrpcDispatcherClient.create(options, new WindmillStubFactoryFactoryImpl(options))); } else { windmillStreamFactory = windmillStreamFactoryBuilder.build(); windmillServer = new JniWindmillApplianceServer(options.getLocalWindmillHostport()); } - configFetcher = new StreamingApplianceComputationConfigFetcher(windmillServer::getConfig); + configFetcher = + new StreamingApplianceComputationConfigFetcher( + windmillServer::getConfig, + new FixedGlobalConfigHandle(StreamingGlobalConfig.builder().build())); computationStateCache = computationStateCacheFactory.apply(configFetcher); } @@ -494,10 +498,9 @@ static StreamingDataflowWorker forTesting( HotKeyLogger hotKeyLogger, Supplier clock, Function executorSupplier, - int localRetryTimeoutMs, - OperationalLimits limits) { + StreamingGlobalConfigHandleImpl globalConfigHandle, + int localRetryTimeoutMs) { ConcurrentMap stageInfo = new ConcurrentHashMap<>(); - AtomicReference operationalLimits = new AtomicReference<>(limits); BoundedQueueExecutor workExecutor = createWorkUnitExecutor(options); WindmillStateCache stateCache = WindmillStateCache.builder() @@ -510,13 +513,20 @@ static StreamingDataflowWorker forTesting( /* hasReceivedGlobalConfig= */ true, options.getGlobalConfigRefreshPeriod().getMillis(), workUnitClient, - executorSupplier, - config -> - onPipelineConfig( - config, - windmillServer::setWindmillServiceEndpoints, - operationalLimits::set)) - : new StreamingApplianceComputationConfigFetcher(windmillServer::getConfig); + globalConfigHandle, + executorSupplier) + : new StreamingApplianceComputationConfigFetcher( + windmillServer::getConfig, globalConfigHandle); + configFetcher + .getGlobalConfigHandle() + .registerConfigObserver( + config -> { + if (config.windmillServiceEndpoints().isEmpty()) { + LOG.warn("Received empty windmill service endpoints"); + return; + } + windmillServer.setWindmillServiceEndpoints(config.windmillServiceEndpoints()); + }); ConcurrentMap stateNameMap = new ConcurrentHashMap<>(prePopulatedStateNameMappings); ComputationStateCache computationStateCache = @@ -583,7 +593,6 @@ static StreamingDataflowWorker forTesting( workFailureProcessor, streamingCounters, memoryMonitor, - operationalLimits, options.isEnableStreamingEngine() ? windmillStreamFactory .setHealthCheckIntervalMillis( @@ -594,23 +603,6 @@ static StreamingDataflowWorker forTesting( stageInfo); } - private static void onPipelineConfig( - StreamingEnginePipelineConfig config, - Consumer> consumeWindmillServiceEndpoints, - Consumer operationalLimits) { - - operationalLimits.accept( - OperationalLimits.builder() - .setMaxWorkItemCommitBytes(config.maxWorkItemCommitBytes()) - .setMaxOutputKeyBytes(config.maxOutputKeyBytes()) - .setMaxOutputValueBytes(config.maxOutputValueBytes()) - .build()); - - if (!config.windmillServiceEndpoints().isEmpty()) { - consumeWindmillServiceEndpoints.accept(config.windmillServiceEndpoints()); - } - } - private static GrpcWindmillStreamFactory.Builder createGrpcwindmillStreamFactoryBuilder( DataflowWorkerHarnessOptions options, long clientId) { Duration maxBackoff = @@ -681,24 +673,6 @@ public static void main(String[] args) throws Exception { worker.start(); } - private static ChannelCachingStubFactory createStubFactory( - DataflowWorkerHarnessOptions workerOptions) { - Function channelFactory = - serviceAddress -> - remoteChannel( - serviceAddress, workerOptions.getWindmillServiceRpcChannelAliveTimeoutSec()); - ChannelCache channelCache = - ChannelCache.create( - serviceAddress -> - // IsolationChannel will create and manage separate RPC channels to the same - // serviceAddress via calling the channelFactory, else just directly return the - // RPC channel. - workerOptions.getUseWindmillIsolatedChannels() - ? IsolationChannel.create(() -> channelFactory.apply(serviceAddress)) - : channelFactory.apply(serviceAddress)); - return ChannelCachingRemoteStubFactory.create(workerOptions.getGcpCredential(), channelCache); - } - private static int chooseMaxThreads(DataflowWorkerHarnessOptions options) { if (options.getNumberOfWorkerHarnessThreads() != 0) { return options.getNumberOfWorkerHarnessThreads(); @@ -855,6 +829,7 @@ private static ConfigFetcherComputationStateCacheAndWindmillClient create( */ @AutoValue abstract static class BackgroundMemoryMonitor { + private static BackgroundMemoryMonitor create(MemoryMonitor memoryMonitor) { return new AutoValue_StreamingDataflowWorker_BackgroundMemoryMonitor( memoryMonitor, diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingModeExecutionContext.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingModeExecutionContext.java index f25f6294da86..f10f3b91e7aa 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingModeExecutionContext.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingModeExecutionContext.java @@ -50,6 +50,7 @@ import org.apache.beam.runners.dataflow.worker.profiler.ScopedProfiler.ProfileScope; import org.apache.beam.runners.dataflow.worker.streaming.Watermarks; import org.apache.beam.runners.dataflow.worker.streaming.Work; +import org.apache.beam.runners.dataflow.worker.streaming.config.StreamingGlobalConfigHandle; import org.apache.beam.runners.dataflow.worker.streaming.sideinput.SideInput; import org.apache.beam.runners.dataflow.worker.streaming.sideinput.SideInputState; import org.apache.beam.runners.dataflow.worker.streaming.sideinput.SideInputStateFetcher; @@ -72,7 +73,6 @@ import org.apache.beam.sdk.values.TupleTag; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Supplier; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.FluentIterable; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.HashBasedTable; @@ -107,6 +107,7 @@ public class StreamingModeExecutionContext extends DataflowExecutionContext stateNameMap; private final WindmillStateCache.ForComputation stateCache; private final ReaderCache readerCache; + private final StreamingGlobalConfigHandle globalConfigHandle; private final boolean throwExceptionOnLargeOutput; private volatile long backlogBytes; @@ -127,7 +128,7 @@ public class StreamingModeExecutionContext extends DataflowExecutionContext metricsContainerRegistry, DataflowExecutionStateTracker executionStateTracker, StreamingModeExecutionStateRegistry executionStateRegistry, + StreamingGlobalConfigHandle globalConfigHandle, long sinkByteLimit, boolean throwExceptionOnLargeOutput) { super( @@ -163,6 +165,7 @@ public StreamingModeExecutionContext( sinkByteLimit); this.computationId = computationId; this.readerCache = readerCache; + this.globalConfigHandle = globalConfigHandle; this.sideInputCache = new HashMap<>(); this.stateNameMap = ImmutableMap.copyOf(stateNameMap); this.stateCache = stateCache; @@ -176,11 +179,11 @@ public final long getBacklogBytes() { } public long getMaxOutputKeyBytes() { - return operationalLimits.maxOutputKeyBytes; + return operationalLimits.getMaxOutputKeyBytes(); } public long getMaxOutputValueBytes() { - return operationalLimits.maxOutputValueBytes; + return operationalLimits.getMaxOutputValueBytes(); } public boolean throwExceptionsForLargeOutput() { @@ -196,13 +199,13 @@ public void start( Work work, WindmillStateReader stateReader, SideInputStateFetcher sideInputStateFetcher, - OperationalLimits operationalLimits, Windmill.WorkItemCommitRequest.Builder outputBuilder) { this.key = key; this.work = work; this.computationKey = WindmillComputationKey.create(computationId, work.getShardedKey()); this.sideInputStateFetcher = sideInputStateFetcher; - this.operationalLimits = operationalLimits; + // Snapshot the limits for entire bundle processing. + this.operationalLimits = globalConfigHandle.getConfig().operationalLimits(); this.outputBuilder = outputBuilder; this.sideInputCache.clear(); clearSinkFullHint(); @@ -300,9 +303,9 @@ private SideInput fetchSideInput( return fetchSideInputFromWindmill( view, sideInputWindow, - Preconditions.checkNotNull(stateFamily), + checkNotNull(stateFamily), state, - Preconditions.checkNotNull(scopedReadStateSupplier), + checkNotNull(scopedReadStateSupplier), tagCache); } @@ -325,15 +328,15 @@ private SideInput fetchSideInputFromWindmill( } public Iterable getSideInputNotifications() { - return work.getWorkItem().getGlobalDataIdNotificationsList(); + return getWorkItem().getGlobalDataIdNotificationsList(); } private List getFiredTimers() { - return work.getWorkItem().getTimers().getTimersList(); + return getWorkItem().getTimers().getTimersList(); } public @Nullable ByteString getSerializedKey() { - return work.getWorkItem().getKey(); + return work == null ? null : work.getWorkItem().getKey(); } public WindmillComputationKey getComputationKey() { @@ -341,11 +344,15 @@ public WindmillComputationKey getComputationKey() { } public long getWorkToken() { - return work.getWorkItem().getWorkToken(); + return getWorkItem().getWorkToken(); } public Windmill.WorkItem getWorkItem() { - return work.getWorkItem(); + return checkNotNull( + work, + "work is null. A call to StreamingModeExecutionContext.start(...) is required to set" + + " work for execution.") + .getWorkItem(); } public Windmill.WorkItemCommitRequest.Builder getOutputBuilder() { @@ -386,7 +393,7 @@ public void invalidateCache() { public UnboundedSource.@Nullable CheckpointMark getReaderCheckpoint( Coder coder) { try { - ByteString sourceStateState = work.getWorkItem().getSourceState().getState(); + ByteString sourceStateState = getWorkItem().getSourceState().getState(); if (sourceStateState.isEmpty()) { return null; } @@ -733,7 +740,7 @@ public void start( key, stateFamily, stateReader, - work.getWorkItem().getIsNewKey(), + getWorkItem().getIsNewKey(), cacheForKey.forFamily(stateFamily), scopedReadStateSupplier); diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationWorkExecutor.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationWorkExecutor.java index 8a00194887da..8dc681fc640c 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationWorkExecutor.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationWorkExecutor.java @@ -24,7 +24,6 @@ import org.apache.beam.runners.core.metrics.ExecutionStateTracker; import org.apache.beam.runners.dataflow.worker.DataflowMapTaskExecutor; import org.apache.beam.runners.dataflow.worker.DataflowWorkExecutor; -import org.apache.beam.runners.dataflow.worker.OperationalLimits; import org.apache.beam.runners.dataflow.worker.StreamingModeExecutionContext; import org.apache.beam.runners.dataflow.worker.streaming.sideinput.SideInputStateFetcher; import org.apache.beam.runners.dataflow.worker.util.common.worker.ElementCounter; @@ -73,11 +72,9 @@ public final void executeWork( Work work, WindmillStateReader stateReader, SideInputStateFetcher sideInputStateFetcher, - OperationalLimits operationalLimits, Windmill.WorkItemCommitRequest.Builder outputBuilder) throws Exception { - context() - .start(key, work, stateReader, sideInputStateFetcher, operationalLimits, outputBuilder); + context().start(key, work, stateReader, sideInputStateFetcher, outputBuilder); workExecutor().execute(); } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/config/ComputationConfig.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/config/ComputationConfig.java index fb8bcf7edbfb..9702751aeb98 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/config/ComputationConfig.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/config/ComputationConfig.java @@ -48,12 +48,13 @@ public static ComputationConfig create( public abstract ImmutableMap stateNameMap(); /** Interface to fetch configurations for a specific computation. */ - @FunctionalInterface public interface Fetcher { default void start() {} default void stop() {} Optional fetchConfig(String computationId); + + StreamingGlobalConfigHandle getGlobalConfigHandle(); } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/config/FakeGlobalConfigHandle.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/config/FakeGlobalConfigHandle.java new file mode 100644 index 000000000000..d4d73f5882b1 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/config/FakeGlobalConfigHandle.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.streaming.config; + +import java.util.function.Consumer; +import javax.annotation.Nonnull; +import javax.annotation.concurrent.ThreadSafe; +import org.apache.beam.sdk.annotations.Internal; + +@Internal +@ThreadSafe +/* + * Fake StreamingGlobalConfigHandle used for Tests. Allows setting fake configs. + */ +public class FakeGlobalConfigHandle implements StreamingGlobalConfigHandle { + + private final StreamingGlobalConfigHandleImpl globalConfigHandle; + + public FakeGlobalConfigHandle(StreamingGlobalConfig config) { + this.globalConfigHandle = new StreamingGlobalConfigHandleImpl(); + this.globalConfigHandle.setConfig(config); + } + + @Override + public StreamingGlobalConfig getConfig() { + return globalConfigHandle.getConfig(); + } + + public void setConfig(StreamingGlobalConfig config) { + globalConfigHandle.setConfig(config); + } + + @Override + public void registerConfigObserver(@Nonnull Consumer callback) { + globalConfigHandle.registerConfigObserver(callback); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/config/FixedGlobalConfigHandle.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/config/FixedGlobalConfigHandle.java new file mode 100644 index 000000000000..c244ecb8c7a8 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/config/FixedGlobalConfigHandle.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.streaming.config; + +import java.util.function.Consumer; +import javax.annotation.Nonnull; +import javax.annotation.concurrent.ThreadSafe; +import org.apache.beam.sdk.annotations.Internal; + +@Internal +@ThreadSafe +/* + * StreamingGlobalConfigHandle returning a fixed config + * initialized during construction. Used for Appliance and Tests. + */ +public class FixedGlobalConfigHandle implements StreamingGlobalConfigHandle { + + private final StreamingGlobalConfig config; + + public FixedGlobalConfigHandle(StreamingGlobalConfig config) { + this.config = config; + } + + @Override + public StreamingGlobalConfig getConfig() { + return config; + } + + @Override + public void registerConfigObserver(@Nonnull Consumer callback) { + callback.accept(config); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/config/StreamingApplianceComputationConfigFetcher.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/config/StreamingApplianceComputationConfigFetcher.java index 786ded09498a..025e66be79c1 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/config/StreamingApplianceComputationConfigFetcher.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/config/StreamingApplianceComputationConfigFetcher.java @@ -48,11 +48,14 @@ public final class StreamingApplianceComputationConfigFetcher implements Computa private final ApplianceComputationConfigFetcher applianceComputationConfigFetcher; private final ConcurrentHashMap systemNameToComputationIdMap; + private final StreamingGlobalConfigHandle globalConfigHandle; public StreamingApplianceComputationConfigFetcher( - ApplianceComputationConfigFetcher applianceComputationConfigFetcher) { + ApplianceComputationConfigFetcher applianceComputationConfigFetcher, + StreamingGlobalConfigHandle globalConfigHandle) { this.applianceComputationConfigFetcher = applianceComputationConfigFetcher; this.systemNameToComputationIdMap = new ConcurrentHashMap<>(); + this.globalConfigHandle = globalConfigHandle; } /** Returns a {@code Table} */ @@ -112,6 +115,11 @@ public Optional fetchConfig(String computationId) { .collect(toImmutableMap(NameMapEntry::getUserName, NameMapEntry::getSystemName))); } + @Override + public StreamingGlobalConfigHandle getGlobalConfigHandle() { + return globalConfigHandle; + } + private Optional createComputationConfig( String serializedMapTask, Table transformUserNameToStateFamilyByComputationId, diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/config/StreamingEngineComputationConfigFetcher.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/config/StreamingEngineComputationConfigFetcher.java index d230aac54c63..22b0dac6eb22 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/config/StreamingEngineComputationConfigFetcher.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/config/StreamingEngineComputationConfigFetcher.java @@ -30,16 +30,18 @@ import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; -import java.util.function.Consumer; import java.util.function.Function; import java.util.stream.StreamSupport; import javax.annotation.concurrent.ThreadSafe; +import org.apache.beam.runners.dataflow.worker.OperationalLimits; import org.apache.beam.runners.dataflow.worker.WorkUnitClient; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.UserWorkerRunnerV1Settings; import org.apache.beam.sdk.annotations.Internal; import org.apache.beam.sdk.util.BackOff; import org.apache.beam.sdk.util.BackOffUtils; import org.apache.beam.sdk.util.FluentBackoff; import org.apache.beam.sdk.util.Sleeper; +import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.InvalidProtocolBufferException; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Splitter; @@ -72,33 +74,31 @@ public final class StreamingEngineComputationConfigFetcher implements Computatio private final long globalConfigRefreshPeriodMillis; private final WorkUnitClient dataflowServiceClient; private final ScheduledExecutorService globalConfigRefresher; - private final Consumer onStreamingConfig; + private final StreamingGlobalConfigHandleImpl globalConfigHandle; private final AtomicBoolean hasReceivedGlobalConfig; private StreamingEngineComputationConfigFetcher( boolean hasReceivedGlobalConfig, long globalConfigRefreshPeriodMillis, WorkUnitClient dataflowServiceClient, - ScheduledExecutorService globalConfigRefresher, - Consumer onStreamingConfig) { + StreamingGlobalConfigHandleImpl globalConfigHandle, + ScheduledExecutorService globalConfigRefresher) { this.globalConfigRefreshPeriodMillis = globalConfigRefreshPeriodMillis; this.dataflowServiceClient = dataflowServiceClient; this.globalConfigRefresher = globalConfigRefresher; - this.onStreamingConfig = onStreamingConfig; + this.globalConfigHandle = globalConfigHandle; this.hasReceivedGlobalConfig = new AtomicBoolean(hasReceivedGlobalConfig); } public static StreamingEngineComputationConfigFetcher create( - long globalConfigRefreshPeriodMillis, - WorkUnitClient dataflowServiceClient, - Consumer onStreamingConfig) { + long globalConfigRefreshPeriodMillis, WorkUnitClient dataflowServiceClient) { return new StreamingEngineComputationConfigFetcher( /* hasReceivedGlobalConfig= */ false, globalConfigRefreshPeriodMillis, dataflowServiceClient, + new StreamingGlobalConfigHandleImpl(), Executors.newSingleThreadScheduledExecutor( - new ThreadFactoryBuilder().setNameFormat(CONFIG_REFRESHER_THREAD_NAME).build()), - onStreamingConfig); + new ThreadFactoryBuilder().setNameFormat(CONFIG_REFRESHER_THREAD_NAME).build())); } @VisibleForTesting @@ -106,14 +106,14 @@ public static StreamingEngineComputationConfigFetcher forTesting( boolean hasReceivedGlobalConfig, long globalConfigRefreshPeriodMillis, WorkUnitClient dataflowServiceClient, - Function executorSupplier, - Consumer onStreamingConfig) { + StreamingGlobalConfigHandleImpl globalConfigHandle, + Function executorSupplier) { return new StreamingEngineComputationConfigFetcher( hasReceivedGlobalConfig, globalConfigRefreshPeriodMillis, dataflowServiceClient, - executorSupplier.apply(CONFIG_REFRESHER_THREAD_NAME), - onStreamingConfig); + globalConfigHandle, + executorSupplier.apply(CONFIG_REFRESHER_THREAD_NAME)); } @VisibleForTesting @@ -157,11 +157,9 @@ private static Optional fetchConfigWithRetry( } } - private StreamingEnginePipelineConfig createPipelineConfig(StreamingConfigTask config) { - StreamingEnginePipelineConfig.Builder pipelineConfig = StreamingEnginePipelineConfig.builder(); - if (config.getUserStepToStateFamilyNameMap() != null) { - pipelineConfig.setUserStepToStateFamilyNameMap(config.getUserStepToStateFamilyNameMap()); - } + private StreamingGlobalConfig createPipelineConfig(StreamingConfigTask config) { + StreamingGlobalConfig.Builder pipelineConfig = StreamingGlobalConfig.builder(); + OperationalLimits.Builder operationalLimits = OperationalLimits.builder(); if (config.getWindmillServiceEndpoint() != null && !config.getWindmillServiceEndpoint().isEmpty()) { @@ -184,23 +182,36 @@ private StreamingEnginePipelineConfig createPipelineConfig(StreamingConfigTask c if (config.getMaxWorkItemCommitBytes() != null && config.getMaxWorkItemCommitBytes() > 0 && config.getMaxWorkItemCommitBytes() <= Integer.MAX_VALUE) { - pipelineConfig.setMaxWorkItemCommitBytes(config.getMaxWorkItemCommitBytes().intValue()); + operationalLimits.setMaxWorkItemCommitBytes(config.getMaxWorkItemCommitBytes().intValue()); } if (config.getOperationalLimits() != null) { if (config.getOperationalLimits().getMaxKeyBytes() != null && config.getOperationalLimits().getMaxKeyBytes() > 0 && config.getOperationalLimits().getMaxKeyBytes() <= Integer.MAX_VALUE) { - pipelineConfig.setMaxOutputKeyBytes(config.getOperationalLimits().getMaxKeyBytes()); + operationalLimits.setMaxOutputKeyBytes(config.getOperationalLimits().getMaxKeyBytes()); } if (config.getOperationalLimits().getMaxProductionOutputBytes() != null && config.getOperationalLimits().getMaxProductionOutputBytes() > 0 && config.getOperationalLimits().getMaxProductionOutputBytes() <= Integer.MAX_VALUE) { - pipelineConfig.setMaxOutputValueBytes( + operationalLimits.setMaxOutputValueBytes( config.getOperationalLimits().getMaxProductionOutputBytes()); } } + pipelineConfig.setOperationalLimits(operationalLimits.build()); + + byte[] settings_bytes = config.decodeUserWorkerRunnerV1Settings(); + if (settings_bytes != null) { + UserWorkerRunnerV1Settings settings = UserWorkerRunnerV1Settings.newBuilder().build(); + try { + settings = UserWorkerRunnerV1Settings.parseFrom(settings_bytes); + } catch (InvalidProtocolBufferException e) { + LOG.error("Parsing UserWorkerRunnerV1Settings failed", e); + } + pipelineConfig.setUserWorkerJobSettings(settings); + } + return pipelineConfig.build(); } @@ -233,6 +244,11 @@ public Optional fetchConfig(String computationId) { .flatMap(StreamingEngineComputationConfigFetcher::createComputationConfig); } + @Override + public StreamingGlobalConfigHandle getGlobalConfigHandle() { + return globalConfigHandle; + } + @Override public void stop() { // We have already shutdown or start has not been called. @@ -259,7 +275,7 @@ public void stop() { @SuppressWarnings("FutureReturnValueIgnored") private void schedulePeriodicGlobalConfigRequests() { globalConfigRefresher.scheduleWithFixedDelay( - () -> fetchGlobalConfig().ifPresent(onStreamingConfig), + () -> fetchGlobalConfig().ifPresent(globalConfigHandle::setConfig), 0, globalConfigRefreshPeriodMillis, TimeUnit.MILLISECONDS); @@ -272,9 +288,9 @@ private void schedulePeriodicGlobalConfigRequests() { private synchronized void fetchInitialPipelineGlobalConfig() { while (!hasReceivedGlobalConfig.get()) { LOG.info("Sending request to get initial global configuration for this worker."); - Optional globalConfig = fetchGlobalConfig(); + Optional globalConfig = fetchGlobalConfig(); if (globalConfig.isPresent()) { - onStreamingConfig.accept(globalConfig.get()); + globalConfigHandle.setConfig(globalConfig.get()); hasReceivedGlobalConfig.set(true); break; } @@ -285,13 +301,14 @@ private synchronized void fetchInitialPipelineGlobalConfig() { LOG.info("Initial global configuration received, harness is now ready"); } - private Optional fetchGlobalConfig() { + private Optional fetchGlobalConfig() { return fetchConfigWithRetry(dataflowServiceClient::getGlobalStreamingConfigWorkItem) .map(config -> createPipelineConfig(config)); } @FunctionalInterface private interface ThrowingFetchWorkItemFn { + Optional fetchWorkItem() throws IOException; } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/config/StreamingEnginePipelineConfig.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/config/StreamingGlobalConfig.java similarity index 56% rename from runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/config/StreamingEnginePipelineConfig.java rename to runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/config/StreamingGlobalConfig.java index 8f1ff93f6a49..8f76f5ec27af 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/config/StreamingEnginePipelineConfig.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/config/StreamingGlobalConfig.java @@ -18,8 +18,8 @@ package org.apache.beam.runners.dataflow.worker.streaming.config; import com.google.auto.value.AutoValue; -import java.util.HashMap; -import java.util.Map; +import org.apache.beam.runners.dataflow.worker.OperationalLimits; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.UserWorkerRunnerV1Settings; import org.apache.beam.sdk.annotations.Internal; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableSet; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.net.HostAndPort; @@ -27,41 +27,30 @@ /** Global pipeline config for pipelines running in Streaming Engine mode. */ @AutoValue @Internal -public abstract class StreamingEnginePipelineConfig { +public abstract class StreamingGlobalConfig { - private static final long DEFAULT_MAX_WORK_ITEM_COMMIT_BYTES = 180 << 20; - - public static StreamingEnginePipelineConfig.Builder builder() { - return new AutoValue_StreamingEnginePipelineConfig.Builder() - .setMaxWorkItemCommitBytes(DEFAULT_MAX_WORK_ITEM_COMMIT_BYTES) - .setMaxOutputKeyBytes(Long.MAX_VALUE) - .setMaxOutputValueBytes(Long.MAX_VALUE) - .setUserStepToStateFamilyNameMap(new HashMap<>()) - .setWindmillServiceEndpoints(ImmutableSet.of()); + public static StreamingGlobalConfig.Builder builder() { + return new AutoValue_StreamingGlobalConfig.Builder() + .setWindmillServiceEndpoints(ImmutableSet.of()) + .setUserWorkerJobSettings(UserWorkerRunnerV1Settings.newBuilder().build()) + .setOperationalLimits(OperationalLimits.builder().build()); } - public abstract long maxWorkItemCommitBytes(); - - public abstract long maxOutputKeyBytes(); - - public abstract long maxOutputValueBytes(); - - public abstract Map userStepToStateFamilyNameMap(); + public abstract OperationalLimits operationalLimits(); public abstract ImmutableSet windmillServiceEndpoints(); + public abstract UserWorkerRunnerV1Settings userWorkerJobSettings(); + @AutoValue.Builder public abstract static class Builder { - public abstract Builder setMaxWorkItemCommitBytes(long value); - - public abstract Builder setMaxOutputKeyBytes(long value); - public abstract Builder setMaxOutputValueBytes(long value); + public abstract Builder setWindmillServiceEndpoints(ImmutableSet value); - public abstract Builder setUserStepToStateFamilyNameMap(Map value); + public abstract Builder setOperationalLimits(OperationalLimits operationalLimits); - public abstract Builder setWindmillServiceEndpoints(ImmutableSet value); + public abstract Builder setUserWorkerJobSettings(UserWorkerRunnerV1Settings settings); - public abstract StreamingEnginePipelineConfig build(); + public abstract StreamingGlobalConfig build(); } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/config/StreamingGlobalConfigHandle.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/config/StreamingGlobalConfigHandle.java new file mode 100644 index 000000000000..6f75ba887473 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/config/StreamingGlobalConfigHandle.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.streaming.config; + +import java.util.function.Consumer; +import javax.annotation.Nonnull; +import javax.annotation.concurrent.ThreadSafe; +import org.apache.beam.sdk.annotations.Internal; + +@Internal +@ThreadSafe +public interface StreamingGlobalConfigHandle { + + /** Returns the latest StreamingGlobalConfig */ + StreamingGlobalConfig getConfig(); + + /** + * Subscribe to config updates by registering a callback. Callback should be called the first time + * with settings, if any. The callback could execute inline before the method returns. + */ + void registerConfigObserver(@Nonnull Consumer callback); +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/config/StreamingGlobalConfigHandleImpl.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/config/StreamingGlobalConfigHandleImpl.java new file mode 100644 index 000000000000..9ed5c9fcf396 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/config/StreamingGlobalConfigHandleImpl.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.streaming.config; + +import java.util.concurrent.CopyOnWriteArrayList; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicReference; +import java.util.function.Consumer; +import javax.annotation.Nonnull; +import javax.annotation.concurrent.ThreadSafe; +import org.apache.beam.sdk.annotations.Internal; +import org.apache.beam.vendor.grpc.v1p60p1.com.google.common.base.Preconditions; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +@Internal +@ThreadSafe +public class StreamingGlobalConfigHandleImpl implements StreamingGlobalConfigHandle { + + private static final Logger LOG = LoggerFactory.getLogger(StreamingGlobalConfigHandleImpl.class); + + private final AtomicReference streamingEngineConfig = + new AtomicReference<>(); + + private final CopyOnWriteArrayList configCallbacks = new CopyOnWriteArrayList<>(); + + @Override + @Nonnull + public StreamingGlobalConfig getConfig() { + Preconditions.checkState( + streamingEngineConfig.get() != null, + "Global config should be set before any processing is done"); + return streamingEngineConfig.get(); + } + + @Override + public void registerConfigObserver(@Nonnull Consumer callback) { + ConfigCallback configCallback = new ConfigCallback(callback); + configCallbacks.add(configCallback); + if (streamingEngineConfig.get() != null) { + configCallback.run(); + } + } + + void setConfig(@Nonnull StreamingGlobalConfig config) { + if (config.equals(streamingEngineConfig.get())) { + return; + } + streamingEngineConfig.set(config); + for (ConfigCallback configCallback : configCallbacks) { + configCallback.run(); + } + } + + private class ConfigCallback { + + private final AtomicInteger queuedOrRunning = new AtomicInteger(0); + private final Consumer configConsumer; + + private ConfigCallback(Consumer configConsumer) { + this.configConsumer = configConsumer; + } + + /** + * Runs the passed in callback with the latest config. Overlapping `run()` calls will be + * collapsed into one. If the callback is already running a new call will be scheduled to run + * after the current execution completes, on the same thread which ran the previous run. + */ + private void run() { + // If the callback is already running, + // Increment queued and return. The thread running + // the callback will run it again with the latest config. + if (queuedOrRunning.incrementAndGet() > 1) { + return; + } + // Else run the callback + while (true) { + try { + configConsumer.accept(StreamingGlobalConfigHandleImpl.this.streamingEngineConfig.get()); + } catch (Exception e) { + LOG.error("Exception running GlobalConfig callback", e); + } + if (queuedOrRunning.updateAndGet( + queuedOrRunning -> { + if (queuedOrRunning == 1) { + // If there are no queued requests stop processing. + return 0; + } + // Else, clear queue, set 1 running and run the callback + return 1; + }) + == 0) { + break; + } + } + } + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/harness/StreamingWorkerStatusPages.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/harness/StreamingWorkerStatusPages.java index d305e25af7e5..6981312eff1d 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/harness/StreamingWorkerStatusPages.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/harness/StreamingWorkerStatusPages.java @@ -28,6 +28,7 @@ import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicReference; import java.util.function.Consumer; import java.util.function.Supplier; import javax.servlet.http.HttpServletRequest; @@ -38,6 +39,8 @@ import org.apache.beam.runners.dataflow.worker.status.LastExceptionDataProvider; import org.apache.beam.runners.dataflow.worker.status.WorkerStatusPages; import org.apache.beam.runners.dataflow.worker.streaming.ComputationStateCache; +import org.apache.beam.runners.dataflow.worker.streaming.config.StreamingGlobalConfig; +import org.apache.beam.runners.dataflow.worker.streaming.config.StreamingGlobalConfigHandle; import org.apache.beam.runners.dataflow.worker.util.BoundedQueueExecutor; import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.ChannelzServlet; import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.GrpcWindmillStreamFactory; @@ -77,6 +80,8 @@ public final class StreamingWorkerStatusPages { private final DebugCapture.@Nullable Manager debugCapture; private final @Nullable ChannelzServlet channelzServlet; + private final AtomicReference globalConfig = new AtomicReference<>(); + StreamingWorkerStatusPages( Supplier clock, long clientId, @@ -90,7 +95,8 @@ public final class StreamingWorkerStatusPages { @Nullable GrpcWindmillStreamFactory windmillStreamFactory, Consumer getDataStatusProvider, BoundedQueueExecutor workUnitExecutor, - ScheduledExecutorService statusPageDumper) { + ScheduledExecutorService statusPageDumper, + StreamingGlobalConfigHandle globalConfigHandle) { this.clock = clock; this.clientId = clientId; this.isRunning = isRunning; @@ -104,6 +110,7 @@ public final class StreamingWorkerStatusPages { this.getDataStatusProvider = getDataStatusProvider; this.workUnitExecutor = workUnitExecutor; this.statusPageDumper = statusPageDumper; + globalConfigHandle.registerConfigObserver(globalConfig::set); } public static StreamingWorkerStatusPages.Builder builder() { @@ -150,6 +157,17 @@ private void addStreamingEngineStatusPages() { statusPages.addCapturePage(Preconditions.checkNotNull(channelzServlet)); statusPages.addStatusDataProvider( "streaming", "Streaming RPCs", Preconditions.checkNotNull(windmillStreamFactory)); + statusPages.addStatusDataProvider( + "jobSettings", + "User Worker Job Settings", + writer -> { + @Nullable StreamingGlobalConfig config = globalConfig.get(); + if (config == null) { + writer.println("Job Settings not loaded."); + return; + } + writer.println(config.userWorkerJobSettings().toString()); + }); } private boolean isStreamingEngine() { @@ -256,6 +274,8 @@ public interface Builder { Builder setStatusPageDumper(ScheduledExecutorService statusPageDumper); + Builder setGlobalConfigHandle(StreamingGlobalConfigHandle globalConfigHandle); + StreamingWorkerStatusPages build(); } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDispatcherClient.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDispatcherClient.java index cf2e7260592d..f96464150d4a 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDispatcherClient.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDispatcherClient.java @@ -26,15 +26,20 @@ import java.util.Set; import java.util.concurrent.CountDownLatch; import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicReference; import javax.annotation.concurrent.GuardedBy; import javax.annotation.concurrent.ThreadSafe; +import org.apache.beam.runners.dataflow.DataflowRunner; +import org.apache.beam.runners.dataflow.options.DataflowWorkerHarnessOptions; +import org.apache.beam.runners.dataflow.worker.streaming.config.StreamingGlobalConfig; import org.apache.beam.runners.dataflow.worker.windmill.CloudWindmillMetadataServiceV1Alpha1Grpc; import org.apache.beam.runners.dataflow.worker.windmill.CloudWindmillMetadataServiceV1Alpha1Grpc.CloudWindmillMetadataServiceV1Alpha1Stub; import org.apache.beam.runners.dataflow.worker.windmill.CloudWindmillServiceV1Alpha1Grpc; import org.apache.beam.runners.dataflow.worker.windmill.CloudWindmillServiceV1Alpha1Grpc.CloudWindmillServiceV1Alpha1Stub; import org.apache.beam.runners.dataflow.worker.windmill.WindmillServiceAddress; import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.stubs.WindmillStubFactory; +import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.stubs.WindmillStubFactoryFactory; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; @@ -46,8 +51,10 @@ /** Manages endpoints and stubs for connecting to the Windmill Dispatcher. */ @ThreadSafe public class GrpcDispatcherClient { + private static final Logger LOG = LoggerFactory.getLogger(GrpcDispatcherClient.class); - private final WindmillStubFactory windmillStubFactory; + static final String STREAMING_ENGINE_USE_JOB_SETTINGS_FOR_ISOLATED_CHANNELS = + "streaming_engine_use_job_settings_for_isolated_channels"; private final CountDownLatch onInitializedEndpoints; /** @@ -60,23 +67,49 @@ public class GrpcDispatcherClient { @GuardedBy("this") private final Random rand; + private final WindmillStubFactoryFactory windmillStubFactoryFactory; + + private final AtomicReference windmillStubFactory = new AtomicReference<>(); + + private final AtomicBoolean useIsolatedChannels = new AtomicBoolean(); + private final boolean reactToIsolatedChannelsJobSetting; + private GrpcDispatcherClient( - WindmillStubFactory windmillStubFactory, + DataflowWorkerHarnessOptions options, + WindmillStubFactoryFactory windmillStubFactoryFactory, DispatcherStubs initialDispatcherStubs, Random rand) { - this.windmillStubFactory = windmillStubFactory; + this.windmillStubFactoryFactory = windmillStubFactoryFactory; + if (DataflowRunner.hasExperiment( + options, STREAMING_ENGINE_USE_JOB_SETTINGS_FOR_ISOLATED_CHANNELS)) { + if (options.getUseWindmillIsolatedChannels() != null) { + this.useIsolatedChannels.set(options.getUseWindmillIsolatedChannels()); + this.reactToIsolatedChannelsJobSetting = false; + } else { + this.useIsolatedChannels.set(false); + this.reactToIsolatedChannelsJobSetting = true; + } + } else { + this.useIsolatedChannels.set(Boolean.TRUE.equals(options.getUseWindmillIsolatedChannels())); + this.reactToIsolatedChannelsJobSetting = false; + } + this.windmillStubFactory.set( + windmillStubFactoryFactory.makeWindmillStubFactory(useIsolatedChannels.get())); this.rand = rand; this.dispatcherStubs = new AtomicReference<>(initialDispatcherStubs); this.onInitializedEndpoints = new CountDownLatch(1); } - public static GrpcDispatcherClient create(WindmillStubFactory windmillStubFactory) { - return new GrpcDispatcherClient(windmillStubFactory, DispatcherStubs.empty(), new Random()); + public static GrpcDispatcherClient create( + DataflowWorkerHarnessOptions options, WindmillStubFactoryFactory windmillStubFactoryFactory) { + return new GrpcDispatcherClient( + options, windmillStubFactoryFactory, DispatcherStubs.empty(), new Random()); } @VisibleForTesting public static GrpcDispatcherClient forTesting( - WindmillStubFactory windmillGrpcStubFactory, + DataflowWorkerHarnessOptions options, + WindmillStubFactoryFactory windmillStubFactoryFactory, List windmillServiceStubs, List windmillMetadataServiceStubs, Set dispatcherEndpoints) { @@ -84,7 +117,8 @@ public static GrpcDispatcherClient forTesting( dispatcherEndpoints.size() == windmillServiceStubs.size() && windmillServiceStubs.size() == windmillMetadataServiceStubs.size()); return new GrpcDispatcherClient( - windmillGrpcStubFactory, + options, + windmillStubFactoryFactory, DispatcherStubs.create( dispatcherEndpoints, windmillServiceStubs, windmillMetadataServiceStubs), new Random()); @@ -146,14 +180,36 @@ public boolean hasInitializedEndpoints() { return dispatcherStubs.get().hasInitializedEndpoints(); } + public void onJobConfig(StreamingGlobalConfig config) { + if (config.windmillServiceEndpoints().isEmpty()) { + LOG.warn("Dispatcher client received empty windmill service endpoints from global config"); + return; + } + boolean forceRecreateStubs = false; + if (reactToIsolatedChannelsJobSetting) { + boolean useIsolatedChannels = config.userWorkerJobSettings().getUseWindmillIsolatedChannels(); + if (this.useIsolatedChannels.getAndSet(useIsolatedChannels) != useIsolatedChannels) { + windmillStubFactory.set( + windmillStubFactoryFactory.makeWindmillStubFactory(useIsolatedChannels)); + forceRecreateStubs = true; + } + } + consumeWindmillDispatcherEndpoints(config.windmillServiceEndpoints(), forceRecreateStubs); + } + public synchronized void consumeWindmillDispatcherEndpoints( ImmutableSet dispatcherEndpoints) { + consumeWindmillDispatcherEndpoints(dispatcherEndpoints, /*forceRecreateStubs=*/ false); + } + + private synchronized void consumeWindmillDispatcherEndpoints( + ImmutableSet dispatcherEndpoints, boolean forceRecreateStubs) { ImmutableSet currentDispatcherEndpoints = dispatcherStubs.get().dispatcherEndpoints(); Preconditions.checkArgument( dispatcherEndpoints != null && !dispatcherEndpoints.isEmpty(), "Cannot set dispatcher endpoints to nothing."); - if (currentDispatcherEndpoints.equals(dispatcherEndpoints)) { + if (!forceRecreateStubs && currentDispatcherEndpoints.equals(dispatcherEndpoints)) { // The endpoints are equal don't recreate the stubs. return; } @@ -164,7 +220,7 @@ public synchronized void consumeWindmillDispatcherEndpoints( } LOG.info("Initializing Streaming Engine GRPC client for endpoints: {}", dispatcherEndpoints); - dispatcherStubs.set(DispatcherStubs.create(dispatcherEndpoints, windmillStubFactory)); + dispatcherStubs.set(DispatcherStubs.create(dispatcherEndpoints, windmillStubFactory.get())); onInitializedEndpoints.countDown(); } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillServer.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillServer.java index 1fce4d238b2e..310495982679 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillServer.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillServer.java @@ -53,7 +53,7 @@ import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.CommitWorkStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetDataStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetWorkStream; -import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.stubs.WindmillStubFactory; +import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.stubs.WindmillStubFactoryFactory; import org.apache.beam.runners.dataflow.worker.windmill.client.throttling.StreamingEngineThrottleTimers; import org.apache.beam.runners.dataflow.worker.windmill.work.WorkItemReceiver; import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; @@ -154,7 +154,7 @@ static GrpcWindmillServer newTestInstance( String name, List experiments, long clientId, - WindmillStubFactory windmillStubFactory) { + WindmillStubFactoryFactory windmillStubFactoryFactory) { ManagedChannel inProcessChannel = inProcessChannel(name); CloudWindmillServiceV1Alpha1Stub stub = CloudWindmillServiceV1Alpha1Grpc.newStub(inProcessChannel); @@ -164,16 +164,18 @@ static GrpcWindmillServer newTestInstance( List windmillMetadataServiceStubs = Lists.newArrayList(metadataStub); + DataflowWorkerHarnessOptions testOptions = + testOptions(/* enableStreamingEngine= */ true, experiments); + Set dispatcherEndpoints = Sets.newHashSet(HostAndPort.fromHost(name)); GrpcDispatcherClient dispatcherClient = GrpcDispatcherClient.forTesting( - windmillStubFactory, + testOptions, + windmillStubFactoryFactory, windmillServiceStubs, windmillMetadataServiceStubs, dispatcherEndpoints); - DataflowWorkerHarnessOptions testOptions = - testOptions(/* enableStreamingEngine= */ true, experiments); boolean sendKeyedGetDataRequests = !testOptions.isEnableStreamingEngine() || DataflowRunner.hasExperiment( @@ -190,7 +192,7 @@ static GrpcWindmillServer newTestInstance( @VisibleForTesting static GrpcWindmillServer newApplianceTestInstance( - Channel channel, WindmillStubFactory windmillStubFactory) { + Channel channel, WindmillStubFactoryFactory windmillStubFactoryFactory) { DataflowWorkerHarnessOptions options = testOptions(/* enableStreamingEngine= */ false, new ArrayList<>()); GrpcWindmillServer testServer = @@ -198,7 +200,7 @@ static GrpcWindmillServer newApplianceTestInstance( options, GrpcWindmillStreamFactory.of(createJobHeader(options, 1)).build(), // No-op, Appliance does not use Dispatcher to call Streaming Engine. - GrpcDispatcherClient.create(windmillStubFactory)); + GrpcDispatcherClient.create(options, windmillStubFactoryFactory)); testServer.syncApplianceStub = createWindmillApplianceStubWithDeadlineInterceptor(channel); return testServer; } diff --git a/runners/prism/java/src/main/java/org/apache/beam/runners/prism/StateListener.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/stubs/WindmillStubFactoryFactory.java similarity index 69% rename from runners/prism/java/src/main/java/org/apache/beam/runners/prism/StateListener.java rename to runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/stubs/WindmillStubFactoryFactory.java index 89f537e4f812..f7dd9a22b996 100644 --- a/runners/prism/java/src/main/java/org/apache/beam/runners/prism/StateListener.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/stubs/WindmillStubFactoryFactory.java @@ -15,13 +15,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.runners.prism; +package org.apache.beam.runners.dataflow.worker.windmill.client.grpc.stubs; -import org.apache.beam.sdk.PipelineResult; +import org.apache.beam.sdk.annotations.Internal; -/** Listens for {@link PipelineResult.State} changes reported by the {@link StateWatcher}. */ -interface StateListener { - - /** Callback invoked when {@link StateWatcher} discovers a {@link PipelineResult.State} change. */ - void onStateChanged(PipelineResult.State state); +@Internal +public interface WindmillStubFactoryFactory { + WindmillStubFactory makeWindmillStubFactory(boolean useIsolatedChannels); } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/stubs/WindmillStubFactoryFactoryImpl.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/stubs/WindmillStubFactoryFactoryImpl.java new file mode 100644 index 000000000000..f6ffb9c14519 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/stubs/WindmillStubFactoryFactoryImpl.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.client.grpc.stubs; + +import static org.apache.beam.runners.dataflow.worker.windmill.client.grpc.stubs.WindmillChannelFactory.remoteChannel; + +import com.google.auth.Credentials; +import java.util.function.Function; +import org.apache.beam.runners.dataflow.options.DataflowWorkerHarnessOptions; +import org.apache.beam.runners.dataflow.worker.windmill.WindmillServiceAddress; +import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.ManagedChannel; + +public class WindmillStubFactoryFactoryImpl implements WindmillStubFactoryFactory { + + private final int windmillServiceRpcChannelAliveTimeoutSec; + private final Credentials gcpCredential; + + public WindmillStubFactoryFactoryImpl(DataflowWorkerHarnessOptions workerOptions) { + this.gcpCredential = workerOptions.getGcpCredential(); + this.windmillServiceRpcChannelAliveTimeoutSec = + workerOptions.getWindmillServiceRpcChannelAliveTimeoutSec(); + } + + @Override + public WindmillStubFactory makeWindmillStubFactory(boolean useIsolatedChannels) { + Function channelFactory = + serviceAddress -> remoteChannel(serviceAddress, windmillServiceRpcChannelAliveTimeoutSec); + ChannelCache channelCache = + ChannelCache.create( + serviceAddress -> + // IsolationChannel will create and manage separate RPC channels to the same + // serviceAddress via calling the channelFactory, else just directly return the + // RPC channel. + useIsolatedChannels + ? IsolationChannel.create(() -> channelFactory.apply(serviceAddress)) + : channelFactory.apply(serviceAddress)); + return ChannelCachingRemoteStubFactory.create(gcpCredential, channelCache); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/ComputationWorkExecutorFactory.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/ComputationWorkExecutorFactory.java index 20c1247b2168..d5e0b3a24e2a 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/ComputationWorkExecutorFactory.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/ComputationWorkExecutorFactory.java @@ -47,6 +47,7 @@ import org.apache.beam.runners.dataflow.worker.streaming.ComputationState; import org.apache.beam.runners.dataflow.worker.streaming.ComputationWorkExecutor; import org.apache.beam.runners.dataflow.worker.streaming.StageInfo; +import org.apache.beam.runners.dataflow.worker.streaming.config.StreamingGlobalConfigHandle; import org.apache.beam.runners.dataflow.worker.util.common.worker.MapTaskExecutor; import org.apache.beam.runners.dataflow.worker.util.common.worker.OutputObjectAndByteCounter; import org.apache.beam.runners.dataflow.worker.util.common.worker.ReadOperation; @@ -94,6 +95,7 @@ final class ComputationWorkExecutorFactory { private final long maxSinkBytes; private final IdGenerator idGenerator; + private final StreamingGlobalConfigHandle globalConfigHandle; private final boolean throwExceptionOnLargeOutput; ComputationWorkExecutorFactory( @@ -103,12 +105,14 @@ final class ComputationWorkExecutorFactory { Function stateCacheFactory, DataflowExecutionStateSampler sampler, CounterSet pendingDeltaCounters, - IdGenerator idGenerator) { + IdGenerator idGenerator, + StreamingGlobalConfigHandle globalConfigHandle) { this.options = options; this.mapTaskExecutorFactory = mapTaskExecutorFactory; this.readerCache = readerCache; this.stateCacheFactory = stateCacheFactory; this.idGenerator = idGenerator; + this.globalConfigHandle = globalConfigHandle; this.readerRegistry = ReaderRegistry.defaultRegistry(); this.sinkRegistry = SinkRegistry.defaultRegistry(); this.sampler = sampler; @@ -262,6 +266,7 @@ private StreamingModeExecutionContext createExecutionContext( stageInfo.metricsContainerRegistry(), executionStateTracker, stageInfo.executionStateRegistry(), + globalConfigHandle, maxSinkBytes, throwExceptionOnLargeOutput); } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/StreamingWorkScheduler.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/StreamingWorkScheduler.java index 86f2cffe604c..965a29126dc2 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/StreamingWorkScheduler.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/StreamingWorkScheduler.java @@ -25,7 +25,6 @@ import java.util.Optional; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicReference; import java.util.function.Function; import java.util.function.Supplier; import javax.annotation.concurrent.ThreadSafe; @@ -33,7 +32,6 @@ import org.apache.beam.runners.dataflow.worker.DataflowExecutionStateSampler; import org.apache.beam.runners.dataflow.worker.DataflowMapTaskExecutorFactory; import org.apache.beam.runners.dataflow.worker.HotKeyLogger; -import org.apache.beam.runners.dataflow.worker.OperationalLimits; import org.apache.beam.runners.dataflow.worker.ReaderCache; import org.apache.beam.runners.dataflow.worker.WorkItemCancelledException; import org.apache.beam.runners.dataflow.worker.logging.DataflowWorkerLoggingMDC; @@ -44,6 +42,7 @@ import org.apache.beam.runners.dataflow.worker.streaming.StageInfo; import org.apache.beam.runners.dataflow.worker.streaming.Watermarks; import org.apache.beam.runners.dataflow.worker.streaming.Work; +import org.apache.beam.runners.dataflow.worker.streaming.config.StreamingGlobalConfigHandle; import org.apache.beam.runners.dataflow.worker.streaming.harness.StreamingCounters; import org.apache.beam.runners.dataflow.worker.streaming.sideinput.SideInputStateFetcher; import org.apache.beam.runners.dataflow.worker.streaming.sideinput.SideInputStateFetcherFactory; @@ -85,7 +84,7 @@ public final class StreamingWorkScheduler { private final HotKeyLogger hotKeyLogger; private final ConcurrentMap stageInfoMap; private final DataflowExecutionStateSampler sampler; - private final AtomicReference operationalLimits; + private final StreamingGlobalConfigHandle globalConfigHandle; public StreamingWorkScheduler( DataflowWorkerHarnessOptions options, @@ -99,7 +98,7 @@ public StreamingWorkScheduler( HotKeyLogger hotKeyLogger, ConcurrentMap stageInfoMap, DataflowExecutionStateSampler sampler, - AtomicReference operationalLimits) { + StreamingGlobalConfigHandle globalConfigHandle) { this.options = options; this.clock = clock; this.computationWorkExecutorFactory = computationWorkExecutorFactory; @@ -111,7 +110,7 @@ public StreamingWorkScheduler( this.hotKeyLogger = hotKeyLogger; this.stageInfoMap = stageInfoMap; this.sampler = sampler; - this.operationalLimits = operationalLimits; + this.globalConfigHandle = globalConfigHandle; } public static StreamingWorkScheduler create( @@ -126,8 +125,8 @@ public static StreamingWorkScheduler create( StreamingCounters streamingCounters, HotKeyLogger hotKeyLogger, DataflowExecutionStateSampler sampler, - AtomicReference operationalLimits, IdGenerator idGenerator, + StreamingGlobalConfigHandle globalConfigHandle, ConcurrentMap stageInfoMap) { ComputationWorkExecutorFactory computationWorkExecutorFactory = new ComputationWorkExecutorFactory( @@ -137,7 +136,8 @@ public static StreamingWorkScheduler create( stateCacheFactory, sampler, streamingCounters.pendingDeltaCounters(), - idGenerator); + idGenerator, + globalConfigHandle); return new StreamingWorkScheduler( options, @@ -151,7 +151,7 @@ public static StreamingWorkScheduler create( hotKeyLogger, stageInfoMap, sampler, - operationalLimits); + globalConfigHandle); } private static long computeShuffleBytesRead(Windmill.WorkItem workItem) { @@ -295,7 +295,7 @@ private Windmill.WorkItemCommitRequest validateCommitRequestSize( Windmill.WorkItemCommitRequest commitRequest, String computationId, Windmill.WorkItem workItem) { - long byteLimit = operationalLimits.get().maxWorkItemCommitBytes; + long byteLimit = globalConfigHandle.getConfig().operationalLimits().getMaxWorkItemCommitBytes(); int commitSize = commitRequest.getSerializedSize(); int estimatedCommitSize = commitSize < 0 ? Integer.MAX_VALUE : commitSize; @@ -380,12 +380,7 @@ private ExecuteWorkResult executeWork( // Blocks while executing work. computationWorkExecutor.executeWork( - executionKey, - work, - stateReader, - localSideInputStateFetcher, - operationalLimits.get(), - outputBuilder); + executionKey, work, stateReader, localSideInputStateFetcher, outputBuilder); if (work.isFailed()) { throw new WorkItemCancelledException(workItem.getShardingKey()); @@ -415,6 +410,7 @@ private ExecuteWorkResult executeWork( // If processing failed due to a thrown exception, close the executionState. Do not // return/release the executionState back to computationState as that will lead to this // executionState instance being reused. + LOG.info("Invalidating executor after work item {} failed with Exception:", key, t); computationWorkExecutor.invalidate(); // Re-throw the exception, it will be caught and handled by workFailureProcessor downstream. diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/StreamPoolHeartbeatSender.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/StreamPoolHeartbeatSender.java index e571f89f142c..fa36b11ffe55 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/StreamPoolHeartbeatSender.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/StreamPoolHeartbeatSender.java @@ -17,6 +17,9 @@ */ package org.apache.beam.runners.dataflow.worker.windmill.work.refresh; +import java.util.concurrent.atomic.AtomicReference; +import javax.annotation.Nonnull; +import org.apache.beam.runners.dataflow.worker.streaming.config.StreamingGlobalConfigHandle; import org.apache.beam.runners.dataflow.worker.windmill.client.CloseableStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStreamPool; @@ -27,19 +30,53 @@ /** StreamingEngine stream pool based implementation of {@link HeartbeatSender}. */ @Internal public final class StreamPoolHeartbeatSender implements HeartbeatSender { + private static final Logger LOG = LoggerFactory.getLogger(StreamPoolHeartbeatSender.class); - private final WindmillStreamPool heartbeatStreamPool; + @Nonnull + private final AtomicReference> + heartbeatStreamPool = new AtomicReference<>(); - public StreamPoolHeartbeatSender( + private StreamPoolHeartbeatSender( WindmillStreamPool heartbeatStreamPool) { - this.heartbeatStreamPool = heartbeatStreamPool; + this.heartbeatStreamPool.set(heartbeatStreamPool); + } + + public static StreamPoolHeartbeatSender Create( + @Nonnull WindmillStreamPool heartbeatStreamPool) { + return new StreamPoolHeartbeatSender(heartbeatStreamPool); + } + + /** + * Creates StreamPoolHeartbeatSender that switches between the passed in stream pools depending on + * global config. + * + * @param dedicatedHeartbeatPool stream to use when using separate streams for heartbeat is + * enabled. + * @param getDataPool stream to use when using separate streams for heartbeat is disabled. + */ + public static StreamPoolHeartbeatSender Create( + @Nonnull WindmillStreamPool dedicatedHeartbeatPool, + @Nonnull WindmillStreamPool getDataPool, + @Nonnull StreamingGlobalConfigHandle configHandle) { + // Use getDataPool as the default, settings callback will + // switch to the separate pool if enabled before processing any elements are processed. + StreamPoolHeartbeatSender heartbeatSender = new StreamPoolHeartbeatSender(getDataPool); + configHandle.registerConfigObserver( + streamingGlobalConfig -> + heartbeatSender.heartbeatStreamPool.set( + streamingGlobalConfig + .userWorkerJobSettings() + .getUseSeparateWindmillHeartbeatStreams() + ? dedicatedHeartbeatPool + : getDataPool)); + return heartbeatSender; } @Override public void sendHeartbeats(Heartbeats heartbeats) { try (CloseableStream closeableStream = - heartbeatStreamPool.getCloseableStream()) { + heartbeatStreamPool.get().getCloseableStream()) { closeableStream.stream().refreshActiveWork(heartbeats.heartbeatRequests().asMap()); } catch (Exception e) { LOG.warn("Error occurred sending heartbeats=[{}].", heartbeats, e); diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorkerTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorkerTest.java index b41ad391d878..dadf02171235 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorkerTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorkerTest.java @@ -102,6 +102,8 @@ import org.apache.beam.runners.dataflow.worker.streaming.ShardedKey; import org.apache.beam.runners.dataflow.worker.streaming.Watermarks; import org.apache.beam.runners.dataflow.worker.streaming.Work; +import org.apache.beam.runners.dataflow.worker.streaming.config.StreamingGlobalConfig; +import org.apache.beam.runners.dataflow.worker.streaming.config.StreamingGlobalConfigHandleImpl; import org.apache.beam.runners.dataflow.worker.testing.RestoreDataflowLoggingMDC; import org.apache.beam.runners.dataflow.worker.testing.TestCountingSource; import org.apache.beam.runners.dataflow.worker.util.BoundedQueueExecutor; @@ -275,6 +277,8 @@ public Long get() { @Rule public TestRule restoreMDC = new RestoreDataflowLoggingMDC(); @Rule public ErrorCollector errorCollector = new ErrorCollector(); WorkUnitClient mockWorkUnitClient = mock(WorkUnitClient.class); + StreamingGlobalConfigHandleImpl mockGlobalConfigHandle = + mock(StreamingGlobalConfigHandleImpl.class); HotKeyLogger hotKeyLogger = mock(HotKeyLogger.class); private @Nullable ComputationStateCache computationStateCache = null; @@ -750,7 +754,9 @@ private StringBuilder initializeExpectedCommitRequest( requestBuilder.append("cache_token: "); requestBuilder.append(index + 1); requestBuilder.append(" "); - if (hasSourceBytesProcessed) requestBuilder.append("source_bytes_processed: 0 "); + if (hasSourceBytesProcessed) { + requestBuilder.append("source_bytes_processed: 0 "); + } return requestBuilder; } @@ -834,6 +840,8 @@ private DataflowWorkerHarnessOptions createTestingPipelineOptions(String... args private StreamingDataflowWorker makeWorker( StreamingDataflowWorkerTestParams streamingDataflowWorkerTestParams) { + when(mockGlobalConfigHandle.getConfig()) + .thenReturn(streamingDataflowWorkerTestParams.streamingGlobalConfig()); StreamingDataflowWorker worker = StreamingDataflowWorker.forTesting( streamingDataflowWorkerTestParams.stateNameMappings(), @@ -847,8 +855,8 @@ private StreamingDataflowWorker makeWorker( hotKeyLogger, streamingDataflowWorkerTestParams.clock(), streamingDataflowWorkerTestParams.executorSupplier(), - streamingDataflowWorkerTestParams.localRetryTimeoutMs(), - streamingDataflowWorkerTestParams.operationalLimits()); + mockGlobalConfigHandle, + streamingDataflowWorkerTestParams.localRetryTimeoutMs()); this.computationStateCache = worker.getComputationStateCache(); return worker; } @@ -1210,8 +1218,11 @@ public void testKeyCommitTooLargeException() throws Exception { makeWorker( defaultWorkerParams() .setInstructions(instructions) - .setOperationalLimits( - OperationalLimits.builder().setMaxWorkItemCommitBytes(1000).build()) + .setStreamingGlobalConfig( + StreamingGlobalConfig.builder() + .setOperationalLimits( + OperationalLimits.builder().setMaxWorkItemCommitBytes(1000).build()) + .build()) .publishCounters() .build()); worker.start(); @@ -1282,7 +1293,11 @@ public void testOutputKeyTooLargeException() throws Exception { makeWorker( defaultWorkerParams("--experiments=throw_exceptions_on_large_output") .setInstructions(instructions) - .setOperationalLimits(OperationalLimits.builder().setMaxOutputKeyBytes(15).build()) + .setStreamingGlobalConfig( + StreamingGlobalConfig.builder() + .setOperationalLimits( + OperationalLimits.builder().setMaxOutputKeyBytes(15).build()) + .build()) .build()); worker.start(); @@ -1315,8 +1330,11 @@ public void testOutputValueTooLargeException() throws Exception { makeWorker( defaultWorkerParams("--experiments=throw_exceptions_on_large_output") .setInstructions(instructions) - .setOperationalLimits( - OperationalLimits.builder().setMaxOutputValueBytes(15).build()) + .setStreamingGlobalConfig( + StreamingGlobalConfig.builder() + .setOperationalLimits( + OperationalLimits.builder().setMaxOutputValueBytes(15).build()) + .build()) .build()); worker.start(); @@ -4412,7 +4430,9 @@ Duration getLatencyAttributionDuration(long workToken, LatencyAttribution.State } boolean isActiveWorkRefresh(GetDataRequest request) { - if (request.getComputationHeartbeatRequestCount() > 0) return true; + if (request.getComputationHeartbeatRequestCount() > 0) { + return true; + } for (ComputationGetDataRequest computationRequest : request.getRequestsList()) { if (!computationRequest.getComputationId().equals(DEFAULT_COMPUTATION_ID)) { return false; @@ -4508,7 +4528,7 @@ private static StreamingDataflowWorkerTestParams.Builder builder() { .setLocalRetryTimeoutMs(-1) .setPublishCounters(false) .setClock(Instant::now) - .setOperationalLimits(OperationalLimits.builder().build()); + .setStreamingGlobalConfig(StreamingGlobalConfig.builder().build()); } abstract ImmutableMap stateNameMappings(); @@ -4525,10 +4545,11 @@ private static StreamingDataflowWorkerTestParams.Builder builder() { abstract int localRetryTimeoutMs(); - abstract OperationalLimits operationalLimits(); + abstract StreamingGlobalConfig streamingGlobalConfig(); @AutoValue.Builder abstract static class Builder { + abstract Builder setStateNameMappings(ImmutableMap value); abstract ImmutableMap.Builder stateNameMappingsBuilder(); @@ -4559,7 +4580,7 @@ final Builder publishCounters() { abstract Builder setLocalRetryTimeoutMs(int value); - abstract Builder setOperationalLimits(OperationalLimits operationalLimits); + abstract Builder setStreamingGlobalConfig(StreamingGlobalConfig config); abstract StreamingDataflowWorkerTestParams build(); } diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingModeExecutionContextTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingModeExecutionContextTest.java index 86ed8f552d16..a1d4210f3dbc 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingModeExecutionContextTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingModeExecutionContextTest.java @@ -59,6 +59,9 @@ import org.apache.beam.runners.dataflow.worker.profiler.ScopedProfiler.ProfileScope; import org.apache.beam.runners.dataflow.worker.streaming.Watermarks; import org.apache.beam.runners.dataflow.worker.streaming.Work; +import org.apache.beam.runners.dataflow.worker.streaming.config.FixedGlobalConfigHandle; +import org.apache.beam.runners.dataflow.worker.streaming.config.StreamingGlobalConfig; +import org.apache.beam.runners.dataflow.worker.streaming.config.StreamingGlobalConfigHandle; import org.apache.beam.runners.dataflow.worker.streaming.sideinput.SideInputStateFetcher; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.FakeGetDataClient; @@ -107,6 +110,8 @@ public void setUp() { options = PipelineOptionsFactory.as(DataflowWorkerHarnessOptions.class); CounterSet counterSet = new CounterSet(); ConcurrentHashMap stateNameMap = new ConcurrentHashMap<>(); + StreamingGlobalConfigHandle globalConfigHandle = + new FixedGlobalConfigHandle(StreamingGlobalConfig.builder().build()); stateNameMap.put(NameContextsForTests.nameContextForTest().userName(), "testStateFamily"); executionContext = new StreamingModeExecutionContext( @@ -127,6 +132,7 @@ public void setUp() { PipelineOptionsFactory.create(), "test-work-item-id"), executionStateRegistry, + globalConfigHandle, Long.MAX_VALUE, /*throwExceptionOnLargeOutput=*/ false); } @@ -158,7 +164,6 @@ public void testTimerInternalsSetTimer() { Watermarks.builder().setInputDataWatermark(new Instant(1000)).build()), stateReader, sideInputStateFetcher, - OperationalLimits.builder().build(), outputBuilder); TimerInternals timerInternals = stepContext.timerInternals(); @@ -208,7 +213,6 @@ public void testTimerInternalsProcessingTimeSkew() { Watermarks.builder().setInputDataWatermark(new Instant(1000)).build()), stateReader, sideInputStateFetcher, - OperationalLimits.builder().build(), outputBuilder); TimerInternals timerInternals = stepContext.timerInternals(); assertTrue(timerTimestamp.isBefore(timerInternals.currentProcessingTime())); diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/WorkerCustomSourcesTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/WorkerCustomSourcesTest.java index f2e03b453fd8..8ad73a5145bc 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/WorkerCustomSourcesTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/WorkerCustomSourcesTest.java @@ -90,6 +90,9 @@ import org.apache.beam.runners.dataflow.worker.profiler.ScopedProfiler.NoopProfileScope; import org.apache.beam.runners.dataflow.worker.streaming.Watermarks; import org.apache.beam.runners.dataflow.worker.streaming.Work; +import org.apache.beam.runners.dataflow.worker.streaming.config.FixedGlobalConfigHandle; +import org.apache.beam.runners.dataflow.worker.streaming.config.StreamingGlobalConfig; +import org.apache.beam.runners.dataflow.worker.streaming.config.StreamingGlobalConfigHandle; import org.apache.beam.runners.dataflow.worker.streaming.sideinput.SideInputStateFetcher; import org.apache.beam.runners.dataflow.worker.testing.TestCountingSource; import org.apache.beam.runners.dataflow.worker.util.common.worker.NativeReader; @@ -594,6 +597,8 @@ public void testReadUnboundedReader() throws Exception { StreamingModeExecutionStateRegistry executionStateRegistry = new StreamingModeExecutionStateRegistry(); ReaderCache readerCache = new ReaderCache(Duration.standardMinutes(1), Runnable::run); + StreamingGlobalConfigHandle globalConfigHandle = + new FixedGlobalConfigHandle(StreamingGlobalConfig.builder().build()); StreamingModeExecutionContext context = new StreamingModeExecutionContext( counterSet, @@ -610,6 +615,7 @@ public void testReadUnboundedReader() throws Exception { PipelineOptionsFactory.create(), "test-work-item-id"), executionStateRegistry, + globalConfigHandle, Long.MAX_VALUE, /*throwExceptionOnLargeOutput=*/ false); @@ -635,7 +641,6 @@ public void testReadUnboundedReader() throws Exception { Watermarks.builder().setInputDataWatermark(new Instant(0)).build()), mock(WindmillStateReader.class), mock(SideInputStateFetcher.class), - OperationalLimits.builder().build(), Windmill.WorkItemCommitRequest.newBuilder()); @SuppressWarnings({"unchecked", "rawtypes"}) @@ -960,6 +965,8 @@ public void testFailedWorkItemsAbort() throws Exception { CounterSet counterSet = new CounterSet(); StreamingModeExecutionStateRegistry executionStateRegistry = new StreamingModeExecutionStateRegistry(); + StreamingGlobalConfigHandle globalConfigHandle = + new FixedGlobalConfigHandle(StreamingGlobalConfig.builder().build()); StreamingModeExecutionContext context = new StreamingModeExecutionContext( counterSet, @@ -979,6 +986,7 @@ public void testFailedWorkItemsAbort() throws Exception { PipelineOptionsFactory.create(), "test-work-item-id"), executionStateRegistry, + globalConfigHandle, Long.MAX_VALUE, /*throwExceptionOnLargeOutput=*/ false); @@ -1012,7 +1020,6 @@ public void testFailedWorkItemsAbort() throws Exception { dummyWork, mock(WindmillStateReader.class), mock(SideInputStateFetcher.class), - OperationalLimits.builder().build(), Windmill.WorkItemCommitRequest.newBuilder()); @SuppressWarnings({"unchecked", "rawtypes"}) diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationWorkExecutorTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationWorkExecutorTest.java new file mode 100644 index 000000000000..9146ad02fddd --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationWorkExecutorTest.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.streaming; + +import static org.junit.Assert.assertTrue; +import static org.mockito.Mockito.doAnswer; +import static org.mockito.Mockito.doThrow; +import static org.mockito.Mockito.mock; + +import java.util.concurrent.atomic.AtomicBoolean; +import org.apache.beam.runners.core.metrics.ExecutionStateTracker; +import org.apache.beam.runners.dataflow.worker.DataflowWorkExecutor; +import org.apache.beam.runners.dataflow.worker.StreamingModeExecutionContext; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class ComputationWorkExecutorTest { + + private final DataflowWorkExecutor dataflowWorkExecutor = mock(DataflowWorkExecutor.class); + private final StreamingModeExecutionContext context = mock(StreamingModeExecutionContext.class); + private ComputationWorkExecutor computationWorkExecutor; + + @Before + public void setUp() { + computationWorkExecutor = + ComputationWorkExecutor.builder() + .setWorkExecutor(dataflowWorkExecutor) + .setContext(context) + .setExecutionStateTracker(mock(ExecutionStateTracker.class)) + .build(); + } + + @Test + public void testInvalidate_withoutCallToStart() { + // Call to invalidate w/o a call to start should not fail. + computationWorkExecutor.invalidate(); + } + + @Test + public void testInvalidate_handlesException() { + AtomicBoolean verifyContextInvalidated = new AtomicBoolean(false); + Throwable e = new RuntimeException("something bad happened 2"); + doThrow(e).when(dataflowWorkExecutor).close(); + doAnswer( + ignored -> { + verifyContextInvalidated.set(true); + return null; + }) + .when(context) + .invalidateCache(); + computationWorkExecutor.invalidate(); + assertTrue(verifyContextInvalidated.get()); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/config/FixedGlobalConfigHandleTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/config/FixedGlobalConfigHandleTest.java new file mode 100644 index 000000000000..b5cb85a58c12 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/config/FixedGlobalConfigHandleTest.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.streaming.config; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicReference; +import org.apache.beam.runners.dataflow.worker.OperationalLimits; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.UserWorkerRunnerV1Settings; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableSet; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.net.HostAndPort; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class FixedGlobalConfigHandleTest { + + @Test + public void getConfig() { + StreamingGlobalConfig config = + StreamingGlobalConfig.builder() + .setOperationalLimits( + OperationalLimits.builder() + .setMaxOutputValueBytes(123) + .setMaxOutputKeyBytes(324) + .setMaxWorkItemCommitBytes(456) + .build()) + .setWindmillServiceEndpoints(ImmutableSet.of(HostAndPort.fromHost("windmillHost"))) + .setUserWorkerJobSettings( + UserWorkerRunnerV1Settings.newBuilder() + .setUseSeparateWindmillHeartbeatStreams(false) + .build()) + .build(); + FixedGlobalConfigHandle globalConfigHandle = new FixedGlobalConfigHandle(config); + assertEquals(config, globalConfigHandle.getConfig()); + } + + @Test + public void registerConfigObserver() throws InterruptedException { + StreamingGlobalConfig config = + StreamingGlobalConfig.builder() + .setOperationalLimits( + OperationalLimits.builder() + .setMaxOutputValueBytes(123) + .setMaxOutputKeyBytes(324) + .setMaxWorkItemCommitBytes(456) + .build()) + .setWindmillServiceEndpoints(ImmutableSet.of(HostAndPort.fromHost("windmillHost"))) + .setUserWorkerJobSettings( + UserWorkerRunnerV1Settings.newBuilder() + .setUseSeparateWindmillHeartbeatStreams(false) + .build()) + .build(); + FixedGlobalConfigHandle globalConfigHandle = new FixedGlobalConfigHandle(config); + AtomicReference configFromCallback = new AtomicReference<>(); + CountDownLatch latch = new CountDownLatch(1); + globalConfigHandle.registerConfigObserver( + cbConfig -> { + configFromCallback.set(cbConfig); + latch.countDown(); + }); + assertTrue(latch.await(10, TimeUnit.SECONDS)); + assertEquals(configFromCallback.get(), globalConfigHandle.getConfig()); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/config/StreamingApplianceComputationConfigFetcherTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/config/StreamingApplianceComputationConfigFetcherTest.java index f39c98c61b19..2586ae2be86f 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/config/StreamingApplianceComputationConfigFetcherTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/config/StreamingApplianceComputationConfigFetcherTest.java @@ -137,6 +137,8 @@ public void testGetComputationConfig_onFetchConfigError() { } private StreamingApplianceComputationConfigFetcher createStreamingApplianceConfigLoader() { - return new StreamingApplianceComputationConfigFetcher(mockWindmillServer::getConfig); + return new StreamingApplianceComputationConfigFetcher( + mockWindmillServer::getConfig, + new FixedGlobalConfigHandle(StreamingGlobalConfig.builder().build())); } } diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/config/StreamingEngineComputationConfigFetcherTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/config/StreamingEngineComputationConfigFetcherTest.java index 59fd092adcba..9fa17588c94d 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/config/StreamingEngineComputationConfigFetcherTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/config/StreamingEngineComputationConfigFetcherTest.java @@ -34,7 +34,7 @@ import java.util.Set; import java.util.concurrent.CountDownLatch; import java.util.concurrent.Executors; -import java.util.function.Consumer; +import org.apache.beam.runners.dataflow.worker.OperationalLimits; import org.apache.beam.runners.dataflow.worker.WorkUnitClient; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; @@ -47,6 +47,7 @@ @RunWith(JUnit4.class) public class StreamingEngineComputationConfigFetcherTest { + private final WorkUnitClient mockDataflowServiceClient = mock(WorkUnitClient.class, new Returns(Optional.empty())); private StreamingEngineComputationConfigFetcher streamingEngineConfigFetcher; @@ -54,13 +55,13 @@ public class StreamingEngineComputationConfigFetcherTest { private StreamingEngineComputationConfigFetcher createConfigFetcher( boolean waitForInitialConfig, long globalConfigRefreshPeriod, - Consumer onPipelineConfig) { + StreamingGlobalConfigHandleImpl globalConfigHandle) { return StreamingEngineComputationConfigFetcher.forTesting( !waitForInitialConfig, globalConfigRefreshPeriod, mockDataflowServiceClient, - ignored -> Executors.newSingleThreadScheduledExecutor(), - onPipelineConfig); + globalConfigHandle, + ignored -> Executors.newSingleThreadScheduledExecutor()); } @After @@ -75,31 +76,33 @@ public void testStart_requiresInitialConfig() throws IOException, InterruptedExc .setJobId("job") .setStreamingConfigTask(new StreamingConfigTask().setMaxWorkItemCommitBytes(10L)); CountDownLatch waitForInitialConfig = new CountDownLatch(1); - Set receivedPipelineConfig = new HashSet<>(); + Set receivedPipelineConfig = new HashSet<>(); when(mockDataflowServiceClient.getGlobalStreamingConfigWorkItem()) .thenReturn(Optional.of(initialConfig)); + StreamingGlobalConfigHandleImpl globalConfigHandle = new StreamingGlobalConfigHandleImpl(); + globalConfigHandle.registerConfigObserver( + config -> { + try { + receivedPipelineConfig.add(config); + waitForInitialConfig.await(); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + }); streamingEngineConfigFetcher = - createConfigFetcher( - /* waitForInitialConfig= */ true, - 0, - config -> { - try { - receivedPipelineConfig.add(config); - waitForInitialConfig.await(); - } catch (InterruptedException e) { - throw new RuntimeException(e); - } - }); + createConfigFetcher(/* waitForInitialConfig= */ true, 0, globalConfigHandle); Thread asyncStartConfigLoader = new Thread(streamingEngineConfigFetcher::start); asyncStartConfigLoader.start(); waitForInitialConfig.countDown(); asyncStartConfigLoader.join(); - assertThat(receivedPipelineConfig) - .containsExactly( - StreamingEnginePipelineConfig.builder() - .setMaxWorkItemCommitBytes( - initialConfig.getStreamingConfigTask().getMaxWorkItemCommitBytes()) - .build()); + StreamingGlobalConfig.Builder configBuilder = + StreamingGlobalConfig.builder() + .setOperationalLimits( + OperationalLimits.builder() + .setMaxWorkItemCommitBytes( + initialConfig.getStreamingConfigTask().getMaxWorkItemCommitBytes()) + .build()); + assertThat(receivedPipelineConfig).containsExactly(configBuilder.build()); } @Test @@ -117,7 +120,7 @@ public void testStart_startsPeriodicConfigRequests() throws IOException, Interru .setJobId("job") .setStreamingConfigTask(new StreamingConfigTask().setMaxWorkItemCommitBytes(100L)); CountDownLatch numExpectedRefreshes = new CountDownLatch(3); - Set receivedPipelineConfig = new HashSet<>(); + Set receivedPipelineConfig = new HashSet<>(); when(mockDataflowServiceClient.getGlobalStreamingConfigWorkItem()) .thenReturn(Optional.of(firstConfig)) .thenReturn(Optional.of(secondConfig)) @@ -127,15 +130,15 @@ public void testStart_startsPeriodicConfigRequests() throws IOException, Interru // ConfigFetcher should not do anything with a config that doesn't contain a // StreamingConfigTask. .thenReturn(Optional.of(new WorkItem().setJobId("jobId"))); - + StreamingGlobalConfigHandleImpl globalConfigHandle = new StreamingGlobalConfigHandleImpl(); + globalConfigHandle.registerConfigObserver( + config -> { + receivedPipelineConfig.add(config); + numExpectedRefreshes.countDown(); + }); streamingEngineConfigFetcher = createConfigFetcher( - /* waitForInitialConfig= */ true, - Duration.millis(100).getMillis(), - config -> { - receivedPipelineConfig.add(config); - numExpectedRefreshes.countDown(); - }); + /* waitForInitialConfig= */ true, Duration.millis(100).getMillis(), globalConfigHandle); Thread asyncStartConfigLoader = new Thread(streamingEngineConfigFetcher::start); asyncStartConfigLoader.start(); @@ -143,24 +146,34 @@ public void testStart_startsPeriodicConfigRequests() throws IOException, Interru asyncStartConfigLoader.join(); assertThat(receivedPipelineConfig) .containsExactly( - StreamingEnginePipelineConfig.builder() - .setMaxWorkItemCommitBytes( - firstConfig.getStreamingConfigTask().getMaxWorkItemCommitBytes()) + StreamingGlobalConfig.builder() + .setOperationalLimits( + OperationalLimits.builder() + .setMaxWorkItemCommitBytes( + firstConfig.getStreamingConfigTask().getMaxWorkItemCommitBytes()) + .build()) .build(), - StreamingEnginePipelineConfig.builder() - .setMaxWorkItemCommitBytes( - secondConfig.getStreamingConfigTask().getMaxWorkItemCommitBytes()) + StreamingGlobalConfig.builder() + .setOperationalLimits( + OperationalLimits.builder() + .setMaxWorkItemCommitBytes( + secondConfig.getStreamingConfigTask().getMaxWorkItemCommitBytes()) + .build()) .build(), - StreamingEnginePipelineConfig.builder() - .setMaxWorkItemCommitBytes( - thirdConfig.getStreamingConfigTask().getMaxWorkItemCommitBytes()) + StreamingGlobalConfig.builder() + .setOperationalLimits( + OperationalLimits.builder() + .setMaxWorkItemCommitBytes( + thirdConfig.getStreamingConfigTask().getMaxWorkItemCommitBytes()) + .build()) .build()); } @Test public void testGetComputationConfig() throws IOException { + StreamingGlobalConfigHandleImpl globalConfigHandle = new StreamingGlobalConfigHandleImpl(); streamingEngineConfigFetcher = - createConfigFetcher(/* waitForInitialConfig= */ false, 0, ignored -> {}); + createConfigFetcher(/* waitForInitialConfig= */ false, 0, globalConfigHandle); String computationId = "computationId"; String stageName = "stageName"; String systemName = "systemName"; @@ -193,9 +206,11 @@ public void testGetComputationConfig() throws IOException { @Test public void testGetComputationConfig_noComputationPresent() throws IOException { - Set receivedPipelineConfig = new HashSet<>(); + Set receivedPipelineConfig = new HashSet<>(); + StreamingGlobalConfigHandleImpl globalConfigHandle = new StreamingGlobalConfigHandleImpl(); + globalConfigHandle.registerConfigObserver(receivedPipelineConfig::add); streamingEngineConfigFetcher = - createConfigFetcher(/* waitForInitialConfig= */ false, 0, receivedPipelineConfig::add); + createConfigFetcher(/* waitForInitialConfig= */ false, 0, globalConfigHandle); when(mockDataflowServiceClient.getStreamingConfigWorkItem(anyString())) .thenReturn(Optional.empty()); Optional pipelineConfig = @@ -206,8 +221,9 @@ public void testGetComputationConfig_noComputationPresent() throws IOException { @Test public void testGetComputationConfig_fetchConfigFromDataflowError() throws IOException { + StreamingGlobalConfigHandleImpl globalConfigHandle = new StreamingGlobalConfigHandleImpl(); streamingEngineConfigFetcher = - createConfigFetcher(/* waitForInitialConfig= */ false, 0, ignored -> {}); + createConfigFetcher(/* waitForInitialConfig= */ false, 0, globalConfigHandle); RuntimeException e = new RuntimeException("something bad happened."); when(mockDataflowServiceClient.getStreamingConfigWorkItem(anyString())).thenThrow(e); Throwable fetchConfigError = diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/config/StreamingGlobalConfigHandleImplTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/config/StreamingGlobalConfigHandleImplTest.java new file mode 100644 index 000000000000..059f60731a7d --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/config/StreamingGlobalConfigHandleImplTest.java @@ -0,0 +1,293 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.streaming.config; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.util.concurrent.CopyOnWriteArrayList; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicReference; +import java.util.function.Supplier; +import org.apache.beam.runners.dataflow.worker.OperationalLimits; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.UserWorkerRunnerV1Settings; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableSet; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.net.HostAndPort; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class StreamingGlobalConfigHandleImplTest { + + @Test + public void getConfig() { + StreamingGlobalConfigHandleImpl globalConfigHandle = new StreamingGlobalConfigHandleImpl(); + StreamingGlobalConfig config = + StreamingGlobalConfig.builder() + .setOperationalLimits( + OperationalLimits.builder() + .setMaxOutputValueBytes(123) + .setMaxOutputKeyBytes(324) + .setMaxWorkItemCommitBytes(456) + .build()) + .setWindmillServiceEndpoints(ImmutableSet.of(HostAndPort.fromHost("windmillHost"))) + .setUserWorkerJobSettings( + UserWorkerRunnerV1Settings.newBuilder() + .setUseSeparateWindmillHeartbeatStreams(false) + .build()) + .build(); + globalConfigHandle.setConfig(config); + assertEquals(config, globalConfigHandle.getConfig()); + + StreamingGlobalConfig updatedConfig = + StreamingGlobalConfig.builder() + .setOperationalLimits( + OperationalLimits.builder() + .setMaxOutputValueBytes(324) + .setMaxOutputKeyBytes(456) + .setMaxWorkItemCommitBytes(123) + .build()) + .setWindmillServiceEndpoints(ImmutableSet.of(HostAndPort.fromHost("windmillHost1"))) + .setUserWorkerJobSettings( + UserWorkerRunnerV1Settings.newBuilder() + .setUseSeparateWindmillHeartbeatStreams(true) + .build()) + .build(); + globalConfigHandle.setConfig(updatedConfig); + assertEquals(updatedConfig, globalConfigHandle.getConfig()); + } + + @Test + public void registerConfigObserver_configSetAfterRegisteringCallback() + throws InterruptedException { + CountDownLatch latch = new CountDownLatch(2); + StreamingGlobalConfigHandleImpl globalConfigHandle = new StreamingGlobalConfigHandleImpl(); + StreamingGlobalConfig configToSet = + StreamingGlobalConfig.builder() + .setOperationalLimits( + OperationalLimits.builder() + .setMaxOutputValueBytes(123) + .setMaxOutputKeyBytes(324) + .setMaxWorkItemCommitBytes(456) + .build()) + .setWindmillServiceEndpoints(ImmutableSet.of(HostAndPort.fromHost("windmillHost"))) + .setUserWorkerJobSettings( + UserWorkerRunnerV1Settings.newBuilder() + .setUseSeparateWindmillHeartbeatStreams(false) + .build()) + .build(); + AtomicReference configFromCallback1 = new AtomicReference<>(); + AtomicReference configFromCallback2 = new AtomicReference<>(); + globalConfigHandle.registerConfigObserver( + config -> { + configFromCallback1.set(config); + latch.countDown(); + }); + globalConfigHandle.registerConfigObserver( + config -> { + configFromCallback2.set(config); + latch.countDown(); + }); + globalConfigHandle.setConfig(configToSet); + assertTrue(latch.await(10, TimeUnit.SECONDS)); + assertEquals(configFromCallback1.get(), globalConfigHandle.getConfig()); + assertEquals(configFromCallback2.get(), globalConfigHandle.getConfig()); + } + + @Test + public void registerConfigObserver_configSetBeforeRegisteringCallback() + throws InterruptedException { + CountDownLatch latch = new CountDownLatch(2); + StreamingGlobalConfigHandleImpl globalConfigHandle = new StreamingGlobalConfigHandleImpl(); + StreamingGlobalConfig configToSet = + StreamingGlobalConfig.builder() + .setOperationalLimits( + OperationalLimits.builder() + .setMaxOutputValueBytes(123) + .setMaxOutputKeyBytes(324) + .setMaxWorkItemCommitBytes(456) + .build()) + .setWindmillServiceEndpoints(ImmutableSet.of(HostAndPort.fromHost("windmillHost"))) + .setUserWorkerJobSettings( + UserWorkerRunnerV1Settings.newBuilder() + .setUseSeparateWindmillHeartbeatStreams(false) + .build()) + .build(); + AtomicReference configFromCallback1 = new AtomicReference<>(); + AtomicReference configFromCallback2 = new AtomicReference<>(); + globalConfigHandle.setConfig(configToSet); + globalConfigHandle.registerConfigObserver( + config -> { + configFromCallback1.set(config); + latch.countDown(); + }); + globalConfigHandle.registerConfigObserver( + config -> { + configFromCallback2.set(config); + latch.countDown(); + }); + assertTrue(latch.await(10, TimeUnit.SECONDS)); + assertEquals(configFromCallback1.get(), globalConfigHandle.getConfig()); + assertEquals(configFromCallback2.get(), globalConfigHandle.getConfig()); + } + + @Test + public void registerConfigObserver_configSetBeforeRegisteringCallback_callbackThrowsException() + throws InterruptedException { + CountDownLatch latch = new CountDownLatch(2); + StreamingGlobalConfigHandleImpl globalConfigHandle = new StreamingGlobalConfigHandleImpl(); + StreamingGlobalConfig configToSet = + StreamingGlobalConfig.builder() + .setOperationalLimits( + OperationalLimits.builder() + .setMaxOutputValueBytes(123) + .setMaxOutputKeyBytes(324) + .setMaxWorkItemCommitBytes(456) + .build()) + .setWindmillServiceEndpoints(ImmutableSet.of(HostAndPort.fromHost("windmillHost"))) + .setUserWorkerJobSettings( + UserWorkerRunnerV1Settings.newBuilder() + .setUseSeparateWindmillHeartbeatStreams(false) + .build()) + .build(); + AtomicReference configFromCallback = new AtomicReference<>(); + globalConfigHandle.setConfig(configToSet); + globalConfigHandle.registerConfigObserver( + config -> { + latch.countDown(); + throw new RuntimeException(); + }); + globalConfigHandle.registerConfigObserver( + config -> { + configFromCallback.set(config); + latch.countDown(); + }); + assertTrue(latch.await(10, TimeUnit.SECONDS)); + assertEquals(configFromCallback.get(), configToSet); + } + + @Test + public void registerConfigObserver_configSetAfterRegisteringCallback_callbackThrowsException() + throws InterruptedException { + CountDownLatch latch = new CountDownLatch(2); + StreamingGlobalConfigHandleImpl globalConfigHandle = new StreamingGlobalConfigHandleImpl(); + StreamingGlobalConfig configToSet = + StreamingGlobalConfig.builder() + .setOperationalLimits( + OperationalLimits.builder() + .setMaxOutputValueBytes(123) + .setMaxOutputKeyBytes(324) + .setMaxWorkItemCommitBytes(456) + .build()) + .setWindmillServiceEndpoints(ImmutableSet.of(HostAndPort.fromHost("windmillHost"))) + .setUserWorkerJobSettings( + UserWorkerRunnerV1Settings.newBuilder() + .setUseSeparateWindmillHeartbeatStreams(false) + .build()) + .build(); + AtomicReference configFromCallback = new AtomicReference<>(); + globalConfigHandle.registerConfigObserver( + config -> { + latch.countDown(); + throw new RuntimeException(); + }); + globalConfigHandle.registerConfigObserver( + config -> { + configFromCallback.set(config); + latch.countDown(); + }); + globalConfigHandle.setConfig(configToSet); + assertTrue(latch.await(10, TimeUnit.SECONDS)); + assertEquals(configFromCallback.get(), configToSet); + } + + @Test + public void registerConfigObserver_shouldNotCallCallbackForIfConfigRemainsSame() + throws InterruptedException { + CountDownLatch latch = new CountDownLatch(1); + AtomicInteger callbackCount = new AtomicInteger(0); + StreamingGlobalConfigHandleImpl globalConfigHandle = new StreamingGlobalConfigHandleImpl(); + Supplier configToSet = + () -> + StreamingGlobalConfig.builder() + .setOperationalLimits( + OperationalLimits.builder() + .setMaxOutputValueBytes(123) + .setMaxOutputKeyBytes(324) + .setMaxWorkItemCommitBytes(456) + .build()) + .setWindmillServiceEndpoints(ImmutableSet.of(HostAndPort.fromHost("windmillHost"))) + .setUserWorkerJobSettings( + UserWorkerRunnerV1Settings.newBuilder() + .setUseSeparateWindmillHeartbeatStreams(false) + .build()) + .build(); + globalConfigHandle.registerConfigObserver( + config -> { + callbackCount.incrementAndGet(); + latch.countDown(); + }); + globalConfigHandle.setConfig(configToSet.get()); + // call setter again with same config + globalConfigHandle.setConfig(configToSet.get()); + assertTrue(latch.await(10, TimeUnit.SECONDS)); + Thread.sleep(TimeUnit.SECONDS.toMillis(10)); + assertEquals(1, callbackCount.get()); + } + + @Test + public void registerConfigObserver_updateConfigWhenCallbackIsRunning() + throws InterruptedException { + CountDownLatch latch = new CountDownLatch(2); + StreamingGlobalConfigHandleImpl globalConfigHandle = new StreamingGlobalConfigHandleImpl(); + StreamingGlobalConfig initialConfig = + StreamingGlobalConfig.builder() + .setOperationalLimits(OperationalLimits.builder().setMaxOutputValueBytes(4569).build()) + .build(); + StreamingGlobalConfig updatedConfig = + StreamingGlobalConfig.builder() + .setOperationalLimits( + OperationalLimits.builder() + .setMaxOutputValueBytes(123) + .setMaxOutputKeyBytes(324) + .setMaxWorkItemCommitBytes(456) + .build()) + .setWindmillServiceEndpoints(ImmutableSet.of(HostAndPort.fromHost("windmillHost"))) + .setUserWorkerJobSettings( + UserWorkerRunnerV1Settings.newBuilder() + .setUseSeparateWindmillHeartbeatStreams(false) + .build()) + .build(); + CopyOnWriteArrayList configsFromCallback = new CopyOnWriteArrayList<>(); + globalConfigHandle.registerConfigObserver( + config -> { + configsFromCallback.add(config); + if (config.equals(initialConfig)) { + globalConfigHandle.setConfig(updatedConfig); + } + latch.countDown(); + }); + globalConfigHandle.setConfig(initialConfig); + assertTrue(latch.await(10, TimeUnit.SECONDS)); + assertEquals(configsFromCallback.get(0), initialConfig); + assertEquals(configsFromCallback.get(1), updatedConfig); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/harness/FanOutStreamingEngineWorkerHarnessTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/harness/FanOutStreamingEngineWorkerHarnessTest.java index aaa71b6598ea..ed8815c48e76 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/harness/FanOutStreamingEngineWorkerHarnessTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/harness/FanOutStreamingEngineWorkerHarnessTest.java @@ -39,6 +39,7 @@ import java.util.concurrent.TimeUnit; import java.util.stream.Collectors; import javax.annotation.Nullable; +import org.apache.beam.runners.dataflow.options.DataflowWorkerHarnessOptions; import org.apache.beam.runners.dataflow.worker.util.MemoryMonitor; import org.apache.beam.runners.dataflow.worker.windmill.CloudWindmillMetadataServiceV1Alpha1Grpc; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GetWorkRequest; @@ -54,10 +55,12 @@ import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.stubs.ChannelCachingStubFactory; import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.stubs.WindmillChannelFactory; import org.apache.beam.runners.dataflow.worker.windmill.testing.FakeWindmillStubFactory; +import org.apache.beam.runners.dataflow.worker.windmill.testing.FakeWindmillStubFactoryFactory; import org.apache.beam.runners.dataflow.worker.windmill.work.WorkItemScheduler; import org.apache.beam.runners.dataflow.worker.windmill.work.budget.GetWorkBudget; import org.apache.beam.runners.dataflow.worker.windmill.work.budget.GetWorkBudgetDistributor; import org.apache.beam.runners.dataflow.worker.windmill.work.budget.GetWorkBudgetSpender; +import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.Server; import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.inprocess.InProcessServerBuilder; import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.inprocess.InProcessSocketAddress; @@ -111,7 +114,11 @@ public class FanOutStreamingEngineWorkerHarnessTest { WindmillChannelFactory.inProcessChannel("StreamingEngineClientTest"))); private final GrpcDispatcherClient dispatcherClient = GrpcDispatcherClient.forTesting( - stubFactory, new ArrayList<>(), new ArrayList<>(), new HashSet<>()); + PipelineOptionsFactory.as(DataflowWorkerHarnessOptions.class), + new FakeWindmillStubFactoryFactory(stubFactory), + new ArrayList<>(), + new ArrayList<>(), + new HashSet<>()); @Rule public transient Timeout globalTimeout = Timeout.seconds(600); private Server fakeStreamingEngineServer; private CountDownLatch getWorkerMetadataReady; diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDispatcherClientTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDispatcherClientTest.java new file mode 100644 index 000000000000..3f746d91a868 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDispatcherClientTest.java @@ -0,0 +1,154 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.client.grpc; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.instanceOf; +import static org.hamcrest.Matchers.not; +import static org.junit.Assert.assertNotSame; +import static org.junit.Assert.assertSame; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import org.apache.beam.runners.dataflow.options.DataflowWorkerHarnessOptions; +import org.apache.beam.runners.dataflow.worker.streaming.config.StreamingGlobalConfig; +import org.apache.beam.runners.dataflow.worker.windmill.CloudWindmillServiceV1Alpha1Grpc.CloudWindmillServiceV1Alpha1Stub; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.UserWorkerRunnerV1Settings; +import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.stubs.IsolationChannel; +import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.stubs.WindmillStubFactoryFactoryImpl; +import org.apache.beam.sdk.options.PipelineOptionsFactory; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableSet; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Lists; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.net.HostAndPort; +import org.hamcrest.Matcher; +import org.junit.Test; +import org.junit.experimental.runners.Enclosed; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameter; +import org.junit.runners.Parameterized.Parameters; + +@RunWith(Enclosed.class) +public class GrpcDispatcherClientTest { + + @RunWith(JUnit4.class) + public static class RespectsJobSettingTest { + + @Test + public void createsNewStubWhenIsolatedChannelsConfigIsChanged() { + DataflowWorkerHarnessOptions options = + PipelineOptionsFactory.as(DataflowWorkerHarnessOptions.class); + options.setExperiments( + Lists.newArrayList( + GrpcDispatcherClient.STREAMING_ENGINE_USE_JOB_SETTINGS_FOR_ISOLATED_CHANNELS)); + GrpcDispatcherClient dispatcherClient = + GrpcDispatcherClient.create(options, new WindmillStubFactoryFactoryImpl(options)); + // Create first time with Isolated channels disabled + dispatcherClient.onJobConfig(getGlobalConfig(/*useWindmillIsolatedChannels=*/ false)); + CloudWindmillServiceV1Alpha1Stub stub1 = dispatcherClient.getWindmillServiceStub(); + CloudWindmillServiceV1Alpha1Stub stub2 = dispatcherClient.getWindmillServiceStub(); + assertSame(stub2, stub1); + assertThat(stub1.getChannel(), not(instanceOf(IsolationChannel.class))); + + // Enable Isolated channels + dispatcherClient.onJobConfig(getGlobalConfig(/*useWindmillIsolatedChannels=*/ true)); + CloudWindmillServiceV1Alpha1Stub stub3 = dispatcherClient.getWindmillServiceStub(); + assertNotSame(stub3, stub1); + + assertThat(stub3.getChannel(), instanceOf(IsolationChannel.class)); + CloudWindmillServiceV1Alpha1Stub stub4 = dispatcherClient.getWindmillServiceStub(); + assertSame(stub3, stub4); + + // Disable Isolated channels + dispatcherClient.onJobConfig(getGlobalConfig(/*useWindmillIsolatedChannels=*/ false)); + CloudWindmillServiceV1Alpha1Stub stub5 = dispatcherClient.getWindmillServiceStub(); + assertNotSame(stub4, stub5); + assertThat(stub5.getChannel(), not(instanceOf(IsolationChannel.class))); + } + } + + @RunWith(Parameterized.class) + public static class RespectsPipelineOptionsTest { + + @Parameters + public static Collection data() { + List list = new ArrayList<>(); + for (Boolean pipelineOption : new Boolean[] {true, false}) { + list.add(new Object[] {/*experimentEnabled=*/ false, pipelineOption}); + list.add(new Object[] {/*experimentEnabled=*/ true, pipelineOption}); + } + return list; + } + + @Parameter(0) + public Boolean experimentEnabled; + + @Parameter(1) + public Boolean pipelineOption; + + @Test + public void ignoresIsolatedChannelsConfigWithPipelineOption() { + DataflowWorkerHarnessOptions options = + PipelineOptionsFactory.as(DataflowWorkerHarnessOptions.class); + if (experimentEnabled) { + options.setExperiments( + Lists.newArrayList( + GrpcDispatcherClient.STREAMING_ENGINE_USE_JOB_SETTINGS_FOR_ISOLATED_CHANNELS)); + } + options.setUseWindmillIsolatedChannels(pipelineOption); + GrpcDispatcherClient dispatcherClient = + GrpcDispatcherClient.create(options, new WindmillStubFactoryFactoryImpl(options)); + Matcher classMatcher = + pipelineOption + ? instanceOf(IsolationChannel.class) + : not(instanceOf(IsolationChannel.class)); + + // Job setting disabled, PipelineOption enabled + dispatcherClient.onJobConfig(getGlobalConfig(/*useWindmillIsolatedChannels=*/ false)); + CloudWindmillServiceV1Alpha1Stub stub1 = dispatcherClient.getWindmillServiceStub(); + CloudWindmillServiceV1Alpha1Stub stub2 = dispatcherClient.getWindmillServiceStub(); + assertSame(stub2, stub1); + assertThat(stub1.getChannel(), classMatcher); + + // Job setting enabled + dispatcherClient.onJobConfig(getGlobalConfig(/*useWindmillIsolatedChannels=*/ true)); + CloudWindmillServiceV1Alpha1Stub stub3 = dispatcherClient.getWindmillServiceStub(); + assertSame(stub3, stub1); + + CloudWindmillServiceV1Alpha1Stub stub4 = dispatcherClient.getWindmillServiceStub(); + assertSame(stub3, stub4); + + // Job setting disabled + dispatcherClient.onJobConfig(getGlobalConfig(/*useWindmillIsolatedChannels=*/ false)); + CloudWindmillServiceV1Alpha1Stub stub5 = dispatcherClient.getWindmillServiceStub(); + assertSame(stub4, stub5); + } + } + + static StreamingGlobalConfig getGlobalConfig(boolean useWindmillIsolatedChannels) { + return StreamingGlobalConfig.builder() + .setWindmillServiceEndpoints(ImmutableSet.of(HostAndPort.fromString("windmill:1234"))) + .setUserWorkerJobSettings( + UserWorkerRunnerV1Settings.newBuilder() + .setUseWindmillIsolatedChannels(useWindmillIsolatedChannels) + .build()) + .build(); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillServerTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillServerTest.java index 7e5801b65de4..239e3979a3b7 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillServerTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillServerTest.java @@ -73,6 +73,7 @@ import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetWorkStream; import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.stubs.WindmillChannelFactory; import org.apache.beam.runners.dataflow.worker.windmill.testing.FakeWindmillStubFactory; +import org.apache.beam.runners.dataflow.worker.windmill.testing.FakeWindmillStubFactoryFactory; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.CallOptions; import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.Channel; @@ -110,6 +111,7 @@ "rawtypes", // TODO(https://github.com/apache/beam/issues/20447) }) public class GrpcWindmillServerTest { + private static final Logger LOG = LoggerFactory.getLogger(GrpcWindmillServerTest.class); private static final int STREAM_CHUNK_SIZE = 2 << 20; private final long clientId = 10L; @@ -145,8 +147,9 @@ private void startServerAndClient(List experiments) throws Exception { name, experiments, clientId, - new FakeWindmillStubFactory( - () -> grpcCleanup.register(WindmillChannelFactory.inProcessChannel(name)))); + new FakeWindmillStubFactoryFactory( + new FakeWindmillStubFactory( + () -> grpcCleanup.register(WindmillChannelFactory.inProcessChannel(name))))); } private void maybeInjectError(Stream stream) { @@ -212,7 +215,9 @@ public ClientCall interceptCall( this.client = GrpcWindmillServer.newApplianceTestInstance( - inprocessChannel, new FakeWindmillStubFactory(() -> (ManagedChannel) inprocessChannel)); + inprocessChannel, + new FakeWindmillStubFactoryFactory( + new FakeWindmillStubFactory(() -> (ManagedChannel) inprocessChannel))); Windmill.GetWorkResponse response1 = client.getWork(GetWorkRequest.getDefaultInstance()); Windmill.GetWorkResponse response2 = client.getWork(GetWorkRequest.getDefaultInstance()); diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/testing/FakeWindmillStubFactoryFactory.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/testing/FakeWindmillStubFactoryFactory.java new file mode 100644 index 000000000000..51f8b8e14320 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/testing/FakeWindmillStubFactoryFactory.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.testing; + +import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.stubs.WindmillStubFactory; +import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.stubs.WindmillStubFactoryFactory; + +public class FakeWindmillStubFactoryFactory implements WindmillStubFactoryFactory { + + private final WindmillStubFactory windmillStubFactory; + + public FakeWindmillStubFactoryFactory(WindmillStubFactory windmillStubFactory) { + this.windmillStubFactory = windmillStubFactory; + } + + @Override + public WindmillStubFactory makeWindmillStubFactory(boolean useIsolatedChannels) { + return windmillStubFactory; + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/StreamPoolHeartbeatSenderTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/StreamPoolHeartbeatSenderTest.java new file mode 100644 index 000000000000..ed915088d0a6 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/StreamPoolHeartbeatSenderTest.java @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.work.refresh; + +import static org.junit.Assert.assertEquals; + +import java.util.Optional; +import org.apache.beam.runners.dataflow.worker.FakeWindmillServer; +import org.apache.beam.runners.dataflow.worker.streaming.config.FakeGlobalConfigHandle; +import org.apache.beam.runners.dataflow.worker.streaming.config.StreamingGlobalConfig; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.HeartbeatRequest; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.UserWorkerRunnerV1Settings; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStreamPool; +import org.joda.time.Duration; +import org.junit.Test; +import org.junit.rules.ErrorCollector; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class StreamPoolHeartbeatSenderTest { + + @Test + public void sendsHeartbeatsOnStream() { + FakeWindmillServer server = new FakeWindmillServer(new ErrorCollector(), c -> Optional.empty()); + StreamPoolHeartbeatSender heartbeatSender = + StreamPoolHeartbeatSender.Create( + WindmillStreamPool.create(1, Duration.standardSeconds(10), server::getDataStream)); + Heartbeats.Builder heartbeatsBuilder = Heartbeats.builder(); + heartbeatsBuilder + .heartbeatRequestsBuilder() + .put("key", HeartbeatRequest.newBuilder().setWorkToken(123).build()); + heartbeatSender.sendHeartbeats(heartbeatsBuilder.build()); + assertEquals(1, server.getGetDataRequests().size()); + } + + @Test + public void sendsHeartbeatsOnDedicatedStream() { + FakeWindmillServer dedicatedServer = + new FakeWindmillServer(new ErrorCollector(), c -> Optional.empty()); + FakeWindmillServer getDataServer = + new FakeWindmillServer(new ErrorCollector(), c -> Optional.empty()); + + FakeGlobalConfigHandle configHandle = + new FakeGlobalConfigHandle(getGlobalConfig(/*useSeparateHeartbeatStreams=*/ true)); + StreamPoolHeartbeatSender heartbeatSender = + StreamPoolHeartbeatSender.Create( + WindmillStreamPool.create( + 1, Duration.standardSeconds(10), dedicatedServer::getDataStream), + WindmillStreamPool.create( + 1, Duration.standardSeconds(10), getDataServer::getDataStream), + configHandle); + Heartbeats.Builder heartbeatsBuilder = Heartbeats.builder(); + heartbeatsBuilder + .heartbeatRequestsBuilder() + .put("key", HeartbeatRequest.newBuilder().setWorkToken(123).build()); + heartbeatSender.sendHeartbeats(heartbeatsBuilder.build()); + assertEquals(1, dedicatedServer.getGetDataRequests().size()); + assertEquals(0, getDataServer.getGetDataRequests().size()); + + heartbeatSender.sendHeartbeats(heartbeatsBuilder.build()); + assertEquals(2, dedicatedServer.getGetDataRequests().size()); + assertEquals(0, getDataServer.getGetDataRequests().size()); + + // Turn off separate heartbeats + configHandle.setConfig(getGlobalConfig(/*useSeparateHeartbeatStreams=*/ false)); + heartbeatSender.sendHeartbeats(heartbeatsBuilder.build()); + // request to getDataServer increases and dedicatedServer remains same + assertEquals(2, dedicatedServer.getGetDataRequests().size()); + assertEquals(1, getDataServer.getGetDataRequests().size()); + } + + private static StreamingGlobalConfig getGlobalConfig(boolean useSeparateHeartbeatStreams) { + return StreamingGlobalConfig.builder() + .setUserWorkerJobSettings( + UserWorkerRunnerV1Settings.newBuilder() + .setUseSeparateWindmillHeartbeatStreams(useSeparateHeartbeatStreams) + .build()) + .build(); + } + + @Test + public void sendsHeartbeatsOnGetDataStream() { + FakeWindmillServer dedicatedServer = + new FakeWindmillServer(new ErrorCollector(), c -> Optional.empty()); + FakeWindmillServer getDataServer = + new FakeWindmillServer(new ErrorCollector(), c -> Optional.empty()); + + FakeGlobalConfigHandle configHandle = + new FakeGlobalConfigHandle(getGlobalConfig(/*useSeparateHeartbeatStreams=*/ false)); + StreamPoolHeartbeatSender heartbeatSender = + StreamPoolHeartbeatSender.Create( + WindmillStreamPool.create( + 1, Duration.standardSeconds(10), dedicatedServer::getDataStream), + WindmillStreamPool.create( + 1, Duration.standardSeconds(10), getDataServer::getDataStream), + configHandle); + Heartbeats.Builder heartbeatsBuilder = Heartbeats.builder(); + heartbeatsBuilder + .heartbeatRequestsBuilder() + .put("key", HeartbeatRequest.newBuilder().setWorkToken(123).build()); + heartbeatSender.sendHeartbeats(heartbeatsBuilder.build()); + assertEquals(0, dedicatedServer.getGetDataRequests().size()); + assertEquals(1, getDataServer.getGetDataRequests().size()); + + heartbeatSender.sendHeartbeats(heartbeatsBuilder.build()); + assertEquals(0, dedicatedServer.getGetDataRequests().size()); + assertEquals(2, getDataServer.getGetDataRequests().size()); + + // Turn on separate heartbeats + configHandle.setConfig(getGlobalConfig(/*useSeparateHeartbeatStreams=*/ true)); + heartbeatSender.sendHeartbeats(heartbeatsBuilder.build()); + // request to dedicatedServer increases and getDataServer remains same + assertEquals(1, dedicatedServer.getGetDataRequests().size()); + assertEquals(2, getDataServer.getGetDataRequests().size()); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/windmill/src/main/proto/windmill.proto b/runners/google-cloud-dataflow-java/worker/windmill/src/main/proto/windmill.proto index 4677ff9dcc9a..3b3348dbc3fa 100644 --- a/runners/google-cloud-dataflow-java/worker/windmill/src/main/proto/windmill.proto +++ b/runners/google-cloud-dataflow-java/worker/windmill/src/main/proto/windmill.proto @@ -923,6 +923,15 @@ message WorkerMetadataResponse { reserved 4; } +// Settings to control runtime behavior of the java runner v1 user worker. +message UserWorkerRunnerV1Settings { + // If true, use separate channels for each windmill RPC. + optional bool use_windmill_isolated_channels = 1 [default = true]; + + // If true, use separate streaming RPC for windmill heartbeats and state reads. + optional bool use_separate_windmill_heartbeat_streams = 2 [default = true]; +} + service WindmillAppliance { // Gets streaming Dataflow work. rpc GetWork(.windmill.GetWorkRequest) returns (.windmill.GetWorkResponse); diff --git a/runners/prism/java/build.gradle b/runners/prism/java/build.gradle index deee8876af6f..de9a30ad8189 100644 --- a/runners/prism/java/build.gradle +++ b/runners/prism/java/build.gradle @@ -26,16 +26,11 @@ description = "Apache Beam :: Runners :: Prism :: Java" ext.summary = "Support for executing a pipeline on Prism." dependencies { - implementation project(path: ":model:job-management", configuration: "shadow") - implementation project(path: ":model:pipeline", configuration: "shadow") implementation project(path: ":sdks:java:core", configuration: "shadow") - implementation project(path: ":sdks:java:harness", configuration: "shadow") - implementation project(":runners:java-fn-execution") implementation project(":runners:portability:java") implementation library.java.joda_time implementation library.java.slf4j_api - implementation library.java.vendored_grpc_1_60_1 implementation library.java.vendored_guava_32_1_2_jre compileOnly library.java.hamcrest diff --git a/runners/prism/java/src/main/java/org/apache/beam/runners/prism/PrismArtifactResolver.java b/runners/prism/java/src/main/java/org/apache/beam/runners/prism/PrismArtifactResolver.java deleted file mode 100644 index db56bc6047ca..000000000000 --- a/runners/prism/java/src/main/java/org/apache/beam/runners/prism/PrismArtifactResolver.java +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.runners.prism; - -import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkState; - -import com.google.auto.value.AutoValue; -import java.util.Optional; -import org.apache.beam.model.pipeline.v1.RunnerApi; -import org.apache.beam.sdk.Pipeline; -import org.apache.beam.sdk.options.PipelineOptions; -import org.apache.beam.sdk.util.construction.DefaultArtifactResolver; -import org.apache.beam.sdk.util.construction.PipelineTranslation; -import org.apache.beam.sdk.util.construction.SdkComponents; - -/** - * The {@link PrismArtifactResolver} converts a {@link Pipeline} to a {@link RunnerApi.Pipeline} via - * resolving {@link RunnerApi.ArtifactInformation}. - */ -@AutoValue -abstract class PrismArtifactResolver { - - /** - * Instantiates a {@link PrismArtifactResolver} from the {@param pipeline}, applying defaults to - * the remaining dependencies. - */ - static PrismArtifactResolver of(Pipeline pipeline) { - return PrismArtifactResolver.builder().setPipeline(pipeline).build(); - } - - static Builder builder() { - return new AutoValue_PrismArtifactResolver.Builder(); - } - - /** - * Converts the {@link #getPipeline()} using {@link PipelineTranslation#toProto} and {@link - * #getDelegate()}'s {@link - * org.apache.beam.sdk.util.construction.ArtifactResolver#resolveArtifacts}. - */ - RunnerApi.Pipeline resolvePipelineProto() { - RunnerApi.Pipeline result = PipelineTranslation.toProto(getPipeline(), getSdkComponents()); - return getDelegate().resolveArtifacts(result); - } - - /** - * {@link PrismArtifactResolver} delegates to {@link - * org.apache.beam.sdk.util.construction.ArtifactResolver} to transform {@link - * RunnerApi.ArtifactInformation}. Defaults to {@link DefaultArtifactResolver#INSTANCE} if not - * set. - */ - abstract org.apache.beam.sdk.util.construction.ArtifactResolver getDelegate(); - - /** The {@link Pipeline} from which {@link PrismArtifactResolver#resolvePipelineProto()}. */ - abstract Pipeline getPipeline(); - - /** - * SDK objects that will be represented by {@link - * org.apache.beam.model.pipeline.v1.RunnerApi.Components}. Instantiated via {@link - * SdkComponents#create(PipelineOptions)} by default, where {@link PipelineOptions} are acquired - * from {@link #getPipeline}'s {@link Pipeline#getOptions}. - */ - abstract SdkComponents getSdkComponents(); - - @AutoValue.Builder - abstract static class Builder { - - abstract Builder setDelegate( - org.apache.beam.sdk.util.construction.ArtifactResolver artifactResolver); - - abstract Optional getDelegate(); - - abstract Builder setSdkComponents(SdkComponents sdkComponents); - - abstract Optional getSdkComponents(); - - abstract Builder setPipeline(Pipeline pipeline); - - abstract Optional getPipeline(); - - abstract PrismArtifactResolver autoBuild(); - - final PrismArtifactResolver build() { - if (!getDelegate().isPresent()) { - setDelegate(DefaultArtifactResolver.INSTANCE); - } - - if (!getSdkComponents().isPresent()) { - checkState(getPipeline().isPresent()); - setSdkComponents(SdkComponents.create(getPipeline().get().getOptions())); - } - - return autoBuild(); - } - } -} diff --git a/runners/prism/java/src/main/java/org/apache/beam/runners/prism/PrismArtifactStager.java b/runners/prism/java/src/main/java/org/apache/beam/runners/prism/PrismArtifactStager.java deleted file mode 100644 index f1d99a213eea..000000000000 --- a/runners/prism/java/src/main/java/org/apache/beam/runners/prism/PrismArtifactStager.java +++ /dev/null @@ -1,173 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.runners.prism; - -import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkState; - -import com.google.auto.value.AutoValue; -import java.util.Optional; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.TimeUnit; -import org.apache.beam.model.jobmanagement.v1.ArtifactStagingServiceGrpc; -import org.apache.beam.model.jobmanagement.v1.JobApi; -import org.apache.beam.model.jobmanagement.v1.JobServiceGrpc; -import org.apache.beam.model.pipeline.v1.Endpoints; -import org.apache.beam.runners.fnexecution.artifact.ArtifactRetrievalService; -import org.apache.beam.runners.fnexecution.artifact.ArtifactStagingService; -import org.apache.beam.sdk.fn.channel.ManagedChannelFactory; -import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.ManagedChannel; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Stages {@link org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline} artifacts of prepared jobs. - */ -@AutoValue -abstract class PrismArtifactStager implements AutoCloseable { - - private static final Logger LOG = LoggerFactory.getLogger(PrismArtifactStager.class); - - /** - * Instantiate a {@link PrismArtifactStager} via call to {@link #of(String, String)}, assigning - * {@link Builder#setStagingEndpoint} using {@param prepareJobResponse} {@link - * JobApi.PrepareJobResponse#getArtifactStagingEndpoint} and {@link - * JobApi.PrepareJobResponse#getStagingSessionToken}. - */ - static PrismArtifactStager of(JobApi.PrepareJobResponse prepareJobResponse) { - return of( - prepareJobResponse.getArtifactStagingEndpoint().getUrl(), - prepareJobResponse.getStagingSessionToken()); - } - - /** - * Instantiates a {@link PrismArtifactStager} from the {@param stagingEndpoint} URL and {@param - * stagingSessionToken} to instantiate the {@link #getRetrievalService}, {@link - * #getManagedChannel}, and {@link #getStagingServiceStub} defaults. See the referenced getters - * for more details. - */ - static PrismArtifactStager of(String stagingEndpoint, String stagingSessionToken) { - return PrismArtifactStager.builder() - .setStagingEndpoint(stagingEndpoint) - .setStagingSessionToken(stagingSessionToken) - .build(); - } - - static Builder builder() { - return new AutoValue_PrismArtifactStager.Builder(); - } - - /** - * Stage the {@link org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline} artifacts via {@link - * ArtifactStagingService#offer} supplying {@link #getRetrievalService}, {@link - * #getStagingServiceStub}, and {@link #getStagingSessionToken}. - */ - void stage() throws ExecutionException, InterruptedException { - LOG.info("staging artifacts at {}", getStagingEndpoint()); - ArtifactStagingService.offer( - getRetrievalService(), getStagingServiceStub(), getStagingSessionToken()); - } - - /** The URL of the {@link ArtifactStagingService}. */ - abstract String getStagingEndpoint(); - - /** - * Token associated with a staging session and acquired from a {@link - * JobServiceGrpc.JobServiceStub#prepare}'s {@link JobApi.PrepareJobResponse}. - */ - abstract String getStagingSessionToken(); - - /** - * The service that retrieves artifacts; defaults to instantiating from the default {@link - * ArtifactRetrievalService#ArtifactRetrievalService()} constructor. - */ - abstract ArtifactRetrievalService getRetrievalService(); - - /** - * Used to instantiate the {@link #getStagingServiceStub}. By default, instantiates using {@link - * ManagedChannelFactory#forDescriptor(Endpoints.ApiServiceDescriptor)}, where {@link - * Endpoints.ApiServiceDescriptor} is instantiated via {@link - * Endpoints.ApiServiceDescriptor.Builder#setUrl(String)} and the URL provided by {@link - * #getStagingEndpoint}. - */ - abstract ManagedChannel getManagedChannel(); - - /** - * Required by {@link ArtifactStagingService#offer}. By default, instantiates using {@link - * ArtifactStagingServiceGrpc#newStub} and {@link #getManagedChannel}. - */ - abstract ArtifactStagingServiceGrpc.ArtifactStagingServiceStub getStagingServiceStub(); - - @Override - public void close() { - LOG.info("shutting down {}", PrismArtifactStager.class); - getRetrievalService().close(); - getManagedChannel().shutdown(); - try { - getManagedChannel().awaitTermination(3000L, TimeUnit.MILLISECONDS); - } catch (InterruptedException ignored) { - } - } - - @AutoValue.Builder - abstract static class Builder { - - abstract Builder setStagingEndpoint(String stagingEndpoint); - - abstract Optional getStagingEndpoint(); - - abstract Builder setStagingSessionToken(String stagingSessionToken); - - abstract Builder setRetrievalService(ArtifactRetrievalService retrievalService); - - abstract Optional getRetrievalService(); - - abstract Builder setManagedChannel(ManagedChannel managedChannel); - - abstract Optional getManagedChannel(); - - abstract Builder setStagingServiceStub( - ArtifactStagingServiceGrpc.ArtifactStagingServiceStub stub); - - abstract Optional - getStagingServiceStub(); - - abstract PrismArtifactStager autoBuild(); - - final PrismArtifactStager build() { - - checkState(getStagingEndpoint().isPresent(), "missing staging endpoint"); - ManagedChannelFactory channelFactory = ManagedChannelFactory.createDefault(); - - if (!getManagedChannel().isPresent()) { - Endpoints.ApiServiceDescriptor descriptor = - Endpoints.ApiServiceDescriptor.newBuilder().setUrl(getStagingEndpoint().get()).build(); - setManagedChannel(channelFactory.forDescriptor(descriptor)); - } - - if (!getStagingServiceStub().isPresent()) { - setStagingServiceStub(ArtifactStagingServiceGrpc.newStub(getManagedChannel().get())); - } - - if (!getRetrievalService().isPresent()) { - setRetrievalService(new ArtifactRetrievalService()); - } - - return autoBuild(); - } - } -} diff --git a/runners/prism/java/src/main/java/org/apache/beam/runners/prism/PrismJobManager.java b/runners/prism/java/src/main/java/org/apache/beam/runners/prism/PrismJobManager.java deleted file mode 100644 index e461e92c4749..000000000000 --- a/runners/prism/java/src/main/java/org/apache/beam/runners/prism/PrismJobManager.java +++ /dev/null @@ -1,160 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.runners.prism; - -import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkState; - -import com.google.auto.value.AutoValue; -import java.io.Closeable; -import java.util.Optional; -import java.util.concurrent.TimeUnit; -import org.apache.beam.model.jobmanagement.v1.JobApi; -import org.apache.beam.model.jobmanagement.v1.JobServiceGrpc; -import org.apache.beam.model.pipeline.v1.Endpoints; -import org.apache.beam.sdk.PipelineResult; -import org.apache.beam.sdk.fn.channel.ManagedChannelFactory; -import org.apache.beam.sdk.options.PortablePipelineOptions; -import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.ManagedChannel; -import org.joda.time.Duration; - -/** - * A wrapper for {@link JobServiceGrpc.JobServiceBlockingStub} that {@link #close}es when {@link - * StateListener#onStateChanged} is invoked with a {@link PipelineResult.State} that is {@link - * PipelineResult.State#isTerminal}. - */ -@AutoValue -abstract class PrismJobManager implements StateListener, Closeable { - - /** - * Instantiate a {@link PrismJobManager} with {@param options}, assigning {@link #getEndpoint} - * from {@link PortablePipelineOptions#getJobEndpoint} and {@link #getTimeout} from {@link - * PortablePipelineOptions#getJobServerTimeout}. Defaults the instantiations of {@link - * #getManagedChannel} and {@link #getBlockingStub}. See respective getters for more details. - */ - static PrismJobManager of(PortablePipelineOptions options) { - return builder() - .setEndpoint(options.getJobEndpoint()) - .setTimeout(Duration.standardSeconds(options.getJobServerTimeout())) - .build(); - } - - static Builder builder() { - return new AutoValue_PrismJobManager.Builder(); - } - - /** - * Executes {@link #getBlockingStub()}'s {@link JobServiceGrpc.JobServiceBlockingStub#prepare} - * method. - */ - JobApi.PrepareJobResponse prepare(JobApi.PrepareJobRequest request) { - return getBlockingStub().prepare(request); - } - - /** - * Executes {@link #getBlockingStub()}'s {@link JobServiceGrpc.JobServiceBlockingStub#run} method. - */ - JobApi.RunJobResponse run(JobApi.RunJobRequest request) { - return getBlockingStub().run(request); - } - - /** The {@link JobServiceGrpc} endpoint. */ - abstract String getEndpoint(); - - /** The {@link JobServiceGrpc} timeout. */ - abstract Duration getTimeout(); - - /** The {@link #getBlockingStub}'s channel. Defaulted from the {@link #getEndpoint()}. */ - abstract ManagedChannel getManagedChannel(); - - /** The wrapped service defaulted using the {@link #getManagedChannel}. */ - abstract JobServiceGrpc.JobServiceBlockingStub getBlockingStub(); - - /** Shuts down {@link #getManagedChannel}, if not {@link #isShutdown}. */ - @Override - public void close() { - if (isShutdown()) { - return; - } - getManagedChannel().shutdown(); - try { - getManagedChannel().awaitTermination(3000L, TimeUnit.MILLISECONDS); - } catch (InterruptedException ignored) { - } - } - - /** Queries whether {@link #getManagedChannel} {@link ManagedChannel#isShutdown}. */ - boolean isShutdown() { - return getManagedChannel().isShutdown(); - } - - /** - * Override of {@link StateListener#onStateChanged}. Invokes {@link #close} when {@link - * PipelineResult.State} {@link PipelineResult.State#isTerminal}. - */ - @Override - public void onStateChanged(PipelineResult.State state) { - if (state.isTerminal()) { - close(); - } - } - - @AutoValue.Builder - abstract static class Builder { - - abstract Builder setEndpoint(String endpoint); - - abstract Optional getEndpoint(); - - abstract Builder setTimeout(Duration timeout); - - abstract Optional getTimeout(); - - abstract Builder setManagedChannel(ManagedChannel managedChannel); - - abstract Optional getManagedChannel(); - - abstract Builder setBlockingStub(JobServiceGrpc.JobServiceBlockingStub blockingStub); - - abstract Optional getBlockingStub(); - - abstract PrismJobManager autoBuild(); - - final PrismJobManager build() { - - checkState(getEndpoint().isPresent(), "endpoint is not set"); - checkState(getTimeout().isPresent(), "timeout is not set"); - - if (!getManagedChannel().isPresent()) { - ManagedChannelFactory channelFactory = ManagedChannelFactory.createDefault(); - - setManagedChannel( - channelFactory.forDescriptor( - Endpoints.ApiServiceDescriptor.newBuilder().setUrl(getEndpoint().get()).build())); - } - - if (!getBlockingStub().isPresent()) { - setBlockingStub( - JobServiceGrpc.newBlockingStub(getManagedChannel().get()) - .withDeadlineAfter(getTimeout().get().getMillis(), TimeUnit.MILLISECONDS) - .withWaitForReady()); - } - - return autoBuild(); - } - } -} diff --git a/runners/prism/java/src/main/java/org/apache/beam/runners/prism/StateWatcher.java b/runners/prism/java/src/main/java/org/apache/beam/runners/prism/StateWatcher.java deleted file mode 100644 index fe9eb84a72b5..000000000000 --- a/runners/prism/java/src/main/java/org/apache/beam/runners/prism/StateWatcher.java +++ /dev/null @@ -1,146 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.runners.prism; - -import com.google.auto.value.AutoValue; -import java.util.Arrays; -import java.util.Iterator; -import java.util.List; -import java.util.Optional; -import java.util.concurrent.TimeUnit; -import org.apache.beam.model.jobmanagement.v1.JobApi; -import org.apache.beam.model.jobmanagement.v1.JobServiceGrpc; -import org.apache.beam.sdk.PipelineResult; -import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.ChannelCredentials; -import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.InsecureChannelCredentials; -import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.ManagedChannel; -import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.netty.NettyChannelBuilder; -import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.stub.StreamObserver; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.net.HostAndPort; - -/** - * {@link StateWatcher} {@link #watch}es for and reports {@link PipelineResult.State} changes to - * {@link StateListener}s. - */ -@AutoValue -abstract class StateWatcher implements AutoCloseable { - - private Optional latestState = Optional.empty(); - - /** - * Instantiates a {@link StateWatcher} with {@link InsecureChannelCredentials}. {@link - * StateWatcher} will report to each {@link StateListener} of {@param listeners} of any changed - * {@link PipelineResult.State}. - */ - static StateWatcher insecure(String endpoint, StateListener... listeners) { - return StateWatcher.builder() - .setEndpoint(HostAndPort.fromString(endpoint)) - .setCredentials(InsecureChannelCredentials.create()) - .setListeners(Arrays.asList(listeners)) - .build(); - } - - /** - * Watch for a Job's {@link PipelineResult.State} change. A {@link - * org.apache.beam.model.jobmanagement.v1.JobApi.GetJobStateRequest} identifies a Job to watch via - * its {@link JobApi.GetJobStateRequest#getJobId()}. The method is blocking until the {@link - * JobApi.JobStateEvent} {@link StreamObserver#onCompleted()}. - */ - void watch(String jobId) { - JobApi.GetJobStateRequest request = - JobApi.GetJobStateRequest.newBuilder().setJobId(jobId).build(); - Iterator iterator = getJobServiceBlockingStub().getStateStream(request); - while (iterator.hasNext()) { - JobApi.JobStateEvent event = iterator.next(); - PipelineResult.State state = PipelineResult.State.valueOf(event.getState().name()); - publish(state); - } - } - - private void publish(PipelineResult.State state) { - if (latestState.isPresent() && latestState.get().equals(state)) { - return; - } - latestState = Optional.of(state); - for (StateListener listener : getListeners()) { - listener.onStateChanged(state); - } - } - - static Builder builder() { - return new AutoValue_StateWatcher.Builder(); - } - - abstract HostAndPort getEndpoint(); - - abstract ChannelCredentials getCredentials(); - - abstract List getListeners(); - - abstract ManagedChannel getManagedChannel(); - - abstract JobServiceGrpc.JobServiceBlockingStub getJobServiceBlockingStub(); - - @Override - public void close() { - getManagedChannel().shutdown(); - try { - getManagedChannel().awaitTermination(3000L, TimeUnit.MILLISECONDS); - } catch (InterruptedException ignored) { - } - } - - @AutoValue.Builder - abstract static class Builder { - - abstract Builder setEndpoint(HostAndPort endpoint); - - abstract Optional getEndpoint(); - - abstract Builder setCredentials(ChannelCredentials credentials); - - abstract Optional getCredentials(); - - abstract Builder setListeners(List listeners); - - abstract Builder setManagedChannel(ManagedChannel managedChannel); - - abstract Builder setJobServiceBlockingStub( - JobServiceGrpc.JobServiceBlockingStub jobServiceBlockingStub); - - abstract StateWatcher autoBuild(); - - final StateWatcher build() { - if (!getEndpoint().isPresent()) { - throw new IllegalStateException("missing endpoint"); - } - if (!getCredentials().isPresent()) { - throw new IllegalStateException("missing credentials"); - } - HostAndPort endpoint = getEndpoint().get(); - ManagedChannel channel = - NettyChannelBuilder.forAddress( - endpoint.getHost(), endpoint.getPort(), getCredentials().get()) - .build(); - setManagedChannel(channel); - setJobServiceBlockingStub(JobServiceGrpc.newBlockingStub(channel)); - - return autoBuild(); - } - } -} diff --git a/runners/prism/java/src/main/java/org/apache/beam/runners/prism/WorkerService.java b/runners/prism/java/src/main/java/org/apache/beam/runners/prism/WorkerService.java deleted file mode 100644 index 289ffac64f8a..000000000000 --- a/runners/prism/java/src/main/java/org/apache/beam/runners/prism/WorkerService.java +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.runners.prism; - -import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull; - -import org.apache.beam.fn.harness.ExternalWorkerService; -import org.apache.beam.model.pipeline.v1.Endpoints; -import org.apache.beam.sdk.PipelineResult; -import org.apache.beam.sdk.fn.server.GrpcFnServer; -import org.apache.beam.sdk.options.PortablePipelineOptions; -import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.Server; -import org.checkerframework.checker.nullness.qual.MonotonicNonNull; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * An {@link ExternalWorkerService} {@link GrpcFnServer} encapsulation that {@link #stop}s when - * {@link StateListener#onStateChanged} is invoked with a {@link PipelineResult.State} that is - * {@link PipelineResult.State#isTerminal}. - */ -class WorkerService implements StateListener { - - private static final Logger LOG = LoggerFactory.getLogger(WorkerService.class); - - private final ExternalWorkerService worker; - private @MonotonicNonNull GrpcFnServer server; - - WorkerService(PortablePipelineOptions options) { - this.worker = new ExternalWorkerService(options); - } - - /** Start the {@link ExternalWorkerService}. */ - void start() throws Exception { - if (server != null && !server.getServer().isShutdown()) { - return; - } - - server = worker.start(); - LOG.info("Starting worker service at {}", getApiServiceDescriptorUrl()); - } - - /** - * Queries whether the {@link ExternalWorkerService} {@link GrpcFnServer}'s {@link Server} is - * running. - */ - boolean isRunning() { - if (server == null) { - return false; - } - return !server.getServer().isShutdown(); - } - - /** - * Queries the {@link Endpoints.ApiServiceDescriptor#getUrl} of the {@link ExternalWorkerService} - * {@link GrpcFnServer}'s {@link Server}. Throws an exception if the {@link WorkerService} has not - * {@link WorkerService#start}ed. - */ - String getApiServiceDescriptorUrl() { - return checkStateNotNull(server, "worker service not started") - .getApiServiceDescriptor() - .getUrl(); - } - - /** - * Updates {@link PortablePipelineOptions#getDefaultEnvironmentConfig} with {@link - * #getApiServiceDescriptorUrl}. Throws an exception if the {@link WorkerService} has not {@link - * WorkerService#start}ed. - */ - PortablePipelineOptions updateDefaultEnvironmentConfig(PortablePipelineOptions options) { - options.setDefaultEnvironmentConfig(getApiServiceDescriptorUrl()); - return options; - } - - /** - * Overrides {@link StateListener#onStateChanged}, invoking {@link #stop} when {@link - * PipelineResult.State#isTerminal}. - */ - @Override - public void onStateChanged(PipelineResult.State state) { - if (state.isTerminal()) { - stop(); - } - } - - /** - * Stops the {@link ExternalWorkerService} {@link GrpcFnServer}'s {@link Server}. If not {@link - * WorkerService#isRunning()}, then calling stop is a noop. - */ - void stop() { - if (server == null || server.getServer().isShutdown()) { - return; - } - LOG.info("Stopping worker service at {}", getApiServiceDescriptorUrl()); - try { - server.close(); - } catch (Exception e) { - throw new RuntimeException(e); - } - } -} diff --git a/runners/prism/java/src/test/java/org/apache/beam/runners/prism/PrismArtifactResolverTest.java b/runners/prism/java/src/test/java/org/apache/beam/runners/prism/PrismArtifactResolverTest.java deleted file mode 100644 index ef4646f02347..000000000000 --- a/runners/prism/java/src/test/java/org/apache/beam/runners/prism/PrismArtifactResolverTest.java +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.runners.prism; - -import static com.google.common.truth.Truth.assertThat; - -import org.apache.beam.model.pipeline.v1.RunnerApi; -import org.apache.beam.sdk.Pipeline; -import org.apache.beam.sdk.transforms.Impulse; -import org.apache.beam.sdk.util.construction.BeamUrns; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.JUnit4; - -/** Tests for {@link PrismArtifactResolver}. */ -@RunWith(JUnit4.class) -public class PrismArtifactResolverTest { - @Test - public void resolvesPipeline() { - Pipeline pipeline = Pipeline.create(); - pipeline.apply(Impulse.create()); - PrismArtifactResolver underTest = PrismArtifactResolver.of(pipeline); - RunnerApi.Pipeline pipelineProto = underTest.resolvePipelineProto(); - RunnerApi.Components components = pipelineProto.getComponents(); - assertThat(components.getTransformsMap()).containsKey("Impulse"); - assertThat(components.getCodersMap()).containsKey("ByteArrayCoder"); - assertThat(components.getEnvironmentsMap()) - .containsKey(BeamUrns.getUrn(RunnerApi.StandardEnvironments.Environments.DOCKER)); - } -} diff --git a/runners/prism/java/src/test/java/org/apache/beam/runners/prism/PrismArtifactStagerTest.java b/runners/prism/java/src/test/java/org/apache/beam/runners/prism/PrismArtifactStagerTest.java deleted file mode 100644 index d3ac8a72eafb..000000000000 --- a/runners/prism/java/src/test/java/org/apache/beam/runners/prism/PrismArtifactStagerTest.java +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.runners.prism; - -import static com.google.common.truth.Truth.assertThat; -import static org.apache.beam.runners.fnexecution.artifact.ArtifactRetrievalService.EMBEDDED_ARTIFACT_URN; -import static org.junit.Assert.assertThrows; - -import java.io.IOException; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.concurrent.ExecutionException; -import org.apache.beam.model.pipeline.v1.RunnerApi; -import org.apache.beam.runners.fnexecution.artifact.ArtifactRetrievalService; -import org.apache.beam.runners.fnexecution.artifact.ArtifactStagingService; -import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; -import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.ManagedChannel; -import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.inprocess.InProcessChannelBuilder; -import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.inprocess.InProcessServerBuilder; -import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.testing.GrpcCleanupRule; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; -import org.apache.commons.io.output.ByteArrayOutputStream; -import org.junit.Rule; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.JUnit4; - -/** Tests for {@link PrismArtifactStager}. */ -@RunWith(JUnit4.class) -public class PrismArtifactStagerTest { - - @Rule public final GrpcCleanupRule grpcCleanup = new GrpcCleanupRule(); - - final ArtifactStagingService stagingService = - new ArtifactStagingService(new TestDestinationProvider()); - - @Test - public void givenValidArtifacts_stages() - throws IOException, ExecutionException, InterruptedException { - PrismArtifactStager underTest = prismArtifactStager(validArtifacts()); - assertThat(underTest.getManagedChannel().isShutdown()).isFalse(); - underTest.stage(); - assertThat(stagingService.getStagedArtifacts(underTest.getStagingSessionToken())).isNotEmpty(); - underTest.close(); - assertThat(underTest.getManagedChannel().isShutdown()).isTrue(); - } - - @Test - public void givenErrors_performsGracefulCleanup() throws IOException { - PrismArtifactStager underTest = prismArtifactStager(invalidArtifacts()); - assertThat(underTest.getManagedChannel().isShutdown()).isFalse(); - ExecutionException error = assertThrows(ExecutionException.class, underTest::stage); - assertThat(error.getMessage()).contains("Unexpected artifact type: invalid-type-urn"); - assertThat(underTest.getManagedChannel().isShutdown()).isFalse(); - underTest.close(); - assertThat(underTest.getManagedChannel().isShutdown()).isTrue(); - } - - private PrismArtifactStager prismArtifactStager( - Map> artifacts) throws IOException { - String serverName = InProcessServerBuilder.generateName(); - ArtifactRetrievalService retrievalService = new ArtifactRetrievalService(); - String stagingToken = "staging-token"; - stagingService.registerJob(stagingToken, artifacts); - - grpcCleanup.register( - InProcessServerBuilder.forName(serverName) - .directExecutor() - .addService(stagingService) - .addService(retrievalService) - .build() - .start()); - - ManagedChannel channel = - grpcCleanup.register(InProcessChannelBuilder.forName(serverName).build()); - - return PrismArtifactStager.builder() - .setStagingEndpoint("ignore") - .setStagingSessionToken(stagingToken) - .setManagedChannel(channel) - .build(); - } - - private Map> validArtifacts() { - return ImmutableMap.of( - "env1", - Collections.singletonList( - RunnerApi.ArtifactInformation.newBuilder() - .setTypeUrn(EMBEDDED_ARTIFACT_URN) - .setTypePayload( - RunnerApi.EmbeddedFilePayload.newBuilder() - .setData(ByteString.copyFromUtf8("type-payload")) - .build() - .toByteString()) - .setRoleUrn("role-urn") - .build())); - } - - private Map> invalidArtifacts() { - return ImmutableMap.of( - "env1", - Collections.singletonList( - RunnerApi.ArtifactInformation.newBuilder() - .setTypeUrn("invalid-type-urn") - .setTypePayload( - RunnerApi.EmbeddedFilePayload.newBuilder() - .setData(ByteString.copyFromUtf8("type-payload")) - .build() - .toByteString()) - .setRoleUrn("role-urn") - .build())); - } - - private static class TestDestinationProvider - implements ArtifactStagingService.ArtifactDestinationProvider { - - @Override - public ArtifactStagingService.ArtifactDestination getDestination( - String stagingToken, String name) throws IOException { - return ArtifactStagingService.ArtifactDestination.create( - EMBEDDED_ARTIFACT_URN, ByteString.EMPTY, new ByteArrayOutputStream()); - } - - @Override - public void removeStagedArtifacts(String stagingToken) throws IOException {} - } -} diff --git a/runners/prism/java/src/test/java/org/apache/beam/runners/prism/PrismJobManagerTest.java b/runners/prism/java/src/test/java/org/apache/beam/runners/prism/PrismJobManagerTest.java deleted file mode 100644 index 1e38e4f8d12e..000000000000 --- a/runners/prism/java/src/test/java/org/apache/beam/runners/prism/PrismJobManagerTest.java +++ /dev/null @@ -1,211 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.runners.prism; - -import static com.google.common.truth.Truth.assertThat; -import static org.junit.Assert.assertThrows; - -import java.io.IOException; -import java.util.Optional; -import org.apache.beam.model.jobmanagement.v1.JobApi; -import org.apache.beam.model.jobmanagement.v1.JobServiceGrpc; -import org.apache.beam.model.pipeline.v1.Endpoints; -import org.apache.beam.model.pipeline.v1.RunnerApi; -import org.apache.beam.sdk.Pipeline; -import org.apache.beam.sdk.PipelineResult; -import org.apache.beam.sdk.transforms.Impulse; -import org.apache.beam.sdk.util.construction.PipelineTranslation; -import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.ManagedChannel; -import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.inprocess.InProcessChannelBuilder; -import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.inprocess.InProcessServerBuilder; -import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.stub.StreamObserver; -import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.testing.GrpcCleanupRule; -import org.joda.time.Duration; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TestName; -import org.junit.runner.RunWith; -import org.junit.runners.JUnit4; - -/** Tests for {@link PrismJobManager}. */ -@RunWith(JUnit4.class) -public class PrismJobManagerTest { - @Rule public final GrpcCleanupRule grpcCleanup = new GrpcCleanupRule(); - - @Rule public TestName testName = new TestName(); - - @Test - public void givenPrepareError_forwardsException_canGracefulShutdown() { - TestJobService service = - new TestJobService().withErrorResponse(new RuntimeException(testName.getMethodName())); - PrismJobManager underTest = prismJobManager(service); - assertThat(underTest.isShutdown()).isFalse(); - assertThrows( - RuntimeException.class, - () -> - underTest.prepare( - JobApi.PrepareJobRequest.newBuilder().setPipeline(pipelineOf()).build())); - assertThat(underTest.isShutdown()).isFalse(); - underTest.close(); - assertThat(underTest.isShutdown()).isTrue(); - } - - @Test - public void givenPrepareSuccess_forwardsResponse_canGracefulShutdown() { - TestJobService service = - new TestJobService() - .withPrepareJobResponse( - JobApi.PrepareJobResponse.newBuilder() - .setStagingSessionToken("token") - .setPreparationId("preparationId") - .setArtifactStagingEndpoint( - Endpoints.ApiServiceDescriptor.newBuilder() - .setUrl("localhost:1234") - .build()) - .build()); - PrismJobManager underTest = prismJobManager(service); - assertThat(underTest.isShutdown()).isFalse(); - JobApi.PrepareJobResponse response = - underTest.prepare(JobApi.PrepareJobRequest.newBuilder().setPipeline(pipelineOf()).build()); - assertThat(underTest.isShutdown()).isFalse(); - assertThat(response.getStagingSessionToken()).isEqualTo("token"); - assertThat(response.getPreparationId()).isEqualTo("preparationId"); - underTest.close(); - assertThat(underTest.isShutdown()).isTrue(); - } - - @Test - public void givenRunError_forwardsException_canGracefulShutdown() { - TestJobService service = - new TestJobService().withErrorResponse(new RuntimeException(testName.getMethodName())); - PrismJobManager underTest = prismJobManager(service); - assertThat(underTest.isShutdown()).isFalse(); - assertThrows( - RuntimeException.class, - () -> - underTest.run(JobApi.RunJobRequest.newBuilder().setPreparationId("prepareId").build())); - assertThat(underTest.isShutdown()).isFalse(); - underTest.close(); - assertThat(underTest.isShutdown()).isTrue(); - } - - @Test - public void givenRunSuccess_forwardsResponse_canGracefulShutdown() { - TestJobService service = - new TestJobService() - .withRunJobResponse(JobApi.RunJobResponse.newBuilder().setJobId("jobId").build()); - PrismJobManager underTest = prismJobManager(service); - assertThat(underTest.isShutdown()).isFalse(); - JobApi.RunJobResponse runJobResponse = - underTest.run(JobApi.RunJobRequest.newBuilder().setPreparationId("preparationId").build()); - assertThat(underTest.isShutdown()).isFalse(); - assertThat(runJobResponse.getJobId()).isEqualTo("jobId"); - underTest.close(); - assertThat(underTest.isShutdown()).isTrue(); - } - - @Test - public void givenTerminalState_closes() { - PrismJobManager underTest = prismJobManager(new TestJobService()); - assertThat(underTest.isShutdown()).isFalse(); - underTest.onStateChanged(PipelineResult.State.RUNNING); - assertThat(underTest.isShutdown()).isFalse(); - underTest.onStateChanged(PipelineResult.State.RUNNING); - assertThat(underTest.isShutdown()).isFalse(); - underTest.onStateChanged(PipelineResult.State.CANCELLED); - assertThat(underTest.isShutdown()).isTrue(); - - underTest.close(); - } - - private PrismJobManager prismJobManager(TestJobService service) { - String serverName = InProcessServerBuilder.generateName(); - try { - grpcCleanup.register( - InProcessServerBuilder.forName(serverName) - .directExecutor() - .addService(service) - .build() - .start()); - } catch (IOException e) { - throw new RuntimeException(e); - } - - ManagedChannel channel = - grpcCleanup.register(InProcessChannelBuilder.forName(serverName).build()); - - return PrismJobManager.builder() - .setTimeout(Duration.millis(3000L)) - .setEndpoint("ignore") - .setManagedChannel(channel) - .build(); - } - - private static class TestJobService extends JobServiceGrpc.JobServiceImplBase { - - private Optional prepareJobResponse = Optional.empty(); - private Optional runJobResponse = Optional.empty(); - private Optional error = Optional.empty(); - - TestJobService withPrepareJobResponse(JobApi.PrepareJobResponse prepareJobResponse) { - this.prepareJobResponse = Optional.of(prepareJobResponse); - return this; - } - - TestJobService withRunJobResponse(JobApi.RunJobResponse runJobResponse) { - this.runJobResponse = Optional.of(runJobResponse); - return this; - } - - TestJobService withErrorResponse(RuntimeException error) { - this.error = Optional.of(error); - return this; - } - - @Override - public void prepare( - JobApi.PrepareJobRequest request, - StreamObserver responseObserver) { - if (prepareJobResponse.isPresent()) { - responseObserver.onNext(prepareJobResponse.get()); - responseObserver.onCompleted(); - } - if (error.isPresent()) { - responseObserver.onError(error.get()); - } - } - - @Override - public void run( - JobApi.RunJobRequest request, StreamObserver responseObserver) { - if (runJobResponse.isPresent()) { - responseObserver.onNext(runJobResponse.get()); - responseObserver.onCompleted(); - } - if (error.isPresent()) { - responseObserver.onError(error.get()); - } - } - } - - private static RunnerApi.Pipeline pipelineOf() { - Pipeline pipeline = Pipeline.create(); - pipeline.apply(Impulse.create()); - return PipelineTranslation.toProto(pipeline); - } -} diff --git a/runners/prism/java/src/test/java/org/apache/beam/runners/prism/PrismLocatorTest.java b/runners/prism/java/src/test/java/org/apache/beam/runners/prism/PrismLocatorTest.java index 982a8bfd657c..9054e9c99a04 100644 --- a/runners/prism/java/src/test/java/org/apache/beam/runners/prism/PrismLocatorTest.java +++ b/runners/prism/java/src/test/java/org/apache/beam/runners/prism/PrismLocatorTest.java @@ -31,6 +31,7 @@ import java.nio.file.attribute.BasicFileAttributes; import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.junit.Before; +import org.junit.Ignore; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; @@ -111,6 +112,7 @@ public void givenGithubTagPrismLocationOption_thenThrows() { } @Test + @Ignore // TODO: use mock site. Currently failing with response code 500 instead of 404 public void givenPrismLocation404_thenThrows() { PrismPipelineOptions options = options(); options.setPrismLocation("https://example.com/i/dont/exist.zip"); diff --git a/runners/prism/java/src/test/java/org/apache/beam/runners/prism/StateWatcherTest.java b/runners/prism/java/src/test/java/org/apache/beam/runners/prism/StateWatcherTest.java deleted file mode 100644 index cfc420046206..000000000000 --- a/runners/prism/java/src/test/java/org/apache/beam/runners/prism/StateWatcherTest.java +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.runners.prism; - -import static com.google.common.truth.Truth.assertThat; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import org.apache.beam.model.jobmanagement.v1.JobApi; -import org.apache.beam.model.jobmanagement.v1.JobServiceGrpc; -import org.apache.beam.sdk.PipelineResult; -import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.Grpc; -import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.InsecureServerCredentials; -import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.Server; -import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.stub.StreamObserver; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.JUnit4; - -@RunWith(JUnit4.class) -public class StateWatcherTest { - - @Test - public void givenSingleListener_watches() { - Server server = serverOf(PipelineResult.State.RUNNING, PipelineResult.State.DONE); - TestStateListener listener = new TestStateListener(); - try (StateWatcher underTest = StateWatcher.insecure("0.0.0.0:" + server.getPort(), listener)) { - underTest.watch("job-001"); - assertThat(listener.states) - .containsExactly(PipelineResult.State.RUNNING, PipelineResult.State.DONE); - shutdown(server); - } - } - - @Test - public void givenMultipleListeners_watches() { - Server server = serverOf(PipelineResult.State.RUNNING, PipelineResult.State.DONE); - TestStateListener listenerA = new TestStateListener(); - TestStateListener listenerB = new TestStateListener(); - try (StateWatcher underTest = - StateWatcher.insecure("0.0.0.0:" + server.getPort(), listenerA, listenerB)) { - underTest.watch("job-001"); - assertThat(listenerA.states) - .containsExactly(PipelineResult.State.RUNNING, PipelineResult.State.DONE); - assertThat(listenerB.states) - .containsExactly(PipelineResult.State.RUNNING, PipelineResult.State.DONE); - shutdown(server); - } - } - - @Test - public void publishesOnlyChangedState() { - Server server = - serverOf( - PipelineResult.State.RUNNING, - PipelineResult.State.RUNNING, - PipelineResult.State.RUNNING, - PipelineResult.State.RUNNING, - PipelineResult.State.RUNNING, - PipelineResult.State.RUNNING, - PipelineResult.State.RUNNING, - PipelineResult.State.DONE); - TestStateListener listener = new TestStateListener(); - try (StateWatcher underTest = StateWatcher.insecure("0.0.0.0:" + server.getPort(), listener)) { - underTest.watch("job-001"); - assertThat(listener.states) - .containsExactly(PipelineResult.State.RUNNING, PipelineResult.State.DONE); - shutdown(server); - } - } - - private static class TestStateListener implements StateListener { - private final List states = new ArrayList<>(); - - @Override - public void onStateChanged(PipelineResult.State state) { - states.add(state); - } - } - - private static class TestJobServiceStateStream extends JobServiceGrpc.JobServiceImplBase { - private final List states; - - TestJobServiceStateStream(PipelineResult.State... states) { - this.states = Arrays.asList(states); - } - - @Override - public void getStateStream( - JobApi.GetJobStateRequest request, StreamObserver responseObserver) { - for (PipelineResult.State state : states) { - responseObserver.onNext( - JobApi.JobStateEvent.newBuilder() - .setState(JobApi.JobState.Enum.valueOf(state.name())) - .build()); - } - responseObserver.onCompleted(); - } - } - - private static Server serverOf(PipelineResult.State... states) { - try { - return Grpc.newServerBuilderForPort(0, InsecureServerCredentials.create()) - .addService(new TestJobServiceStateStream(states)) - .build() - .start(); - } catch (IOException e) { - throw new RuntimeException(e); - } - } - - private static void shutdown(Server server) { - server.shutdownNow(); - try { - server.awaitTermination(); - } catch (InterruptedException ignored) { - } - } -} diff --git a/runners/prism/java/src/test/java/org/apache/beam/runners/prism/WorkerServiceTest.java b/runners/prism/java/src/test/java/org/apache/beam/runners/prism/WorkerServiceTest.java deleted file mode 100644 index 7fc05d7747cd..000000000000 --- a/runners/prism/java/src/test/java/org/apache/beam/runners/prism/WorkerServiceTest.java +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.runners.prism; - -import static com.google.common.truth.Truth.assertThat; -import static org.junit.Assert.assertThrows; - -import org.apache.beam.sdk.PipelineResult; -import org.apache.beam.sdk.options.PipelineOptionsFactory; -import org.apache.beam.sdk.options.PortablePipelineOptions; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.JUnit4; - -/** Tests for {@link WorkerService}. */ -@RunWith(JUnit4.class) -public class WorkerServiceTest { - @Test - public void testStartStop() throws Exception { - PortablePipelineOptions options = - PipelineOptionsFactory.create().as(PortablePipelineOptions.class); - WorkerService underTest = new WorkerService(options); - underTest.start(); - assertThat(underTest.isRunning()).isTrue(); - assertThat(underTest.getApiServiceDescriptorUrl()).matches("localhost:\\d+"); - underTest.stop(); - assertThat(underTest.isRunning()).isFalse(); - } - - @Test - public void givenStarted_updateDefaultEnvironmentConfig() throws Exception { - PortablePipelineOptions options = - PipelineOptionsFactory.create().as(PortablePipelineOptions.class); - assertThat(options.getDefaultEnvironmentConfig()).isNull(); - WorkerService underTest = new WorkerService(options); - underTest.start(); - options = underTest.updateDefaultEnvironmentConfig(options); - assertThat(options.getDefaultEnvironmentConfig()) - .isEqualTo(underTest.getApiServiceDescriptorUrl()); - underTest.stop(); - } - - @Test - public void givenNotStarted_updateDefaultEnvironmentConfig_throws() { - PortablePipelineOptions options = - PipelineOptionsFactory.create().as(PortablePipelineOptions.class); - WorkerService underTest = new WorkerService(options); - assertThrows( - IllegalStateException.class, () -> underTest.updateDefaultEnvironmentConfig(options)); - } - - @Test - public void whenStateIsTerminal_thenStop() throws Exception { - PortablePipelineOptions options = - PipelineOptionsFactory.create().as(PortablePipelineOptions.class); - WorkerService underTest = new WorkerService(options); - assertThat(underTest.isRunning()).isFalse(); - underTest.start(); - assertThat(underTest.isRunning()).isTrue(); - - underTest.onStateChanged(PipelineResult.State.RUNNING); - assertThat(underTest.isRunning()).isTrue(); - - underTest.onStateChanged(PipelineResult.State.RUNNING); - assertThat(underTest.isRunning()).isTrue(); - - underTest.onStateChanged(PipelineResult.State.CANCELLED); - assertThat(underTest.isRunning()).isFalse(); - } -} diff --git a/sdks/go.mod b/sdks/go.mod index a4d7b69af70d..26c951679cce 100644 --- a/sdks/go.mod +++ b/sdks/go.mod @@ -23,19 +23,19 @@ module github.com/apache/beam/sdks/v2 go 1.21 require ( - cloud.google.com/go/bigquery v1.62.0 - cloud.google.com/go/bigtable v1.29.0 - cloud.google.com/go/datastore v1.17.1 + cloud.google.com/go/bigquery v1.63.0 + cloud.google.com/go/bigtable v1.31.0 + cloud.google.com/go/datastore v1.19.0 cloud.google.com/go/profiler v0.4.1 - cloud.google.com/go/pubsub v1.41.0 + cloud.google.com/go/pubsub v1.43.0 cloud.google.com/go/spanner v1.67.0 cloud.google.com/go/storage v1.43.0 - github.com/aws/aws-sdk-go-v2 v1.30.4 - github.com/aws/aws-sdk-go-v2/config v1.27.28 - github.com/aws/aws-sdk-go-v2/credentials v1.17.28 - github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.17.12 - github.com/aws/aws-sdk-go-v2/service/s3 v1.60.0 - github.com/aws/smithy-go v1.20.4 + github.com/aws/aws-sdk-go-v2 v1.31.0 + github.com/aws/aws-sdk-go-v2/config v1.27.37 + github.com/aws/aws-sdk-go-v2/credentials v1.17.35 + github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.17.23 + github.com/aws/aws-sdk-go-v2/service/s3 v1.63.1 + github.com/aws/smithy-go v1.21.0 github.com/docker/go-connections v0.5.0 github.com/dustin/go-humanize v1.0.1 github.com/go-sql-driver/mysql v1.8.1 @@ -52,15 +52,15 @@ require ( github.com/tetratelabs/wazero v1.8.0 github.com/xitongsys/parquet-go v1.6.2 github.com/xitongsys/parquet-go-source v0.0.0-20220315005136-aec0fe3e777c - go.mongodb.org/mongo-driver v1.16.1 - golang.org/x/net v0.28.0 - golang.org/x/oauth2 v0.22.0 + go.mongodb.org/mongo-driver v1.17.0 + golang.org/x/net v0.29.0 + golang.org/x/oauth2 v0.23.0 golang.org/x/sync v0.8.0 - golang.org/x/sys v0.24.0 - golang.org/x/text v0.17.0 - google.golang.org/api v0.192.0 - google.golang.org/genproto v0.0.0-20240730163845-b1a4ccb954bf - google.golang.org/grpc v1.65.0 + golang.org/x/sys v0.25.0 + golang.org/x/text v0.18.0 + google.golang.org/api v0.197.0 + google.golang.org/genproto v0.0.0-20240903143218-8af14fe29dc1 + google.golang.org/grpc v1.66.1 google.golang.org/protobuf v1.34.2 gopkg.in/yaml.v2 v2.4.0 gopkg.in/yaml.v3 v3.0.1 @@ -73,10 +73,10 @@ require ( ) require ( - cel.dev/expr v0.15.0 // indirect - cloud.google.com/go/auth v0.8.1 // indirect - cloud.google.com/go/auth/oauth2adapt v0.2.3 // indirect - cloud.google.com/go/monitoring v1.20.3 // indirect + cel.dev/expr v0.16.0 // indirect + cloud.google.com/go/auth v0.9.3 // indirect + cloud.google.com/go/auth/oauth2adapt v0.2.4 // indirect + cloud.google.com/go/monitoring v1.21.0 // indirect dario.cat/mergo v1.0.0 // indirect filippo.io/edwards25519 v1.1.0 // indirect github.com/AdaLogics/go-fuzz-headers v0.0.0-20230811130428-ced1acdcaa24 // indirect @@ -88,7 +88,6 @@ require ( github.com/go-logr/logr v1.4.2 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/go-ole/go-ole v1.2.6 // indirect - github.com/golang/protobuf v1.5.4 // indirect github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect github.com/minio/highwayhash v1.0.3 // indirect github.com/moby/docker-image-spec v1.3.1 // indirect @@ -97,56 +96,57 @@ require ( github.com/nats-io/jwt/v2 v2.5.8 // indirect github.com/nats-io/nkeys v0.4.7 // indirect github.com/nats-io/nuid v1.0.1 // indirect + github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect github.com/shirou/gopsutil/v3 v3.23.12 // indirect github.com/shoenig/go-m1cpu v0.1.6 // indirect github.com/tklauser/go-sysconf v0.3.12 // indirect github.com/tklauser/numcpus v0.6.1 // indirect github.com/yusufpapurcu/wmi v1.2.3 // indirect - go.einride.tech/aip v0.67.1 // indirect - go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.49.0 // indirect - go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.49.0 // indirect - go.opentelemetry.io/otel v1.24.0 // indirect + go.einride.tech/aip v0.68.0 // indirect + go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.54.0 // indirect + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.54.0 // indirect + go.opentelemetry.io/otel v1.29.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.19.0 // indirect - go.opentelemetry.io/otel/metric v1.24.0 // indirect - go.opentelemetry.io/otel/sdk v1.24.0 // indirect - go.opentelemetry.io/otel/sdk/metric v1.24.0 // indirect - go.opentelemetry.io/otel/trace v1.24.0 // indirect + go.opentelemetry.io/otel/metric v1.29.0 // indirect + go.opentelemetry.io/otel/sdk v1.29.0 // indirect + go.opentelemetry.io/otel/sdk/metric v1.29.0 // indirect + go.opentelemetry.io/otel/trace v1.29.0 // indirect golang.org/x/time v0.6.0 // indirect ) require ( - cloud.google.com/go v0.115.0 // indirect + cloud.google.com/go v0.115.1 // indirect cloud.google.com/go/compute/metadata v0.5.0 // indirect - cloud.google.com/go/iam v1.1.12 // indirect - cloud.google.com/go/longrunning v0.5.11 // indirect + cloud.google.com/go/iam v1.2.0 // indirect + cloud.google.com/go/longrunning v0.6.0 // indirect github.com/Azure/go-ansiterm v0.0.0-20210617225240-d185dfc1b5a1 // indirect github.com/Microsoft/go-winio v0.6.2 // indirect github.com/apache/arrow/go/arrow v0.0.0-20200730104253-651201b0f516 // indirect github.com/apache/thrift v0.17.0 // indirect github.com/aws/aws-sdk-go v1.34.0 // indirect - github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.4 // indirect - github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.12 // indirect - github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.16 // indirect - github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.16 // indirect + github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.5 // indirect + github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.14 // indirect + github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.18 // indirect + github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.18 // indirect github.com/aws/aws-sdk-go-v2/internal/ini v1.8.1 // indirect - github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.16 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.11.4 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.3.18 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.11.18 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.17.16 // indirect - github.com/aws/aws-sdk-go-v2/service/sso v1.22.5 // indirect - github.com/aws/aws-sdk-go-v2/service/ssooidc v1.26.5 // indirect - github.com/aws/aws-sdk-go-v2/service/sts v1.30.4 // indirect + github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.18 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.11.5 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.3.20 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.11.20 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.17.18 // indirect + github.com/aws/aws-sdk-go-v2/service/sso v1.23.1 // indirect + github.com/aws/aws-sdk-go-v2/service/ssooidc v1.27.1 // indirect + github.com/aws/aws-sdk-go-v2/service/sts v1.31.1 // indirect github.com/cenkalti/backoff/v4 v4.2.1 // indirect github.com/census-instrumentation/opencensus-proto v0.4.1 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect - github.com/cncf/xds/go v0.0.0-20240423153145-555b57ec207b // indirect + github.com/cncf/xds/go v0.0.0-20240822171458-6449f94b4d59 // indirect github.com/cpuguy83/dockercfg v0.3.1 // indirect - github.com/docker/docker v27.1.2+incompatible // but required to resolve issue docker has with go1.20 + github.com/docker/docker v27.2.1+incompatible // but required to resolve issue docker has with go1.20 github.com/docker/go-units v0.5.0 // indirect - github.com/envoyproxy/go-control-plane v0.12.0 // indirect - github.com/envoyproxy/protoc-gen-validate v1.0.4 // indirect + github.com/envoyproxy/go-control-plane v0.13.0 // indirect + github.com/envoyproxy/protoc-gen-validate v1.1.0 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect github.com/goccy/go-json v0.10.2 // indirect github.com/gogo/protobuf v1.3.2 // indirect @@ -156,7 +156,7 @@ require ( github.com/google/pprof v0.0.0-20240528025155-186aa0362fba // indirect github.com/google/renameio/v2 v2.0.0 // indirect github.com/google/s2a-go v0.1.8 // indirect - github.com/googleapis/enterprise-certificate-proxy v0.3.2 // indirect + github.com/googleapis/enterprise-certificate-proxy v0.3.4 // indirect github.com/googleapis/gax-go/v2 v2.13.0 // indirect github.com/gorilla/handlers v1.5.2 // indirect github.com/gorilla/mux v1.8.1 // indirect @@ -181,13 +181,13 @@ require ( github.com/xdg-go/pbkdf2 v1.0.0 // indirect github.com/xdg-go/scram v1.1.2 // indirect github.com/xdg-go/stringprep v1.0.4 // indirect - github.com/youmark/pkcs8 v0.0.0-20181117223130-1be2e3e5546d // indirect + github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 // indirect github.com/zeebo/xxh3 v1.0.2 // indirect go.opencensus.io v0.24.0 // indirect - golang.org/x/crypto v0.26.0 // indirect - golang.org/x/mod v0.18.0 // indirect - golang.org/x/tools v0.22.0 // indirect - golang.org/x/xerrors v0.0.0-20240716161551-93cc26a95ae9 // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20240725223205-93522f1f2a9f // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20240730163845-b1a4ccb954bf // indirect + golang.org/x/crypto v0.27.0 // indirect + golang.org/x/mod v0.20.0 // indirect + golang.org/x/tools v0.24.0 // indirect + golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20240903143218-8af14fe29dc1 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20240903143218-8af14fe29dc1 // indirect ) diff --git a/sdks/go.sum b/sdks/go.sum index 4da0ea7b208a..6bbfabfbaf14 100644 --- a/sdks/go.sum +++ b/sdks/go.sum @@ -1,5 +1,5 @@ -cel.dev/expr v0.15.0 h1:O1jzfJCQBfL5BFoYktaxwIhuttaQPsVWerH9/EEKx0w= -cel.dev/expr v0.15.0/go.mod h1:TRSuuV7DlVCE/uwv5QbAiW/v8l5O8C4eEPHeu7gf7Sg= +cel.dev/expr v0.16.0 h1:yloc84fytn4zmJX2GU3TkXGsaieaV7dQ057Qs4sIG2Y= +cel.dev/expr v0.16.0/go.mod h1:TRSuuV7DlVCE/uwv5QbAiW/v8l5O8C4eEPHeu7gf7Sg= cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= cloud.google.com/go v0.38.0/go.mod h1:990N+gfupTy94rShfmMCWGDn0LpTmnzTp2qbd1dvSRU= @@ -38,8 +38,8 @@ cloud.google.com/go v0.104.0/go.mod h1:OO6xxXdJyvuJPcEPBLN9BJPD+jep5G1+2U5B5gkRY cloud.google.com/go v0.105.0/go.mod h1:PrLgOJNe5nfE9UMxKxgXj4mD3voiP+YQ6gdt6KMFOKM= cloud.google.com/go v0.107.0/go.mod h1:wpc2eNrD7hXUTy8EKS10jkxpZBjASrORK7goS+3YX2I= cloud.google.com/go v0.110.0/go.mod h1:SJnCLqQ0FCFGSZMUNUf84MV3Aia54kn7pi8st7tMzaY= -cloud.google.com/go v0.115.0 h1:CnFSK6Xo3lDYRoBKEcAtia6VSC837/ZkJuRduSFnr14= -cloud.google.com/go v0.115.0/go.mod h1:8jIM5vVgoAEoiVxQ/O4BFTfHqulPZgs/ufEzMcFMdWU= +cloud.google.com/go v0.115.1 h1:Jo0SM9cQnSkYfp44+v+NQXHpcHqlnRJk2qxh6yvxxxQ= +cloud.google.com/go v0.115.1/go.mod h1:DuujITeaufu3gL68/lOFIirVNJwQeyf5UXyi+Wbgknc= cloud.google.com/go/accessapproval v1.4.0/go.mod h1:zybIuC3KpDOvotz59lFe5qxRZx6C75OtwbisN56xYB4= cloud.google.com/go/accessapproval v1.5.0/go.mod h1:HFy3tuiGvMdcd/u+Cu5b9NkO1pEICJ46IR82PoUdplw= cloud.google.com/go/accessapproval v1.6.0/go.mod h1:R0EiYnwV5fsRFiKZkPHr6mwyk2wxUJ30nL4j2pcFY2E= @@ -101,10 +101,10 @@ cloud.google.com/go/assuredworkloads v1.7.0/go.mod h1:z/736/oNmtGAyU47reJgGN+KVo cloud.google.com/go/assuredworkloads v1.8.0/go.mod h1:AsX2cqyNCOvEQC8RMPnoc0yEarXQk6WEKkxYfL6kGIo= cloud.google.com/go/assuredworkloads v1.9.0/go.mod h1:kFuI1P78bplYtT77Tb1hi0FMxM0vVpRC7VVoJC3ZoT0= cloud.google.com/go/assuredworkloads v1.10.0/go.mod h1:kwdUQuXcedVdsIaKgKTp9t0UJkE5+PAVNhdQm4ZVq2E= -cloud.google.com/go/auth v0.8.1 h1:QZW9FjC5lZzN864p13YxvAtGUlQ+KgRL+8Sg45Z6vxo= -cloud.google.com/go/auth v0.8.1/go.mod h1:qGVp/Y3kDRSDZ5gFD/XPUfYQ9xW1iI7q8RIRoCyBbJc= -cloud.google.com/go/auth/oauth2adapt v0.2.3 h1:MlxF+Pd3OmSudg/b1yZ5lJwoXCEaeedAguodky1PcKI= -cloud.google.com/go/auth/oauth2adapt v0.2.3/go.mod h1:tMQXOfZzFuNuUxOypHlQEXgdfX5cuhwU+ffUuXRJE8I= +cloud.google.com/go/auth v0.9.3 h1:VOEUIAADkkLtyfr3BLa3R8Ed/j6w1jTBmARx+wb5w5U= +cloud.google.com/go/auth v0.9.3/go.mod h1:7z6VY+7h3KUdRov5F1i8NDP5ZzWKYmEPO842BgCsmTk= +cloud.google.com/go/auth/oauth2adapt v0.2.4 h1:0GWE/FUsXhf6C+jAkWgYm7X9tK8cuEIfy19DBn6B6bY= +cloud.google.com/go/auth/oauth2adapt v0.2.4/go.mod h1:jC/jOpwFP6JBxhB3P5Rr0a9HLMC/Pe3eaL4NmdvqPtc= cloud.google.com/go/automl v1.5.0/go.mod h1:34EjfoFGMZ5sgJ9EoLsRtdPSNZLcfflJR39VbVNS2M0= cloud.google.com/go/automl v1.6.0/go.mod h1:ugf8a6Fx+zP0D59WLhqgTDsQI9w07o64uf/Is3Nh5p8= cloud.google.com/go/automl v1.7.0/go.mod h1:RL9MYCCsJEOmt0Wf3z9uzG0a7adTT1fe+aObgSpkCt8= @@ -133,10 +133,10 @@ cloud.google.com/go/bigquery v1.47.0/go.mod h1:sA9XOgy0A8vQK9+MWhEQTY6Tix87M/Zur cloud.google.com/go/bigquery v1.48.0/go.mod h1:QAwSz+ipNgfL5jxiaK7weyOhzdoAy1zFm0Nf1fysJac= cloud.google.com/go/bigquery v1.49.0/go.mod h1:Sv8hMmTFFYBlt/ftw2uN6dFdQPzBlREY9yBh7Oy7/4Q= cloud.google.com/go/bigquery v1.50.0/go.mod h1:YrleYEh2pSEbgTBZYMJ5SuSr0ML3ypjRB1zgf7pvQLU= -cloud.google.com/go/bigquery v1.62.0 h1:SYEA2f7fKqbSRRBHb7g0iHTtZvtPSPYdXfmqsjpsBwo= -cloud.google.com/go/bigquery v1.62.0/go.mod h1:5ee+ZkF1x/ntgCsFQJAQTM3QkAZOecfCmvxhkJsWRSA= -cloud.google.com/go/bigtable v1.29.0 h1:2CnFjKPwjpZMZdTi2RpppvxzD80zKzDYrLYEQw/NnAs= -cloud.google.com/go/bigtable v1.29.0/go.mod h1:5p909nNdWaNUcWs6KGZO8mI5HUovstlmrIi7+eA5PTQ= +cloud.google.com/go/bigquery v1.63.0 h1:yQFuJXdDukmBkiUUpjX0i1CtHLFU62HqPs/VDvSzaZo= +cloud.google.com/go/bigquery v1.63.0/go.mod h1:TQto6OR4kw27bqjNTGkVk1Vo5PJlTgxvDJn6YEIZL/E= +cloud.google.com/go/bigtable v1.31.0 h1:/uVLxGVRbK4mxK/iO89VqXcL/zoTSmkltVfIDYVBluQ= +cloud.google.com/go/bigtable v1.31.0/go.mod h1:N/mwZO+4TSHOeyiE1JxO+sRPnW4bnR7WLn9AEaiJqew= cloud.google.com/go/billing v1.4.0/go.mod h1:g9IdKBEFlItS8bTtlrZdVLWSSdSyFUZKXNS02zKMOZY= cloud.google.com/go/billing v1.5.0/go.mod h1:mztb1tBc3QekhjSgmpf/CV4LzWXLzCArwpLmP2Gm88s= cloud.google.com/go/billing v1.6.0/go.mod h1:WoXzguj+BeHXPbKfNWkqVtDdzORazmCjraY+vrxcyvI= @@ -210,8 +210,8 @@ cloud.google.com/go/datacatalog v1.8.0/go.mod h1:KYuoVOv9BM8EYz/4eMFxrr4DUKhGIOX cloud.google.com/go/datacatalog v1.8.1/go.mod h1:RJ58z4rMp3gvETA465Vg+ag8BGgBdnRPEMMSTr5Uv+M= cloud.google.com/go/datacatalog v1.12.0/go.mod h1:CWae8rFkfp6LzLumKOnmVh4+Zle4A3NXLzVJ1d1mRm0= cloud.google.com/go/datacatalog v1.13.0/go.mod h1:E4Rj9a5ZtAxcQJlEBTLgMTphfP11/lNaAshpoBgemX8= -cloud.google.com/go/datacatalog v1.20.5 h1:Cosg/L60myEbpP1HoNv77ykV7zWe7hqSwY4uUDmhx/I= -cloud.google.com/go/datacatalog v1.20.5/go.mod h1:DB0QWF9nelpsbB0eR/tA0xbHZZMvpoFD1XFy3Qv/McI= +cloud.google.com/go/datacatalog v1.22.0 h1:7e5/0B2LYbNx0BcUJbiCT8K2wCtcB5993z/v1JeLIdc= +cloud.google.com/go/datacatalog v1.22.0/go.mod h1:4Wff6GphTY6guF5WphrD76jOdfBiflDiRGFAxq7t//I= cloud.google.com/go/dataflow v0.6.0/go.mod h1:9QwV89cGoxjjSR9/r7eFDqqjtvbKxAK2BaYU6PVk9UM= cloud.google.com/go/dataflow v0.7.0/go.mod h1:PX526vb4ijFMesO1o202EaUmouZKBpjHsTlCtB4parQ= cloud.google.com/go/dataflow v0.8.0/go.mod h1:Rcf5YgTKPtQyYz8bLYhFoIV/vP39eL7fWNcSOyFfLJE= @@ -240,8 +240,8 @@ cloud.google.com/go/datastore v1.0.0/go.mod h1:LXYbyblFSglQ5pkeyhO+Qmw7ukd3C+pD7 cloud.google.com/go/datastore v1.1.0/go.mod h1:umbIZjpQpHh4hmRpGhH4tLFup+FVzqBi1b3c64qFpCk= cloud.google.com/go/datastore v1.10.0/go.mod h1:PC5UzAmDEkAmkfaknstTYbNpgE49HAgW2J1gcgUfmdM= cloud.google.com/go/datastore v1.11.0/go.mod h1:TvGxBIHCS50u8jzG+AW/ppf87v1of8nwzFNgEZU1D3c= -cloud.google.com/go/datastore v1.17.1 h1:6Me8ugrAOAxssGhSo8im0YSuy4YvYk4mbGvCadAH5aE= -cloud.google.com/go/datastore v1.17.1/go.mod h1:mtzZ2HcVtz90OVrEXXGDc2pO4NM1kiBQy8YV4qGe0ZM= +cloud.google.com/go/datastore v1.19.0 h1:p5H3bUQltOa26GcMRAxPoNwoqGkq5v8ftx9/ZBB35MI= +cloud.google.com/go/datastore v1.19.0/go.mod h1:KGzkszuj87VT8tJe67GuB+qLolfsOt6bZq/KFuWaahc= cloud.google.com/go/datastream v1.2.0/go.mod h1:i/uTP8/fZwgATHS/XFu0TcNUhuA0twZxxQ3EyCUQMwo= cloud.google.com/go/datastream v1.3.0/go.mod h1:cqlOX8xlyYF/uxhiKn6Hbv6WjwPPuI9W2M9SAXwaLLQ= cloud.google.com/go/datastream v1.4.0/go.mod h1:h9dpzScPhDTs5noEMQVWP8Wx8AFBRyS0s8KWPx/9r0g= @@ -327,8 +327,8 @@ cloud.google.com/go/iam v0.8.0/go.mod h1:lga0/y3iH6CX7sYqypWJ33hf7kkfXJag67naqGE cloud.google.com/go/iam v0.11.0/go.mod h1:9PiLDanza5D+oWFZiH1uG+RnRCfEGKoyl6yo4cgWZGY= cloud.google.com/go/iam v0.12.0/go.mod h1:knyHGviacl11zrtZUoDuYpDgLjvr28sLQaG0YB2GYAY= cloud.google.com/go/iam v0.13.0/go.mod h1:ljOg+rcNfzZ5d6f1nAUJ8ZIxOaZUVoS14bKCtaLZ/D0= -cloud.google.com/go/iam v1.1.12 h1:JixGLimRrNGcxvJEQ8+clfLxPlbeZA6MuRJ+qJNQ5Xw= -cloud.google.com/go/iam v1.1.12/go.mod h1:9LDX8J7dN5YRyzVHxwQzrQs9opFFqn0Mxs9nAeB+Hhg= +cloud.google.com/go/iam v1.2.0 h1:kZKMKVNk/IsSSc/udOb83K0hL/Yh/Gcqpz+oAkoIFN8= +cloud.google.com/go/iam v1.2.0/go.mod h1:zITGuWgsLZxd8OwAlX+eMFgZDXzBm7icj1PVTYG766Q= cloud.google.com/go/iap v1.4.0/go.mod h1:RGFwRJdihTINIe4wZ2iCP0zF/qu18ZwyKxrhMhygBEc= cloud.google.com/go/iap v1.5.0/go.mod h1:UH/CGgKd4KyohZL5Pt0jSKE4m3FR51qg6FKQ/z/Ix9A= cloud.google.com/go/iap v1.6.0/go.mod h1:NSuvI9C/j7UdjGjIde7t7HBz+QTwBcapPE07+sSRcLk= @@ -348,8 +348,8 @@ cloud.google.com/go/kms v1.8.0/go.mod h1:4xFEhYFqvW+4VMELtZyxomGSYtSQKzM178ylFW4 cloud.google.com/go/kms v1.9.0/go.mod h1:qb1tPTgfF9RQP8e1wq4cLFErVuTJv7UsSC915J8dh3w= cloud.google.com/go/kms v1.10.0/go.mod h1:ng3KTUtQQU9bPX3+QGLsflZIHlkbn8amFAMY63m8d24= cloud.google.com/go/kms v1.10.1/go.mod h1:rIWk/TryCkR59GMC3YtHtXeLzd634lBbKenvyySAyYI= -cloud.google.com/go/kms v1.18.4 h1:dYN3OCsQ6wJLLtOnI8DGUwQ5shMusXsWCCC+s09ATsk= -cloud.google.com/go/kms v1.18.4/go.mod h1:SG1bgQ3UWW6/KdPo9uuJnzELXY5YTTMJtDYvajiQ22g= +cloud.google.com/go/kms v1.19.0 h1:x0OVJDl6UH1BSX4THKlMfdcFWoE4ruh90ZHuilZekrU= +cloud.google.com/go/kms v1.19.0/go.mod h1:e4imokuPJUc17Trz2s6lEXFDt8bgDmvpVynH39bdrHM= cloud.google.com/go/language v1.4.0/go.mod h1:F9dRpNFQmJbkaop6g0JhSBXCNlO90e1KWx5iDdxbWic= cloud.google.com/go/language v1.6.0/go.mod h1:6dJ8t3B+lUYfStgls25GusK04NLh3eDLQnWM3mdEbhI= cloud.google.com/go/language v1.7.0/go.mod h1:DJ6dYN/W+SQOjF8e1hLQXMF21AkH2w9wiPzPCJa2MIE= @@ -363,8 +363,8 @@ cloud.google.com/go/logging v1.7.0/go.mod h1:3xjP2CjkM3ZkO73aj4ASA5wRPGGCRrPIAeN cloud.google.com/go/longrunning v0.1.1/go.mod h1:UUFxuDWkv22EuY93jjmDMFT5GPQKeFVJBIF6QlTqdsE= cloud.google.com/go/longrunning v0.3.0/go.mod h1:qth9Y41RRSUE69rDcOn6DdK3HfQfsUI0YSmW3iIlLJc= cloud.google.com/go/longrunning v0.4.1/go.mod h1:4iWDqhBZ70CvZ6BfETbvam3T8FMvLK+eFj0E6AaRQTo= -cloud.google.com/go/longrunning v0.5.11 h1:Havn1kGjz3whCfoD8dxMLP73Ph5w+ODyZB9RUsDxtGk= -cloud.google.com/go/longrunning v0.5.11/go.mod h1:rDn7//lmlfWV1Dx6IB4RatCPenTwwmqXuiP0/RgoEO4= +cloud.google.com/go/longrunning v0.6.0 h1:mM1ZmaNsQsnb+5n1DNPeL0KwQd9jQRqSqSDEkBZr+aI= +cloud.google.com/go/longrunning v0.6.0/go.mod h1:uHzSZqW89h7/pasCWNYdUpwGz3PcVWhrWupreVPYLts= cloud.google.com/go/managedidentities v1.3.0/go.mod h1:UzlW3cBOiPrzucO5qWkNkh0w33KFtBJU281hacNvsdE= cloud.google.com/go/managedidentities v1.4.0/go.mod h1:NWSBYbEMgqmbZsLIyKvxrYbtqOsxY1ZrGM+9RgDqInM= cloud.google.com/go/managedidentities v1.5.0/go.mod h1:+dWcZ0JlUmpuxpIDfyP5pP5y0bLdRwOS4Lp7gMni/LA= @@ -388,8 +388,8 @@ cloud.google.com/go/monitoring v1.7.0/go.mod h1:HpYse6kkGo//7p6sT0wsIC6IBDET0RhI cloud.google.com/go/monitoring v1.8.0/go.mod h1:E7PtoMJ1kQXWxPjB6mv2fhC5/15jInuulFdYYtlcvT4= cloud.google.com/go/monitoring v1.12.0/go.mod h1:yx8Jj2fZNEkL/GYZyTLS4ZtZEZN8WtDEiEqG4kLK50w= cloud.google.com/go/monitoring v1.13.0/go.mod h1:k2yMBAB1H9JT/QETjNkgdCGD9bPF712XiLTVr+cBrpw= -cloud.google.com/go/monitoring v1.20.3 h1:v/7MXFxYrhXLEZ9sSfwXdlTLLB/xrU7xTyYjY5acynQ= -cloud.google.com/go/monitoring v1.20.3/go.mod h1:GPIVIdNznIdGqEjtRKQWTLcUeRnPjZW85szouimiczU= +cloud.google.com/go/monitoring v1.21.0 h1:EMc0tB+d3lUewT2NzKC/hr8cSR9WsUieVywzIHetGro= +cloud.google.com/go/monitoring v1.21.0/go.mod h1:tuJ+KNDdJbetSsbSGTqnaBvbauS5kr3Q/koy3Up6r+4= cloud.google.com/go/networkconnectivity v1.4.0/go.mod h1:nOl7YL8odKyAOtzNX73/M5/mGZgqqMeryi6UPZTk/rA= cloud.google.com/go/networkconnectivity v1.5.0/go.mod h1:3GzqJx7uhtlM3kln0+x5wyFvuVH1pIBJjhCpjzSt75o= cloud.google.com/go/networkconnectivity v1.6.0/go.mod h1:OJOoEXW+0LAxHh89nXd64uGG+FbQoeH8DtxCHVOMlaM= @@ -449,8 +449,8 @@ cloud.google.com/go/pubsub v1.26.0/go.mod h1:QgBH3U/jdJy/ftjPhTkyXNj543Tin1pRYcd cloud.google.com/go/pubsub v1.27.1/go.mod h1:hQN39ymbV9geqBnfQq6Xf63yNhUAhv9CZhzp5O6qsW0= cloud.google.com/go/pubsub v1.28.0/go.mod h1:vuXFpwaVoIPQMGXqRyUQigu/AX1S3IWugR9xznmcXX8= cloud.google.com/go/pubsub v1.30.0/go.mod h1:qWi1OPS0B+b5L+Sg6Gmc9zD1Y+HaM0MdUr7LsupY1P4= -cloud.google.com/go/pubsub v1.41.0 h1:ZPaM/CvTO6T+1tQOs/jJ4OEMpjtel0PTLV7j1JK+ZrI= -cloud.google.com/go/pubsub v1.41.0/go.mod h1:g+YzC6w/3N91tzG66e2BZtp7WrpBBMXVa3Y9zVoOGpk= +cloud.google.com/go/pubsub v1.43.0 h1:s3Qx+F96J7Kwey/uVHdK3QxFLIlOvvw4SfMYw2jFjb4= +cloud.google.com/go/pubsub v1.43.0/go.mod h1:LNLfqItblovg7mHWgU5g84Vhza4J8kTxx0YqIeTzcXY= cloud.google.com/go/pubsublite v1.5.0/go.mod h1:xapqNQ1CuLfGi23Yda/9l4bBCKz/wC3KIJ5gKcxveZg= cloud.google.com/go/pubsublite v1.6.0/go.mod h1:1eFCS0U11xlOuMFV/0iBqw3zP12kddMeCbj/F3FSj9k= cloud.google.com/go/pubsublite v1.7.0/go.mod h1:8hVMwRXfDfvGm3fahVbtDbiLePT3gpoiJYJY+vxWxVM= @@ -677,56 +677,56 @@ github.com/aws/aws-sdk-go v1.30.19/go.mod h1:5zCpMtNQVjRREroY7sYe8lOMRSxkhG6MZve github.com/aws/aws-sdk-go v1.34.0 h1:brux2dRrlwCF5JhTL7MUT3WUwo9zfDHZZp3+g3Mvlmo= github.com/aws/aws-sdk-go v1.34.0/go.mod h1:5zCpMtNQVjRREroY7sYe8lOMRSxkhG6MZveU8YkpAk0= github.com/aws/aws-sdk-go-v2 v1.7.1/go.mod h1:L5LuPC1ZgDr2xQS7AmIec/Jlc7O/Y1u2KxJyNVab250= -github.com/aws/aws-sdk-go-v2 v1.30.4 h1:frhcagrVNrzmT95RJImMHgabt99vkXGslubDaDagTk8= -github.com/aws/aws-sdk-go-v2 v1.30.4/go.mod h1:CT+ZPWXbYrci8chcARI3OmI/qgd+f6WtuLOoaIA8PR0= -github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.4 h1:70PVAiL15/aBMh5LThwgXdSQorVr91L127ttckI9QQU= -github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.4/go.mod h1:/MQxMqci8tlqDH+pjmoLu1i0tbWCUP1hhyMRuFxpQCw= +github.com/aws/aws-sdk-go-v2 v1.31.0 h1:3V05LbxTSItI5kUqNwhJrrrY1BAXxXt0sN0l72QmG5U= +github.com/aws/aws-sdk-go-v2 v1.31.0/go.mod h1:ztolYtaEUtdpf9Wftr31CJfLVjOnD/CVRkKOOYgF8hA= +github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.5 h1:xDAuZTn4IMm8o1LnBZvmrL8JA1io4o3YWNXgohbf20g= +github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.5/go.mod h1:wYSv6iDS621sEFLfKvpPE2ugjTuGlAG7iROg0hLOkfc= github.com/aws/aws-sdk-go-v2/config v1.5.0/go.mod h1:RWlPOAW3E3tbtNAqTwvSW54Of/yP3oiZXMI0xfUdjyA= -github.com/aws/aws-sdk-go-v2/config v1.27.28 h1:OTxWGW/91C61QlneCtnD62NLb4W616/NM1jA8LhJqbg= -github.com/aws/aws-sdk-go-v2/config v1.27.28/go.mod h1:uzVRVtJSU5EFv6Fu82AoVFKozJi2ZCY6WRCXj06rbvs= +github.com/aws/aws-sdk-go-v2/config v1.27.37 h1:xaoIwzHVuRWRHFI0jhgEdEGc8xE1l91KaeRDsWEIncU= +github.com/aws/aws-sdk-go-v2/config v1.27.37/go.mod h1:S2e3ax9/8KnMSyRVNd3sWTKs+1clJ2f1U6nE0lpvQRg= github.com/aws/aws-sdk-go-v2/credentials v1.3.1/go.mod h1:r0n73xwsIVagq8RsxmZbGSRQFj9As3je72C2WzUIToc= -github.com/aws/aws-sdk-go-v2/credentials v1.17.28 h1:m8+AHY/ND8CMHJnPoH7PJIRakWGa4gbfbxuY9TGTUXM= -github.com/aws/aws-sdk-go-v2/credentials v1.17.28/go.mod h1:6TF7dSc78ehD1SL6KpRIPKMA1GyyWflIkjqg+qmf4+c= +github.com/aws/aws-sdk-go-v2/credentials v1.17.35 h1:7QknrZhYySEB1lEXJxGAmuD5sWwys5ZXNr4m5oEz0IE= +github.com/aws/aws-sdk-go-v2/credentials v1.17.35/go.mod h1:8Vy4kk7at4aPSmibr7K+nLTzG6qUQAUO4tW49fzUV4E= github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.3.0/go.mod h1:2LAuqPx1I6jNfaGDucWfA2zqQCYCOMCDHiCOciALyNw= -github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.12 h1:yjwoSyDZF8Jth+mUk5lSPJCkMC0lMy6FaCD51jm6ayE= -github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.12/go.mod h1:fuR57fAgMk7ot3WcNQfb6rSEn+SUffl7ri+aa8uKysI= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.14 h1:C/d03NAmh8C4BZXhuRNboF/DqhBkBCeDiJDcaqIT5pA= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.14/go.mod h1:7I0Ju7p9mCIdlrfS+JCgqcYD0VXz/N4yozsox+0o078= github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.3.2/go.mod h1:qaqQiHSrOUVOfKe6fhgQ6UzhxjwqVW8aHNegd6Ws4w4= -github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.17.12 h1:i7cJ1izNlox4ka6cvbHPTztYGtbpW4Je/jyQIKOIU4A= -github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.17.12/go.mod h1:lHnam/4CTEVHaANZD54IrpE80VLK+lUU84WEeJ1FJ8M= -github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.16 h1:TNyt/+X43KJ9IJJMjKfa3bNTiZbUP7DeCxfbTROESwY= -github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.16/go.mod h1:2DwJF39FlNAUiX5pAc0UNeiz16lK2t7IaFcm0LFHEgc= -github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.16 h1:jYfy8UPmd+6kJW5YhY0L1/KftReOGxI/4NtVSTh9O/I= -github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.16/go.mod h1:7ZfEPZxkW42Afq4uQB8H2E2e6ebh6mXTueEpYzjCzcs= +github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.17.23 h1:DIheXDgLzIUyZNB9BKM+9OGbvwbxitX0N6b6qNbMmNU= +github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.17.23/go.mod h1:5QQZmD2ttfnDs7GzIjdQTcF2fo27mecoEIL63H8IDBE= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.18 h1:kYQ3H1u0ANr9KEKlGs/jTLrBFPo8P8NaH/w7A01NeeM= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.18/go.mod h1:r506HmK5JDUh9+Mw4CfGJGSSoqIiLCndAuqXuhbv67Y= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.18 h1:Z7IdFUONvTcvS7YuhtVxN99v2cCoHRXOS4mTr0B/pUc= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.18/go.mod h1:DkKMmksZVVyat+Y+r1dEOgJEfUeA7UngIHWeKsi0yNc= github.com/aws/aws-sdk-go-v2/internal/ini v1.1.1/go.mod h1:Zy8smImhTdOETZqfyn01iNOe0CNggVbPjCajyaz6Gvg= github.com/aws/aws-sdk-go-v2/internal/ini v1.8.1 h1:VaRN3TlFdd6KxX1x3ILT5ynH6HvKgqdiXoTxAF4HQcQ= github.com/aws/aws-sdk-go-v2/internal/ini v1.8.1/go.mod h1:FbtygfRFze9usAadmnGJNc8KsP346kEe+y2/oyhGAGc= -github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.16 h1:mimdLQkIX1zr8GIPY1ZtALdBQGxcASiBd2MOp8m/dMc= -github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.16/go.mod h1:YHk6owoSwrIsok+cAH9PENCOGoH5PU2EllX4vLtSrsY= +github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.18 h1:OWYvKL53l1rbsUmW7bQyJVsYU/Ii3bbAAQIIFNbM0Tk= +github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.18/go.mod h1:CUx0G1v3wG6l01tUB+j7Y8kclA8NSqK4ef0YG79a4cg= github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.2.1/go.mod h1:v33JQ57i2nekYTA70Mb+O18KeH4KqhdqxTJZNK1zdRE= -github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.11.4 h1:KypMCbLPPHEmf9DgMGw51jMj77VfGPAN2Kv4cfhlfgI= -github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.11.4/go.mod h1:Vz1JQXliGcQktFTN/LN6uGppAIRoLBR2bMvIMP0gOjc= -github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.3.18 h1:GckUnpm4EJOAio1c8o25a+b3lVfwVzC9gnSBqiiNmZM= -github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.3.18/go.mod h1:Br6+bxfG33Dk3ynmkhsW2Z/t9D4+lRqdLDNCKi85w0U= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.11.5 h1:QFASJGfT8wMXtuP3D5CRmMjARHv9ZmzFUMJznHDOY3w= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.11.5/go.mod h1:QdZ3OmoIjSX+8D1OPAzPxDfjXASbBMDsz9qvtyIhtik= +github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.3.20 h1:rTWjG6AvWekO2B1LHeM3ktU7MqyX9rzWQ7hgzneZW7E= +github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.3.20/go.mod h1:RGW2DDpVc8hu6Y6yG8G5CHVmVOAn1oV8rNKOHRJyswg= github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.2.1/go.mod h1:zceowr5Z1Nh2WVP8bf/3ikB41IZW59E4yIYbg+pC6mw= -github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.11.18 h1:tJ5RnkHCiSH0jyd6gROjlJtNwov0eGYNz8s8nFcR0jQ= -github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.11.18/go.mod h1:++NHzT+nAF7ZPrHPsA+ENvsXkOO8wEu+C6RXltAG4/c= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.11.20 h1:Xbwbmk44URTiHNx6PNo0ujDE6ERlsCKJD3u1zfnzAPg= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.11.20/go.mod h1:oAfOFzUB14ltPZj1rWwRc3d/6OgD76R8KlvU3EqM9Fg= github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.5.1/go.mod h1:6EQZIwNNvHpq/2/QSJnp4+ECvqIy55w95Ofs0ze+nGQ= -github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.17.16 h1:jg16PhLPUiHIj8zYIW6bqzeQSuHVEiWnGA0Brz5Xv2I= -github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.17.16/go.mod h1:Uyk1zE1VVdsHSU7096h/rwnXDzOzYQVl+FNPhPw7ShY= +github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.17.18 h1:eb+tFOIl9ZsUe2259/BKPeniKuz4/02zZFH/i4Nf8Rg= +github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.17.18/go.mod h1:GVCC2IJNJTmdlyEsSmofEy7EfJncP7DNnXDzRjJ5Keg= github.com/aws/aws-sdk-go-v2/service/s3 v1.11.1/go.mod h1:XLAGFrEjbvMCLvAtWLLP32yTv8GpBquCApZEycDLunI= -github.com/aws/aws-sdk-go-v2/service/s3 v1.60.0 h1:2QXGJvG19QwqXUvgcdoCOZPyLuvZf8LiXPCN4P53TdI= -github.com/aws/aws-sdk-go-v2/service/s3 v1.60.0/go.mod h1:BSPI0EfnYUuNHPS0uqIo5VrRwzie+Fp+YhQOUs16sKI= +github.com/aws/aws-sdk-go-v2/service/s3 v1.63.1 h1:TR96r56VwELV0qguNFCuz+/bEpRfnR3ZsS9/IG05C7Q= +github.com/aws/aws-sdk-go-v2/service/s3 v1.63.1/go.mod h1:NLTqRLe3pUNu3nTEHI6XlHLKYmc8fbHUdMxAB6+s41Q= github.com/aws/aws-sdk-go-v2/service/sso v1.3.1/go.mod h1:J3A3RGUvuCZjvSuZEcOpHDnzZP/sKbhDWV2T1EOzFIM= -github.com/aws/aws-sdk-go-v2/service/sso v1.22.5 h1:zCsFCKvbj25i7p1u94imVoO447I/sFv8qq+lGJhRN0c= -github.com/aws/aws-sdk-go-v2/service/sso v1.22.5/go.mod h1:ZeDX1SnKsVlejeuz41GiajjZpRSWR7/42q/EyA/QEiM= -github.com/aws/aws-sdk-go-v2/service/ssooidc v1.26.5 h1:SKvPgvdvmiTWoi0GAJ7AsJfOz3ngVkD/ERbs5pUnHNI= -github.com/aws/aws-sdk-go-v2/service/ssooidc v1.26.5/go.mod h1:20sz31hv/WsPa3HhU3hfrIet2kxM4Pe0r20eBZ20Tac= +github.com/aws/aws-sdk-go-v2/service/sso v1.23.1 h1:2jrVsMHqdLD1+PA4BA6Nh1eZp0Gsy3mFSB5MxDvcJtU= +github.com/aws/aws-sdk-go-v2/service/sso v1.23.1/go.mod h1:XRlMvmad0ZNL+75C5FYdMvbbLkd6qiqz6foR1nA1PXY= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.27.1 h1:0L7yGCg3Hb3YQqnSgBTZM5wepougtL1aEccdcdYhHME= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.27.1/go.mod h1:FnvDM4sfa+isJ3kDXIzAB9GAwVSzFzSy97uZ3IsHo4E= github.com/aws/aws-sdk-go-v2/service/sts v1.6.0/go.mod h1:q7o0j7d7HrJk/vr9uUt3BVRASvcU7gYZB9PUgPiByXg= -github.com/aws/aws-sdk-go-v2/service/sts v1.30.4 h1:iAckBT2OeEK/kBDyN/jDtpEExhjeeA/Im2q4X0rJZT8= -github.com/aws/aws-sdk-go-v2/service/sts v1.30.4/go.mod h1:vmSqFK+BVIwVpDAGZB3CoCXHzurt4qBE8lf+I/kRTh0= +github.com/aws/aws-sdk-go-v2/service/sts v1.31.1 h1:8K0UNOkZiK9Uh3HIF6Bx0rcNCftqGCeKmOaR7Gp5BSo= +github.com/aws/aws-sdk-go-v2/service/sts v1.31.1/go.mod h1:yMWe0F+XG0DkRZK5ODZhG7BEFYhLXi2dqGsv6tX0cgI= github.com/aws/smithy-go v1.6.0/go.mod h1:SObp3lf9smib00L/v3U2eAKG8FyQ7iLrJnQiAmR5n+E= -github.com/aws/smithy-go v1.20.4 h1:2HK1zBdPgRbjFOHlfeQZfpC4r72MOb9bZkiFwggKO+4= -github.com/aws/smithy-go v1.20.4/go.mod h1:irrKGvNn1InZwb2d7fkIRNucdfwR8R+Ts3wxYa/cJHg= +github.com/aws/smithy-go v1.21.0 h1:H7L8dtDRk0P1Qm6y0ji7MCYMQObJ5R9CRpyPhRUkLYA= +github.com/aws/smithy-go v1.21.0/go.mod h1:irrKGvNn1InZwb2d7fkIRNucdfwR8R+Ts3wxYa/cJHg= github.com/boombuler/barcode v1.0.0/go.mod h1:paBWMcWSl3LHKBqUq+rly7CNSldXjb2rDl3JlRe0mD8= github.com/boombuler/barcode v1.0.1/go.mod h1:paBWMcWSl3LHKBqUq+rly7CNSldXjb2rDl3JlRe0mD8= github.com/cenkalti/backoff/v4 v4.2.1 h1:y4OZtCnogmCPw98Zjyt5a6+QwPLGkiQsYW5oUqylYbM= @@ -757,8 +757,8 @@ github.com/cncf/xds/go v0.0.0-20211011173535-cb28da3451f1/go.mod h1:eXthEFrGJvWH github.com/cncf/xds/go v0.0.0-20220314180256-7f1daf1720fc/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= github.com/cncf/xds/go v0.0.0-20230105202645-06c439db220b/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= github.com/cncf/xds/go v0.0.0-20230607035331-e9ce68804cb4/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= -github.com/cncf/xds/go v0.0.0-20240423153145-555b57ec207b h1:ga8SEFjZ60pxLcmhnThWgvH2wg8376yUJmPhEH4H3kw= -github.com/cncf/xds/go v0.0.0-20240423153145-555b57ec207b/go.mod h1:W+zGtBO5Y1IgJhy4+A9GOqVhqLpfZi+vwmdNXUehLA8= +github.com/cncf/xds/go v0.0.0-20240822171458-6449f94b4d59 h1:fLZ97KE86ELjEYJCEUVzmbhfzDxHHGwBrDVMd4XL6Bs= +github.com/cncf/xds/go v0.0.0-20240822171458-6449f94b4d59/go.mod h1:W+zGtBO5Y1IgJhy4+A9GOqVhqLpfZi+vwmdNXUehLA8= github.com/colinmarc/hdfs/v2 v2.1.1/go.mod h1:M3x+k8UKKmxtFu++uAZ0OtDU8jR3jnaZIAc6yK4Ue0c= github.com/containerd/log v0.1.0 h1:TCJt7ioM2cr/tfR8GPbGf9/VRAX8D2B4PjzCpfX540I= github.com/containerd/log v0.1.0/go.mod h1:VRRf09a7mHDIRezVKTRCrOq78v577GXq3bSa3EhrzVo= @@ -775,8 +775,8 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5QvfrDyIgxBk= github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E= -github.com/docker/docker v27.1.2+incompatible h1:AhGzR1xaQIy53qCkxARaFluI00WPGtXn0AJuoQsVYTY= -github.com/docker/docker v27.1.2+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= +github.com/docker/docker v27.2.1+incompatible h1:fQdiLfW7VLscyoeYEBz7/J8soYFDZV1u6VW6gJEjNMI= +github.com/docker/docker v27.2.1+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= github.com/docker/go-connections v0.5.0 h1:USnMq7hx7gwdVZq1L49hLXaFtUdTADjXGp+uj1Br63c= github.com/docker/go-connections v0.5.0/go.mod h1:ov60Kzw0kKElRwhNs9UlUHAE/F9Fe6GLaXnqyDdmEXc= github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= @@ -796,14 +796,14 @@ github.com/envoyproxy/go-control-plane v0.9.10-0.20210907150352-cf90f659a021/go. github.com/envoyproxy/go-control-plane v0.10.2-0.20220325020618-49ff273808a1/go.mod h1:KJwIaB5Mv44NWtYuAOFCVOjcI94vtpEz2JU/D2v6IjE= github.com/envoyproxy/go-control-plane v0.10.3/go.mod h1:fJJn/j26vwOu972OllsvAgJJM//w9BV6Fxbg2LuVd34= github.com/envoyproxy/go-control-plane v0.11.1-0.20230524094728-9239064ad72f/go.mod h1:sfYdkwUW4BA3PbKjySwjJy+O4Pu0h62rlqCMHNk+K+Q= -github.com/envoyproxy/go-control-plane v0.12.0 h1:4X+VP1GHd1Mhj6IB5mMeGbLCleqxjletLK6K0rbxyZI= -github.com/envoyproxy/go-control-plane v0.12.0/go.mod h1:ZBTaoJ23lqITozF0M6G4/IragXCQKCnYbmlmtHvwRG0= +github.com/envoyproxy/go-control-plane v0.13.0 h1:HzkeUz1Knt+3bK+8LG1bxOO/jzWZmdxpwC51i202les= +github.com/envoyproxy/go-control-plane v0.13.0/go.mod h1:GRaKG3dwvFoTg4nj7aXdZnvMg4d7nvT/wl9WgVXn3Q8= github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= github.com/envoyproxy/protoc-gen-validate v0.6.7/go.mod h1:dyJXwwfPK2VSqiB9Klm1J6romD608Ba7Hij42vrOBCo= github.com/envoyproxy/protoc-gen-validate v0.9.1/go.mod h1:OKNgG7TCp5pF4d6XftA0++PMirau2/yoOwVac3AbF2w= github.com/envoyproxy/protoc-gen-validate v0.10.1/go.mod h1:DRjgyB0I43LtJapqN6NiRwroiAU2PaFuvk/vjgh61ss= -github.com/envoyproxy/protoc-gen-validate v1.0.4 h1:gVPz/FMfvh57HdSJQyvBtF00j8JU4zdyUgIUNhlgg0A= -github.com/envoyproxy/protoc-gen-validate v1.0.4/go.mod h1:qys6tmnRsYrQqIhm2bvKZH4Blx/1gTIZ2UKVY1M+Yew= +github.com/envoyproxy/protoc-gen-validate v1.1.0 h1:tntQDh69XqOCOZsDz0lVJQez/2L6Uu2PdjCQwWCJ3bM= +github.com/envoyproxy/protoc-gen-validate v1.1.0/go.mod h1:sXRDRVmzEbkM7CVcM06s9shE/m23dg3wzjl0UWqJ2q4= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/fogleman/gg v1.2.1-0.20190220221249-0403632d5b90/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k= @@ -888,8 +888,8 @@ github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM= github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= -github.com/google/btree v1.1.2 h1:xf4v41cLI2Z6FxbKm+8Bu+m8ifhj15JuZ9sa0jZCMUU= -github.com/google/btree v1.1.2/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4= +github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg= +github.com/google/btree v1.1.3/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4= github.com/google/flatbuffers v1.11.0/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8= github.com/google/flatbuffers v2.0.8+incompatible/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8= github.com/google/flatbuffers v23.5.26+incompatible h1:M9dgRyhJemaM4Sw8+66GHBu8ioaQmyPLg1b8VwK5WJg= @@ -951,8 +951,8 @@ github.com/googleapis/enterprise-certificate-proxy v0.1.0/go.mod h1:17drOmN3MwGY github.com/googleapis/enterprise-certificate-proxy v0.2.0/go.mod h1:8C0jb7/mgJe/9KK8Lm7X9ctZC2t60YyIpYEI16jx0Qg= github.com/googleapis/enterprise-certificate-proxy v0.2.1/go.mod h1:AwSRAtLfXpU5Nm3pW+v7rGDHp09LsPtGY9MduiEsR9k= github.com/googleapis/enterprise-certificate-proxy v0.2.3/go.mod h1:AwSRAtLfXpU5Nm3pW+v7rGDHp09LsPtGY9MduiEsR9k= -github.com/googleapis/enterprise-certificate-proxy v0.3.2 h1:Vie5ybvEvT75RniqhfFxPRy3Bf7vr3h0cechB90XaQs= -github.com/googleapis/enterprise-certificate-proxy v0.3.2/go.mod h1:VLSiSSBs/ksPL8kq3OBOQ6WRI2QnaFynd1DCjZ62+V0= +github.com/googleapis/enterprise-certificate-proxy v0.3.4 h1:XYIDZApgAnrN1c855gTgghdIA6Stxb52D5RnLI1SLyw= +github.com/googleapis/enterprise-certificate-proxy v0.3.4/go.mod h1:YKe7cfqYXjKGpGvmSg28/fFvhNzinZQm8DGnaburhGA= github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg= github.com/googleapis/gax-go/v2 v2.0.5/go.mod h1:DWXyrwAJ9X0FpwwEdw+IPEYBICEFu5mhpdKc/us6bOk= github.com/googleapis/gax-go/v2 v2.1.0/go.mod h1:Q3nei7sK6ybPYH7twZdmQpAd1MKb7pfu6SK+H1/DsU0= @@ -1100,6 +1100,8 @@ github.com/pkg/sftp v1.10.1/go.mod h1:lYOWFsE0bwd1+KfKJaKeuokY15vzFx25BLbzYYoAxZ github.com/pkg/sftp v1.13.1/go.mod h1:3HaPG6Dq1ILlpPZRO0HVMrsydcdLt6HRDccSgb87qRg= github.com/pkg/xattr v0.4.9 h1:5883YPCtkSd8LFbs13nXplj9g9tlrwoJRjgpgMu1/fE= github.com/pkg/xattr v0.4.9/go.mod h1:di8WF84zAKk8jzR1UBTEWh9AUlIZZ7M/JNt8e9B6ktU= +github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 h1:GFCKgmp0tecUJ0sJuv4pzYCqS9+RGSn52M3FUwPs+uo= +github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10/go.mod h1:t/avpk3KcrXxUnYOhZhMXJlSEyie6gQbtLq5NM3loB8= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c h1:ncq/mPwQF4JjgDlrVEn3C11VoGHZN7m8qihwgMEtzYw= @@ -1181,8 +1183,8 @@ github.com/xitongsys/parquet-go-source v0.0.0-20190524061010-2b72cbee77d5/go.mod github.com/xitongsys/parquet-go-source v0.0.0-20200817004010-026bad9b25d0/go.mod h1:HYhIKsdns7xz80OgkbgJYrtQY7FjHWHKH6cvN7+czGE= github.com/xitongsys/parquet-go-source v0.0.0-20220315005136-aec0fe3e777c h1:UDtocVeACpnwauljUbeHD9UOjjcvF5kLUHruww7VT9A= github.com/xitongsys/parquet-go-source v0.0.0-20220315005136-aec0fe3e777c/go.mod h1:qLb2Itmdcp7KPa5KZKvhE9U1q5bYSOmgeOckF/H2rQA= -github.com/youmark/pkcs8 v0.0.0-20181117223130-1be2e3e5546d h1:splanxYIlg+5LfHAM6xpdFEAYOk8iySO56hMFq6uLyA= -github.com/youmark/pkcs8 v0.0.0-20181117223130-1be2e3e5546d/go.mod h1:rHwXgn7JulP+udvsHwJoVG1YGAP6VLg4y9I5dyZdqmA= +github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 h1:ilQV1hzziu+LLM3zUTJ0trRztfwgjqKnBWNtSRkbmwM= +github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78/go.mod h1:aL8wCCfTfSfmXjznFBSZNN13rSJjlIOI1fUNAtF7rmI= github.com/yuin/goldmark v1.1.25/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.1.32/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= @@ -1196,11 +1198,11 @@ github.com/zeebo/assert v1.3.0 h1:g7C04CbJuIDKNPFHmsk4hwZDO5O+kntRxzaUoNXj+IQ= github.com/zeebo/assert v1.3.0/go.mod h1:Pq9JiuJQpG8JLJdtkwrJESF0Foym2/D9XMU5ciN/wJ0= github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0= github.com/zeebo/xxh3 v1.0.2/go.mod h1:5NWz9Sef7zIDm2JHfFlcQvNekmcEl9ekUZQQKCYaDcA= -go.einride.tech/aip v0.67.1 h1:d/4TW92OxXBngkSOwWS2CH5rez869KpKMaN44mdxkFI= -go.einride.tech/aip v0.67.1/go.mod h1:ZGX4/zKw8dcgzdLsrvpOOGxfxI2QSk12SlP7d6c0/XI= +go.einride.tech/aip v0.68.0 h1:4seM66oLzTpz50u4K1zlJyOXQ3tCzcJN7I22tKkjipw= +go.einride.tech/aip v0.68.0/go.mod h1:7y9FF8VtPWqpxuAxl0KQWqaULxW4zFIesD6zF5RIHHg= go.etcd.io/bbolt v1.3.5/go.mod h1:G5EMThwa9y8QZGBClrRx5EY+Yw9kAhnjy3bSjsnlVTQ= -go.mongodb.org/mongo-driver v1.16.1 h1:rIVLL3q0IHM39dvE+z2ulZLp9ENZKThVfuvN/IiN4l8= -go.mongodb.org/mongo-driver v1.16.1/go.mod h1:oB6AhJQvFQL4LEHyXi6aJzQJtBiTQHiAd83l0GdFaiw= +go.mongodb.org/mongo-driver v1.17.0 h1:Hp4q2MCjvY19ViwimTs00wHi7G4yzxh4/2+nTx8r40k= +go.mongodb.org/mongo-driver v1.17.0/go.mod h1:wwWm/+BuOddhcq3n68LKRmgk2wXzmF6s0SFOa0GINL4= go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU= go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8= go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= @@ -1210,24 +1212,24 @@ go.opencensus.io v0.22.5/go.mod h1:5pWMHQbX5EPX2/62yrJeAkowc+lfs/XD7Uxpq3pI6kk= go.opencensus.io v0.23.0/go.mod h1:XItmlyltB5F7CS4xOC1DcqMoFqwtC6OG2xF7mCv7P7E= go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0= go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo= -go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.49.0 h1:4Pp6oUg3+e/6M4C0A/3kJ2VYa++dsWVTtGgLVj5xtHg= -go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.49.0/go.mod h1:Mjt1i1INqiaoZOMGR1RIUJN+i3ChKoFRqzrRQhlkbs0= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.49.0 h1:jq9TW8u3so/bN+JPT166wjOI6/vQPF6Xe7nMNIltagk= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.49.0/go.mod h1:p8pYQP+m5XfbZm9fxtSKAbM6oIllS7s2AfxrChvc7iw= -go.opentelemetry.io/otel v1.24.0 h1:0LAOdjNmQeSTzGBzduGe/rU4tZhMwL5rWgtp9Ku5Jfo= -go.opentelemetry.io/otel v1.24.0/go.mod h1:W7b9Ozg4nkF5tWI5zsXkaKKDjdVjpD4oAt9Qi/MArHo= +go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.54.0 h1:r6I7RJCN86bpD/FQwedZ0vSixDpwuWREjW9oRMsmqDc= +go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.54.0/go.mod h1:B9yO6b04uB80CzjedvewuqDhxJxi11s7/GtiGa8bAjI= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.54.0 h1:TT4fX+nBOA/+LUkobKGW1ydGcn+G3vRw9+g5HwCphpk= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.54.0/go.mod h1:L7UH0GbB0p47T4Rri3uHjbpCFYrVrwc1I25QhNPiGK8= +go.opentelemetry.io/otel v1.29.0 h1:PdomN/Al4q/lN6iBJEN3AwPvUiHPMlt93c8bqTG5Llw= +go.opentelemetry.io/otel v1.29.0/go.mod h1:N/WtXPs1CNCUEx+Agz5uouwCba+i+bJGFicT8SR4NP8= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.19.0 h1:Mne5On7VWdx7omSrSSZvM4Kw7cS7NQkOOmLcgscI51U= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.19.0/go.mod h1:IPtUMKL4O3tH5y+iXVyAXqpAwMuzC1IrxVS81rummfE= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.19.0 h1:IeMeyr1aBvBiPVYihXIaeIZba6b8E1bYp7lbdxK8CQg= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.19.0/go.mod h1:oVdCUtjq9MK9BlS7TtucsQwUcXcymNiEDjgDD2jMtZU= -go.opentelemetry.io/otel/metric v1.24.0 h1:6EhoGWWK28x1fbpA4tYTOWBkPefTDQnb8WSGXlc88kI= -go.opentelemetry.io/otel/metric v1.24.0/go.mod h1:VYhLe1rFfxuTXLgj4CBiyz+9WYBA8pNGJgDcSFRKBco= -go.opentelemetry.io/otel/sdk v1.24.0 h1:YMPPDNymmQN3ZgczicBY3B6sf9n62Dlj9pWD3ucgoDw= -go.opentelemetry.io/otel/sdk v1.24.0/go.mod h1:KVrIYw6tEubO9E96HQpcmpTKDVn9gdv35HoYiQWGDFg= -go.opentelemetry.io/otel/sdk/metric v1.24.0 h1:yyMQrPzF+k88/DbH7o4FMAs80puqd+9osbiBrJrz/w8= -go.opentelemetry.io/otel/sdk/metric v1.24.0/go.mod h1:I6Y5FjH6rvEnTTAYQz3Mmv2kl6Ek5IIrmwTLqMrrOE0= -go.opentelemetry.io/otel/trace v1.24.0 h1:CsKnnL4dUAr/0llH9FKuc698G04IrpWV0MQA/Y1YELI= -go.opentelemetry.io/otel/trace v1.24.0/go.mod h1:HPc3Xr/cOApsBI154IU0OI0HJexz+aw5uPdbs3UCjNU= +go.opentelemetry.io/otel/metric v1.29.0 h1:vPf/HFWTNkPu1aYeIsc98l4ktOQaL6LeSoeV2g+8YLc= +go.opentelemetry.io/otel/metric v1.29.0/go.mod h1:auu/QWieFVWx+DmQOUMgj0F8LHWdgalxXqvp7BII/W8= +go.opentelemetry.io/otel/sdk v1.29.0 h1:vkqKjk7gwhS8VaWb0POZKmIEDimRCMsopNYnriHyryo= +go.opentelemetry.io/otel/sdk v1.29.0/go.mod h1:pM8Dx5WKnvxLCb+8lG1PRNIDxu9g9b9g59Qr7hfAAok= +go.opentelemetry.io/otel/sdk/metric v1.29.0 h1:K2CfmJohnRgvZ9UAj2/FhIf/okdWcNdBwe1m8xFXiSY= +go.opentelemetry.io/otel/sdk/metric v1.29.0/go.mod h1:6zZLdCl2fkauYoZIOn/soQIDSWFmNSRcICarHfuhNJQ= +go.opentelemetry.io/otel/trace v1.29.0 h1:J/8ZNK4XgR7a21DZUAsbF8pZ5Jcw1VhACmnYt39JTi4= +go.opentelemetry.io/otel/trace v1.29.0/go.mod h1:eHl3w0sp3paPkYstJOmAimxhiFXPg+MMTlEh3nsQgWQ= go.opentelemetry.io/proto/otlp v0.7.0/go.mod h1:PqfVotwruBrMGOCsRd/89rSnXhoiJIqeYNgFYFoEGnI= go.opentelemetry.io/proto/otlp v0.15.0/go.mod h1:H7XAot3MsfNsj7EXtrA2q5xSNQ10UqI405h3+duxN4U= go.opentelemetry.io/proto/otlp v0.19.0/go.mod h1:H7XAot3MsfNsj7EXtrA2q5xSNQ10UqI405h3+duxN4U= @@ -1245,8 +1247,8 @@ golang.org/x/crypto v0.0.0-20210421170649-83a5a9bb288b/go.mod h1:T9bdIzuCu7OtxOm golang.org/x/crypto v0.0.0-20210513164829-c07d793c2f9a/go.mod h1:P+XmwS30IXTQdn5tA2iutPOUgjI07+tq3H3K9MVA1s8= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.0.0-20211108221036-ceb1ce70b4fa/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= -golang.org/x/crypto v0.26.0 h1:RrRspgV4mU+YwB4FYnuBoKsUapNIL5cohGAmSH3azsw= -golang.org/x/crypto v0.26.0/go.mod h1:GY7jblb9wI+FOo5y8/S2oY4zWP07AkOJ4+jxCqdqn54= +golang.org/x/crypto v0.27.0 h1:GXm2NjJrPaiv/h1tb2UH8QfgC/hOf/+z0p6PT8o1w7A= +golang.org/x/crypto v0.27.0/go.mod h1:1Xngt8kV6Dvbssa53Ziq6Eqn0HqbZi5Z6R0ZpwQzt70= golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= @@ -1306,8 +1308,8 @@ golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91 golang.org/x/mod v0.7.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/mod v0.9.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= -golang.org/x/mod v0.18.0 h1:5+9lSbEzPSdWkH32vYPBwEpX8KwDbM52Ud9xBUvNlb0= -golang.org/x/mod v0.18.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0= +golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -1367,8 +1369,8 @@ golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc= golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= -golang.org/x/net v0.28.0 h1:a9JDOJc5GMUJ0+UDqmLT86WiEy7iWyIhz8gz8E4e5hE= -golang.org/x/net v0.28.0/go.mod h1:yqtgsTWOOnlGLG9GFRrK3++bGOUEkNBoHZc8MEDWPNg= +golang.org/x/net v0.29.0 h1:5ORfpBpCs4HzDYoodCDBbwHzdR5UrLBZ3sOnUJmFoHo= +golang.org/x/net v0.29.0/go.mod h1:gLkgy8jTGERgjzMic6DS9+SP0ajcu6Xu3Orq/SpETg0= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -1398,8 +1400,8 @@ golang.org/x/oauth2 v0.4.0/go.mod h1:RznEsdpjGAINPTOF0UH/t+xJ75L18YO3Ho6Pyn+uRec golang.org/x/oauth2 v0.5.0/go.mod h1:9/XBHVqLaWO3/BRHs5jbpYCnOZVjj5V0ndyaAM7KB4I= golang.org/x/oauth2 v0.6.0/go.mod h1:ycmewcwgD4Rpr3eZJLSB4Kyyljb3qDh40vJ8STE5HKw= golang.org/x/oauth2 v0.7.0/go.mod h1:hPLQkd9LyjfXTiRohC/41GhcFqxisoUQ99sCUOHO9x4= -golang.org/x/oauth2 v0.22.0 h1:BzDx2FehcG7jJwgWLELCdmLuxk2i+x9UDpSiss2u0ZA= -golang.org/x/oauth2 v0.22.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= +golang.org/x/oauth2 v0.23.0 h1:PbgcYx2W7i4LvjJWEbf0ngHV6qJYr86PkAV3bXdLEbs= +golang.org/x/oauth2 v0.23.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -1505,8 +1507,8 @@ golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/sys v0.24.0 h1:Twjiwq9dn6R1fQcyiK+wQyHWfaz/BJB+YIpzU/Cv3Xg= -golang.org/x/sys v0.24.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.25.0 h1:r+8e+loiHxRqhXVl6ML1nO3l1+oFoWbnlu2Ehimmi34= +golang.org/x/sys v0.25.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.2.0/go.mod h1:TVmDHMZPmdnySmBfhjOoOdhjzdE1h4u1VwSiw2l1Nuc= @@ -1515,8 +1517,8 @@ golang.org/x/term v0.4.0/go.mod h1:9P2UbLfCdcvo3p/nzKvsmas4TnlujnuoV9hGgYzW1lQ= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U= golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY= -golang.org/x/term v0.23.0 h1:F6D4vR+EHoL9/sWAWgAR1H2DcHr4PareCbAaCo1RpuU= -golang.org/x/term v0.23.0/go.mod h1:DgV24QBUrK6jhZXl+20l6UWznPlwAHm1Q1mGHtydmSk= +golang.org/x/term v0.24.0 h1:Mh5cbb+Zk2hqqXNO7S1iTjEphVL+jb8ZWaqh/g+JWkM= +golang.org/x/term v0.24.0/go.mod h1:lOBK/LVxemqiMij05LGJ0tzNr8xlmwBRJ81PX6wVLH8= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= @@ -1533,8 +1535,8 @@ golang.org/x/text v0.6.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= -golang.org/x/text v0.17.0 h1:XtiM5bkSOt+ewxlOE/aE/AKEHibwj/6gvWMl9Rsh0Qc= -golang.org/x/text v0.17.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= +golang.org/x/text v0.18.0 h1:XvMDiNzPAl0jr17s6W9lcaIhGUfUORdGCNsuLmPG224= +golang.org/x/text v0.18.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= @@ -1607,8 +1609,8 @@ golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc golang.org/x/tools v0.3.0/go.mod h1:/rWhSS2+zyEVwoJf8YAX6L2f0ntZ7Kn/mGgAWcipA5k= golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= golang.org/x/tools v0.7.0/go.mod h1:4pg6aUX35JBAogB10C9AtvVL+qowtN4pT3CGSQex14s= -golang.org/x/tools v0.22.0 h1:gqSGLZqv+AI9lIQzniJ0nZDRG5GBPsSi+DRNHWNz6yA= -golang.org/x/tools v0.22.0/go.mod h1:aCwcsjqvq7Yqt6TNyX7QMU2enbQ/Gt0bo6krSeEri+c= +golang.org/x/tools v0.24.0 h1:J1shsA93PJUEVaUSaay7UXAyE8aimq3GW0pjlolpa24= +golang.org/x/tools v0.24.0/go.mod h1:YhNqVBIfWHdzvTLs0d8LCuMhkKUgSUKldakyV7W/WDQ= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= @@ -1617,8 +1619,8 @@ golang.org/x/xerrors v0.0.0-20220411194840-2f41105eb62f/go.mod h1:I/5z698sn9Ka8T golang.org/x/xerrors v0.0.0-20220517211312-f3a8303e98df/go.mod h1:K8+ghG5WaK9qNqU5K3HdILfMLy1f3aNYFI/wnl100a8= golang.org/x/xerrors v0.0.0-20220609144429-65e65417b02f/go.mod h1:K8+ghG5WaK9qNqU5K3HdILfMLy1f3aNYFI/wnl100a8= golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2/go.mod h1:K8+ghG5WaK9qNqU5K3HdILfMLy1f3aNYFI/wnl100a8= -golang.org/x/xerrors v0.0.0-20240716161551-93cc26a95ae9 h1:LLhsEBxRTBLuKlQxFBYUOU8xyFgXv6cOTp2HASDlsDk= -golang.org/x/xerrors v0.0.0-20240716161551-93cc26a95ae9/go.mod h1:NDW/Ps6MPRej6fsCIbMTohpP40sJ/P/vI1MoTEGwX90= +golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da h1:noIWHXmPHxILtqtCOPIhSt0ABwskkZKjD3bXGnZGpNY= +golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da/go.mod h1:NDW/Ps6MPRej6fsCIbMTohpP40sJ/P/vI1MoTEGwX90= gonum.org/v1/gonum v0.0.0-20180816165407-929014505bf4/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo= gonum.org/v1/gonum v0.8.2/go.mod h1:oe/vMfY3deqTw+1EZJhuvEW2iwGF1bW9wwu7XCu0+v0= gonum.org/v1/gonum v0.9.3/go.mod h1:TZumC3NeyVQskjXqmyWt4S3bINhy7B4eYwW69EbyX+0= @@ -1686,8 +1688,8 @@ google.golang.org/api v0.108.0/go.mod h1:2Ts0XTHNVWxypznxWOYUeI4g3WdP9Pk2Qk58+a/ google.golang.org/api v0.110.0/go.mod h1:7FC4Vvx1Mooxh8C5HWjzZHcavuS2f6pmJpZx60ca7iI= google.golang.org/api v0.111.0/go.mod h1:qtFHvU9mhgTJegR31csQ+rwxyUTHOKFqCKWp1J0fdw0= google.golang.org/api v0.114.0/go.mod h1:ifYI2ZsFK6/uGddGfAD5BMxlnkBqCmqHSDUVi45N5Yg= -google.golang.org/api v0.192.0 h1:PljqpNAfZaaSpS+TnANfnNAXKdzHM/B9bKhwRlo7JP0= -google.golang.org/api v0.192.0/go.mod h1:9VcphjvAxPKLmSxVSzPlSRXy/5ARMEw5bf58WoVXafQ= +google.golang.org/api v0.197.0 h1:x6CwqQLsFiA5JKAiGyGBjc2bNtHtLddhJCE2IKuhhcQ= +google.golang.org/api v0.197.0/go.mod h1:AuOuo20GoQ331nq7DquGHlU6d+2wN2fZ8O0ta60nRNw= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= @@ -1827,12 +1829,12 @@ google.golang.org/genproto v0.0.0-20230323212658-478b75c54725/go.mod h1:UUQDJDOl google.golang.org/genproto v0.0.0-20230330154414-c0448cd141ea/go.mod h1:UUQDJDOlWu4KYeJZffbWgBkS1YFobzKbLVfK69pe0Ak= google.golang.org/genproto v0.0.0-20230331144136-dcfb400f0633/go.mod h1:UUQDJDOlWu4KYeJZffbWgBkS1YFobzKbLVfK69pe0Ak= google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1/go.mod h1:nKE/iIaLqn2bQwXBg8f1g2Ylh6r5MN5CmZvuzZCgsCU= -google.golang.org/genproto v0.0.0-20240730163845-b1a4ccb954bf h1:OqdXDEakZCVtDiZTjcxfwbHPCT11ycCEsTKesBVKvyY= -google.golang.org/genproto v0.0.0-20240730163845-b1a4ccb954bf/go.mod h1:mCr1K1c8kX+1iSBREvU3Juo11CB+QOEWxbRS01wWl5M= -google.golang.org/genproto/googleapis/api v0.0.0-20240725223205-93522f1f2a9f h1:b1Ln/PG8orm0SsBbHZWke8dDp2lrCD4jSmfglFpTZbk= -google.golang.org/genproto/googleapis/api v0.0.0-20240725223205-93522f1f2a9f/go.mod h1:AHT0dDg3SoMOgZGnZk29b5xTbPHMoEC8qthmBLJCpys= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240730163845-b1a4ccb954bf h1:liao9UHurZLtiEwBgT9LMOnKYsHze6eA6w1KQCMVN2Q= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240730163845-b1a4ccb954bf/go.mod h1:Ue6ibwXGpU+dqIcODieyLOcgj7z8+IcskoNIgZxtrFY= +google.golang.org/genproto v0.0.0-20240903143218-8af14fe29dc1 h1:BulPr26Jqjnd4eYDVe+YvyR7Yc2vJGkO5/0UxD0/jZU= +google.golang.org/genproto v0.0.0-20240903143218-8af14fe29dc1/go.mod h1:hL97c3SYopEHblzpxRL4lSs523++l8DYxGM1FQiYmb4= +google.golang.org/genproto/googleapis/api v0.0.0-20240903143218-8af14fe29dc1 h1:hjSy6tcFQZ171igDaN5QHOw2n6vx40juYbC/x67CEhc= +google.golang.org/genproto/googleapis/api v0.0.0-20240903143218-8af14fe29dc1/go.mod h1:qpvKtACPCQhAdu3PyQgV4l3LMXZEtft7y8QcarRsp9I= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240903143218-8af14fe29dc1 h1:pPJltXNxVzT4pK9yD8vR9X75DaWYYmLGMsEvBfFQZzQ= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240903143218-8af14fe29dc1/go.mod h1:UqMtugtsSgubUsoxbuAoiCXvqvErP7Gf0so0mK9tHxU= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= @@ -1874,8 +1876,8 @@ google.golang.org/grpc v1.52.3/go.mod h1:pu6fVzoFb+NBYNAvQL08ic+lvB2IojljRYuun5v google.golang.org/grpc v1.53.0/go.mod h1:OnIrk0ipVdj4N5d9IUoFUx72/VlD7+jUsHwZgwSMQpw= google.golang.org/grpc v1.54.0/go.mod h1:PUSEXI6iWghWaB6lXM4knEgpJNu2qUcKfDtNci3EC2g= google.golang.org/grpc v1.56.3/go.mod h1:I9bI3vqKfayGqPUAwGdOSu7kt6oIJLixfffKrpXqQ9s= -google.golang.org/grpc v1.65.0 h1:bs/cUb4lp1G5iImFFd3u5ixQzweKizoZJAwBNLR42lc= -google.golang.org/grpc v1.65.0/go.mod h1:WgYC2ypjlB0EiQi6wdKixMqukr6lBc0Vo+oOgjrM5ZQ= +google.golang.org/grpc v1.66.1 h1:hO5qAXR19+/Z44hmvIM4dQFMSYX9XcWsByfoxutBpAM= +google.golang.org/grpc v1.66.1/go.mod h1:s3/l6xSSCURdVfAnL+TqCNMyTDAGN6+lZeVxnZR128Y= google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.1.0/go.mod h1:6Kw0yEErY5E/yWrBtf03jp27GLLJujG4z/JK95pnjjw= google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= diff --git a/sdks/go/pkg/beam/core/runtime/graphx/translate.go b/sdks/go/pkg/beam/core/runtime/graphx/translate.go index 65280ef6b930..1e30d4258507 100644 --- a/sdks/go/pkg/beam/core/runtime/graphx/translate.go +++ b/sdks/go/pkg/beam/core/runtime/graphx/translate.go @@ -494,6 +494,7 @@ func (m *marshaller) addMultiEdge(edge NamedEdge) ([]string, error) { m.requirements[URNRequiresSplittableDoFn] = true } if _, ok := edge.Edge.DoFn.ProcessElementFn().BundleFinalization(); ok { + payload.RequestsFinalization = true m.requirements[URNRequiresBundleFinalization] = true } if _, ok := edge.Edge.DoFn.ProcessElementFn().StateProvider(); ok { diff --git a/sdks/go/pkg/beam/core/typex/special.go b/sdks/go/pkg/beam/core/typex/special.go index edc1249fe763..af36ba92d280 100644 --- a/sdks/go/pkg/beam/core/typex/special.go +++ b/sdks/go/pkg/beam/core/typex/special.go @@ -69,8 +69,18 @@ type Window interface { Equals(o Window) bool } -// BundleFinalization allows registering callbacks to be performed after the runner durably persists bundle results. +// BundleFinalization allows registering callbacks for the runner to invoke after the bundle completes and the runner +// commits the output. Parameter is accessible during DoFn StartBundle, ProcessElement, FinishBundle. +// However, if your DoFn implementation requires BundleFinalization in StartBundle or FinishBundle, it is needed in the +// ProcessElement signature, even if not invoked, +// Common use cases for BundleFinalization would be to perform work after elements in a bundle have been processed. +// See beam.ParDo for documentation on these DoFn lifecycle methods. type BundleFinalization interface { + + // RegisterCallback registers the runner to invoke func() after the runner persists the bundle of processed elements. + // The time.Duration configures the callback expiration, after which the runner will not invoke func(). + // Returning error communicates to the runner that bundle finalization failed and the runner may choose to attempt + // finalization again. RegisterCallback(time.Duration, func() error) } diff --git a/sdks/go/pkg/beam/forward.go b/sdks/go/pkg/beam/forward.go index 210c39ab4e49..b2f610b703e9 100644 --- a/sdks/go/pkg/beam/forward.go +++ b/sdks/go/pkg/beam/forward.go @@ -204,6 +204,7 @@ type Window = typex.Window // BundleFinalization represents the parameter used to register callbacks to // be run once the runner has durably persisted output for a bundle. +// See typex.BundleFinalization for more details. type BundleFinalization = typex.BundleFinalization // These are the reflect.Type instances of the universal types, which are used diff --git a/sdks/go/pkg/beam/runners/prism/internal/handlepardo.go b/sdks/go/pkg/beam/runners/prism/internal/handlepardo.go index 2d3425af33c6..13e9b6f1b79d 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/handlepardo.go +++ b/sdks/go/pkg/beam/runners/prism/internal/handlepardo.go @@ -78,11 +78,7 @@ func (h *pardo) PrepareTransform(tid string, t *pipepb.PTransform, comps *pipepb } // Lets check for and remove anything that makes things less simple. - if pdo.OnWindowExpirationTimerFamilySpec == "" && - !pdo.RequestsFinalization && - !pdo.RequiresStableInput && - !pdo.RequiresTimeSortedInput && - pdo.RestrictionCoderId == "" { + if pdo.RestrictionCoderId == "" { // Which inputs are Side inputs don't change the graph further, // so they're not included here. Any nearly any ParDo can have them. diff --git a/sdks/go/pkg/beam/runners/prism/internal/jobservices/job.go b/sdks/go/pkg/beam/runners/prism/internal/jobservices/job.go index 6cde48ded9ac..1407feafe325 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/jobservices/job.go +++ b/sdks/go/pkg/beam/runners/prism/internal/jobservices/job.go @@ -44,6 +44,7 @@ import ( var supportedRequirements = map[string]struct{}{ urns.RequirementSplittableDoFn: {}, urns.RequirementStatefulProcessing: {}, + urns.RequirementBundleFinalization: {}, } // TODO, move back to main package, and key off of executor handlers? diff --git a/sdks/go/pkg/beam/runners/prism/internal/preprocess.go b/sdks/go/pkg/beam/runners/prism/internal/preprocess.go index ed7f168e36ee..7de32f85b7ee 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/preprocess.go +++ b/sdks/go/pkg/beam/runners/prism/internal/preprocess.go @@ -445,6 +445,7 @@ func finalizeStage(stg *stage, comps *pipepb.Components, pipelineFacts *fusionFa if err := (proto.UnmarshalOptions{}).Unmarshal(t.GetSpec().GetPayload(), pardo); err != nil { return fmt.Errorf("unable to decode ParDoPayload for %v", link.Transform) } + stg.finalize = pardo.RequestsFinalization if len(pardo.GetTimerFamilySpecs())+len(pardo.GetStateSpecs())+len(pardo.GetOnWindowExpirationTimerFamilySpec()) > 0 { stg.stateful = true } diff --git a/sdks/go/pkg/beam/runners/prism/internal/preprocess_test.go b/sdks/go/pkg/beam/runners/prism/internal/preprocess_test.go index 1c35794658d4..56879a3455f2 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/preprocess_test.go +++ b/sdks/go/pkg/beam/runners/prism/internal/preprocess_test.go @@ -134,7 +134,11 @@ func Test_preprocessor_preProcessGraph(t *testing.T) { }}) gotStages := pre.preProcessGraph(test.input, nil) - if diff := cmp.Diff(test.wantStages, gotStages, cmp.AllowUnexported(stage{}, link{}), cmpopts.EquateEmpty()); diff != "" { + if diff := cmp.Diff(test.wantStages, gotStages, + cmp.AllowUnexported(stage{}, link{}), + cmpopts.EquateEmpty(), + cmpopts.IgnoreFields(stage{}, "baseProgTick"), + ); diff != "" { t.Errorf("preProcessGraph(%q) stages diff (-want,+got)\n%v", test.name, diff) } diff --git a/sdks/go/pkg/beam/runners/prism/internal/stage.go b/sdks/go/pkg/beam/runners/prism/internal/stage.go index da23ca8ccce1..f33754b2ca0a 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/stage.go +++ b/sdks/go/pkg/beam/runners/prism/internal/stage.go @@ -20,6 +20,8 @@ import ( "context" "fmt" "io" + "runtime/debug" + "sync/atomic" "time" "github.com/apache/beam/sdks/v2/go/pkg/beam/core/graph/mtime" @@ -62,6 +64,7 @@ type stage struct { sideInputs []engine.LinkID // Non-parallel input PCollections and their consumers internalCols []string // PCollections that escape. Used for precise coder sending. envID string + finalize bool stateful bool // hasTimers indicates the transform+timerfamily pairs that need to be waited on for // the stage to be considered complete. @@ -76,13 +79,36 @@ type stage struct { SinkToPCollection map[string]string OutputsToCoders map[string]engine.PColInfo + + // Stage specific progress and splitting interval. + baseProgTick atomic.Value // time.Duration +} + +// The minimum and maximum durations between each ProgressBundleRequest and split evaluation. +const ( + minimumProgTick = 100 * time.Millisecond + maximumProgTick = 30 * time.Second +) + +func clampTick(dur time.Duration) time.Duration { + switch { + case dur < minimumProgTick: + return minimumProgTick + case dur > maximumProgTick: + return maximumProgTick + default: + return dur + } } func (s *stage) Execute(ctx context.Context, j *jobservices.Job, wk *worker.W, comps *pipepb.Components, em *engine.ElementManager, rb engine.RunBundle) (err error) { + if s.baseProgTick.Load() == nil { + s.baseProgTick.Store(minimumProgTick) + } defer func() { // Convert execution panics to errors to fail the bundle. if e := recover(); e != nil { - err = fmt.Errorf("panic in stage.Execute bundle processing goroutine: %v, stage: %+v", e, s) + err = fmt.Errorf("panic in stage.Execute bundle processing goroutine: %v, stage: %+v,stackTrace:\n%s", e, s, debug.Stack()) } }() slog.Debug("Execute: starting bundle", "bundle", rb) @@ -142,7 +168,9 @@ func (s *stage) Execute(ctx context.Context, j *jobservices.Job, wk *worker.W, c previousTotalCount := int64(-2) // Total count of all pcollection elements. unsplit := true - progTick := time.NewTicker(100 * time.Millisecond) + baseTick := s.baseProgTick.Load().(time.Duration) + ticked := false + progTick := time.NewTicker(baseTick) defer progTick.Stop() var dataFinished, bundleFinished bool // If we have no data outputs, we still need to have progress & splits @@ -170,6 +198,7 @@ progress: break progress // exit progress loop on close. } case <-progTick.C: + ticked = true resp, err := b.Progress(ctx, wk) if err != nil { slog.Debug("SDK Error from progress, aborting progress", "bundle", rb, "error", err.Error()) @@ -196,6 +225,7 @@ progress: unsplit = false continue progress } + // TODO sort out rescheduling primary Roots on bundle failure. var residuals []engine.Residual for _, rr := range sr.GetResidualRoots() { @@ -220,12 +250,28 @@ progress: Data: residuals, }) } + + // Any split means we're processing slower than desired, but splitting should increase + // throughput. Back off for this and other bundles for this stage + baseTime := s.baseProgTick.Load().(time.Duration) + newTime := clampTick(baseTime * 4) + if s.baseProgTick.CompareAndSwap(baseTime, newTime) { + progTick.Reset(newTime) + } else { + progTick.Reset(s.baseProgTick.Load().(time.Duration)) + } } else { previousIndex = index["index"] previousTotalCount = index["totalCount"] } } } + // If we never received any progress ticks, we may have too long a time, shrink it for new runs instead. + if !ticked { + newTick := clampTick(baseTick - minimumProgTick) + // If it's otherwise unchanged, apply the new duration. + s.baseProgTick.CompareAndSwap(baseTick, newTick) + } // Tentative Data is ready, commit it to the main datastore. slog.Debug("Execute: committing data", "bundle", rb, slog.Any("outputsWithData", maps.Keys(b.OutputData.Raw)), slog.Any("outputs", maps.Keys(s.OutputsToCoders))) @@ -278,6 +324,14 @@ progress: slog.Debug("returned empty residual application", "bundle", rb, slog.Int("numResiduals", l), slog.String("pcollection", s.primaryInput)) } em.PersistBundle(rb, s.OutputsToCoders, b.OutputData, s.inputInfo, residuals) + if s.finalize { + _, err := b.Finalize(ctx, wk) + if err != nil { + slog.Error("SDK Error from bundle finalization", "bundle", rb, "error", err.Error()) + panic(err) + } + slog.Info("finalized bundle", "bundle", rb) + } b.OutputData = engine.TentativeData{} // Clear the data. return nil } diff --git a/sdks/go/pkg/beam/runners/prism/internal/unimplemented_test.go b/sdks/go/pkg/beam/runners/prism/internal/unimplemented_test.go index 6afb04521af0..f8917c72ccde 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/unimplemented_test.go +++ b/sdks/go/pkg/beam/runners/prism/internal/unimplemented_test.go @@ -83,6 +83,7 @@ func TestImplemented(t *testing.T) { {pipeline: primitives.Checkpoints}, {pipeline: primitives.CoGBK}, {pipeline: primitives.ReshuffleKV}, + {pipeline: primitives.ParDoProcessElementBundleFinalizer}, // The following have been "allowed" to unblock further development // But it's not clear these tests truly validate the expected behavior diff --git a/sdks/go/pkg/beam/runners/prism/internal/worker/bundle.go b/sdks/go/pkg/beam/runners/prism/internal/worker/bundle.go index 50e427ca36f5..3ccafdb81e9a 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/worker/bundle.go +++ b/sdks/go/pkg/beam/runners/prism/internal/worker/bundle.go @@ -206,6 +206,17 @@ func (b *B) Cleanup(wk *W) { wk.mu.Unlock() } +func (b *B) Finalize(ctx context.Context, wk *W) (*fnpb.FinalizeBundleResponse, error) { + resp := wk.sendInstruction(ctx, &fnpb.InstructionRequest{ + Request: &fnpb.InstructionRequest_FinalizeBundle{ + FinalizeBundle: &fnpb.FinalizeBundleRequest{ + InstructionId: b.InstID, + }, + }, + }) + return resp.GetFinalizeBundle(), nil +} + // Progress sends a progress request for the given bundle to the passed in worker, blocking on the response. func (b *B) Progress(ctx context.Context, wk *W) (*fnpb.ProcessBundleProgressResponse, error) { resp := wk.sendInstruction(ctx, &fnpb.InstructionRequest{ diff --git a/sdks/go/pkg/beam/transforms/periodic/periodic.go b/sdks/go/pkg/beam/transforms/periodic/periodic.go index 5a7b4d0cf536..cc9c342b9125 100644 --- a/sdks/go/pkg/beam/transforms/periodic/periodic.go +++ b/sdks/go/pkg/beam/transforms/periodic/periodic.go @@ -61,6 +61,15 @@ func NewSequenceDefinition(start, end time.Time, interval time.Duration) Sequenc } } +// Calculates size of the output that the sequence should have emitted up to now. +func calculateSequenceByteSize(now time.Time, sd SequenceDefinition, rest offsetrange.Restriction) int64 { + nowIndex := int64(now.Sub(mtime.Time(sd.Start).ToTime()) / sd.Interval) + if nowIndex < rest.Start { + return 0 + } + return 8 * (min(rest.End, nowIndex) - rest.Start) +} + type sequenceGenDoFn struct{} func (fn *sequenceGenDoFn) CreateInitialRestriction(sd SequenceDefinition) offsetrange.Restriction { @@ -75,8 +84,8 @@ func (fn *sequenceGenDoFn) CreateTracker(rest offsetrange.Restriction) *sdf.Lock return sdf.NewLockRTracker(offsetrange.NewTracker(rest)) } -func (fn *sequenceGenDoFn) RestrictionSize(_ SequenceDefinition, rest offsetrange.Restriction) float64 { - return rest.Size() +func (fn *sequenceGenDoFn) RestrictionSize(sd SequenceDefinition, rest offsetrange.Restriction) float64 { + return float64(calculateSequenceByteSize(time.Now(), sd, rest)) } func (fn *sequenceGenDoFn) SplitRestriction(_ SequenceDefinition, rest offsetrange.Restriction) []offsetrange.Restriction { diff --git a/sdks/go/pkg/beam/transforms/periodic/periodic_test.go b/sdks/go/pkg/beam/transforms/periodic/periodic_test.go index a34edf8d07b8..26f94be5b4e9 100644 --- a/sdks/go/pkg/beam/transforms/periodic/periodic_test.go +++ b/sdks/go/pkg/beam/transforms/periodic/periodic_test.go @@ -21,6 +21,7 @@ import ( "time" "github.com/apache/beam/sdks/v2/go/pkg/beam" + "github.com/apache/beam/sdks/v2/go/pkg/beam/io/rtrackers/offsetrange" "github.com/apache/beam/sdks/v2/go/pkg/beam/options/jobopts" _ "github.com/apache/beam/sdks/v2/go/pkg/beam/runners/prism" "github.com/apache/beam/sdks/v2/go/pkg/beam/testing/passert" @@ -56,3 +57,38 @@ func TestImpulse(t *testing.T) { passert.Count(s, out, "SecondsInMinute", 60) ptest.RunAndValidate(t, p) } + +func TestSize(t *testing.T) { + sd := SequenceDefinition{ + Interval: 10 * time.Second, + Start: 0, + End: 1000 * time.Minute.Milliseconds(), + } + end := int64((1000 * time.Minute) / (10 * time.Second)) + + sizeTests := []struct { + now, startIndex, endIndex, want int64 + }{ + {100, 10, end, 0}, + {100, 9, end, 8}, + {100, 8, end, 16}, + {101, 9, end, 8}, + {10000, 0, end, 8 * 10000 / 10}, + {10000, 1002, 1003, 0}, + {10100, 1002, 1003, 8}, + } + + for _, test := range sizeTests { + got := calculateSequenceByteSize( + time.Unix(test.now, 0), + sd, + offsetrange.Restriction{ + Start: int64(test.startIndex), + End: int64(test.endIndex), + }) + if got != test.want { + t.Errorf("TestBytes(%v, %v, %v) = %v, want %v", + test.now, test.startIndex, test.endIndex, got, test.want) + } + } +} diff --git a/sdks/go/test/integration/integration.go b/sdks/go/test/integration/integration.go index aec69036eeb5..de782daa2d5d 100644 --- a/sdks/go/test/integration/integration.go +++ b/sdks/go/test/integration/integration.go @@ -104,6 +104,9 @@ var directFilters = []string{ "TestSetState", "TestSetStateClear", "TestTimers.*", // no timer support for the go direct runner. + + // no support for BundleFinalizer + "TestParDoBundleFinalizer.*", } var portableFilters = []string{ @@ -134,6 +137,9 @@ var portableFilters = []string{ // The portable runner does not uniquify timers. (data elements re-fired) "TestTimers.*", + + // no support for BundleFinalizer + "TestParDoBundleFinalizer.*", } var prismFilters = []string{ @@ -190,6 +196,9 @@ var flinkFilters = []string{ "TestTimers_EventTime_Unbounded", // (failure when comparing on side inputs (NPE on window lookup)) "TestTimers_ProcessingTime.*", // Flink doesn't support processing time timers. + + // no support for BundleFinalizer + "TestParDoBundleFinalizer.*", } var samzaFilters = []string{ @@ -231,6 +240,9 @@ var samzaFilters = []string{ // Samza does not support state. "TestTimers.*", + + // no support for BundleFinalizer + "TestParDoBundleFinalizer.*", } var sparkFilters = []string{ @@ -265,6 +277,9 @@ var sparkFilters = []string{ "TestTimers_EventTime_Unbounded", // Side inputs in executable stage not supported. "TestTimers_ProcessingTime_Infinity", // Spark doesn't support test stream. + + // no support for BundleFinalizer + "TestParDoBundleFinalizer.*", } var dataflowFilters = []string{ diff --git a/sdks/go/test/integration/primitives/pardo.go b/sdks/go/test/integration/primitives/pardo.go index 2c2383ea90ba..dc59d8f67b80 100644 --- a/sdks/go/test/integration/primitives/pardo.go +++ b/sdks/go/test/integration/primitives/pardo.go @@ -18,6 +18,8 @@ package primitives import ( "flag" "fmt" + "sync/atomic" + "time" "github.com/apache/beam/sdks/v2/go/pkg/beam" "github.com/apache/beam/sdks/v2/go/pkg/beam/register" @@ -32,6 +34,9 @@ func init() { register.Function3x2(asymJoinFn) register.Function5x0(splitByName) register.Function2x0(emitPipelineOptions) + register.DoFn2x0[beam.BundleFinalization, []byte]((*processElemBundleFinalizer)(nil)) + register.DoFn2x0[beam.BundleFinalization, []byte]((*finalizerInFinishBundle)(nil)) + register.DoFn2x0[beam.BundleFinalization, []byte]((*finalizerInAll)(nil)) register.Iter1[int]() register.Iter2[int, int]() @@ -192,3 +197,78 @@ func emitPipelineOptions(_ []byte, emit func(string)) { emit(fmt.Sprintf("%s: %s", "B", beam.PipelineOptions.Get("B"))) emit(fmt.Sprintf("%s: %s", "C", beam.PipelineOptions.Get("C"))) } + +var CountInvokeBundleFinalizer atomic.Int32 + +const ( + BundleFinalizerStart = 1 + BundleFinalizerProcess = 2 + BundleFinalizerFinish = 4 +) + +// ParDoProcessElementBundleFinalizer creates a beam.Pipeline with a beam.ParDo0 that processes a DoFn with a +// beam.BundleFinalization in its ProcessElement method. +func ParDoProcessElementBundleFinalizer(s beam.Scope) { + imp := beam.Impulse(s) + beam.ParDo0(s, &processElemBundleFinalizer{}, imp) +} + +type processElemBundleFinalizer struct { +} + +func (fn *processElemBundleFinalizer) ProcessElement(bf beam.BundleFinalization, _ []byte) { + bf.RegisterCallback(time.Second, func() error { + CountInvokeBundleFinalizer.Add(BundleFinalizerProcess) + return nil + }) +} + +// ParDoFinishBundleFinalizer creates a beam.Pipeline with a beam.ParDo0 that processes a DoFn containing a noop +// beam.BundleFinalization in its ProcessElement method and a beam.BundleFinalization in its FinishBundle method. +func ParDoFinishBundleFinalizer(s beam.Scope) { + imp := beam.Impulse(s) + beam.ParDo0(s, &finalizerInFinishBundle{}, imp) +} + +type finalizerInFinishBundle struct{} + +// ProcessElement requires beam.BundleFinalization in its method signature in order for FinishBundle's +// beam.BundleFinalization to be invoked. +func (fn *finalizerInFinishBundle) ProcessElement(_ beam.BundleFinalization, _ []byte) {} + +func (fn *finalizerInFinishBundle) FinishBundle(bf beam.BundleFinalization) { + bf.RegisterCallback(time.Second, func() error { + CountInvokeBundleFinalizer.Add(BundleFinalizerFinish) + return nil + }) +} + +// ParDoFinalizerInAll creates a beam.Pipeline with a beam.ParDo0 that processes a DoFn containing a beam.BundleFinalization +// in all three lifecycle methods StartBundle, ProcessElement, FinishBundle. +func ParDoFinalizerInAll(s beam.Scope) { + imp := beam.Impulse(s) + beam.ParDo0(s, &finalizerInAll{}, imp) +} + +type finalizerInAll struct{} + +func (fn *finalizerInAll) StartBundle(bf beam.BundleFinalization) { + bf.RegisterCallback(time.Second, func() error { + CountInvokeBundleFinalizer.Add(BundleFinalizerStart) + return nil + }) +} + +func (fn *finalizerInAll) ProcessElement(bf beam.BundleFinalization, _ []byte) { + bf.RegisterCallback(time.Second, func() error { + CountInvokeBundleFinalizer.Add(BundleFinalizerProcess) + return nil + }) +} + +func (fn *finalizerInAll) FinishBundle(bf beam.BundleFinalization) { + bf.RegisterCallback(time.Second, func() error { + CountInvokeBundleFinalizer.Add(BundleFinalizerFinish) + return nil + }) +} diff --git a/sdks/go/test/integration/primitives/pardo_test.go b/sdks/go/test/integration/primitives/pardo_test.go index d2ad57b350b3..aa6cb3de2008 100644 --- a/sdks/go/test/integration/primitives/pardo_test.go +++ b/sdks/go/test/integration/primitives/pardo_test.go @@ -18,6 +18,8 @@ package primitives import ( "testing" + "github.com/apache/beam/sdks/v2/go/pkg/beam" + "github.com/apache/beam/sdks/v2/go/pkg/beam/options/jobopts" "github.com/apache/beam/sdks/v2/go/pkg/beam/testing/ptest" "github.com/apache/beam/sdks/v2/go/test/integration" ) @@ -46,3 +48,44 @@ func TestParDoPipelineOptions(t *testing.T) { integration.CheckFilters(t) ptest.RunAndValidate(t, ParDoPipelineOptions()) } + +func TestParDoBundleFinalizer(t *testing.T) { + integration.CheckFilters(t) + if !jobopts.IsLoopback() { + t.Skip("Only Loopback mode is supported") + } + for _, tt := range []struct { + name string + pipelineFn func(s beam.Scope) + want int32 + }{ + { + name: "InProcessElement", + pipelineFn: ParDoProcessElementBundleFinalizer, + want: BundleFinalizerProcess, + }, + { + name: "InFinishBundle", + pipelineFn: ParDoFinishBundleFinalizer, + want: BundleFinalizerFinish, + }, + { + name: "InStartProcessFinishBundle", + pipelineFn: ParDoFinalizerInAll, + want: BundleFinalizerStart + BundleFinalizerProcess + BundleFinalizerFinish, + }, + } { + t.Run(tt.name, func(t *testing.T) { + CountInvokeBundleFinalizer.Store(0) + p, s := beam.NewPipelineWithRoot() + tt.pipelineFn(s) + _, err := ptest.RunWithMetrics(p) + if err != nil { + t.Fatalf("Failed to execute job: %v", err) + } + if got := CountInvokeBundleFinalizer.Load(); got != tt.want { + t.Errorf("BundleFinalization RegisterCallback not invoked as expected via proxy counts, got: %v, want: %v", got, tt.want) + } + }) + } +} diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/RowCoder.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/RowCoder.java index 9121b60666aa..8fa46dbbd259 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/RowCoder.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/RowCoder.java @@ -25,6 +25,7 @@ import org.apache.beam.sdk.transforms.SerializableFunctions; import org.apache.beam.sdk.values.Row; import org.apache.beam.sdk.values.TypeDescriptors; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; import org.checkerframework.checker.nullness.qual.Nullable; /** A sub-class of SchemaCoder that can only encode {@link Row} instances. */ @@ -35,7 +36,12 @@ public static RowCoder of(Schema schema) { /** Override encoding positions for the given schema. */ public static void overrideEncodingPositions(UUID uuid, Map encodingPositions) { - SchemaCoder.overrideEncodingPositions(uuid, encodingPositions); + RowCoderGenerator.overrideEncodingPositions(uuid, encodingPositions); + } + + @VisibleForTesting + static void clearGeneratedRowCoders() { + RowCoderGenerator.clearRowCoderCache(); } private RowCoder(Schema schema) { diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/RowCoderGenerator.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/RowCoderGenerator.java index e3bd218945bf..7a1b16d7e91f 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/RowCoderGenerator.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/RowCoderGenerator.java @@ -30,6 +30,7 @@ import java.util.Map; import java.util.UUID; import javax.annotation.Nullable; +import javax.annotation.concurrent.GuardedBy; import net.bytebuddy.ByteBuddy; import net.bytebuddy.description.modifier.FieldManifestation; import net.bytebuddy.description.modifier.Ownership; @@ -53,10 +54,14 @@ import org.apache.beam.sdk.schemas.Schema.Field; import org.apache.beam.sdk.schemas.Schema.FieldType; import org.apache.beam.sdk.schemas.SchemaCoder; +import org.apache.beam.sdk.util.StringUtils; import org.apache.beam.sdk.util.common.ReflectHelpers; import org.apache.beam.sdk.values.Row; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Maps; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * A utility for automatically generating a {@link Coder} for {@link Row} objects corresponding to a @@ -109,21 +114,99 @@ public abstract class RowCoderGenerator { private static final String CODERS_FIELD_NAME = "FIELD_CODERS"; private static final String POSITIONS_FIELD_NAME = "FIELD_ENCODING_POSITIONS"; + static class WithStackTrace { + private final T value; + private final String stackTrace; + + public WithStackTrace(T value, String stackTrace) { + this.value = value; + this.stackTrace = stackTrace; + } + + public T getValue() { + return value; + } + + public String getStackTrace() { + return stackTrace; + } + } + // Cache for Coder class that are already generated. - private static final Map> GENERATED_CODERS = Maps.newConcurrentMap(); - private static final Map> ENCODING_POSITION_OVERRIDES = - Maps.newConcurrentMap(); + @GuardedBy("cacheLock") + private static final Map>> GENERATED_CODERS = Maps.newHashMap(); + + @GuardedBy("cacheLock") + private static final Map>> ENCODING_POSITION_OVERRIDES = + Maps.newHashMap(); + + private static final Object cacheLock = new Object(); + + private static final Logger LOG = LoggerFactory.getLogger(RowCoderGenerator.class); + + private static String getStackTrace() { + return StringUtils.arrayToNewlines(Thread.currentThread().getStackTrace(), 10); + } public static void overrideEncodingPositions(UUID uuid, Map encodingPositions) { - ENCODING_POSITION_OVERRIDES.put(uuid, encodingPositions); + final String stackTrace = getStackTrace(); + synchronized (cacheLock) { + @Nullable + WithStackTrace> previousEncodingPositions = + ENCODING_POSITION_OVERRIDES.put( + uuid, new WithStackTrace<>(encodingPositions, stackTrace)); + @Nullable WithStackTrace> existingCoder = GENERATED_CODERS.get(uuid); + if (previousEncodingPositions == null) { + if (existingCoder != null) { + LOG.error( + "Received encoding positions for uuid {} too late after creating RowCoder. Created: {}\n Override: {}", + uuid, + existingCoder.getStackTrace(), + stackTrace); + } else { + LOG.info("Received encoding positions {} for uuid {}.", encodingPositions, uuid); + } + } else if (!previousEncodingPositions.getValue().equals(encodingPositions)) { + if (existingCoder == null) { + LOG.error( + "Received differing encoding positions for uuid {} before coder creation. Was {} at {}\n Now {} at {}", + uuid, + previousEncodingPositions.getValue(), + encodingPositions, + previousEncodingPositions.getStackTrace(), + stackTrace); + } else { + LOG.error( + "Received differing encoding positions for uuid {} after coder creation at {}\n. " + + "Was {} at {}\n Now {} at {}\n", + uuid, + existingCoder.getStackTrace(), + previousEncodingPositions.getValue(), + encodingPositions, + previousEncodingPositions.getStackTrace(), + stackTrace); + } + } + } + } + + @VisibleForTesting + static void clearRowCoderCache() { + synchronized (cacheLock) { + GENERATED_CODERS.clear(); + } } @SuppressWarnings("unchecked") public static Coder generate(Schema schema) { - // Using ConcurrentHashMap::computeIfAbsent here would deadlock in case of nested - // coders. Using HashMap::computeIfAbsent generates ConcurrentModificationExceptions in Java 11. - Coder rowCoder = GENERATED_CODERS.get(schema.getUUID()); - if (rowCoder == null) { + String stackTrace = getStackTrace(); + UUID uuid = Preconditions.checkNotNull(schema.getUUID()); + // Avoid using computeIfAbsent which may cause issues with nested schemas. + synchronized (cacheLock) { + @Nullable WithStackTrace> existingRowCoder = GENERATED_CODERS.get(uuid); + if (existingRowCoder != null) { + return existingRowCoder.getValue(); + } TypeDescription.Generic coderType = TypeDescription.Generic.Builder.parameterizedType(Coder.class, Row.class).build(); DynamicType.Builder builder = @@ -131,8 +214,13 @@ public static Coder generate(Schema schema) { builder = implementMethods(schema, builder); int[] encodingPosToRowIndex = new int[schema.getFieldCount()]; + @Nullable + WithStackTrace> existingEncodingPositions = + ENCODING_POSITION_OVERRIDES.get(uuid); Map encodingPositions = - ENCODING_POSITION_OVERRIDES.getOrDefault(schema.getUUID(), schema.getEncodingPositions()); + existingEncodingPositions == null + ? schema.getEncodingPositions() + : existingEncodingPositions.getValue(); for (int recordIndex = 0; recordIndex < schema.getFieldCount(); ++recordIndex) { String name = schema.getField(recordIndex).getName(); int encodingPosition = encodingPositions.get(name); @@ -163,6 +251,7 @@ public static Coder generate(Schema schema) { .withParameters(Coder[].class, int[].class) .intercept(new GeneratedCoderConstructor()); + Coder rowCoder; try { rowCoder = builder @@ -179,9 +268,14 @@ public static Coder generate(Schema schema) { | InvocationTargetException e) { throw new RuntimeException("Unable to generate coder for schema " + schema, e); } - GENERATED_CODERS.put(schema.getUUID(), rowCoder); + GENERATED_CODERS.put(uuid, new WithStackTrace<>(rowCoder, stackTrace)); + LOG.debug( + "Created row coder for uuid {} with encoding positions {} at {}", + uuid, + encodingPositions, + stackTrace); + return rowCoder; } - return rowCoder; } private static class GeneratedCoderConstructor implements Implementation { @@ -326,7 +420,7 @@ static void encodeDelegate( } // Encode a bitmap for the null fields to save having to encode a bunch of nulls. - NULL_LIST_CODER.encode(scanNullFields(fieldValues), outputStream); + NULL_LIST_CODER.encode(scanNullFields(fieldValues, encodingPosToIndex), outputStream); for (int encodingPos = 0; encodingPos < fieldValues.length; ++encodingPos) { @Nullable Object fieldValue = fieldValues[encodingPosToIndex[encodingPos]]; if (fieldValue != null) { @@ -348,14 +442,15 @@ static void encodeDelegate( // Figure out which fields of the Row are null, and returns a BitSet. This allows us to save // on encoding each null field separately. - private static BitSet scanNullFields(Object[] fieldValues) { + private static BitSet scanNullFields(Object[] fieldValues, int[] encodingPosToIndex) { + Preconditions.checkState(fieldValues.length == encodingPosToIndex.length); BitSet nullFields = new BitSet(fieldValues.length); - for (int idx = 0; idx < fieldValues.length; ++idx) { - if (fieldValues[idx] == null) { - nullFields.set(idx); + for (int encodingPos = 0; encodingPos < encodingPosToIndex.length; ++encodingPos) { + int fieldIndex = encodingPosToIndex[encodingPos]; + if (fieldValues[fieldIndex] == null) { + nullFields.set(encodingPos); } } - return nullFields; } } @@ -425,7 +520,7 @@ static Row decodeDelegate( // in which case we drop the extra fields. if (encodingPos < coders.length) { int rowIndex = encodingPosToIndex[encodingPos]; - if (nullFields.get(rowIndex)) { + if (nullFields.get(encodingPos)) { fieldValues[rowIndex] = null; } else { Object fieldValue = coders[encodingPos].decode(inputStream); diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/Schema.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/Schema.java index 255d411028f9..5af59356b174 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/Schema.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/Schema.java @@ -41,6 +41,7 @@ import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.BiMap; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.HashBiMap; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableBiMap; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableSet; @@ -90,7 +91,12 @@ public String toString() { } } // A mapping between field names an indices. - private final BiMap fieldIndices = HashBiMap.create(); + private final BiMap fieldIndices; + + // Encoding positions can be used to maintain encoded byte compatibility between schemas with + // different field ordering or with added/removed fields. Such positions affect the encoding + // and decoding of Rows performed by RowCoderGenerator. They are stored within Schemas to + // facilitate plumbing to coders, display data etc but do not affect schema equality / uuid etc. private Map encodingPositions = Maps.newHashMap(); private boolean encodingPositionsOverridden = false; @@ -312,17 +318,20 @@ public Schema(List fields) { } public Schema(List fields, Options options) { - this.fields = fields; + this.fields = ImmutableList.copyOf(fields); int index = 0; - for (Field field : fields) { + BiMap fieldIndicesMutable = HashBiMap.create(); + for (Field field : this.fields) { Preconditions.checkArgument( - fieldIndices.get(field.getName()) == null, + fieldIndicesMutable.get(field.getName()) == null, "Duplicate field " + field.getName() + " added to schema"); encodingPositions.put(field.getName(), index); - fieldIndices.put(field.getName(), index++); + fieldIndicesMutable.put(field.getName(), index++); } - this.hashCode = Objects.hash(fieldIndices, fields); + this.fieldIndices = ImmutableBiMap.copyOf(fieldIndicesMutable); this.options = options; + this.hashCode = Objects.hash(this.fieldIndices, this.fields, this.options); + this.uuid = UUID.randomUUID(); } public static Schema of(Field... fields) { @@ -334,29 +343,24 @@ public static Schema of(Field... fields) { * fields. */ public Schema sorted() { - // Create a new schema and copy over the appropriate Schema object attributes: - // {fields, uuid, options} - // Note: encoding positions are not copied over because generally they should align with the - // ordering of field indices. Otherwise, problems may occur when encoding/decoding Rows of - // this schema. - Schema sortedSchema = - this.fields.stream() - .sorted(Comparator.comparing(Field::getName)) - .map( - field -> { - FieldType innerType = field.getType(); - if (innerType.getRowSchema() != null) { - Schema innerSortedSchema = innerType.getRowSchema().sorted(); - innerType = innerType.toBuilder().setRowSchema(innerSortedSchema).build(); - return field.toBuilder().setType(innerType).build(); - } - return field; - }) - .collect(Schema.toSchema()) - .withOptions(getOptions()); - sortedSchema.setUUID(getUUID()); - - return sortedSchema; + // Create a new schema and copy over the appropriate Schema object attributes: {fields, options} + // Note: uuid is not copied as the Schema field ordering is changed. encoding positions are not + // copied over because generally they should align with the ordering of field indices. + // Otherwise, problems may occur when encoding/decoding Rows of this schema. + return this.fields.stream() + .sorted(Comparator.comparing(Field::getName)) + .map( + field -> { + FieldType innerType = field.getType(); + if (innerType.getRowSchema() != null) { + Schema innerSortedSchema = innerType.getRowSchema().sorted(); + innerType = innerType.toBuilder().setRowSchema(innerSortedSchema).build(); + return field.toBuilder().setType(innerType).build(); + } + return field; + }) + .collect(Schema.toSchema()) + .withOptions(getOptions()); } /** Returns a copy of the Schema with the options set. */ @@ -405,11 +409,14 @@ public boolean equals(@Nullable Object o) { return false; } Schema other = (Schema) o; - // If both schemas have a UUID set, we can simply compare the UUIDs. - if (uuid != null && other.uuid != null) { - if (Objects.equals(uuid, other.uuid)) { - return true; - } + // If both schemas have a UUID set, we can short-circuit deep comparison if the + // UUIDs are equal. + if (uuid != null && other.uuid != null && Objects.equals(uuid, other.uuid)) { + return true; + } + // Utilize hash-code pre-calculation for cheap negative comparison. + if (this.hashCode != other.hashCode) { + return false; } return Objects.equals(fieldIndices, other.fieldIndices) && Objects.equals(getFields(), other.getFields()) diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/SchemaCoder.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/SchemaCoder.java index 323f4e98dc55..b93b64f7dbe8 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/SchemaCoder.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/SchemaCoder.java @@ -164,7 +164,10 @@ public String toString() { } // Sets the schema id, and then recursively ensures that all schemas have ids set. - private static void setSchemaIds(Schema schema) { + private static void setSchemaIds(@Nullable Schema schema) { + if (schema == null) { + return; + } if (schema.getUUID() == null) { schema.setUUID(UUID.randomUUID()); } @@ -187,7 +190,7 @@ private static void setSchemaIds(FieldType fieldType) { return; case ARRAY: - case ITERABLE:; + case ITERABLE: setSchemaIds(fieldType.getCollectionElementType()); return; diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/SchemaTranslation.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/SchemaTranslation.java index 1d3f3348f1ed..5253f82d15b9 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/SchemaTranslation.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/SchemaTranslation.java @@ -115,7 +115,12 @@ private static String getLogicalTypeUrn(String identifier) { .build(); public static SchemaApi.Schema schemaToProto(Schema schema, boolean serializeLogicalType) { - String uuid = schema.getUUID() != null ? schema.getUUID().toString() : ""; + return schemaToProto(schema, serializeLogicalType, true); + } + + public static SchemaApi.Schema schemaToProto( + Schema schema, boolean serializeLogicalType, boolean serializeUUID) { + String uuid = schema.getUUID() != null && serializeUUID ? schema.getUUID().toString() : ""; SchemaApi.Schema.Builder builder = SchemaApi.Schema.newBuilder().setId(uuid); for (Field field : schema.getFields()) { SchemaApi.Field protoField = @@ -123,7 +128,8 @@ public static SchemaApi.Schema schemaToProto(Schema schema, boolean serializeLog field, schema.indexOf(field.getName()), schema.getEncodingPositions().get(field.getName()), - serializeLogicalType); + serializeLogicalType, + serializeUUID); builder.addFields(protoField); } builder.addAllOptions(optionsToProto(schema.getOptions())); @@ -131,11 +137,11 @@ public static SchemaApi.Schema schemaToProto(Schema schema, boolean serializeLog } private static SchemaApi.Field fieldToProto( - Field field, int fieldId, int position, boolean serializeLogicalType) { + Field field, int fieldId, int position, boolean serializeLogicalType, boolean serializeUUID) { return SchemaApi.Field.newBuilder() .setName(field.getName()) .setDescription(field.getDescription()) - .setType(fieldTypeToProto(field.getType(), serializeLogicalType)) + .setType(fieldTypeToProto(field.getType(), serializeLogicalType, serializeUUID)) .setId(fieldId) .setEncodingPosition(position) .addAllOptions(optionsToProto(field.getOptions())) @@ -143,34 +149,46 @@ private static SchemaApi.Field fieldToProto( } @VisibleForTesting - static SchemaApi.FieldType fieldTypeToProto(FieldType fieldType, boolean serializeLogicalType) { + static SchemaApi.FieldType fieldTypeToProto( + FieldType fieldType, boolean serializeLogicalType, boolean serializeUUID) { SchemaApi.FieldType.Builder builder = SchemaApi.FieldType.newBuilder(); switch (fieldType.getTypeName()) { case ROW: builder.setRowType( SchemaApi.RowType.newBuilder() - .setSchema(schemaToProto(fieldType.getRowSchema(), serializeLogicalType))); + .setSchema( + schemaToProto(fieldType.getRowSchema(), serializeLogicalType, serializeUUID))); break; case ARRAY: builder.setArrayType( SchemaApi.ArrayType.newBuilder() .setElementType( - fieldTypeToProto(fieldType.getCollectionElementType(), serializeLogicalType))); + fieldTypeToProto( + fieldType.getCollectionElementType(), + serializeLogicalType, + serializeUUID))); break; case ITERABLE: builder.setIterableType( SchemaApi.IterableType.newBuilder() .setElementType( - fieldTypeToProto(fieldType.getCollectionElementType(), serializeLogicalType))); + fieldTypeToProto( + fieldType.getCollectionElementType(), + serializeLogicalType, + serializeUUID))); break; case MAP: builder.setMapType( SchemaApi.MapType.newBuilder() - .setKeyType(fieldTypeToProto(fieldType.getMapKeyType(), serializeLogicalType)) - .setValueType(fieldTypeToProto(fieldType.getMapValueType(), serializeLogicalType)) + .setKeyType( + fieldTypeToProto( + fieldType.getMapKeyType(), serializeLogicalType, serializeUUID)) + .setValueType( + fieldTypeToProto( + fieldType.getMapValueType(), serializeLogicalType, serializeUUID)) .build()); break; @@ -186,12 +204,14 @@ static SchemaApi.FieldType fieldTypeToProto(FieldType fieldType, boolean seriali .setUrn(logicalType.getIdentifier()) .setPayload(ByteString.copyFrom(((UnknownLogicalType) logicalType).getPayload())) .setRepresentation( - fieldTypeToProto(logicalType.getBaseType(), serializeLogicalType)); + fieldTypeToProto( + logicalType.getBaseType(), serializeLogicalType, serializeUUID)); if (logicalType.getArgumentType() != null) { logicalTypeBuilder .setArgumentType( - fieldTypeToProto(logicalType.getArgumentType(), serializeLogicalType)) + fieldTypeToProto( + logicalType.getArgumentType(), serializeLogicalType, serializeUUID)) .setArgument( fieldValueToProto(logicalType.getArgumentType(), logicalType.getArgument())); } @@ -200,13 +220,15 @@ static SchemaApi.FieldType fieldTypeToProto(FieldType fieldType, boolean seriali logicalTypeBuilder = SchemaApi.LogicalType.newBuilder() .setRepresentation( - fieldTypeToProto(logicalType.getBaseType(), serializeLogicalType)) + fieldTypeToProto( + logicalType.getBaseType(), serializeLogicalType, serializeUUID)) .setUrn(urn); if (logicalType.getArgumentType() != null) { logicalTypeBuilder = logicalTypeBuilder .setArgumentType( - fieldTypeToProto(logicalType.getArgumentType(), serializeLogicalType)) + fieldTypeToProto( + logicalType.getArgumentType(), serializeLogicalType, serializeUUID)) .setArgument( fieldValueToProto( logicalType.getArgumentType(), logicalType.getArgument())); @@ -226,7 +248,8 @@ static SchemaApi.FieldType fieldTypeToProto(FieldType fieldType, boolean seriali builder.setLogicalType( SchemaApi.LogicalType.newBuilder() .setUrn(URN_BEAM_LOGICAL_MILLIS_INSTANT) - .setRepresentation(fieldTypeToProto(FieldType.INT64, serializeLogicalType)) + .setRepresentation( + fieldTypeToProto(FieldType.INT64, serializeLogicalType, serializeUUID)) .build()); break; case DECIMAL: @@ -235,7 +258,8 @@ static SchemaApi.FieldType fieldTypeToProto(FieldType fieldType, boolean seriali builder.setLogicalType( SchemaApi.LogicalType.newBuilder() .setUrn(URN_BEAM_LOGICAL_DECIMAL) - .setRepresentation(fieldTypeToProto(FieldType.BYTES, serializeLogicalType)) + .setRepresentation( + fieldTypeToProto(FieldType.BYTES, serializeLogicalType, serializeUUID)) .build()); break; case BYTE: @@ -288,14 +312,14 @@ public static Schema schemaFromProto(SchemaApi.Schema protoSchema) { Schema schema = builder.build(); Preconditions.checkState(encodingLocationMap.size() == schema.getFieldCount()); - long dinstictEncodingPositions = encodingLocationMap.values().stream().distinct().count(); - Preconditions.checkState(dinstictEncodingPositions <= schema.getFieldCount()); - if (dinstictEncodingPositions < schema.getFieldCount() && schema.getFieldCount() > 0) { + long distinctEncodingPositions = encodingLocationMap.values().stream().distinct().count(); + Preconditions.checkState(distinctEncodingPositions <= schema.getFieldCount()); + if (distinctEncodingPositions < schema.getFieldCount() && schema.getFieldCount() > 0) { // This means that encoding positions were not specified in the proto. Generally, we don't // expect this to happen, // but if it does happen, we expect none to be specified - in which case the should all be // zero. - Preconditions.checkState(dinstictEncodingPositions == 1); + Preconditions.checkState(distinctEncodingPositions == 1); } else if (protoSchema.getEncodingPositionsSet()) { schema.setEncodingPositions(encodingLocationMap); } @@ -771,7 +795,8 @@ private static List optionsToProto(Schema.Options options) { protoOptions.add( SchemaApi.Option.newBuilder() .setName(name) - .setType(fieldTypeToProto(Objects.requireNonNull(options.getType(name)), false)) + .setType( + fieldTypeToProto(Objects.requireNonNull(options.getType(name)), false, false)) .setValue( fieldValueToProto( Objects.requireNonNull(options.getType(name)), options.getValue(name))) diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/logicaltypes/OneOfType.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/logicaltypes/OneOfType.java index 4a7573b036e2..31b6c8db2fed 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/logicaltypes/OneOfType.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/logicaltypes/OneOfType.java @@ -65,7 +65,8 @@ private OneOfType(List fields, @Nullable Map enumMap) { enumerationType = EnumerationType.create(enumValues); } oneOfSchema = Schema.builder().addFields(nullableFields).build(); - schemaProtoRepresentation = SchemaTranslation.schemaToProto(oneOfSchema, false).toByteArray(); + schemaProtoRepresentation = + SchemaTranslation.schemaToProto(oneOfSchema, false, false).toByteArray(); } /** Create an {@link OneOfType} logical type. */ diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/PeriodicSequence.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/PeriodicSequence.java index 12cbecd04b02..9ad3141f9666 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/PeriodicSequence.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/PeriodicSequence.java @@ -33,6 +33,7 @@ import org.apache.beam.sdk.transforms.splittabledofn.WatermarkEstimators; import org.apache.beam.sdk.transforms.windowing.BoundedWindow; import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.MoreObjects; import org.checkerframework.checker.nullness.qual.Nullable; import org.joda.time.Duration; @@ -164,7 +165,9 @@ public void checkDone() throws IllegalStateException { @Override public IsBounded isBounded() { - return IsBounded.BOUNDED; + return range.getTo() == BoundedWindow.TIMESTAMP_MAX_VALUE.getMillis() + ? IsBounded.UNBOUNDED + : IsBounded.BOUNDED; } @Override @@ -213,6 +216,13 @@ public RestrictionTracker.TruncateResult truncate() { return null; } + @GetSize + public double getSize( + @Element SequenceDefinition sequence, @Restriction OffsetRange offsetRange) { + long nowMilliSec = Instant.now().getMillis(); + return sequenceBacklogBytes(sequence.durationMilliSec, nowMilliSec, offsetRange); + } + @ProcessElement public ProcessContinuation processElement( @Element SequenceDefinition srcElement, @@ -257,4 +267,26 @@ public ProcessContinuation processElement( public PCollection expand(PCollection input) { return input.apply(ParDo.of(new PeriodicSequenceFn())); } + + private static final int ENCODED_INSTANT_BYTES = 8; + + private static long ceilDiv(long a, long b) { + long result = Math.floorDiv(a, b); + if (a % b != 0) { + ++result; + } + return result; + } + + @VisibleForTesting + static long sequenceBacklogBytes( + long durationMilliSec, long nowMilliSec, OffsetRange offsetRange) { + // Find the # of outputs expected for overlap of offsetRange and [-inf, now) + long start = ceilDiv(offsetRange.getFrom(), durationMilliSec); + long end = ceilDiv(Math.min(nowMilliSec, offsetRange.getTo() - 1), durationMilliSec); + if (start >= end) { + return 0; + } + return ENCODED_INSTANT_BYTES * (end - start); + } } diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/util/RowFilter.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/util/RowFilter.java new file mode 100644 index 000000000000..4e0d9d3ff30d --- /dev/null +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/util/RowFilter.java @@ -0,0 +1,423 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.util; + +import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.values.Row; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Splitter; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Maps; +import org.checkerframework.checker.nullness.qual.Nullable; + +/** + * A utility that filters fields from Beam {@link Row}s. This filter can be configured to indicate + * what fields you would like to either keep or drop. You may also + * specify a singular {@link Row} field to extract with only. Afterward, call + * {@link #filter(Row)} on a Schema-compatible Row to filter it. An un-configured filter will simply + * return the input row untouched. + * + *

A configured {@link RowFilter} will naturally produce {@link Row}s with a new Beam {@link + * Schema}. You can access this new Schema via the filter's {@link #outputSchema()}. + * + *

Configure a {@link RowFilter} as follows: + * + *

{@code
+ * // this is an un-configured filter
+ * RowFilter unconfigured = new RowFilter(beamSchema);
+ *
+ * // this filter will exclusively keep these fields and drop everything else
+ * List fields = Arrays.asList("foo", "bar", "baz");
+ * RowFilter keepingFilter = new RowFilter(beamSchema).keeping(fields);
+ *
+ * // this filter will drop these fields
+ * RowFilter droppingFilter = new RowFilter(beamSchema).dropping(fields);
+ *
+ * // this filter will only output the contents of row field "my_record"
+ * String field = "my_record";
+ * RowFilter onlyFilter = new RowFilter(beamSchema).only(field);
+ *
+ * // produces a filtered row
+ * Row outputRow = keepingFilter.filter(row);
+ * }
+ * + * Check the documentation for {@link #keeping(List)}, {@link #dropping(List)}, and {@link + * #only(String)} for further details on what an output Row can look like. + */ +public class RowFilter implements Serializable { + private final Schema rowSchema; + private @Nullable Schema transformedSchema; + // for 'only' case + private @Nullable String onlyField; + + public RowFilter(Schema rowSchema) { + this.rowSchema = rowSchema; + } + + /** + * Configures this {@link RowFilter} to filter {@link Row}s by keeping only the specified fields. + * Nested fields can be specified using dot-notation. + * + *

For example, if we want to keep the list of fields {@code ["foo", "baz"]}, for the input + * {@link Row}: + * + *

{@code
+   * foo: 123
+   * bar: 456
+   * baz:
+   *   nested_1: abc
+   *   nested_2: xyz
+   * }
+ * + * we will get the following output {@link Row}: + * + *
{@code
+   * foo: 123
+   * baz
+   *   nested_1: abc
+   *   nested_2: xyz
+   * }
+ */ + public RowFilter keeping(List fields) { + checkUnconfigured(); + verifyNoNestedFields(fields, "keep"); + validateSchemaContainsFields(rowSchema, fields, "keep"); + transformedSchema = keepFields(rowSchema, fields); + return this; + } + + /** + * Configures this {@link RowFilter} to filter {@link Row} by removing the specified fields. + * Nested fields can be specified using dot-notation. + * + *

For example, if we want to drop the list of fields {@code ["foo", "baz"]}, for this input + * {@link Row}: + * + *

{@code
+   * foo: 123
+   * bar: 456
+   * baz:
+   *   nested_1: abc
+   *   nested_2: xyz
+   * }
+ * + * we will get the following output {@link Row}: + * + *
{@code
+   * bar: 456
+   * }
+ */ + public RowFilter dropping(List fields) { + checkUnconfigured(); + verifyNoNestedFields(fields, "drop"); + validateSchemaContainsFields(rowSchema, fields, "drop"); + transformedSchema = dropFields(rowSchema, fields); + return this; + } + + /** + * Configures this {@link RowFilter} to only output the contents of a single row field. + * + *

For example, if we want to only extract the contents of field "foo" for this input {@link + * Row}: + * + *

{@code
+   * abc: 123
+   * bar: my_str
+   * foo:
+   *   xyz:
+   *     baz: 456
+   *     qwe: 789
+   * }
+ * + * we will get the following output {@link Row}: + * + *
{@code
+   * xyz:
+   *   baz: 456
+   *   qwe: 789
+   * }
+ * + *

Note that this will fail if the field is not of type {@link Row}, e.g. if {@code "abc"} is + * specified for the example above. + */ + public RowFilter only(String field) { + checkUnconfigured(); + validateSchemaContainsFields(rowSchema, Collections.singletonList(field), "only"); + Schema.Field rowField = rowSchema.getField(field); + Preconditions.checkArgument( + rowField.getType().getTypeName().equals(Schema.TypeName.ROW), + "Expected type '%s' for field '%s', but instead got type '%s'.", + Schema.TypeName.ROW, + rowField.getName(), + rowField.getType().getTypeName()); + + transformedSchema = rowField.getType().getRowSchema(); + onlyField = field; + return this; + } + + /** + * Performs a filter operation (keep or drop) on the input {@link Row}. Must have already + * configured a filter operation with {@link #dropping(List)} or {@link #keeping(List)} for this + * {@link RowFilter}. + * + *

If not yet configured, will simply return the same {@link Row}. + */ + public Row filter(Row row) { + if (transformedSchema == null) { + return row; + } + + Preconditions.checkState( + row.getSchema().assignableTo(rowSchema), + "Encountered Row with schema that is incompatible with this RowFilter's schema." + + "\nRow schema: %s" + + "\nSchema used to initialize this RowFilter: %s", + row.getSchema(), + rowSchema); + + // 'only' case + if (onlyField != null) { + return checkStateNotNull(row.getRow(onlyField)); + } + + // 'keep' and 'drop' + return Preconditions.checkNotNull(copyWithNewSchema(row, outputSchema())); + } + + /** Returns the output {@link Row}'s {@link Schema}. */ + public Schema outputSchema() { + return transformedSchema != null ? transformedSchema : rowSchema; + } + + private void checkUnconfigured() { + Preconditions.checkState( + transformedSchema == null, + "This RowFilter has already been configured to filter to the following Schema: %s", + transformedSchema); + } + + /** Verifies that this selection contains no nested fields. */ + private void verifyNoNestedFields(List fields, String operation) { + List nestedFields = new ArrayList<>(); + for (String field : fields) { + if (field.contains(".")) { + nestedFields.add(field); + } + } + if (!nestedFields.isEmpty()) { + throw new IllegalArgumentException( + String.format( + "RowFilter does not support specifying nested fields to %s: %s", + operation, nestedFields)); + } + } + + /** + * Checks whether a {@link Schema} contains a list of field names. Nested fields can be expressed + * with dot-notation. Throws a helpful error in the case where a field doesn't exist, or if a + * nested field could not be reached. + */ + @VisibleForTesting + static void validateSchemaContainsFields( + Schema schema, List specifiedFields, String operation) { + Set notFound = new HashSet<>(); + Set notRowField = new HashSet<>(); + + for (String field : specifiedFields) { + List levels = Splitter.on(".").splitToList(field); + + Schema currentSchema = schema; + + for (int i = 0; i < levels.size(); i++) { + String currentFieldName = String.join(".", levels.subList(0, i + 1)); + + if (!currentSchema.hasField(levels.get(i))) { + notFound.add(currentFieldName); + break; + } + + if (i + 1 < levels.size()) { + Schema.Field nextField = currentSchema.getField(levels.get(i)); + if (!nextField.getType().getTypeName().equals(Schema.TypeName.ROW)) { + notRowField.add(currentFieldName); + break; + } + currentSchema = Preconditions.checkNotNull(nextField.getType().getRowSchema()); + } + } + } + + if (!notFound.isEmpty() || !notRowField.isEmpty()) { + String message = "Validation failed for '" + operation + "'."; + if (!notFound.isEmpty()) { + message += "\nRow Schema does not contain the following specified fields: " + notFound; + } + if (!notRowField.isEmpty()) { + message += + "\nThe following specified fields are not of type Row. Their nested fields could not be reached: " + + notRowField; + } + throw new IllegalArgumentException(message); + } + } + + /** + * Creates a field tree, separating each top-level field from its (potential) nested fields. E.g. + * ["foo.bar.baz", "foo.abc", "xyz"] --> {"foo": ["bar.baz", "abc"], "xyz": []} + */ + @VisibleForTesting + static Map> getFieldTree(List fields) { + Map> fieldTree = Maps.newHashMap(); + + for (String field : fields) { + List components = Splitter.on(".").splitToList(field); + String root = components.get(0); + fieldTree.computeIfAbsent(root, r -> new ArrayList<>()); + + if (components.size() > 1) { + String nestedFields = String.join(".", components.subList(1, components.size())); + Preconditions.checkNotNull(fieldTree.get(root)).add(nestedFields); + } + } + return fieldTree; + } + + /** + * Returns a new {@link Row} containing only the fields that intersect with the new {@link Schema} + * Relies on a previous step to have validated the compatibility of the new {@link Schema}. + */ + @VisibleForTesting + @Nullable + static Row copyWithNewSchema(@Nullable Row row, Schema newSchema) { + if (row == null) { + return null; + } + Map values = new HashMap<>(newSchema.getFieldCount()); + + for (Schema.Field field : newSchema.getFields()) { + String name = field.getName(); + Object value = row.getValue(name); + if (field.getType().getTypeName().equals(Schema.TypeName.ROW)) { + Schema nestedRowSchema = Preconditions.checkNotNull(field.getType().getRowSchema()); + value = copyWithNewSchema(row.getRow(name), nestedRowSchema); + } + if (value != null) { + values.put(name, value); + } + } + return Row.withSchema(newSchema).withFieldValues(values).build(); + } + + /** + * Returns a new {@link Schema} with the specified fields removed. + * + *

No guarantee that field ordering will remain the same. + */ + @VisibleForTesting + static Schema dropFields(Schema schema, List fieldsToDrop) { + if (fieldsToDrop.isEmpty()) { + return schema; + } + List newFieldsList = new ArrayList<>(schema.getFields()); + Map> fieldTree = getFieldTree(fieldsToDrop); + + for (Map.Entry> fieldAndDescendents : fieldTree.entrySet()) { + String root = fieldAndDescendents.getKey(); + List nestedFields = fieldAndDescendents.getValue(); + Schema.Field fieldToRemove = schema.getField(root); + Schema.FieldType typeToRemove = fieldToRemove.getType(); + + // Base case: we're at the specified field to remove. + if (nestedFields.isEmpty()) { + newFieldsList.remove(fieldToRemove); + } else { + // Otherwise, we're asked to remove a nested field. Verify current field is ROW type + Preconditions.checkArgument( + typeToRemove.getTypeName().equals(Schema.TypeName.ROW), + "Expected type %s for specified nested field '%s', but instead got type %s.", + Schema.TypeName.ROW, + root, + typeToRemove.getTypeName()); + + Schema nestedSchema = Preconditions.checkNotNull(typeToRemove.getRowSchema()); + Schema newNestedSchema = dropFields(nestedSchema, nestedFields); + Schema.Field modifiedField = + Schema.Field.of(root, Schema.FieldType.row(newNestedSchema)) + .withNullable(typeToRemove.getNullable()); + + // Replace with modified field + newFieldsList.set(newFieldsList.indexOf(fieldToRemove), modifiedField); + } + } + return new Schema(newFieldsList); + } + + /** + * Returns a new {@link Schema} with only the specified fields kept. + * + *

No guarantee that field ordering will remain the same. + */ + @VisibleForTesting + static Schema keepFields(Schema schema, List fieldsToKeep) { + if (fieldsToKeep.isEmpty()) { + return schema; + } + List newFieldsList = new ArrayList<>(fieldsToKeep.size()); + Map> fieldTree = getFieldTree(fieldsToKeep); + + for (Map.Entry> fieldAndDescendents : fieldTree.entrySet()) { + String root = fieldAndDescendents.getKey(); + List nestedFields = fieldAndDescendents.getValue(); + Schema.Field fieldToKeep = schema.getField(root); + Schema.FieldType typeToKeep = fieldToKeep.getType(); + + // Base case: we're at the specified field to keep, and we can skip this conditional. + // Otherwise: we're asked to keep a nested field, so we dig deeper to determine which nested + // fields to keep + if (!nestedFields.isEmpty()) { + Preconditions.checkArgument( + typeToKeep.getTypeName().equals(Schema.TypeName.ROW), + "Expected type %s for specified nested field '%s', but instead got type %s.", + Schema.TypeName.ROW, + root, + typeToKeep.getTypeName()); + + Schema nestedSchema = Preconditions.checkNotNull(typeToKeep.getRowSchema()); + Schema newNestedSchema = keepFields(nestedSchema, nestedFields); + fieldToKeep = + Schema.Field.of(root, Schema.FieldType.row(newNestedSchema)) + .withNullable(typeToKeep.getNullable()); + } + newFieldsList.add(fieldToKeep); + } + + return new Schema(newFieldsList); + } +} diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/util/RowStringInterpolator.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/util/RowStringInterpolator.java new file mode 100644 index 000000000000..1f095522dd8b --- /dev/null +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/util/RowStringInterpolator.java @@ -0,0 +1,165 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.util; + +import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull; + +import java.io.Serializable; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.stream.Collectors; +import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.transforms.windowing.BoundedWindow; +import org.apache.beam.sdk.transforms.windowing.PaneInfo; +import org.apache.beam.sdk.values.Row; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.MoreObjects; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Sets; +import org.checkerframework.checker.nullness.qual.Nullable; +import org.joda.time.Instant; + +/** + * A utility that interpolates values in a pre-determined {@link String} using an input Beam {@link + * Row}. + * + *

The {@link RowStringInterpolator} looks for field names specified inside {curly braces}. For + * example, if the interpolator is configured with the String {@code "unified {foo} and streaming"}, + * it will look for a field name {@code "foo"} in the input {@link Row} and substitute in that + * value. If a {@link RowStringInterpolator} is configured with a template String that contains no + * placeholders (i.e. no curly braces), it will simply return that String, untouched. + * + *

Nested fields can be specified using dot-notation (e.g. {@code "top.middle.nested"}). + * + *

Configure a {@link RowStringInterpolator} like so: + * + *

{@code
+ * String template = "unified {foo} and {bar.baz}!";
+ * Row inputRow = {foo: "batch", bar: {baz: "streaming"}, ...};
+ *
+ * RowStringInterpolator interpolator = new RowStringInterpolator(template, beamSchema);
+ * String output = interpolator.interpolate(inputRow, window, paneInfo, timestamp);
+ * // output --> "unified batch and streaming!"
+ * }
+ * + *

Additionally, {@link #interpolate(Row, BoundedWindow, PaneInfo, Instant)} can be used in + * streaming scenarios to substitute windowing metadata into the template String. To make use of + * this, use the relevant placeholder: + * + *

    + *
  • $WINDOW: the window's string representation + *
  • $PANE_INDEX: the pane's index + *
  • $YYYY: the element timestamp's year + *
  • $MM: the element timestamp's month + *
  • $DD: the element timestamp's day + *
+ * + *

For example, your String template can look like: + * + *

{@code "unified {foo} and {bar} since {$YYYY}-{$MM}!"}
+ */ +public class RowStringInterpolator implements Serializable { + private final String template; + private final Set fieldsToReplace; + // Represents the string representation of the element's window + public static final String WINDOW = "$WINDOW"; + public static final String PANE_INDEX = "$PANE_INDEX"; + // Represents the element's pane index + public static final String YYYY = "$YYYY"; + public static final String MM = "$MM"; + public static final String DD = "$DD"; + private static final Set WINDOWING_METADATA = + Sets.newHashSet(WINDOW, PANE_INDEX, YYYY, MM, DD); + private static final Pattern TEMPLATE_PATTERN = Pattern.compile("\\{(.+?)}"); + + /** + * @param template a String template, potentially with placeholders in the form of curly braces, + * e.g. {@code "my {foo} template"}. During interpolation, these placeholders are replaced + * with values in the Beam Row. For more details and examples, refer to the top-level + * documentation. + * @param rowSchema {@link Row}s used for interpolation are expected to be compatible with this + * {@link Schema}. + */ + public RowStringInterpolator(String template, Schema rowSchema) { + this.template = template; + + Matcher m = TEMPLATE_PATTERN.matcher(template); + fieldsToReplace = new HashSet<>(); + while (m.find()) { + fieldsToReplace.add(checkStateNotNull(m.group(1))); + } + + List rowFields = + fieldsToReplace.stream() + .filter(f -> !WINDOWING_METADATA.contains(f)) + .collect(Collectors.toList()); + + RowFilter.validateSchemaContainsFields(rowSchema, rowFields, "string interpolation"); + } + + /** + * Performs string interpolation on the template using values from the input {@link Row} and its + * windowing metadata. + */ + public String interpolate(Row row, BoundedWindow window, PaneInfo paneInfo, Instant timestamp) { + String interpolated = this.template; + for (String field : fieldsToReplace) { + Object val; + switch (field) { + case WINDOW: + val = window.toString(); + break; + case PANE_INDEX: + val = paneInfo.getIndex(); + break; + case YYYY: + val = timestamp.getChronology().year().get(timestamp.getMillis()); + break; + case MM: + val = timestamp.getChronology().monthOfYear().get(timestamp.getMillis()); + break; + case DD: + val = timestamp.getChronology().dayOfMonth().get(timestamp.getMillis()); + break; + default: + val = MoreObjects.firstNonNull(getValue(row, field), ""); + break; + } + + interpolated = interpolated.replace("{" + field + "}", String.valueOf(val)); + } + return interpolated; + } + + private @Nullable Object getValue(@Nullable Row row, String fieldPath) { + if (row == null) { + return null; + } + int dotIndex = fieldPath.indexOf('.'); + String field = dotIndex == -1 ? fieldPath : fieldPath.substring(0, dotIndex); + Preconditions.checkArgument( + row.getSchema().hasField(field), "Invalid row does not contain field '%s'.", field); + + if (dotIndex == -1) { + return row.getValue(field); + } + return getValue(row.getRow(field), fieldPath.substring(dotIndex + 1)); + } +} diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/coders/RowCoderTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/coders/RowCoderTest.java index f62a2611a1cf..885ff8f1491a 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/coders/RowCoderTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/coders/RowCoderTest.java @@ -22,10 +22,12 @@ import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.math.BigDecimal; +import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.Map; +import java.util.UUID; import org.apache.beam.sdk.coders.Coder.NonDeterministicException; import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.schemas.Schema.FieldType; @@ -37,6 +39,7 @@ import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Lists; +import org.checkerframework.checker.nullness.qual.NonNull; import org.joda.time.DateTime; import org.joda.time.DateTimeZone; import org.junit.Assume; @@ -62,7 +65,7 @@ public void testPrimitiveTypes() throws Exception { .build(); DateTime dateTime = - new DateTime().withDate(1979, 03, 14).withTime(1, 2, 3, 4).withZone(DateTimeZone.UTC); + new DateTime().withDate(1979, 3, 14).withTime(1, 2, 3, 4).withZone(DateTimeZone.UTC); Row row = Row.withSchema(schema) .addValues( @@ -219,12 +222,14 @@ public FieldType getBaseType() { } @Override - public Value toBaseType(String input) { + @NonNull + public Value toBaseType(@NonNull String input) { return enumeration.valueOf(input); } @Override - public String toInputType(Value base) { + @NonNull + public String toInputType(@NonNull Value base) { return enumeration.toString(base); } } @@ -401,6 +406,129 @@ public void testEncodingPositionReorderFields() throws Exception { assertEquals(expected, decoded); } + @Test + public void testEncodingPositionReorderFieldsWithNulls() throws Exception { + Schema schema1 = + Schema.builder() + .addNullableField("f_int32", FieldType.INT32) + .addNullableField("f_string", FieldType.STRING) + .build(); + Schema schema2 = + Schema.builder() + .addNullableField("f_string", FieldType.STRING) + .addNullableField("f_int32", FieldType.INT32) + .build(); + schema2.setEncodingPositions(ImmutableMap.of("f_int32", 0, "f_string", 1)); + Row schema1row = + Row.withSchema(schema1) + .withFieldValue("f_int32", null) + .withFieldValue("f_string", "hello world!") + .build(); + + Row schema2row = + Row.withSchema(schema2) + .withFieldValue("f_int32", null) + .withFieldValue("f_string", "hello world!") + .build(); + + ByteArrayOutputStream os = new ByteArrayOutputStream(); + RowCoder.of(schema1).encode(schema1row, os); + Row schema1to2decoded = RowCoder.of(schema2).decode(new ByteArrayInputStream(os.toByteArray())); + assertEquals(schema2row, schema1to2decoded); + + os.reset(); + RowCoder.of(schema2).encode(schema2row, os); + Row schema2to1decoded = RowCoder.of(schema1).decode(new ByteArrayInputStream(os.toByteArray())); + assertEquals(schema1row, schema2to1decoded); + } + + @Test + public void testEncodingPositionReorderViaStaticOverride() throws Exception { + Schema schema1 = + Schema.builder() + .addNullableField("failsafeTableRowPayload", FieldType.STRING) + .addByteArrayField("payload") + .addNullableField("timestamp", FieldType.INT32) + .addNullableField("unknownFieldsPayload", FieldType.STRING) + .build(); + UUID uuid = UUID.randomUUID(); + schema1.setUUID(uuid); + + Row row = + Row.withSchema(schema1) + .addValues("", "hello world!".getBytes(StandardCharsets.UTF_8), 1, "") + .build(); + ByteArrayOutputStream os = new ByteArrayOutputStream(); + RowCoder.of(schema1).encode(row, os); + // Pretend that we are restarting and want to recover from persisted state with a compatible + // schema using the + // overridden encoding positions. + RowCoder.clearGeneratedRowCoders(); + RowCoder.overrideEncodingPositions( + uuid, + ImmutableMap.of( + "failsafeTableRowPayload", 0, "payload", 1, "timestamp", 2, "unknownFieldsPayload", 3)); + + Schema schema2 = + Schema.builder() + .addByteArrayField("payload") + .addNullableField("timestamp", FieldType.INT32) + .addNullableField("unknownFieldsPayload", FieldType.STRING) + .addNullableField("failsafeTableRowPayload", FieldType.STRING) + .build(); + schema2.setUUID(uuid); + + Row expected = + Row.withSchema(schema2) + .addValues("hello world!".getBytes(StandardCharsets.UTF_8), 1, "", "") + .build(); + Row decoded = RowCoder.of(schema2).decode(new ByteArrayInputStream(os.toByteArray())); + assertEquals(expected, decoded); + } + + @Test + public void testEncodingPositionReorderViaStaticOverrideWithNulls() throws Exception { + Schema schema1 = + Schema.builder() + .addNullableField("failsafeTableRowPayload", FieldType.BYTES) + .addByteArrayField("payload") + .addNullableField("timestamp", FieldType.INT32) + .addNullableField("unknownFieldsPayload", FieldType.BYTES) + .build(); + UUID uuid = UUID.randomUUID(); + schema1.setUUID(uuid); + + Row row = + Row.withSchema(schema1) + .addValues(null, "hello world!".getBytes(StandardCharsets.UTF_8), 1, null) + .build(); + ByteArrayOutputStream os = new ByteArrayOutputStream(); + RowCoder.of(schema1).encode(row, os); + // Pretend that we are restarting and want to recover from persisted state with a compatible + // schema using the overridden encoding positions. + RowCoder.clearGeneratedRowCoders(); + RowCoder.overrideEncodingPositions( + uuid, + ImmutableMap.of( + "failsafeTableRowPayload", 0, "payload", 1, "timestamp", 2, "unknownFieldsPayload", 3)); + + Schema schema2 = + Schema.builder() + .addByteArrayField("payload") + .addNullableField("timestamp", FieldType.INT32) + .addNullableField("unknownFieldsPayload", FieldType.BYTES) + .addNullableField("failsafeTableRowPayload", FieldType.BYTES) + .build(); + schema2.setUUID(uuid); + + Row expected = + Row.withSchema(schema2) + .addValues("hello world!".getBytes(StandardCharsets.UTF_8), 1, null, null) + .build(); + Row decoded = RowCoder.of(schema2).decode(new ByteArrayInputStream(os.toByteArray())); + assertEquals(expected, decoded); + } + @Test public void testEncodingPositionAddNewFields() throws Exception { Schema schema1 = diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/SchemaTranslationTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/SchemaTranslationTest.java index 3b22addbf545..b082e2bb68ee 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/SchemaTranslationTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/SchemaTranslationTest.java @@ -214,6 +214,7 @@ public void toAndFromProto() throws Exception { public static class FromProtoToProtoTest { @Parameters(name = "{index}: {0}") public static Iterable data() { + ImmutableList.Builder listBuilder = ImmutableList.builder(); SchemaApi.Schema.Builder builder = SchemaApi.Schema.newBuilder(); // A go 'int' builder.addFields( @@ -232,6 +233,9 @@ public static Iterable data() { .setId(0) .setEncodingPosition(0) .build()); + SchemaApi.Schema singleFieldSchema = builder.build(); + listBuilder.add(singleFieldSchema); + // A pickled python object builder.addFields( SchemaApi.Field.newBuilder() @@ -294,21 +298,51 @@ public static Iterable data() { .setId(2) .setEncodingPosition(2) .build()); - SchemaApi.Schema unknownLogicalTypeSchema = builder.build(); + SchemaApi.Schema multipleFieldSchema = builder.build(); + listBuilder.add(multipleFieldSchema); - return ImmutableList.builder().add(unknownLogicalTypeSchema).build(); + builder.clear(); + builder.addFields( + SchemaApi.Field.newBuilder() + .setName("nested") + .setType( + SchemaApi.FieldType.newBuilder() + .setRowType( + SchemaApi.RowType.newBuilder().setSchema(singleFieldSchema).build()) + .build()) + .build()); + SchemaApi.Schema nestedSchema = builder.build(); + listBuilder.add(nestedSchema); + + return listBuilder.build(); } @Parameter(0) public SchemaApi.Schema schemaProto; + private void clearIds(SchemaApi.Schema.Builder builder) { + builder.clearId(); + for (SchemaApi.Field.Builder field : builder.getFieldsBuilderList()) { + if (field.hasType() + && field.getType().hasRowType() + && field.getType().getRowType().hasSchema()) { + clearIds(field.getTypeBuilder().getRowTypeBuilder().getSchemaBuilder()); + } + } + } + @Test public void fromProtoAndToProto() throws Exception { Schema decodedSchema = SchemaTranslation.schemaFromProto(schemaProto); SchemaApi.Schema reencodedSchemaProto = SchemaTranslation.schemaToProto(decodedSchema, true); + SchemaApi.Schema.Builder builder = reencodedSchemaProto.toBuilder(); + clearIds(builder); + assertThat(builder.build(), equalTo(schemaProto)); - assertThat(reencodedSchemaProto, equalTo(schemaProto)); + SchemaApi.Schema reencodedSchemaProtoWithoutUUID = + SchemaTranslation.schemaToProto(decodedSchema, true, false); + assertThat(reencodedSchemaProtoWithoutUUID, equalTo(schemaProto)); } } @@ -432,8 +466,8 @@ public static Iterable data() { public Schema.FieldType fieldType; @Test - public void testLogicalTypeSerializeDeserilizeCorrectly() { - SchemaApi.FieldType proto = SchemaTranslation.fieldTypeToProto(fieldType, true); + public void testLogicalTypeSerializeDeserializeCorrectly() { + SchemaApi.FieldType proto = SchemaTranslation.fieldTypeToProto(fieldType, true, false); Schema.FieldType translated = SchemaTranslation.fieldTypeFromProto(proto); assertThat( @@ -451,7 +485,7 @@ public void testLogicalTypeSerializeDeserilizeCorrectly() { @Test public void testLogicalTypeFromToProtoCorrectly() { - SchemaApi.FieldType proto = SchemaTranslation.fieldTypeToProto(fieldType, false); + SchemaApi.FieldType proto = SchemaTranslation.fieldTypeToProto(fieldType, false, false); Schema.FieldType translated = SchemaTranslation.fieldTypeFromProto(proto); if (STANDARD_LOGICAL_TYPES.containsKey(translated.getLogicalType().getIdentifier())) { diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/logicaltypes/LogicalTypesTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/logicaltypes/LogicalTypesTest.java index fc264c8104c4..e1590408021a 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/logicaltypes/LogicalTypesTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/logicaltypes/LogicalTypesTest.java @@ -76,6 +76,16 @@ public void testOneOf() { union = intOneOf.getLogicalTypeValue(0, OneOfType.Value.class); assertEquals("int32", oneOf.getCaseEnumType().toString(union.getCaseType())); assertEquals(42, (int) union.getValue()); + + // Validate schema equality. + OneOfType oneOf2 = + OneOfType.create(Field.of("string", FieldType.STRING), Field.of("int32", FieldType.INT32)); + assertEquals(oneOf.getOneOfSchema(), oneOf2.getOneOfSchema()); + Schema schema2 = Schema.builder().addLogicalTypeField("union", oneOf2).build(); + assertEquals(schema, schema2); + Row stringOneOf2 = + Row.withSchema(schema2).addValue(oneOf.createValue("string", "stringValue")).build(); + assertEquals(stringOneOf, stringOneOf2); } @Test diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/PeriodicSequenceTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/PeriodicSequenceTest.java index 3ace145eba88..541a70933870 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/PeriodicSequenceTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/PeriodicSequenceTest.java @@ -24,6 +24,7 @@ import java.util.Comparator; import java.util.List; import java.util.stream.Collectors; +import org.apache.beam.sdk.io.range.OffsetRange; import org.apache.beam.sdk.testing.NeedsRunner; import org.apache.beam.sdk.testing.PAssert; import org.apache.beam.sdk.testing.TestPipeline; @@ -136,4 +137,25 @@ public void testOutputsProperElements() { p.run().waitUntilFinish(); } + + @Test + public void testBacklogBytes() { + assertEquals( + 0, PeriodicSequence.sequenceBacklogBytes(10, 100, new OffsetRange(100, Long.MAX_VALUE))); + assertEquals( + 8, PeriodicSequence.sequenceBacklogBytes(10, 100, new OffsetRange(90, Long.MAX_VALUE))); + assertEquals( + 0, PeriodicSequence.sequenceBacklogBytes(10, 100, new OffsetRange(91, Long.MAX_VALUE))); + assertEquals( + 8, PeriodicSequence.sequenceBacklogBytes(10, 100, new OffsetRange(89, Long.MAX_VALUE))); + assertEquals( + 16, PeriodicSequence.sequenceBacklogBytes(10, 101, new OffsetRange(81, Long.MAX_VALUE))); + assertEquals( + 8 * 10000 / 100, + PeriodicSequence.sequenceBacklogBytes(100, 10000, new OffsetRange(0, Long.MAX_VALUE))); + assertEquals( + 0, PeriodicSequence.sequenceBacklogBytes(10, 10000, new OffsetRange(10011, 10025))); + assertEquals( + 8, PeriodicSequence.sequenceBacklogBytes(10, 10100, new OffsetRange(10011, 10025))); + } } diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/util/RowFilterTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/util/RowFilterTest.java new file mode 100644 index 000000000000..22c17f6d07c9 --- /dev/null +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/util/RowFilterTest.java @@ -0,0 +1,353 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.util; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.containsString; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertThrows; +import static org.junit.Assert.assertTrue; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.Row; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ExpectedException; + +/** Tests for {@link RowFilter}. */ +public class RowFilterTest { + @Rule public ExpectedException thrown = ExpectedException.none(); + + private static final Schema DOUBLY_NESTED_ROW_SCHEMA = + Schema.builder() + .addStringField("doubly_nested_str") + .addInt32Field("doubly_nested_int") + .build(); + + private static final Schema NESTED_ROW_SCHEMA = + Schema.builder() + .addStringField("nested_str") + .addInt32Field("nested_int") + .addFloatField("nested_float") + .addRowField("nested_row", DOUBLY_NESTED_ROW_SCHEMA) + .build(); + private static final Schema ROW_SCHEMA = + Schema.builder() + .addStringField("str") + .addBooleanField("bool") + .addNullableInt32Field("nullable_int") + .addArrayField("arr_int", Schema.FieldType.INT32) + .addRowField("row", NESTED_ROW_SCHEMA) + .addNullableRowField("nullable_row", NESTED_ROW_SCHEMA) + .build(); + + @Test + public void testSchemaValidation() { + List> goodFields = + Arrays.asList( + Arrays.asList("str", "bool", "nullable_row"), + Arrays.asList("nullable_int", "arr_int"), + Arrays.asList("row.nested_str", "row.nested_row.doubly_nested_str"), + Arrays.asList("nullable_row.nested_row.doubly_nested_int")); + + for (List fields : goodFields) { + RowFilter.validateSchemaContainsFields(ROW_SCHEMA, fields, "test-operation"); + } + } + + @Test + public void testSchemaValidationFailsWithHelpfulErrorForMissingFields() { + List, List>> nonExistentFields = + Arrays.asList( + KV.of( + Arrays.asList("nonexistent_1", "nonexistent_2", "nonexistent_3"), + Arrays.asList("nonexistent_1", "nonexistent_2", "nonexistent_3")), + KV.of( + Arrays.asList("nullable_int", "arr_int", "nonexistent"), + Collections.singletonList("nonexistent")), + KV.of( + Arrays.asList( + "nullable_row.nested_row.nonexistent", "row.nonexistent", "row.nested_float"), + Arrays.asList("nullable_row.nested_row.nonexistent", "row.nonexistent"))); + + for (KV, List> fields : nonExistentFields) { + List allFields = fields.getKey(); + List badFields = fields.getValue(); + + IllegalArgumentException e = + assertThrows( + IllegalArgumentException.class, + () -> + RowFilter.validateSchemaContainsFields(ROW_SCHEMA, allFields, "test-operation")); + + assertThat(e.getMessage(), containsString("Validation failed for 'test-operation'")); + assertThat( + e.getMessage(), + containsString("Row Schema does not contain the following specified fields")); + for (String badField : badFields) { + assertThat(e.getMessage(), containsString(badField)); + } + } + } + + @Test + public void testSchemaValidationFailsWithHelpfulErrorForInvalidNestedFields() { + List, List>> nonNestedFields = + Arrays.asList( + KV.of( + Arrays.asList( + "row.nested_row", "row.nested_int", "row.nested_str.unexpected_nested"), + Collections.singletonList("row.nested_str")), + KV.of( + Arrays.asList( + "nullable_row.nested_str", + "nullable_row.nested_str.unexpected", + "row.nested_int.unexpected_2"), + Arrays.asList("nullable_row.nested_str", "row.nested_int"))); + + for (KV, List> fields : nonNestedFields) { + List allFields = fields.getKey(); + List badFields = fields.getValue(); + + IllegalArgumentException e = + assertThrows( + IllegalArgumentException.class, + () -> + RowFilter.validateSchemaContainsFields(ROW_SCHEMA, allFields, "test-operation")); + + assertThat(e.getMessage(), containsString("Validation failed for 'test-operation'")); + assertThat( + e.getMessage(), + containsString( + "The following specified fields are not of type Row. Their nested fields could not be reached")); + for (String badField : badFields) { + assertThat(e.getMessage(), containsString(badField)); + } + } + } + + @Test + public void testGetFieldTree() { + List fields = + Arrays.asList( + "top-level", + "top-level-2", + "top-level.nested-level", + "top-level.nested-level-2", + "top-level.nested-level.doubly-nested-level", + "top-level.nested-level.doubly-nested-level-2"); + List nestedLayer = + Arrays.asList( + "nested-level", + "nested-level-2", + "nested-level.doubly-nested-level", + "nested-level.doubly-nested-level-2"); + + Map> expectedTree = + ImmutableMap.>builder() + .put("top-level-2", Collections.emptyList()) + .put("top-level", nestedLayer) + .build(); + + assertEquals(expectedTree, RowFilter.getFieldTree(fields)); + + List doublyNestedLayer = Arrays.asList("doubly-nested-level", "doubly-nested-level-2"); + + Map> expectedNestedTree = + ImmutableMap.>builder() + .put("nested-level-2", Collections.emptyList()) + .put("nested-level", doublyNestedLayer) + .build(); + + assertEquals(expectedNestedTree, RowFilter.getFieldTree(nestedLayer)); + } + + @Test + public void testDropSchemaFields() { + List fieldsToDrop = + Arrays.asList( + "str", + "arr_int", + "nullable_int", + "row.nested_int", + "row.nested_float", + "row.nested_row.doubly_nested_int", + "nullable_row.nested_str", + "nullable_row.nested_row"); + + Schema expectedDroppedSchema = + Schema.builder() + .addBooleanField("bool") + .addRowField( + "row", + Schema.builder() + .addStringField("nested_str") + .addRowField( + "nested_row", Schema.builder().addStringField("doubly_nested_str").build()) + .build()) + .addNullableRowField( + "nullable_row", + Schema.builder().addInt32Field("nested_int").addFloatField("nested_float").build()) + .build(); + + assertTrue(expectedDroppedSchema.equivalent(RowFilter.dropFields(ROW_SCHEMA, fieldsToDrop))); + } + + @Test + public void testKeepSchemaFields() { + List fieldsToKeep = + Arrays.asList( + "str", + "arr_int", + "nullable_int", + "row.nested_int", + "row.nested_float", + "row.nested_row.doubly_nested_int", + "nullable_row.nested_str", + "nullable_row.nested_row"); + + Schema expectedKeptSchema = + Schema.builder() + .addStringField("str") + .addArrayField("arr_int", Schema.FieldType.INT32) + .addNullableInt32Field("nullable_int") + .addRowField( + "row", + Schema.builder() + .addInt32Field("nested_int") + .addFloatField("nested_float") + .addRowField( + "nested_row", Schema.builder().addInt32Field("doubly_nested_int").build()) + .build()) + .addNullableRowField( + "nullable_row", + Schema.builder() + .addStringField("nested_str") + .addRowField("nested_row", DOUBLY_NESTED_ROW_SCHEMA) + .build()) + .build(); + + assertTrue(expectedKeptSchema.equivalent(RowFilter.keepFields(ROW_SCHEMA, fieldsToKeep))); + } + + @Test + public void testDropNestedFieldsFails() { + thrown.expect(IllegalArgumentException.class); + thrown.expectMessage("RowFilter does not support specifying nested fields to drop"); + + new RowFilter(ROW_SCHEMA) + .dropping( + Arrays.asList( + "bool", + "nullable_int", + "row.nested_int", + "row.nested_float", + "row.nested_row.doubly_nested_int", + "nullable_row")); + } + + @Test + public void testKeepNestedFieldsFails() { + thrown.expect(IllegalArgumentException.class); + thrown.expectMessage("RowFilter does not support specifying nested fields to keep"); + + new RowFilter(ROW_SCHEMA) + .keeping( + Arrays.asList("str", "arr_int", "row.nested_str", "row.nested_row.doubly_nested_str")); + } + + @Test + public void testOnlyFailsWhenSpecifyingNonRowField() { + thrown.expect(IllegalArgumentException.class); + thrown.expectMessage( + "Expected type 'ROW' for field 'nullable_int', but instead got type 'INT32'"); + + new RowFilter(ROW_SCHEMA).only("nullable_int"); + } + + private static final Row ORIGINAL_ROW = + Row.withSchema(ROW_SCHEMA) + .addValue("str_value") + .addValue(true) + .addValue(123) + .addValue(Arrays.asList(1, 2, 3, 4, 5)) + .addValue( + Row.withSchema(NESTED_ROW_SCHEMA) + .addValue("nested_str_value") + .addValue(456) + .addValue(1.234f) + .addValue( + Row.withSchema(DOUBLY_NESTED_ROW_SCHEMA) + .addValue("doubly_nested_str_value") + .addValue(789) + .build()) + .build()) + .addValue(null) + .build(); + + private static final Schema FILTERED_DOUBLY_NESTED_SCHEMA = + Schema.builder().addStringField("doubly_nested_str").build(); + private static final Schema FILTERED_NESTED_SCHEMA = + Schema.builder() + .addStringField("nested_str") + .addRowField("nested_row", FILTERED_DOUBLY_NESTED_SCHEMA) + .build(); + private static final Schema FILTERED_SCHEMA = + Schema.builder() + .addStringField("str") + .addArrayField("arr_int", Schema.FieldType.INT32) + .addRowField("row", FILTERED_NESTED_SCHEMA) + .build(); + + private static final Row FILTERED_ROW = + Row.withSchema(FILTERED_SCHEMA) + .addValue("str_value") + .addValue(Arrays.asList(1, 2, 3, 4, 5)) + .addValue( + Row.withSchema(FILTERED_NESTED_SCHEMA) + .addValue("nested_str_value") + .addValue( + Row.withSchema(FILTERED_DOUBLY_NESTED_SCHEMA) + .addValue("doubly_nested_str_value") + .build()) + .build()) + .build(); + + @Test + public void testCopyRowWithNewSchema() { + assertEquals(FILTERED_ROW, RowFilter.copyWithNewSchema(ORIGINAL_ROW, FILTERED_SCHEMA)); + } + + @Test + public void testOnlyRowField() { + RowFilter rowFilter = new RowFilter(ROW_SCHEMA).only("row"); + + Row expecedRow = + Row.withSchema(rowFilter.outputSchema()) + .addValues(ORIGINAL_ROW.getRow("row").getValues()) + .build(); + + assertEquals(expecedRow, rowFilter.filter(ORIGINAL_ROW)); + } +} diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/util/RowStringInterpolatorTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/util/RowStringInterpolatorTest.java new file mode 100644 index 000000000000..0b1295c38533 --- /dev/null +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/util/RowStringInterpolatorTest.java @@ -0,0 +1,191 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.util; + +import static org.junit.Assert.assertEquals; + +import java.util.Arrays; +import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.transforms.windowing.GlobalWindow; +import org.apache.beam.sdk.transforms.windowing.PaneInfo; +import org.apache.beam.sdk.values.Row; +import org.joda.time.DateTime; +import org.joda.time.Instant; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ExpectedException; + +/** Test class for {@link RowStringInterpolator}. */ +public class RowStringInterpolatorTest { + @Rule public ExpectedException thrown = ExpectedException.none(); + + private static final Schema DOUBLY_NESTED_ROW_SCHEMA = + Schema.builder() + .addStringField("doubly_nested_str") + .addInt32Field("doubly_nested_int") + .build(); + + private static final Schema NESTED_ROW_SCHEMA = + Schema.builder() + .addStringField("nested_str") + .addInt32Field("nested_int") + .addFloatField("nested_float") + .addRowField("nested_row", DOUBLY_NESTED_ROW_SCHEMA) + .build(); + private static final Schema ROW_SCHEMA = + Schema.builder() + .addStringField("str") + .addBooleanField("bool") + .addInt32Field("int") + .addNullableInt32Field("nullable_int") + .addArrayField("arr_int", Schema.FieldType.INT32) + .addRowField("row", NESTED_ROW_SCHEMA) + .addNullableRowField("nullable_row", NESTED_ROW_SCHEMA) + .build(); + + @Test + public void testInvalidRowThrowsHelpfulError() { + String template = "foo {str}"; + RowStringInterpolator interpolator = new RowStringInterpolator(template, ROW_SCHEMA); + + Row invalidRow = Row.nullRow(Schema.builder().addNullableStringField("xyz").build()); + + thrown.expect(IllegalArgumentException.class); + thrown.expectMessage("Invalid row does not contain field 'str'."); + + interpolator.interpolate(invalidRow, null, null, null); + } + + @Test + public void testInvalidRowThrowsHelpfulErrorForNestedFields() { + String template = "foo {row.nested_int}"; + RowStringInterpolator interpolator = new RowStringInterpolator(template, ROW_SCHEMA); + + Schema nestedSchema = Schema.builder().addNullableStringField("xyz").build(); + Row invalidRow = + Row.withSchema(Schema.builder().addNullableRowField("row", nestedSchema).build()) + .addValue(Row.nullRow(nestedSchema)) + .build(); + + thrown.expect(IllegalArgumentException.class); + thrown.expectMessage("Invalid row does not contain field 'nested_int'."); + + interpolator.interpolate(invalidRow, null, null, null); + } + + @Test + public void testInvalidRowThrowsHelpfulErrorForDoublyNestedFields() { + String template = "foo {row.nested_row.doubly_nested_int}"; + RowStringInterpolator interpolator = new RowStringInterpolator(template, ROW_SCHEMA); + + Schema doublyNestedSchema = Schema.builder().addNullableStringField("xyz").build(); + Schema nestedSchema = + Schema.builder().addNullableRowField("nested_row", doublyNestedSchema).build(); + Row invalidRow = + Row.withSchema(Schema.builder().addNullableRowField("row", doublyNestedSchema).build()) + .addValue( + Row.withSchema(nestedSchema).addValue(Row.nullRow(doublyNestedSchema)).build()) + .build(); + + thrown.expect(IllegalArgumentException.class); + thrown.expectMessage("Invalid row does not contain field 'doubly_nested_int'."); + + interpolator.interpolate(invalidRow, null, null, null); + } + + private static final Row ROW = + Row.withSchema(ROW_SCHEMA) + .addValue("str_value") + .addValue(true) + .addValue(123) + .addValue(null) + .addValue(Arrays.asList(1, 2, 3, 4, 5)) + .addValue( + Row.withSchema(NESTED_ROW_SCHEMA) + .addValue("nested_str_value") + .addValue(456) + .addValue(1.234f) + .addValue( + Row.withSchema(DOUBLY_NESTED_ROW_SCHEMA) + .addValue("doubly_nested_str_value") + .addValue(789) + .build()) + .build()) + .addValue(null) + .build(); + + @Test + public void testTopLevelInterpolation() { + String template = "foo {str}, bar {bool}, baz {int}, xyz {nullable_int}"; + RowStringInterpolator interpolator = new RowStringInterpolator(template, ROW_SCHEMA); + + String output = interpolator.interpolate(ROW, null, null, null); + + assertEquals("foo str_value, bar true, baz 123, xyz ", output); + } + + @Test + public void testNestedLevelInterpolation() { + String template = "foo {str}, bar {row.nested_str}, baz {row.nested_float}"; + RowStringInterpolator interpolator = new RowStringInterpolator(template, ROW_SCHEMA); + + String output = interpolator.interpolate(ROW, null, null, null); + + assertEquals("foo str_value, bar nested_str_value, baz 1.234", output); + } + + @Test + public void testDoublyNestedInterpolation() { + String template = + "foo {str}, bar {row.nested_row.doubly_nested_str}, baz {row.nested_row.doubly_nested_int}"; + RowStringInterpolator interpolator = new RowStringInterpolator(template, ROW_SCHEMA); + + String output = interpolator.interpolate(ROW, null, null, null); + + assertEquals("foo str_value, bar doubly_nested_str_value, baz 789", output); + } + + @Test + public void testInterpolateWindowingInformation() { + String template = + String.format( + "str: {str}, window: {%s}, pane: {%s}, year: {%s}, month: {%s}, day: {%s}", + RowStringInterpolator.WINDOW, + RowStringInterpolator.PANE_INDEX, + RowStringInterpolator.YYYY, + RowStringInterpolator.MM, + RowStringInterpolator.DD); + + RowStringInterpolator interpolator = new RowStringInterpolator(template, ROW_SCHEMA); + + Instant instant = new DateTime(2024, 8, 28, 12, 0).toInstant(); + + String output = + interpolator.interpolate( + ROW, + GlobalWindow.INSTANCE, + PaneInfo.createPane(false, false, PaneInfo.Timing.ON_TIME, 2, 0), + instant); + String expected = + String.format( + "str: str_value, window: %s, pane: 2, year: 2024, month: 8, day: 28", + GlobalWindow.INSTANCE); + + assertEquals(expected, output); + } +} diff --git a/sdks/java/expansion-service/build.gradle b/sdks/java/expansion-service/build.gradle index 2926bfad633f..4dd8c8968ed9 100644 --- a/sdks/java/expansion-service/build.gradle +++ b/sdks/java/expansion-service/build.gradle @@ -41,6 +41,7 @@ dependencies { implementation project(path: ":sdks:java:core", configuration: "shadow") implementation project(path: ":runners:java-fn-execution") implementation project(path: ":sdks:java:harness") + runtimeOnly library.java.jamm implementation library.java.snake_yaml permitUnusedDeclared project(path: ":model:fn-execution") implementation library.java.vendored_grpc_1_60_1 diff --git a/sdks/java/extensions/python/src/main/java/org/apache/beam/sdk/extensions/python/PythonExternalTransform.java b/sdks/java/extensions/python/src/main/java/org/apache/beam/sdk/extensions/python/PythonExternalTransform.java index c23a771f3cc8..d5f1745a9a2c 100644 --- a/sdks/java/extensions/python/src/main/java/org/apache/beam/sdk/extensions/python/PythonExternalTransform.java +++ b/sdks/java/extensions/python/src/main/java/org/apache/beam/sdk/extensions/python/PythonExternalTransform.java @@ -311,7 +311,6 @@ Row buildOrGetKwargsRow() { Schema schema = generateSchemaFromFieldValues( kwargsMap.values().toArray(), kwargsMap.keySet().toArray(new String[] {})); - schema.setUUID(UUID.randomUUID()); return Row.withSchema(schema) .addValues(convertComplexTypesToRows(kwargsMap.values().toArray())) .build(); @@ -367,7 +366,6 @@ private Object[] convertComplexTypesToRows(@Nullable Object @NonNull [] values) @VisibleForTesting Row buildOrGetArgsRow() { Schema schema = generateSchemaFromFieldValues(argsArray, null); - schema.setUUID(UUID.randomUUID()); Object[] convertedValues = convertComplexTypesToRows(argsArray); return Row.withSchema(schema).addValues(convertedValues).build(); } @@ -421,7 +419,6 @@ ExternalTransforms.ExternalConfigurationPayload generatePayload() { schemaBuilder.addRowField("kwargs", kwargsRow.getSchema()); } Schema payloadSchema = schemaBuilder.build(); - payloadSchema.setUUID(UUID.randomUUID()); Row.Builder payloadRowBuilder = Row.withSchema(payloadSchema); payloadRowBuilder.addValue(fullyQualifiedName); if (argsRow.getValues().size() > 0) { diff --git a/sdks/java/io/expansion-service/build.gradle b/sdks/java/io/expansion-service/build.gradle index 498950b3dc47..6097e5f5a5a5 100644 --- a/sdks/java/io/expansion-service/build.gradle +++ b/sdks/java/io/expansion-service/build.gradle @@ -47,8 +47,7 @@ dependencies { // **** IcebergIO runtime dependencies **** runtimeOnly library.java.hadoop_client // Needed when using GCS as the warehouse location. - implementation library.java.bigdataoss_gcs_connector - permitUnusedDeclared library.java.bigdataoss_gcs_connector + runtimeOnly library.java.bigdataoss_gcs_connector // Needed for HiveCatalog runtimeOnly ("org.apache.iceberg:iceberg-hive-metastore:1.4.2") runtimeOnly project(path: ":sdks:java:io:iceberg:hive:exec", configuration: "shadow") diff --git a/sdks/java/io/google-cloud-platform/build.gradle b/sdks/java/io/google-cloud-platform/build.gradle index 9452d8db5bb3..3e322d976c1a 100644 --- a/sdks/java/io/google-cloud-platform/build.gradle +++ b/sdks/java/io/google-cloud-platform/build.gradle @@ -115,7 +115,6 @@ dependencies { implementation library.java.http_client implementation library.java.hamcrest implementation library.java.http_core - implementation library.java.jackson_annotations implementation library.java.jackson_core implementation library.java.jackson_databind implementation library.java.jackson_datatype_joda diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryAvroUtils.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryAvroUtils.java index bdee2eef570d..911d5b4d0aa3 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryAvroUtils.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryAvroUtils.java @@ -39,8 +39,6 @@ import java.util.List; import java.util.Optional; import java.util.Set; -import java.util.stream.Collectors; -import java.util.stream.Stream; import org.apache.avro.Conversions; import org.apache.avro.LogicalType; import org.apache.avro.LogicalTypes; @@ -176,41 +174,6 @@ private static String formatTime(long timeMicros) { return LocalTime.ofNanoOfDay(timeMicros * 1000).format(formatter); } - static TableSchema trimBigQueryTableSchema(TableSchema inputSchema, Schema avroSchema) { - List subSchemas = - inputSchema.getFields().stream() - .flatMap(fieldSchema -> mapTableFieldSchema(fieldSchema, avroSchema)) - .collect(Collectors.toList()); - - return new TableSchema().setFields(subSchemas); - } - - private static Stream mapTableFieldSchema( - TableFieldSchema fieldSchema, Schema avroSchema) { - Field avroFieldSchema = avroSchema.getField(fieldSchema.getName()); - if (avroFieldSchema == null) { - return Stream.empty(); - } else if (avroFieldSchema.schema().getType() != Type.RECORD) { - return Stream.of(fieldSchema); - } - - List subSchemas = - fieldSchema.getFields().stream() - .flatMap(subSchema -> mapTableFieldSchema(subSchema, avroFieldSchema.schema())) - .collect(Collectors.toList()); - - TableFieldSchema output = - new TableFieldSchema() - .setCategories(fieldSchema.getCategories()) - .setDescription(fieldSchema.getDescription()) - .setFields(subSchemas) - .setMode(fieldSchema.getMode()) - .setName(fieldSchema.getName()) - .setType(fieldSchema.getType()); - - return Stream.of(output); - } - /** * Utility function to convert from an Avro {@link GenericRecord} to a BigQuery {@link TableRow}. * diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.java index 79a3249d6bc9..f3ade1948986 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.java @@ -1275,8 +1275,12 @@ public PCollection expand(PBegin input) { Schema beamSchema = null; if (getTypeDescriptor() != null && getToBeamRowFn() != null && getFromBeamRowFn() != null) { - beamSchema = sourceDef.getBeamSchema(bqOptions); - beamSchema = getFinalSchema(beamSchema, getSelectedFields()); + TableSchema tableSchema = sourceDef.getTableSchema(bqOptions); + ValueProvider> selectedFields = getSelectedFields(); + if (selectedFields != null && selectedFields.isAccessible()) { + tableSchema = BigQueryUtils.trimSchema(tableSchema, selectedFields.get()); + } + beamSchema = BigQueryUtils.fromTableSchema(tableSchema); } final Coder coder = inferCoder(p.getCoderRegistry()); @@ -1441,24 +1445,6 @@ void cleanup(PassThroughThenCleanup.ContextContainer c) throws Exception { return rows; } - private static Schema getFinalSchema( - Schema beamSchema, ValueProvider> selectedFields) { - List flds = - beamSchema.getFields().stream() - .filter( - field -> { - if (selectedFields != null - && selectedFields.isAccessible() - && selectedFields.get() != null) { - return selectedFields.get().contains(field.getName()); - } else { - return true; - } - }) - .collect(Collectors.toList()); - return Schema.builder().addFields(flds).build(); - } - private PCollection expandForDirectRead( PBegin input, Coder outputCoder, Schema beamSchema, BigQueryOptions bqOptions) { ValueProvider tableProvider = getTableProvider(); diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryOptions.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryOptions.java index a2b506408ee0..5ddede38aa78 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryOptions.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryOptions.java @@ -17,7 +17,6 @@ */ package org.apache.beam.sdk.io.gcp.bigquery; -import com.fasterxml.jackson.annotation.JsonIgnore; import java.util.Map; import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; import org.apache.beam.sdk.options.ApplicationNameOptions; @@ -225,7 +224,6 @@ public interface BigQueryOptions void setJobLabelsMap(Map value); /** BQ endpoint to use. If unspecified, uses the default endpoint. */ - @JsonIgnore @Hidden @Description("The URL for the BigQuery API.") String getBigQueryEndpoint(); diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryQuerySourceDef.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryQuerySourceDef.java index b4035a4e9ac3..25f274d708b5 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryQuerySourceDef.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryQuerySourceDef.java @@ -31,7 +31,6 @@ import org.apache.beam.sdk.extensions.avro.io.AvroSource; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryResourceNaming.JobType; import org.apache.beam.sdk.options.ValueProvider; -import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.transforms.SerializableFunction; import org.checkerframework.checker.nullness.qual.Nullable; import org.slf4j.Logger; @@ -178,7 +177,7 @@ public BigQuerySourceBase toSource( /** {@inheritDoc} */ @Override - public Schema getBeamSchema(BigQueryOptions bqOptions) { + public TableSchema getTableSchema(BigQueryOptions bqOptions) { try { JobStatistics stats = BigQueryQueryHelper.dryRunQueryIfNeeded( @@ -189,8 +188,7 @@ public Schema getBeamSchema(BigQueryOptions bqOptions) { flattenResults, useLegacySql, location); - TableSchema tableSchema = stats.getQuery().getSchema(); - return BigQueryUtils.fromTableSchema(tableSchema); + return stats.getQuery().getSchema(); } catch (IOException | InterruptedException | NullPointerException e) { throw new BigQuerySchemaRetrievalException( "Exception while trying to retrieve schema of query", e); diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQuerySourceDef.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQuerySourceDef.java index c9b1d5f73224..a9c4c5af283c 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQuerySourceDef.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQuerySourceDef.java @@ -21,7 +21,6 @@ import java.io.Serializable; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.extensions.avro.io.AvroSource; -import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.transforms.SerializableFunction; /** @@ -46,11 +45,11 @@ BigQuerySourceBase toSource( boolean useAvroLogicalTypes); /** - * Extract the Beam {@link Schema} corresponding to this source. + * Extract the {@link TableSchema} corresponding to this source. * * @param bqOptions BigQueryOptions - * @return Beam schema of the source + * @return table schema of the source * @throws BigQuerySchemaRetrievalException if schema retrieval fails */ - Schema getBeamSchema(BigQueryOptions bqOptions); + TableSchema getTableSchema(BigQueryOptions bqOptions); } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryStorageSourceBase.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryStorageSourceBase.java index 51a5a8f391a6..d0bc655b311a 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryStorageSourceBase.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryStorageSourceBase.java @@ -28,10 +28,7 @@ import com.google.cloud.bigquery.storage.v1.ReadStream; import java.io.IOException; import java.util.List; -import org.apache.avro.Schema; import org.apache.beam.sdk.coders.Coder; -import org.apache.beam.sdk.extensions.arrow.ArrowConversion; -import org.apache.beam.sdk.extensions.avro.schemas.utils.AvroUtils; import org.apache.beam.sdk.io.BoundedSource; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.StorageClient; import org.apache.beam.sdk.metrics.Lineage; @@ -126,17 +123,16 @@ public List> split( } } - if (selectedFieldsProvider != null || rowRestrictionProvider != null) { - ReadSession.TableReadOptions.Builder tableReadOptionsBuilder = - ReadSession.TableReadOptions.newBuilder(); - if (selectedFieldsProvider != null) { - tableReadOptionsBuilder.addAllSelectedFields(selectedFieldsProvider.get()); - } - if (rowRestrictionProvider != null) { - tableReadOptionsBuilder.setRowRestriction(rowRestrictionProvider.get()); - } - readSessionBuilder.setReadOptions(tableReadOptionsBuilder); + ReadSession.TableReadOptions.Builder tableReadOptionsBuilder = + ReadSession.TableReadOptions.newBuilder(); + if (selectedFieldsProvider != null && selectedFieldsProvider.isAccessible()) { + tableReadOptionsBuilder.addAllSelectedFields(selectedFieldsProvider.get()); + } + if (rowRestrictionProvider != null && rowRestrictionProvider.isAccessible()) { + tableReadOptionsBuilder.setRowRestriction(rowRestrictionProvider.get()); } + readSessionBuilder.setReadOptions(tableReadOptionsBuilder); + if (format != null) { readSessionBuilder.setDataFormat(format); } @@ -182,30 +178,18 @@ public List> split( LOG.info("Read session returned {} streams", readSession.getStreamsList().size()); } - Schema sessionSchema; - if (readSession.getDataFormat() == DataFormat.ARROW) { - org.apache.arrow.vector.types.pojo.Schema schema = - ArrowConversion.arrowSchemaFromInput( - readSession.getArrowSchema().getSerializedSchema().newInput()); - org.apache.beam.sdk.schemas.Schema beamSchema = - ArrowConversion.ArrowSchemaTranslator.toBeamSchema(schema); - sessionSchema = AvroUtils.toAvroSchema(beamSchema); - } else if (readSession.getDataFormat() == DataFormat.AVRO) { - sessionSchema = new Schema.Parser().parse(readSession.getAvroSchema().getSchema()); - } else { - throw new IllegalArgumentException( - "data is not in a supported dataFormat: " + readSession.getDataFormat()); + // TODO: this is inconsistent with method above, where it can be null + Preconditions.checkStateNotNull(targetTable); + TableSchema tableSchema = targetTable.getSchema(); + if (selectedFieldsProvider != null && selectedFieldsProvider.isAccessible()) { + tableSchema = BigQueryUtils.trimSchema(tableSchema, selectedFieldsProvider.get()); } - Preconditions.checkStateNotNull( - targetTable); // TODO: this is inconsistent with method above, where it can be null - TableSchema trimmedSchema = - BigQueryAvroUtils.trimBigQueryTableSchema(targetTable.getSchema(), sessionSchema); List> sources = Lists.newArrayList(); for (ReadStream readStream : readSession.getStreamsList()) { sources.add( BigQueryStorageStreamSource.create( - readSession, readStream, trimmedSchema, parseFn, outputCoder, bqServices)); + readSession, readStream, tableSchema, parseFn, outputCoder, bqServices)); } return ImmutableList.copyOf(sources); diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryTableSourceDef.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryTableSourceDef.java index b399900f9a24..a7299c6992fe 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryTableSourceDef.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryTableSourceDef.java @@ -28,7 +28,6 @@ import org.apache.beam.sdk.extensions.avro.io.AvroSource; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.DatasetService; import org.apache.beam.sdk.options.ValueProvider; -import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.transforms.SerializableFunction; import org.apache.beam.sdk.util.Preconditions; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Strings; @@ -102,13 +101,12 @@ public BigQuerySourceBase toSource( /** {@inheritDoc} */ @Override - public Schema getBeamSchema(BigQueryOptions bqOptions) { + public TableSchema getTableSchema(BigQueryOptions bqOptions) { try { try (DatasetService datasetService = bqServices.getDatasetService(bqOptions)) { TableReference tableRef = getTableReference(bqOptions); Table table = datasetService.getTable(tableRef); - TableSchema tableSchema = Preconditions.checkStateNotNull(table).getSchema(); - return BigQueryUtils.fromTableSchema(tableSchema); + return Preconditions.checkStateNotNull(table).getSchema(); } } catch (Exception e) { throw new BigQuerySchemaRetrievalException("Exception while trying to retrieve schema", e); diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryUtils.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryUtils.java index 305abad5783a..f2f997bdbfa9 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryUtils.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryUtils.java @@ -43,7 +43,9 @@ import java.util.function.Function; import java.util.regex.Matcher; import java.util.regex.Pattern; +import java.util.stream.Collectors; import java.util.stream.IntStream; +import java.util.stream.Stream; import org.apache.avro.Conversions; import org.apache.avro.LogicalTypes; import org.apache.avro.generic.GenericData; @@ -1039,6 +1041,48 @@ private static Object convertAvroNumeric(Object value) { return tableSpec; } + static TableSchema trimSchema(TableSchema schema, @Nullable List selectedFields) { + if (selectedFields == null || selectedFields.isEmpty()) { + return schema; + } + + List trimmedFields = + schema.getFields().stream() + .flatMap(f -> trimField(f, selectedFields)) + .collect(Collectors.toList()); + return new TableSchema().setFields(trimmedFields); + } + + private static Stream trimField( + TableFieldSchema field, List selectedFields) { + String name = field.getName(); + if (selectedFields.contains(name)) { + return Stream.of(field); + } + + if (field.getFields() != null) { + // record + List selectedChildren = + selectedFields.stream() + .filter(sf -> sf.startsWith(name + ".")) + .map(sf -> sf.substring(name.length() + 1)) + .collect(toList()); + + if (!selectedChildren.isEmpty()) { + List trimmedChildren = + field.getFields().stream() + .flatMap(c -> trimField(c, selectedChildren)) + .collect(toList()); + + if (!trimmedChildren.isEmpty()) { + return Stream.of(field.clone().setFields(trimmedChildren)); + } + } + } + + return Stream.empty(); + } + private static @Nullable ServiceCallMetric callMetricForMethod( @Nullable TableReference tableReference, String method) { if (tableReference != null) { diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableIO.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableIO.java index ecf015e6e782..389d2e43c74e 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableIO.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableIO.java @@ -21,16 +21,13 @@ import static org.apache.beam.sdk.options.ValueProvider.StaticValueProvider; import static org.apache.beam.sdk.transforms.errorhandling.BadRecordRouter.BAD_RECORD_TAG; import static org.apache.beam.sdk.util.Preconditions.checkArgumentNotNull; -import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.MoreObjects.firstNonNull; import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkArgument; import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkState; import com.google.api.gax.batching.BatchingException; import com.google.api.gax.rpc.ApiException; -import com.google.api.gax.rpc.DeadlineExceededException; import com.google.api.gax.rpc.InvalidArgumentException; import com.google.api.gax.rpc.NotFoundException; -import com.google.api.gax.rpc.ResourceExhaustedException; import com.google.auto.value.AutoValue; import com.google.bigtable.v2.MutateRowResponse; import com.google.bigtable.v2.Mutation; @@ -41,18 +38,22 @@ import com.google.cloud.bigtable.data.v2.models.ChangeStreamRecord; import com.google.cloud.bigtable.data.v2.models.KeyOffset; import com.google.protobuf.ByteString; -import io.grpc.StatusRuntimeException; import java.io.IOException; +import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Collections; -import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.NoSuchElementException; -import java.util.Set; +import java.util.Queue; +import java.util.concurrent.CompletableFuture; import java.util.concurrent.ConcurrentLinkedQueue; -import java.util.function.BiConsumer; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import java.util.function.BiFunction; import org.apache.beam.sdk.PipelineRunner; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.extensions.protobuf.ProtoCoder; @@ -73,8 +74,6 @@ import org.apache.beam.sdk.io.range.ByteKey; import org.apache.beam.sdk.io.range.ByteKeyRange; import org.apache.beam.sdk.io.range.ByteKeyRangeTracker; -import org.apache.beam.sdk.metrics.Counter; -import org.apache.beam.sdk.metrics.Metrics; import org.apache.beam.sdk.options.ExperimentalOptions; import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.options.ValueProvider; @@ -1116,52 +1115,27 @@ public Write withMaxOutstandingBytes(long bytes) { * always enabled on batch writes and limits the number of outstanding requests to the Bigtable * server. * - *

When enabled, will also set default {@link #withThrottlingReportTargetMs} to 1 minute. - * This enables runner react with increased latency in flush call due to flow control. - * *

Does not modify this object. */ public Write withFlowControl(boolean enableFlowControl) { BigtableWriteOptions options = getBigtableWriteOptions(); - BigtableWriteOptions.Builder builder = options.toBuilder().setFlowControl(enableFlowControl); - if (enableFlowControl) { - builder = builder.setThrottlingReportTargetMs(60_000); - } - return toBuilder().setBigtableWriteOptions(builder.build()).build(); + return toBuilder() + .setBigtableWriteOptions(options.toBuilder().setFlowControl(enableFlowControl).build()) + .build(); } - /** - * Returns a new {@link BigtableIO.Write} with client side latency based throttling enabled. - * - *

Will also set {@link #withThrottlingReportTargetMs} to the same value. - */ + /** @deprecated This method has been deprecated in Beam 2.60.0. It does not have an effect. */ + @Deprecated public Write withThrottlingTargetMs(int throttlingTargetMs) { - BigtableWriteOptions options = getBigtableWriteOptions(); - return toBuilder() - .setBigtableWriteOptions( - options - .toBuilder() - .setThrottlingTargetMs(throttlingTargetMs) - .setThrottlingReportTargetMs(throttlingTargetMs) - .build()) - .build(); + LOG.warn("withThrottlingTargetMs has been removed and does not have effect."); + return this; } - /** - * Returns a new {@link BigtableIO.Write} with throttling time reporting enabled. When write - * request latency exceeded the set value, the amount greater than the target will be considered - * as throttling time and report back to runner. - * - *

If not set, defaults to 3 min for completed batch request. Client side flowing control - * configurations (e.g. {@link #withFlowControl}, {@link #withThrottlingTargetMs} will adjust - * the default value accordingly. Set to 0 to disable throttling time reporting. - */ + /** @deprecated This method has been deprecated in Beam 2.60.0. It does not have an effect. */ + @Deprecated public Write withThrottlingReportTargetMs(int throttlingReportTargetMs) { - BigtableWriteOptions options = getBigtableWriteOptions(); - return toBuilder() - .setBigtableWriteOptions( - options.toBuilder().setThrottlingReportTargetMs(throttlingReportTargetMs).build()) - .build(); + LOG.warn("withThrottlingReportTargetMs has been removed and does not have an effect."); + return this; } public Write withErrorHandler(ErrorHandler badRecordErrorHandler) { @@ -1328,20 +1302,15 @@ private static class BigtableWriterFn private final BigtableServiceFactory.ConfigId id; private final Coder>> inputCoder; private final BadRecordRouter badRecordRouter; - - private final Counter throttlingMsecs = - Metrics.counter(Metrics.THROTTLE_TIME_NAMESPACE, Metrics.THROTTLE_TIME_COUNTER_NAME); - - private final int throttleReportThresMsecs; - - private transient Set> badRecords = null; - // Due to callback thread not supporting Beam metrics, Record pending metrics and report later. - private transient long pendingThrottlingMsecs; + private transient ConcurrentLinkedQueue> badRecords = + null; private transient boolean reportedLineage; // Assign serviceEntry in startBundle and clear it in tearDown. @Nullable private BigtableServiceEntry serviceEntry; + private transient Queue> outstandingWrites; + BigtableWriterFn( BigtableServiceFactory factory, BigtableConfig bigtableConfig, @@ -1355,8 +1324,6 @@ private static class BigtableWriterFn this.badRecordRouter = badRecordRouter; this.failures = new ConcurrentLinkedQueue<>(); this.id = factory.newId(); - // a request completed more than this time will be considered throttled. Disabled if set to 0 - throttleReportThresMsecs = firstNonNull(writeOptions.getThrottlingReportTargetMs(), 180_000); LOG.debug("Created Bigtable Write Fn with writeOptions {} ", writeOptions); } @@ -1376,60 +1343,47 @@ public void startBundle(StartBundleContext c) throws IOException { bigtableWriter = serviceEntry.getService().openForWriting(writeOptions); } - badRecords = new HashSet<>(); + badRecords = new ConcurrentLinkedQueue<>(); + outstandingWrites = new ArrayDeque<>(); } @ProcessElement public void processElement(ProcessContext c, BoundedWindow window) throws Exception { + drainCompletedElementFutures(); checkForFailures(); KV> record = c.element(); - Instant writeStart = Instant.now(); - pendingThrottlingMsecs = 0; - bigtableWriter - .writeRecord(record) - .whenComplete(handleMutationException(record, window, writeStart)); - if (pendingThrottlingMsecs > 0) { - throttlingMsecs.inc(pendingThrottlingMsecs); - } + CompletableFuture f = + bigtableWriter + .writeRecord(record) + // transform the next CompletionStage to have its own status + // this allows us to capture any unexpected errors in the handler + .handle(handleMutationException(record, window)); + outstandingWrites.add(f); ++recordsWritten; seenWindows.compute(window, (key, count) -> (count != null ? count : 0) + 1); } - private BiConsumer handleMutationException( - KV> record, BoundedWindow window, Instant writeStart) { + private void drainCompletedElementFutures() throws ExecutionException, InterruptedException { + // burn down the completed futures to avoid unbounded memory growth + for (Future f = outstandingWrites.peek(); + f != null && f.isDone(); + f = outstandingWrites.peek()) { + // Also ensure that errors in the handler get bubbled up + outstandingWrites.remove().get(); + } + } + + private BiFunction handleMutationException( + KV> record, BoundedWindow window) { return (MutateRowResponse result, Throwable exception) -> { if (exception != null) { if (isDataException(exception)) { retryIndividualRecord(record, window); } else { - // Exception due to resource unavailable or rate limited, - // including DEADLINE_EXCEEDED and RESOURCE_EXHAUSTED. - boolean isResourceException = false; - if (exception instanceof StatusRuntimeException) { - StatusRuntimeException se = (StatusRuntimeException) exception; - if (io.grpc.Status.DEADLINE_EXCEEDED.equals(se.getStatus()) - || io.grpc.Status.RESOURCE_EXHAUSTED.equals(se.getStatus())) { - isResourceException = true; - } - } else if (exception instanceof DeadlineExceededException - || exception instanceof ResourceExhaustedException) { - isResourceException = true; - } - if (isResourceException) { - pendingThrottlingMsecs = new Duration(writeStart, Instant.now()).getMillis(); - } failures.add(new BigtableWriteException(record, exception)); } - } else { - // add the excessive amount to throttling metrics if elapsed time > target latency - if (throttleReportThresMsecs > 0) { - long excessTime = - new Duration(writeStart, Instant.now()).getMillis() - throttleReportThresMsecs; - if (excessTime > 0) { - pendingThrottlingMsecs = excessTime; - } - } } + return null; }; } @@ -1437,7 +1391,7 @@ private void retryIndividualRecord( KV> record, BoundedWindow window) { try { bigtableWriter.writeSingleRecord(record); - } catch (ApiException e) { + } catch (Throwable e) { if (isDataException(e)) { // if we get another NotFoundException, we know this is the bad record. badRecords.add(KV.of(new BigtableWriteException(record, e), window)); @@ -1464,7 +1418,6 @@ private static boolean isDataException(Throwable e) { @FinishBundle public void finishBundle(FinishBundleContext c) throws Exception { if (bigtableWriter != null) { - Instant closeStart = Instant.now(); try { bigtableWriter.close(); } catch (IOException e) { @@ -1473,18 +1426,21 @@ public void finishBundle(FinishBundleContext c) throws Exception { // to the error queue. Bigtable will successfully write other failures in the batch, // so this exception should be ignored if (!(e.getCause() instanceof BatchingException)) { - throttlingMsecs.inc(new Duration(closeStart, Instant.now()).getMillis()); throw e; } } - // add the excessive amount to throttling metrics if elapsed time > target latency - if (throttleReportThresMsecs > 0) { - long excessTime = - new Duration(closeStart, Instant.now()).getMillis() - throttleReportThresMsecs; - if (excessTime > 0) { - throttlingMsecs.inc(excessTime); - } + + // Sanity check: ensure that all element futures are resolved. This should be already be the + // case once bigtableWriter.close() finishes. + try { + CompletableFuture.allOf(outstandingWrites.toArray(new CompletableFuture[0])) + .get(1, TimeUnit.MINUTES); + } catch (TimeoutException e) { + throw new IllegalStateException( + "Unexpected timeout waiting for element future to resolve after the writer was closed", + e); } + if (!reportedLineage) { bigtableWriter.reportLineage(); reportedLineage = true; diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableService.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableService.java index 50d8126999c4..be8f22950adb 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableService.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableService.java @@ -27,7 +27,7 @@ import java.io.Serializable; import java.util.List; import java.util.NoSuchElementException; -import java.util.concurrent.CompletionStage; +import java.util.concurrent.CompletableFuture; import org.apache.beam.sdk.io.gcp.bigtable.BigtableIO.BigtableSource; import org.apache.beam.sdk.values.KV; @@ -42,7 +42,7 @@ interface Writer { * * @throws IOException if there is an error submitting the write. */ - CompletionStage writeRecord(KV> record) + CompletableFuture writeRecord(KV> record) throws IOException; /** diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableServiceImpl.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableServiceImpl.java index 1af9ae4f932d..3451bbf450c7 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableServiceImpl.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableServiceImpl.java @@ -60,7 +60,6 @@ import java.util.Objects; import java.util.Queue; import java.util.concurrent.CompletableFuture; -import java.util.concurrent.CompletionStage; import java.util.concurrent.ExecutionException; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; @@ -552,8 +551,8 @@ public void close() throws IOException { } @Override - public CompletionStage writeRecord(KV> record) - throws IOException { + public CompletableFuture writeRecord( + KV> record) throws IOException { com.google.cloud.bigtable.data.v2.models.Mutation mutation = com.google.cloud.bigtable.data.v2.models.Mutation.fromProtoUnsafe(record.getValue()); diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableWriteOptions.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableWriteOptions.java index 5963eb6be3ce..a63cc575809b 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableWriteOptions.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableWriteOptions.java @@ -57,9 +57,6 @@ abstract class BigtableWriteOptions implements Serializable { /** Returns the target latency if latency based throttling is enabled. */ abstract @Nullable Integer getThrottlingTargetMs(); - /** Returns the target latency if latency based throttling report to runner is enabled. */ - abstract @Nullable Integer getThrottlingReportTargetMs(); - /** Returns true if batch write flow control is enabled. Otherwise return false. */ abstract @Nullable Boolean getFlowControl(); @@ -91,8 +88,6 @@ abstract static class Builder { abstract Builder setThrottlingTargetMs(int targetMs); - abstract Builder setThrottlingReportTargetMs(int targetMs); - abstract Builder setFlowControl(boolean enableFlowControl); abstract Builder setCloseWaitTimeout(Duration timeout); diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java index be573d5be52a..5ae47000b979 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java @@ -1173,7 +1173,6 @@ private PCollection expandReadContinued( @Override public T apply(PubsubMessage input) { if (!reportedMetrics) { - LOG.info("reportling lineage..."); // report Lineage once if (topicPath != null) { TopicPath topic = topicPath.get(); diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOStorageQueryTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOStorageQueryTest.java index 497653f9ab8d..4298c367936c 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOStorageQueryTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOStorageQueryTest.java @@ -381,7 +381,8 @@ private void doQuerySourceInitialSplit( .setParent("projects/" + options.getProject()) .setReadSession( ReadSession.newBuilder() - .setTable(BigQueryHelpers.toTableResourceName(tempTableReference))) + .setTable(BigQueryHelpers.toTableResourceName(tempTableReference)) + .setReadOptions(ReadSession.TableReadOptions.newBuilder())) .setMaxStreamCount(requestedStreamCount) .build(); @@ -482,7 +483,8 @@ public void testQuerySourceInitialSplit_NoReferencedTables() throws Exception { .setParent("projects/" + options.getProject()) .setReadSession( ReadSession.newBuilder() - .setTable(BigQueryHelpers.toTableResourceName(tempTableReference))) + .setTable(BigQueryHelpers.toTableResourceName(tempTableReference)) + .setReadOptions(ReadSession.TableReadOptions.newBuilder())) .setMaxStreamCount(1024) .build(); @@ -652,7 +654,8 @@ public void testQuerySourceInitialSplitWithBigQueryProject_EmptyResult() throws .setReadSession( ReadSession.newBuilder() .setTable(BigQueryHelpers.toTableResourceName(tempTableReference)) - .setDataFormat(DataFormat.AVRO)) + .setDataFormat(DataFormat.AVRO) + .setReadOptions(ReadSession.TableReadOptions.newBuilder())) .setMaxStreamCount(10) .build(); @@ -724,7 +727,8 @@ public void testQuerySourceInitialSplit_EmptyResult() throws Exception { .setParent("projects/" + options.getProject()) .setReadSession( ReadSession.newBuilder() - .setTable(BigQueryHelpers.toTableResourceName(tempTableReference))) + .setTable(BigQueryHelpers.toTableResourceName(tempTableReference)) + .setReadOptions(ReadSession.TableReadOptions.newBuilder())) .setMaxStreamCount(10) .build(); diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOStorageReadTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOStorageReadTest.java index d7930b595538..5b9e15f22b90 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOStorageReadTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOStorageReadTest.java @@ -458,7 +458,8 @@ private void doTableSourceInitialSplitTest(long bundleSize, int streamCount) thr .setParent("projects/project-id") .setReadSession( ReadSession.newBuilder() - .setTable("projects/foo.com:project/datasets/dataset/tables/table")) + .setTable("projects/foo.com:project/datasets/dataset/tables/table") + .setReadOptions(ReadSession.TableReadOptions.newBuilder())) .setMaxStreamCount(streamCount) .build(); @@ -551,7 +552,8 @@ public void testTableSourceInitialSplit_WithDefaultProject() throws Exception { .setParent("projects/project-id") .setReadSession( ReadSession.newBuilder() - .setTable("projects/project-id/datasets/dataset/tables/table")) + .setTable("projects/project-id/datasets/dataset/tables/table") + .setReadOptions(ReadSession.TableReadOptions.newBuilder())) .setMaxStreamCount(1024) .build(); @@ -599,7 +601,8 @@ public void testTableSourceInitialSplit_EmptyTable() throws Exception { .setParent("projects/project-id") .setReadSession( ReadSession.newBuilder() - .setTable("projects/foo.com:project/datasets/dataset/tables/table")) + .setTable("projects/foo.com:project/datasets/dataset/tables/table") + .setReadOptions(ReadSession.TableReadOptions.newBuilder())) .setMaxStreamCount(1024) .build(); @@ -1482,7 +1485,8 @@ public void testReadFromBigQueryIO() throws Exception { .setReadSession( ReadSession.newBuilder() .setTable("projects/foo.com:project/datasets/dataset/tables/table") - .setDataFormat(DataFormat.AVRO)) + .setDataFormat(DataFormat.AVRO) + .setReadOptions(ReadSession.TableReadOptions.newBuilder())) .setMaxStreamCount(10) .build(); @@ -1693,7 +1697,8 @@ public void testReadFromBigQueryIOArrow() throws Exception { .setReadSession( ReadSession.newBuilder() .setTable("projects/foo.com:project/datasets/dataset/tables/table") - .setDataFormat(DataFormat.ARROW)) + .setDataFormat(DataFormat.ARROW) + .setReadOptions(ReadSession.TableReadOptions.newBuilder())) .setMaxStreamCount(10) .build(); diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryUtilsTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryUtilsTest.java index e13e4a92a4dc..e26348b7b478 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryUtilsTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryUtilsTest.java @@ -1222,4 +1222,29 @@ public void testToTableReference() { assertNull(BigQueryUtils.toTableReference("projects/")); assertNull(BigQueryUtils.toTableReference("projects")); } + + @Test + public void testTrimSchema() { + assertEquals(BQ_FLAT_TYPE, BigQueryUtils.trimSchema(BQ_FLAT_TYPE, null)); + assertEquals(BQ_FLAT_TYPE, BigQueryUtils.trimSchema(BQ_FLAT_TYPE, Collections.emptyList())); + + { + TableSchema expected = new TableSchema().setFields(Arrays.asList(ID, VALUE, NAME)); + assertEquals( + expected, BigQueryUtils.trimSchema(BQ_FLAT_TYPE, Arrays.asList("id", "value", "name"))); + } + + { + TableFieldSchema filteredRow = + new TableFieldSchema() + .setName("row") + .setType(StandardSQLTypeName.STRUCT.toString()) + .setMode(Mode.NULLABLE.toString()) + .setFields(Arrays.asList(ID, VALUE, NAME)); + TableSchema expected = new TableSchema().setFields(Collections.singletonList(filteredRow)); + assertEquals( + expected, + BigQueryUtils.trimSchema(BQ_ROW_TYPE, Arrays.asList("row.id", "row.value", "row.name"))); + } + } } diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryDirectReadSchemaTransformProviderTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryDirectReadSchemaTransformProviderTest.java index 2363a870bbd7..a682d413e215 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryDirectReadSchemaTransformProviderTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryDirectReadSchemaTransformProviderTest.java @@ -234,7 +234,8 @@ public void testDirectRead() throws Exception { .setReadSession( ReadSession.newBuilder() .setTable("projects/my-project/datasets/dataset/tables/table") - .setDataFormat(DataFormat.AVRO)) + .setDataFormat(DataFormat.AVRO) + .setReadOptions(ReadSession.TableReadOptions.newBuilder())) .setMaxStreamCount(10) .build(); diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableIOTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableIOTest.java index e5049b037010..71c648730bd2 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableIOTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableIOTest.java @@ -436,21 +436,6 @@ public void testWriteValidationFailsMissingOptionsAndInstanceAndProject() { write.expand(null); } - @Test - public void testWriteClientRateLimitingAlsoSetReportMsecs() { - // client side flow control - BigtableIO.Write write = BigtableIO.write().withTableId("table").withFlowControl(true); - assertEquals( - 60_000, (int) checkNotNull(write.getBigtableWriteOptions().getThrottlingReportTargetMs())); - - // client side latency based throttling - int targetMs = 30_000; - write = BigtableIO.write().withTableId("table").withThrottlingTargetMs(targetMs); - assertEquals( - targetMs, - (int) checkNotNull(write.getBigtableWriteOptions().getThrottlingReportTargetMs())); - } - /** Helper function to make a single row mutation to be written. */ private static KV> makeWrite(String key, String value) { ByteString rowKey = ByteString.copyFromUtf8(key); @@ -1921,8 +1906,8 @@ public FakeBigtableWriter(String tableId) { } @Override - public CompletionStage writeRecord(KV> record) - throws IOException { + public CompletableFuture writeRecord( + KV> record) throws IOException { service.verifyTableExists(tableId); Map table = service.getTable(tableId); ByteString key = record.getKey(); @@ -1954,8 +1939,8 @@ public FailureBigtableWriter( } @Override - public CompletionStage writeRecord(KV> record) - throws IOException { + public CompletableFuture writeRecord( + KV> record) throws IOException { if (failureOptions.getFailAtWriteRecord()) { throw new IOException("Fake IOException in writeRecord()"); } diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableServiceImplTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableServiceImplTest.java index a46d47324b93..d8b47d70f230 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableServiceImplTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableServiceImplTest.java @@ -37,6 +37,7 @@ import com.google.bigtable.v2.Cell; import com.google.bigtable.v2.Column; import com.google.bigtable.v2.Family; +import com.google.bigtable.v2.MutateRowResponse; import com.google.bigtable.v2.Mutation; import com.google.bigtable.v2.Row; import com.google.bigtable.v2.RowFilter; @@ -60,6 +61,7 @@ import java.util.HashMap; import java.util.Iterator; import java.util.List; +import java.util.concurrent.CompletableFuture; import java.util.concurrent.atomic.AtomicBoolean; import java.util.stream.Collectors; import java.util.stream.IntStream; @@ -863,7 +865,8 @@ public void testWrite() throws IOException { .build()) .build(); - underTest.writeRecord(KV.of(key, ImmutableList.of(mutation))); + CompletableFuture unusedElementFuture = + underTest.writeRecord(KV.of(key, ImmutableList.of(mutation))); verify(mockBatcher).add(captor.capture()); diff --git a/sdks/java/io/iceberg/hive/exec/build.gradle b/sdks/java/io/iceberg/hive/exec/build.gradle index bb0b147c5a85..f266ab2ef4db 100644 --- a/sdks/java/io/iceberg/hive/exec/build.gradle +++ b/sdks/java/io/iceberg/hive/exec/build.gradle @@ -39,10 +39,17 @@ artifacts { shadowJar { zip64 true - relocate 'com.google.common', getJavaRelocatedPath('iceberg.hive.com.google.common') - relocate 'com.google.protobuf', getJavaRelocatedPath('iceberg.hive.com.google.protobuf') - relocate 'shaded.parquet', getJavaRelocatedPath('iceberg.hive.shaded.parquet') - relocate 'org.apache.parquet', getJavaRelocatedPath('iceberg.hive.org.apache.parquet') + def problematicPackages = [ + 'com.google.protobuf', + 'com.google.common', + 'shaded.parquet', + 'org.apache.parquet', + 'org.joda' + ] + + problematicPackages.forEach { + relocate it, getJavaRelocatedPath("iceberg.hive.${it}") + } version "3.1.3" mergeServiceFiles() diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AppendFilesToTables.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AppendFilesToTables.java index bb42df5a9330..b26ae83f0866 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AppendFilesToTables.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AppendFilesToTables.java @@ -19,6 +19,8 @@ import org.apache.beam.sdk.coders.KvCoder; import org.apache.beam.sdk.coders.StringUtf8Coder; +import org.apache.beam.sdk.metrics.Counter; +import org.apache.beam.sdk.metrics.Metrics; import org.apache.beam.sdk.transforms.DoFn; import org.apache.beam.sdk.transforms.GroupByKey; import org.apache.beam.sdk.transforms.PTransform; @@ -29,14 +31,17 @@ import org.apache.beam.sdk.values.KV; import org.apache.beam.sdk.values.PCollection; import org.apache.iceberg.AppendFiles; +import org.apache.iceberg.Snapshot; import org.apache.iceberg.Table; import org.apache.iceberg.catalog.Catalog; import org.apache.iceberg.catalog.TableIdentifier; import org.checkerframework.checker.nullness.qual.MonotonicNonNull; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; class AppendFilesToTables extends PTransform, PCollection>> { - + private static final Logger LOG = LoggerFactory.getLogger(AppendFilesToTables.class); private final IcebergCatalogConfig catalogConfig; AppendFilesToTables(IcebergCatalogConfig catalogConfig) { @@ -66,6 +71,8 @@ public String apply(FileWriteResult input) { private static class AppendFilesToTablesDoFn extends DoFn>, KV> { + private final Counter snapshotsCreated = + Metrics.counter(AppendFilesToTables.class, "snapshotsCreated"); private final IcebergCatalogConfig catalogConfig; @@ -87,15 +94,21 @@ public void processElement( @Element KV> element, OutputReceiver> out, BoundedWindow window) { + if (!element.getValue().iterator().hasNext()) { + return; + } + Table table = getCatalog().loadTable(TableIdentifier.parse(element.getKey())); AppendFiles update = table.newAppend(); for (FileWriteResult writtenFile : element.getValue()) { update.appendManifest(writtenFile.getManifestFile()); } update.commit(); + Snapshot snapshot = table.currentSnapshot(); + LOG.info("Created new snapshot for table '{}': {}.", element.getKey(), snapshot); + snapshotsCreated.inc(); out.outputWithTimestamp( - KV.of(element.getKey(), SnapshotInfo.fromSnapshot(table.currentSnapshot())), - window.maxTimestamp()); + KV.of(element.getKey(), SnapshotInfo.fromSnapshot(snapshot)), window.maxTimestamp()); } } } diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergIO.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergIO.java index c3c1da7c7885..0f9612339f48 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergIO.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergIO.java @@ -17,6 +17,7 @@ */ package org.apache.beam.sdk.io.iceberg; +import static org.apache.beam.sdk.util.Preconditions.checkArgumentNotNull; import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull; import com.google.auto.value.AutoValue; @@ -25,6 +26,12 @@ import org.apache.beam.sdk.annotations.Internal; import org.apache.beam.sdk.io.Read; import org.apache.beam.sdk.transforms.PTransform; +import org.apache.beam.sdk.transforms.windowing.AfterFirst; +import org.apache.beam.sdk.transforms.windowing.AfterPane; +import org.apache.beam.sdk.transforms.windowing.AfterProcessingTime; +import org.apache.beam.sdk.transforms.windowing.GlobalWindows; +import org.apache.beam.sdk.transforms.windowing.Repeatedly; +import org.apache.beam.sdk.transforms.windowing.Window; import org.apache.beam.sdk.values.PBegin; import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.Row; @@ -33,6 +40,7 @@ import org.apache.iceberg.Table; import org.apache.iceberg.catalog.TableIdentifier; import org.checkerframework.checker.nullness.qual.Nullable; +import org.joda.time.Duration; /** * The underlying Iceberg connector used by {@link org.apache.beam.sdk.managed.Managed#ICEBERG}. Not @@ -49,6 +57,7 @@ public static WriteRows writeRows(IcebergCatalogConfig catalog) { @AutoValue public abstract static class WriteRows extends PTransform, IcebergWriteResult> { + private static final int TRIGGERING_RECORD_COUNT = 50_000; abstract IcebergCatalogConfig getCatalogConfig(); @@ -56,6 +65,8 @@ public abstract static class WriteRows extends PTransform, Iceb abstract @Nullable DynamicDestinations getDynamicDestinations(); + abstract @Nullable Duration getTriggeringFrequency(); + abstract Builder toBuilder(); @AutoValue.Builder @@ -66,6 +77,8 @@ abstract static class Builder { abstract Builder setDynamicDestinations(DynamicDestinations destinations); + abstract Builder setTriggeringFrequency(Duration triggeringFrequency); + abstract WriteRows build(); } @@ -77,6 +90,21 @@ public WriteRows to(DynamicDestinations destinations) { return toBuilder().setDynamicDestinations(destinations).build(); } + /** + * Sets the frequency at which data is committed and a new {@link org.apache.iceberg.Snapshot} + * is produced. + * + *

Roughly every triggeringFrequency duration, this connector will try to accumulate all + * {@link org.apache.iceberg.ManifestFile}s and commit them to the table as appended files. Each + * commit results in a new table {@link org.apache.iceberg.Snapshot}. + * + *

This is only applicable when writing an unbounded {@link PCollection} (i.e. a streaming + * pipeline). + */ + public WriteRows withTriggeringFrequency(Duration triggeringFrequency) { + return toBuilder().setTriggeringFrequency(triggeringFrequency).build(); + } + @Override public IcebergWriteResult expand(PCollection input) { List allToArgs = Arrays.asList(getTableIdentifier(), getDynamicDestinations()); @@ -89,11 +117,32 @@ public IcebergWriteResult expand(PCollection input) { destinations = DynamicDestinations.singleTable(Preconditions.checkNotNull(getTableIdentifier())); } + + if (input.isBounded().equals(PCollection.IsBounded.UNBOUNDED)) { + Duration triggeringFrequency = getTriggeringFrequency(); + checkArgumentNotNull( + triggeringFrequency, "Streaming pipelines must set a triggering frequency."); + input = + input.apply( + "WindowIntoGlobal", + Window.into(new GlobalWindows()) + .triggering( + Repeatedly.forever( + AfterFirst.of( + AfterProcessingTime.pastFirstElementInPane() + .plusDelayOf(triggeringFrequency), + AfterPane.elementCountAtLeast(TRIGGERING_RECORD_COUNT)))) + .discardingFiredPanes()); + } else { + Preconditions.checkArgument( + getTriggeringFrequency() == null, + "Triggering frequency is only applicable for streaming pipelines."); + } return input .apply("Set Destination Metadata", new AssignDestinations(destinations)) .apply( "Write Rows to Destinations", - new WriteToDestinations(getCatalogConfig(), destinations)); + new WriteToDestinations(getCatalogConfig(), destinations, getTriggeringFrequency())); } } diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergWriteSchemaTransformProvider.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergWriteSchemaTransformProvider.java index 3f0f88946d9c..9f1b51cf2300 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergWriteSchemaTransformProvider.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergWriteSchemaTransformProvider.java @@ -17,13 +17,20 @@ */ package org.apache.beam.sdk.io.iceberg; +import static org.apache.beam.sdk.io.iceberg.IcebergWriteSchemaTransformProvider.Configuration; + import com.google.auto.service.AutoService; +import com.google.auto.value.AutoValue; import java.util.Collections; import java.util.List; +import java.util.Map; import org.apache.beam.sdk.managed.ManagedTransformConstants; +import org.apache.beam.sdk.schemas.AutoValueSchema; import org.apache.beam.sdk.schemas.NoSuchSchemaException; import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.schemas.SchemaRegistry; +import org.apache.beam.sdk.schemas.annotations.DefaultSchema; +import org.apache.beam.sdk.schemas.annotations.SchemaFieldDescription; import org.apache.beam.sdk.schemas.transforms.SchemaTransform; import org.apache.beam.sdk.schemas.transforms.SchemaTransformProvider; import org.apache.beam.sdk.schemas.transforms.TypedSchemaTransformProvider; @@ -35,6 +42,8 @@ import org.apache.beam.sdk.values.Row; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; import org.apache.iceberg.catalog.TableIdentifier; +import org.checkerframework.checker.nullness.qual.Nullable; +import org.joda.time.Duration; /** * SchemaTransform implementation for {@link IcebergIO#writeRows}. Writes Beam Rows to Iceberg and @@ -42,7 +51,7 @@ */ @AutoService(SchemaTransformProvider.class) public class IcebergWriteSchemaTransformProvider - extends TypedSchemaTransformProvider { + extends TypedSchemaTransformProvider { static final String INPUT_TAG = "input"; static final String OUTPUT_TAG = "output"; @@ -57,8 +66,55 @@ public String description() { + "{\"table\" (str), \"operation\" (str), \"summary\" (map[str, str]), \"manifestListLocation\" (str)}"; } + @DefaultSchema(AutoValueSchema.class) + @AutoValue + public abstract static class Configuration { + public static Builder builder() { + return new AutoValue_IcebergWriteSchemaTransformProvider_Configuration.Builder(); + } + + @SchemaFieldDescription("Identifier of the Iceberg table.") + public abstract String getTable(); + + @SchemaFieldDescription("Name of the catalog containing the table.") + public abstract @Nullable String getCatalogName(); + + @SchemaFieldDescription("Properties used to set up the Iceberg catalog.") + public abstract @Nullable Map getCatalogProperties(); + + @SchemaFieldDescription("Properties passed to the Hadoop Configuration.") + public abstract @Nullable Map getConfigProperties(); + + @SchemaFieldDescription( + "For a streaming pipeline, sets the frequency at which snapshots are produced.") + public abstract @Nullable Integer getTriggeringFrequencySeconds(); + + @AutoValue.Builder + public abstract static class Builder { + public abstract Builder setTable(String table); + + public abstract Builder setCatalogName(String catalogName); + + public abstract Builder setCatalogProperties(Map catalogProperties); + + public abstract Builder setConfigProperties(Map confProperties); + + public abstract Builder setTriggeringFrequencySeconds(Integer triggeringFrequencySeconds); + + public abstract Configuration build(); + } + + public IcebergCatalogConfig getIcebergCatalog() { + return IcebergCatalogConfig.builder() + .setCatalogName(getCatalogName()) + .setCatalogProperties(getCatalogProperties()) + .setConfigProperties(getConfigProperties()) + .build(); + } + } + @Override - protected SchemaTransform from(SchemaTransformConfiguration configuration) { + protected SchemaTransform from(Configuration configuration) { return new IcebergWriteSchemaTransform(configuration); } @@ -78,9 +134,9 @@ public String identifier() { } static class IcebergWriteSchemaTransform extends SchemaTransform { - private final SchemaTransformConfiguration configuration; + private final Configuration configuration; - IcebergWriteSchemaTransform(SchemaTransformConfiguration configuration) { + IcebergWriteSchemaTransform(Configuration configuration) { this.configuration = configuration; } @@ -89,7 +145,7 @@ Row getConfigurationRow() { // To stay consistent with our SchemaTransform configuration naming conventions, // we sort lexicographically and convert field names to snake_case return SchemaRegistry.createDefault() - .getToRowFunction(SchemaTransformConfiguration.class) + .getToRowFunction(Configuration.class) .apply(configuration) .sorted() .toSnakeCase(); @@ -102,11 +158,17 @@ Row getConfigurationRow() { public PCollectionRowTuple expand(PCollectionRowTuple input) { PCollection rows = input.get(INPUT_TAG); + IcebergIO.WriteRows writeTransform = + IcebergIO.writeRows(configuration.getIcebergCatalog()) + .to(TableIdentifier.parse(configuration.getTable())); + + Integer trigFreq = configuration.getTriggeringFrequencySeconds(); + if (trigFreq != null) { + writeTransform = writeTransform.withTriggeringFrequency(Duration.standardSeconds(trigFreq)); + } + // TODO: support dynamic destinations - IcebergWriteResult result = - rows.apply( - IcebergIO.writeRows(configuration.getIcebergCatalog()) - .to(TableIdentifier.parse(configuration.getTable()))); + IcebergWriteResult result = rows.apply(writeTransform); PCollection snapshots = result diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/RecordWriter.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/RecordWriter.java index 6bbb103e17cf..1434400563bb 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/RecordWriter.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/RecordWriter.java @@ -36,7 +36,8 @@ class RecordWriter { private static final Logger LOG = LoggerFactory.getLogger(RecordWriter.class); - private final Counter activeWriters = Metrics.counter(RecordWriterManager.class, "activeWriters"); + private final Counter activeIcebergWriters = + Metrics.counter(RecordWriterManager.class, "activeIcebergWriters"); private final DataWriter icebergDataWriter; private final Table table; private final String absoluteFilename; @@ -92,7 +93,7 @@ class RecordWriter { default: throw new RuntimeException("Unknown File Format: " + fileFormat); } - activeWriters.inc(); + activeIcebergWriters.inc(); LOG.info( "Opened {} writer for table {}, partition {}. Writing to path: {}", fileFormat, @@ -115,7 +116,7 @@ public void close() throws IOException { fileFormat, table.name(), absoluteFilename), e); } - activeWriters.dec(); + activeIcebergWriters.dec(); LOG.info("Closed {} writer for table {}, path: {}", fileFormat, table.name(), absoluteFilename); } diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/RecordWriterManager.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/RecordWriterManager.java index b16f0caeb81b..5979e2a60131 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/RecordWriterManager.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/RecordWriterManager.java @@ -25,6 +25,8 @@ import java.util.Map; import java.util.UUID; import java.util.concurrent.TimeUnit; +import org.apache.beam.sdk.metrics.Counter; +import org.apache.beam.sdk.metrics.Metrics; import org.apache.beam.sdk.transforms.windowing.PaneInfo; import org.apache.beam.sdk.util.Preconditions; import org.apache.beam.sdk.util.WindowedValue; @@ -47,8 +49,6 @@ import org.apache.iceberg.data.Record; import org.apache.iceberg.io.FileIO; import org.apache.iceberg.io.OutputFile; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; /** * A writer that manages multiple {@link RecordWriter}s to write to multiple tables and partitions. @@ -60,8 +60,9 @@ * *

A {@link DestinationState} maintains its writers in a {@link Cache}. If a {@link RecordWriter} * is inactive for 1 minute, the {@link DestinationState} will automatically close it to free up - * resources. Calling {@link #close()} on this {@link RecordWriterManager} will do the following for - * each {@link DestinationState}: + * resources. When a data writer is closed, its resulting {@link DataFile} gets written. Calling + * {@link #close()} on this {@link RecordWriterManager} will do the following for each {@link + * DestinationState}: * *

    *
  1. Close all underlying {@link RecordWriter}s @@ -73,7 +74,10 @@ * #getManifestFiles()}. */ class RecordWriterManager implements AutoCloseable { - private static final Logger LOG = LoggerFactory.getLogger(RecordWriterManager.class); + private final Counter dataFilesWritten = + Metrics.counter(RecordWriterManager.class, "dataFilesWritten"); + private final Counter manifestFilesWritten = + Metrics.counter(RecordWriterManager.class, "manifestFilesWritten"); /** * Represents the state of one Iceberg table destination. Creates one {@link RecordWriter} per @@ -88,6 +92,7 @@ class DestinationState { private final PartitionKey partitionKey; private final String tableLocation; private final FileIO fileIO; + private final Table table; private final String stateToken = UUID.randomUUID().toString(); private final List dataFiles = Lists.newArrayList(); @VisibleForTesting final Cache writers; @@ -100,6 +105,7 @@ class DestinationState { this.partitionKey = new PartitionKey(spec, schema); this.tableLocation = table.location(); this.fileIO = table.io(); + this.table = table; // build a cache of RecordWriters. // writers will expire after 1 min of idle time. @@ -123,6 +129,7 @@ class DestinationState { } openWriters--; dataFiles.add(recordWriter.getDataFile()); + dataFilesWritten.inc(); }) .build(); } @@ -170,8 +177,8 @@ private RecordWriter createWriter(PartitionKey partitionKey) { try { RecordWriter writer = new RecordWriter( - catalog, - icebergDestination, + table, + icebergDestination.getFileFormat(), filePrefix + "_" + stateToken + "_" + recordIndex, partitionKey); openWriters++; @@ -261,13 +268,7 @@ public void close() throws IOException { manifestWriter = openWriter; } ManifestFile manifestFile = manifestWriter.toManifestFile(); - - LOG.info( - "Successfully wrote manifest file, adding {} data files ({} rows) to table '{}': {}.", - manifestFile.addedFilesCount(), - manifestFile.addedRowsCount(), - windowedDestination.getValue().getTableIdentifier(), - outputFile.location()); + manifestFilesWritten.inc(); totalManifestFiles .computeIfAbsent(windowedDestination, dest -> Lists.newArrayList()) diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/WriteToDestinations.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/WriteToDestinations.java index 65fd551c782a..f71ff24a1a37 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/WriteToDestinations.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/WriteToDestinations.java @@ -28,24 +28,35 @@ import org.apache.beam.sdk.transforms.MapElements; import org.apache.beam.sdk.transforms.PTransform; import org.apache.beam.sdk.transforms.SimpleFunction; +import org.apache.beam.sdk.transforms.windowing.AfterPane; +import org.apache.beam.sdk.transforms.windowing.AfterProcessingTime; +import org.apache.beam.sdk.transforms.windowing.GlobalWindows; +import org.apache.beam.sdk.transforms.windowing.Repeatedly; +import org.apache.beam.sdk.transforms.windowing.Window; import org.apache.beam.sdk.values.KV; import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.PCollectionList; import org.apache.beam.sdk.values.Row; import org.apache.beam.sdk.values.ShardedKey; +import org.checkerframework.checker.nullness.qual.Nullable; +import org.joda.time.Duration; class WriteToDestinations extends PTransform, IcebergWriteResult> { static final long DEFAULT_MAX_BYTES_PER_FILE = (1L << 40); // 1TB static final int DEFAULT_NUM_FILE_SHARDS = 0; - static final int FILE_TRIGGERING_RECORD_COUNT = 50_000; private final IcebergCatalogConfig catalogConfig; private final DynamicDestinations dynamicDestinations; + private final @Nullable Duration triggeringFrequency; - WriteToDestinations(IcebergCatalogConfig catalogConfig, DynamicDestinations dynamicDestinations) { + WriteToDestinations( + IcebergCatalogConfig catalogConfig, + DynamicDestinations dynamicDestinations, + @Nullable Duration triggeringFrequency) { this.dynamicDestinations = dynamicDestinations; this.catalogConfig = catalogConfig; + this.triggeringFrequency = triggeringFrequency; } @Override @@ -108,11 +119,44 @@ public KV, Row> apply(Row elem) { "Write remaining rows to files", new WriteGroupedRowsToFiles(catalogConfig, dynamicDestinations)); + PCollection writeUngroupedResultPColl = writeUngroupedResult.getWrittenFiles(); + + if (input.isBounded().equals(PCollection.IsBounded.UNBOUNDED)) { + // for streaming pipelines, re-window both outputs to keep Flatten happy + writeGroupedResult = + writeGroupedResult.apply( + "RewindowGroupedRecords", + Window.into(new GlobalWindows()) + .triggering(Repeatedly.forever(AfterPane.elementCountAtLeast(1))) + .discardingFiredPanes()); + writeUngroupedResultPColl = + writeUngroupedResultPColl.apply( + "RewindowUnGroupedRecords", + Window.into(new GlobalWindows()) + .triggering(Repeatedly.forever(AfterPane.elementCountAtLeast(1))) + .discardingFiredPanes()); + } + PCollection allWrittenFiles = - PCollectionList.of(writeUngroupedResult.getWrittenFiles()) + PCollectionList.of(writeUngroupedResultPColl) .and(writeGroupedResult) .apply("Flatten Written Files", Flatten.pCollections()); + if (input.isBounded().equals(PCollection.IsBounded.UNBOUNDED)) { + checkArgumentNotNull( + triggeringFrequency, "Streaming pipelines must set a triggering frequency."); + // apply the user's trigger before we start committing and creating snapshots + allWrittenFiles = + allWrittenFiles.apply( + "ApplyUserTrigger", + Window.into(new GlobalWindows()) + .triggering( + Repeatedly.forever( + AfterProcessingTime.pastFirstElementInPane() + .plusDelayOf(checkArgumentNotNull(triggeringFrequency)))) + .discardingFiredPanes()); + } + // Apply any sharded writes and flatten everything for catalog updates PCollection> snapshots = allWrittenFiles.apply(new AppendFilesToTables(catalogConfig)); diff --git a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergIOIT.java b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergIOIT.java index 2e748e9644e8..8c6d3d99e35e 100644 --- a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergIOIT.java +++ b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergIOIT.java @@ -19,10 +19,12 @@ import static org.hamcrest.MatcherAssert.assertThat; import static org.hamcrest.Matchers.containsInAnyOrder; +import static org.hamcrest.Matchers.equalTo; import java.io.IOException; import java.io.Serializable; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.UUID; @@ -33,9 +35,14 @@ import org.apache.beam.sdk.testing.PAssert; import org.apache.beam.sdk.testing.TestPipeline; import org.apache.beam.sdk.transforms.Create; +import org.apache.beam.sdk.transforms.MapElements; +import org.apache.beam.sdk.transforms.PeriodicImpulse; import org.apache.beam.sdk.transforms.SimpleFunction; +import org.apache.beam.sdk.transforms.windowing.FixedWindows; +import org.apache.beam.sdk.transforms.windowing.Window; import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.Row; +import org.apache.beam.sdk.values.TypeDescriptors; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.apache.hadoop.conf.Configuration; import org.apache.iceberg.AppendFiles; @@ -57,6 +64,8 @@ import org.apache.iceberg.io.InputFile; import org.apache.iceberg.io.OutputFile; import org.apache.iceberg.parquet.Parquet; +import org.joda.time.Duration; +import org.joda.time.Instant; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Rule; @@ -307,4 +316,69 @@ public void testWritePartitionedData() { assertThat( returnedRecords, containsInAnyOrder(INPUT_ROWS.stream().map(RECORD_FUNC::apply).toArray())); } + + @Test + public void testStreamingWrite() { + PartitionSpec partitionSpec = + PartitionSpec.builderFor(ICEBERG_SCHEMA).identity("bool").identity("modulo_5").build(); + Table table = catalog.createTable(tableId, ICEBERG_SCHEMA, partitionSpec); + + Map config = new HashMap<>(managedIcebergConfig()); + config.put("triggering_frequency_seconds", 4); + + // over a span of 10 seconds, create elements from longs in range [0, 1000) + PCollection input = + pipeline + .apply( + PeriodicImpulse.create() + .stopAfter(Duration.millis(9_990)) + .withInterval(Duration.millis(10))) + .apply( + MapElements.into(TypeDescriptors.rows()) + .via(instant -> ROW_FUNC.apply((instant.getMillis() / 10) % 1000))) + .setRowSchema(BEAM_SCHEMA); + + assertThat(input.isBounded(), equalTo(PCollection.IsBounded.UNBOUNDED)); + + input.apply(Managed.write(Managed.ICEBERG).withConfig(config)); + pipeline.run().waitUntilFinish(); + + List returnedRecords = readRecords(table); + assertThat( + returnedRecords, containsInAnyOrder(INPUT_ROWS.stream().map(RECORD_FUNC::apply).toArray())); + } + + @Test + public void testStreamingWriteWithPriorWindowing() { + PartitionSpec partitionSpec = + PartitionSpec.builderFor(ICEBERG_SCHEMA).identity("bool").identity("modulo_5").build(); + Table table = catalog.createTable(tableId, ICEBERG_SCHEMA, partitionSpec); + + Map config = new HashMap<>(managedIcebergConfig()); + config.put("triggering_frequency_seconds", 4); + + // over a span of 10 seconds, create elements from longs in range [0, 1000) + PCollection input = + pipeline + .apply( + PeriodicImpulse.create() + .stopAfter(Duration.millis(9_990)) + .withInterval(Duration.millis(10))) + .apply( + Window.into(FixedWindows.of(Duration.standardSeconds(1))) + .accumulatingFiredPanes()) + .apply( + MapElements.into(TypeDescriptors.rows()) + .via(instant -> ROW_FUNC.apply((instant.getMillis() / 10) % 1000))) + .setRowSchema(BEAM_SCHEMA); + + assertThat(input.isBounded(), equalTo(PCollection.IsBounded.UNBOUNDED)); + + input.apply(Managed.write(Managed.ICEBERG).withConfig(config)); + pipeline.run().waitUntilFinish(); + + List returnedRecords = readRecords(table); + assertThat( + returnedRecords, containsInAnyOrder(INPUT_ROWS.stream().map(RECORD_FUNC::apply).toArray())); + } } diff --git a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergIOWriteTest.java b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergIOWriteTest.java index 2abe6b093481..d3bf13a16787 100644 --- a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergIOWriteTest.java +++ b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergIOWriteTest.java @@ -25,8 +25,13 @@ import java.util.Map; import java.util.UUID; import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.testing.PAssert; import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.testing.TestStream; +import org.apache.beam.sdk.transforms.Count; import org.apache.beam.sdk.transforms.Create; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.Row; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; @@ -47,6 +52,8 @@ import org.apache.iceberg.io.OutputFile; import org.apache.iceberg.parquet.Parquet; import org.hamcrest.Matchers; +import org.joda.time.Duration; +import org.joda.time.Instant; import org.junit.ClassRule; import org.junit.Rule; import org.junit.Test; @@ -303,4 +310,59 @@ public void testIdempotentCommit() throws Exception { secondUpdate.appendFile(dataFile); secondUpdate.commit(); } + + @Test + public void testStreamingWrite() { + TableIdentifier tableId = + TableIdentifier.of( + "default", "streaming_" + Long.toString(UUID.randomUUID().hashCode(), 16)); + + // Create a table and add records to it. + Table table = warehouse.createTable(tableId, TestFixtures.SCHEMA); + + Map catalogProps = + ImmutableMap.builder() + .put("type", CatalogUtil.ICEBERG_CATALOG_TYPE_HADOOP) + .put("warehouse", warehouse.location) + .build(); + + IcebergCatalogConfig catalog = + IcebergCatalogConfig.builder() + .setCatalogName("name") + .setCatalogProperties(catalogProps) + .build(); + + List inputRows = TestFixtures.asRows(TestFixtures.FILE1SNAPSHOT1); + TestStream stream = + TestStream.create(IcebergUtils.icebergSchemaToBeamSchema(TestFixtures.SCHEMA)) + .advanceWatermarkTo(new Instant(0)) + // the first two rows are written within the same triggering interval, + // so they should both be in the first snapshot + .addElements(inputRows.get(0)) + .advanceProcessingTime(Duration.standardSeconds(1)) + .addElements(inputRows.get(1)) + .advanceProcessingTime(Duration.standardSeconds(5)) + // the third row is written in a new triggering interval, + // so we create a new snapshot for it. + .addElements(inputRows.get(2)) + .advanceProcessingTime(Duration.standardSeconds(5)) + .advanceWatermarkToInfinity(); + + PCollection> output = + testPipeline + .apply("Stream Records", stream) + .apply( + "Append To Table", + IcebergIO.writeRows(catalog) + .to(tableId) + .withTriggeringFrequency(Duration.standardSeconds(3))) + .getSnapshots(); + // verify that 2 snapshots are created (one per triggering interval) + PCollection snapshots = output.apply(Count.globally()); + PAssert.that(snapshots).containsInAnyOrder(1L, 1L); + testPipeline.run().waitUntilFinish(); + + List writtenRecords = ImmutableList.copyOf(IcebergGenerics.read(table).build()); + assertThat(writtenRecords, Matchers.containsInAnyOrder(TestFixtures.FILE1SNAPSHOT1.toArray())); + } } diff --git a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergWriteSchemaTransformProviderTest.java b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergWriteSchemaTransformProviderTest.java index 6b555e7e14d0..779687c97768 100644 --- a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergWriteSchemaTransformProviderTest.java +++ b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergWriteSchemaTransformProviderTest.java @@ -17,6 +17,7 @@ */ package org.apache.beam.sdk.io.iceberg; +import static org.apache.beam.sdk.io.iceberg.IcebergWriteSchemaTransformProvider.Configuration; import static org.apache.beam.sdk.io.iceberg.IcebergWriteSchemaTransformProvider.INPUT_TAG; import static org.apache.beam.sdk.io.iceberg.IcebergWriteSchemaTransformProvider.OUTPUT_TAG; import static org.hamcrest.MatcherAssert.assertThat; @@ -88,8 +89,8 @@ public void testSimpleAppend() { properties.put("type", CatalogUtil.ICEBERG_CATALOG_TYPE_HADOOP); properties.put("warehouse", warehouse.location); - SchemaTransformConfiguration config = - SchemaTransformConfiguration.builder() + Configuration config = + Configuration.builder() .setTable(identifier) .setCatalogName("name") .setCatalogProperties(properties) diff --git a/sdks/java/io/jms/src/main/java/org/apache/beam/sdk/io/jms/JmsIO.java b/sdks/java/io/jms/src/main/java/org/apache/beam/sdk/io/jms/JmsIO.java index 060e660b0847..610e76c78416 100644 --- a/sdks/java/io/jms/src/main/java/org/apache/beam/sdk/io/jms/JmsIO.java +++ b/sdks/java/io/jms/src/main/java/org/apache/beam/sdk/io/jms/JmsIO.java @@ -31,6 +31,7 @@ import java.util.Map; import java.util.NoSuchElementException; import java.util.Optional; +import java.util.UUID; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import java.util.stream.Stream; @@ -586,7 +587,7 @@ public boolean requiresDeduping() { } static class UnboundedJmsReader extends UnboundedReader { - + private static final byte[] EMPTY = new byte[0]; private UnboundedJmsSource source; @VisibleForTesting JmsCheckpointMark.Preparer checkpointMarkPreparer; private Connection connection; @@ -604,7 +605,7 @@ public UnboundedJmsReader(UnboundedJmsSource source, PipelineOptions options) this.source = source; this.checkpointMarkPreparer = JmsCheckpointMark.newPreparer(); this.currentMessage = null; - this.currentID = new byte[0]; + this.currentID = EMPTY; this.options = options; } @@ -684,18 +685,22 @@ public boolean advance() throws IOException { currentTimestamp = new Instant(message.getJMSTimestamp()); String messageID = message.getJMSMessageID(); - if (this.source.spec.isRequiresDeduping()) { - // per JMS specification, message ID has prefix "id:". The runner use it to dedup message. - // Empty or non-exist message id (possible for optimization configuration set) will induce - // data loss. - if (messageID.length() <= 3) { - throw new RuntimeException( - String.format( - "Invalid JMSMessageID %s while requiresDeduping is set. Data loss possible.", - messageID)); + if (messageID != null) { + if (this.source.spec.isRequiresDeduping()) { + // per JMS specification, message ID has prefix "id:". The runner use it to dedup + // message. Empty or non-exist message id (possible for optimization configuration set) + // will cause data loss. + if (messageID.length() <= 3) { + throw new RuntimeException( + String.format( + "Invalid JMSMessageID %s while requiresDeduping is set. Data loss possible.", + messageID)); + } } + currentID = messageID.getBytes(StandardCharsets.UTF_8); + } else { + currentID = EMPTY; } - currentID = messageID.getBytes(StandardCharsets.UTF_8); return true; } catch (Exception e) { @@ -728,6 +733,12 @@ public Instant getCurrentTimestamp() { public byte[] getCurrentRecordId() { if (currentMessage == null) { throw new NoSuchElementException(); + } else if (currentID == EMPTY && this.source.spec.isRequiresDeduping()) { + LOG.warn( + "Empty JMSRecordID received when requiresDeduping enabled, runner deduplication will" + + " not be effective"); + // Return a random UUID to ensure it won't get dedup + currentID = UUID.randomUUID().toString().getBytes(StandardCharsets.UTF_8); } return currentID; } diff --git a/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/KafkaIO.java b/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/KafkaIO.java index 1fd3e3e044ef..0f28edf19dd8 100644 --- a/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/KafkaIO.java +++ b/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/KafkaIO.java @@ -890,7 +890,6 @@ static void setupExternalBuilder( builder.setRedistributeNumKeys(0); builder.setAllowDuplicates(false); } - System.out.println("xxx builder service" + builder.toString()); } private static Coder resolveCoder(Class> deserializer) { @@ -1697,11 +1696,12 @@ public PCollection> expand(PBegin input) { } if (kafkaRead.isRedistributed()) { - // fail here instead. - checkArgument( - kafkaRead.isCommitOffsetsInFinalizeEnabled(), - "commitOffsetsInFinalize() can't be enabled with isRedistributed"); + if (kafkaRead.isCommitOffsetsInFinalizeEnabled() && kafkaRead.isAllowDuplicates()) { + LOG.warn( + "Offsets committed due to usage of commitOffsetsInFinalize() and may not capture all work processed due to use of withRedistribute() with duplicates enabled"); + } PCollection> output = input.getPipeline().apply(transform); + if (kafkaRead.getRedistributeNumKeys() == 0) { return output.apply( "Insert Redistribute", @@ -1797,7 +1797,7 @@ public PCollection> expand(PBegin input) { return pcol.apply( "Insert Redistribute with Shards", Redistribute.>arbitrarily() - .withAllowDuplicates(true) + .withAllowDuplicates(kafkaRead.isAllowDuplicates()) .withNumBuckets((int) kafkaRead.getRedistributeNumKeys())); } } @@ -2654,6 +2654,12 @@ public PCollection> expand(PCollection if (getRedistributeNumKeys() == 0) { LOG.warn("This will create a key per record, which is sub-optimal for most use cases."); } + if ((isCommitOffsetEnabled() || configuredKafkaCommit()) && isAllowDuplicates()) { + LOG.warn( + "Either auto_commit is set, or commitOffsetEnabled is enabled (or both), but since " + + "withRestribute() is enabled with allow duplicates, the runner may have additional work processed that " + + "is ahead of the current checkpoint"); + } } if (getConsumerConfig().get(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG) == null) { @@ -2687,8 +2693,7 @@ public PCollection> expand(PCollection .getSchemaCoder(KafkaSourceDescriptor.class), recordCoder)); - boolean applyCommitOffsets = - isCommitOffsetEnabled() && !configuredKafkaCommit() && !isRedistribute(); + boolean applyCommitOffsets = isCommitOffsetEnabled() && !configuredKafkaCommit(); if (!applyCommitOffsets) { return outputWithDescriptor .apply(MapElements.into(new TypeDescriptor>() {}).via(KV::getValue)) @@ -2710,6 +2715,15 @@ public PCollection> expand(PCollection if (Comparators.lexicographical(Comparator.naturalOrder()) .compare(requestedVersion, targetVersion) < 0) { + // Redistribute is not allowed with commits prior to 2.59.0, since there is a Reshuffle + // prior to the redistribute. The reshuffle will occur before commits are offsetted and + // before outputting KafkaRecords. Adding a redistribute then afterwards doesn't provide + // additional performance benefit. + checkArgument( + !isRedistribute(), + "Can not enable isRedistribute() while committing offsets prior to " + + String.join(".", targetVersion)); + return expand259Commits( outputWithDescriptor, recordCoder, input.getPipeline().getSchemaRegistry()); } diff --git a/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/KafkaReadSchemaTransformProvider.java b/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/KafkaReadSchemaTransformProvider.java index b2eeb1a54d1d..e87669ab2b0a 100644 --- a/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/KafkaReadSchemaTransformProvider.java +++ b/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/KafkaReadSchemaTransformProvider.java @@ -149,10 +149,10 @@ public PCollectionRowTuple expand(PCollectionRowTuple input) { Map consumerConfigs = new HashMap<>( MoreObjects.firstNonNull(configuration.getConsumerConfigUpdates(), new HashMap<>())); - consumerConfigs.put(ConsumerConfig.GROUP_ID_CONFIG, "kafka-read-provider-" + groupId); - consumerConfigs.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, true); - consumerConfigs.put(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG, 100); - consumerConfigs.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, autoOffsetReset); + consumerConfigs.putIfAbsent(ConsumerConfig.GROUP_ID_CONFIG, "kafka-read-provider-" + groupId); + consumerConfigs.putIfAbsent(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, true); + consumerConfigs.putIfAbsent(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG, 100); + consumerConfigs.putIfAbsent(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, autoOffsetReset); String format = configuration.getFormat(); boolean handleErrors = ErrorHandling.hasOutput(configuration.getErrorHandling()); diff --git a/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/ReadFromKafkaDoFn.java b/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/ReadFromKafkaDoFn.java index 9bb950bb8e6c..952e29f75104 100644 --- a/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/ReadFromKafkaDoFn.java +++ b/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/ReadFromKafkaDoFn.java @@ -441,10 +441,11 @@ public ProcessContinuation processElement( ConsumerSpEL.evaluateAssign( consumer, ImmutableList.of(kafkaSourceDescriptor.getTopicPartition())); long startOffset = tracker.currentRestriction().getFrom(); - long expectedOffset = startOffset; consumer.seek(kafkaSourceDescriptor.getTopicPartition(), startOffset); ConsumerRecords rawRecords = ConsumerRecords.empty(); + long skippedRecords = 0L; + final Stopwatch sw = Stopwatch.createStarted(); while (true) { rawRecords = poll(consumer, kafkaSourceDescriptor.getTopicPartition()); @@ -461,6 +462,36 @@ public ProcessContinuation processElement( return ProcessContinuation.resume(); } for (ConsumerRecord rawRecord : rawRecords) { + // If the Kafka consumer returns a record with an offset that is already processed + // the record can be safely skipped. This is needed because there is a possibility + // that the seek() above fails to move the offset to the desired position. In which + // case poll() would return records that are already cnsumed. + if (rawRecord.offset() < startOffset) { + // If the start offset is not reached even after skipping the records for 10 seconds + // then the processing is stopped with a backoff to give the Kakfa server some time + // catch up. + if (sw.elapsed().getSeconds() > 10L) { + LOG.error( + "The expected offset ({}) was not reached even after" + + " skipping consumed records for 10 seconds. The offset we could" + + " reach was {}. The processing of this bundle will be attempted" + + " at a later time.", + expectedOffset, + rawRecord.offset()); + return ProcessContinuation.resume() + .withResumeDelay(org.joda.time.Duration.standardSeconds(10L)); + } + skippedRecords++; + continue; + } + if (skippedRecords > 0L) { + LOG.warn( + "{} records were skipped due to seek returning an" + + " earlier position than requested position of {}", + skippedRecords, + expectedOffset); + skippedRecords = 0L; + } if (!tracker.tryClaim(rawRecord.offset())) { return ProcessContinuation.stop(); } diff --git a/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/KafkaIOReadImplementationCompatibilityTest.java b/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/KafkaIOReadImplementationCompatibilityTest.java index 74f1e83fd86b..29c920bf9a6f 100644 --- a/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/KafkaIOReadImplementationCompatibilityTest.java +++ b/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/KafkaIOReadImplementationCompatibilityTest.java @@ -108,7 +108,13 @@ private PipelineResult testReadTransformCreationWithImplementationBoundPropertie Function, KafkaIO.Read> kafkaReadDecorator) { p.apply( kafkaReadDecorator.apply( - mkKafkaReadTransform(1000, null, new ValueAsTimestampFn(), false, 0))); + mkKafkaReadTransform( + 1000, + null, + new ValueAsTimestampFn(), + false, /*redistribute*/ + false, /*allowDuplicates*/ + 0))); return p.run(); } diff --git a/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/KafkaIOTest.java b/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/KafkaIOTest.java index 1fe1147a7390..25ff6dad1244 100644 --- a/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/KafkaIOTest.java +++ b/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/KafkaIOTest.java @@ -88,6 +88,7 @@ import org.apache.beam.sdk.metrics.SinkMetrics; import org.apache.beam.sdk.metrics.SourceMetrics; import org.apache.beam.sdk.options.PipelineOptionsFactory; +import org.apache.beam.sdk.options.StreamingOptions; import org.apache.beam.sdk.testing.ExpectedLogs; import org.apache.beam.sdk.testing.PAssert; import org.apache.beam.sdk.testing.TestPipeline; @@ -381,7 +382,13 @@ public Consumer apply(Map config) { static KafkaIO.Read mkKafkaReadTransform( int numElements, @Nullable SerializableFunction, Instant> timestampFn) { - return mkKafkaReadTransform(numElements, numElements, timestampFn, false, 0); + return mkKafkaReadTransform( + numElements, + numElements, + timestampFn, + false, /*redistribute*/ + false, /*allowDuplicates*/ + 0); } /** @@ -393,6 +400,7 @@ static KafkaIO.Read mkKafkaReadTransform( @Nullable Integer maxNumRecords, @Nullable SerializableFunction, Instant> timestampFn, @Nullable Boolean redistribute, + @Nullable Boolean withAllowDuplicates, @Nullable Integer numKeys) { KafkaIO.Read reader = @@ -408,13 +416,21 @@ static KafkaIO.Read mkKafkaReadTransform( reader = reader.withMaxNumRecords(maxNumRecords); } + if (withAllowDuplicates == null) { + withAllowDuplicates = false; + } + if (timestampFn != null) { reader = reader.withTimestampFn(timestampFn); } if (redistribute) { if (numKeys != null) { - reader = reader.withRedistribute().withRedistributeNumKeys(numKeys); + reader = + reader + .withRedistribute() + .withAllowDuplicates(withAllowDuplicates) + .withRedistributeNumKeys(numKeys); } reader = reader.withRedistribute(); } @@ -628,17 +644,47 @@ public void testRiskyConfigurationWarnsProperly() { } @Test - public void testCommitOffsetsInFinalizeAndRedistributeErrors() { - thrown.expect(Exception.class); - thrown.expectMessage("commitOffsetsInFinalize() can't be enabled with isRedistributed"); + public void warningsWithAllowDuplicatesEnabledAndCommitOffsets() { + int numElements = 1000; + PCollection input = + p.apply( + mkKafkaReadTransform( + numElements, + numElements, + new ValueAsTimestampFn(), + true, /*redistribute*/ + true, /*allowDuplicates*/ + 0) + .commitOffsetsInFinalize() + .withConsumerConfigUpdates( + ImmutableMap.of(ConsumerConfig.GROUP_ID_CONFIG, "group_id")) + .withoutMetadata()) + .apply(Values.create()); + + addCountingAsserts(input, numElements); + p.run(); + + kafkaIOExpectedLogs.verifyWarn( + "Offsets committed due to usage of commitOffsetsInFinalize() and may not capture all work processed due to use of withRedistribute() with duplicates enabled"); + } + + @Test + public void noWarningsWithNoAllowDuplicatesAndCommitOffsets() { int numElements = 1000; PCollection input = p.apply( - mkKafkaReadTransform(numElements, numElements, new ValueAsTimestampFn(), true, 0) + mkKafkaReadTransform( + numElements, + numElements, + new ValueAsTimestampFn(), + true, /*redistribute*/ + false, /*allowDuplicates*/ + 0) + .commitOffsetsInFinalize() .withConsumerConfigUpdates( - ImmutableMap.of(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, true)) + ImmutableMap.of(ConsumerConfig.GROUP_ID_CONFIG, "group_id")) .withoutMetadata()) .apply(Values.create()); @@ -648,13 +694,25 @@ public void testCommitOffsetsInFinalizeAndRedistributeErrors() { @Test public void testNumKeysIgnoredWithRedistributeNotEnabled() { + thrown.expect(Exception.class); + thrown.expectMessage( + "withRedistributeNumKeys is ignored if withRedistribute() is not enabled on the transform"); + int numElements = 1000; PCollection input = p.apply( - mkKafkaReadTransform(numElements, numElements, new ValueAsTimestampFn(), false, 0) + mkKafkaReadTransform( + numElements, + numElements, + new ValueAsTimestampFn(), + false, /*redistribute*/ + false, /*allowDuplicates*/ + 0) + .withRedistributeNumKeys(100) + .commitOffsetsInFinalize() .withConsumerConfigUpdates( - ImmutableMap.of(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, true)) + ImmutableMap.of(ConsumerConfig.GROUP_ID_CONFIG, "group_id")) .withoutMetadata()) .apply(Values.create()); @@ -663,6 +721,32 @@ public void testNumKeysIgnoredWithRedistributeNotEnabled() { p.run(); } + @Test + public void testDisableRedistributeKafkaOffsetLegacy() { + thrown.expect(Exception.class); + thrown.expectMessage( + "Can not enable isRedistribute() while committing offsets prior to 2.60.0"); + p.getOptions().as(StreamingOptions.class).setUpdateCompatibilityVersion("2.59.0"); + + p.apply( + Create.of( + KafkaSourceDescriptor.of( + new TopicPartition("topic", 1), + null, + null, + null, + null, + ImmutableList.of("8.8.8.8:9092")))) + .apply( + KafkaIO.readSourceDescriptors() + .withKeyDeserializer(LongDeserializer.class) + .withValueDeserializer(LongDeserializer.class) + .withRedistribute() + .withProcessingTime() + .commitOffsets()); + p.run(); + } + @Test public void testUnreachableKafkaBrokers() { // Expect an exception when the Kafka brokers are not reachable on the workers. @@ -1982,7 +2066,13 @@ public void testUnboundedSourceStartReadTime() { PCollection input = p.apply( - mkKafkaReadTransform(numElements, maxNumRecords, new ValueAsTimestampFn(), false, 0) + mkKafkaReadTransform( + numElements, + maxNumRecords, + new ValueAsTimestampFn(), + false, /*redistribute*/ + false, /*allowDuplicates*/ + 0) .withStartReadTime(new Instant(startTime)) .withoutMetadata()) .apply(Values.create()); @@ -2006,7 +2096,13 @@ public void testUnboundedSourceStartReadTimeException() { int startTime = numElements / 20; p.apply( - mkKafkaReadTransform(numElements, numElements, new ValueAsTimestampFn(), false, 0) + mkKafkaReadTransform( + numElements, + numElements, + new ValueAsTimestampFn(), + false, /*redistribute*/ + false, /*allowDuplicates*/ + 0) .withStartReadTime(new Instant(startTime)) .withoutMetadata()) .apply(Values.create()); diff --git a/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/ReadFromKafkaDoFnTest.java b/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/ReadFromKafkaDoFnTest.java index 6ee3d9d96ef6..a9e4a4eddb61 100644 --- a/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/ReadFromKafkaDoFnTest.java +++ b/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/ReadFromKafkaDoFnTest.java @@ -32,6 +32,7 @@ import org.apache.beam.runners.core.metrics.DistributionCell; import org.apache.beam.runners.core.metrics.DistributionData; import org.apache.beam.runners.core.metrics.MetricsContainerImpl; +import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.Pipeline.PipelineVisitor; import org.apache.beam.sdk.coders.KvCoder; import org.apache.beam.sdk.coders.SerializableCoder; @@ -40,6 +41,9 @@ import org.apache.beam.sdk.io.range.OffsetRange; import org.apache.beam.sdk.metrics.MetricName; import org.apache.beam.sdk.metrics.MetricsEnvironment; +import org.apache.beam.sdk.options.ExperimentalOptions; +import org.apache.beam.sdk.options.PipelineOptions; +import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.runners.TransformHierarchy.Node; import org.apache.beam.sdk.testing.TestPipeline; import org.apache.beam.sdk.transforms.Create; @@ -62,6 +66,7 @@ import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; import org.apache.kafka.clients.consumer.Consumer; +import org.apache.kafka.clients.consumer.ConsumerConfig; import org.apache.kafka.clients.consumer.ConsumerRecord; import org.apache.kafka.clients.consumer.ConsumerRecords; import org.apache.kafka.clients.consumer.MockConsumer; @@ -105,6 +110,12 @@ public class ReadFromKafkaDoFnTest { private final ReadFromKafkaDoFn exceptionDofnInstance = ReadFromKafkaDoFn.create(makeReadSourceDescriptor(exceptionConsumer), RECORDS); + private final SimpleMockKafkaConsumerWithBrokenSeek consumerWithBrokenSeek = + new SimpleMockKafkaConsumerWithBrokenSeek(OffsetResetStrategy.NONE, topicPartition); + + private final ReadFromKafkaDoFn dofnInstanceWithBrokenSeek = + ReadFromKafkaDoFn.create(makeReadSourceDescriptor(consumerWithBrokenSeek), RECORDS); + private ReadSourceDescriptors makeReadSourceDescriptor( Consumer kafkaMockConsumer) { return ReadSourceDescriptors.read() @@ -290,6 +301,17 @@ public synchronized long position(TopicPartition partition) { } } + private static class SimpleMockKafkaConsumerWithBrokenSeek extends SimpleMockKafkaConsumer { + + public SimpleMockKafkaConsumerWithBrokenSeek( + OffsetResetStrategy offsetResetStrategy, TopicPartition topicPartition) { + super(offsetResetStrategy, topicPartition); + } + + @Override + public synchronized void seek(TopicPartition partition, long offset) {} + } + private static class MockMultiOutputReceiver implements MultiOutputReceiver { MockOutputReceiver>> mockOutputReceiver = @@ -372,6 +394,7 @@ private List>> createExpec public void setUp() throws Exception { dofnInstance.setup(); exceptionDofnInstance.setup(); + dofnInstanceWithBrokenSeek.setup(); consumer.reset(); } @@ -470,6 +493,24 @@ public void testProcessElement() throws Exception { receiver.getGoodRecords()); } + @Test + public void testProcessElementWithEarlierOffset() throws Exception { + MockMultiOutputReceiver receiver = new MockMultiOutputReceiver(); + consumerWithBrokenSeek.setNumOfRecordsPerPoll(6L); + consumerWithBrokenSeek.setCurrentPos(0L); + long startOffset = 3L; + OffsetRangeTracker tracker = + new OffsetRangeTracker(new OffsetRange(startOffset, startOffset + 3)); + KafkaSourceDescriptor descriptor = + KafkaSourceDescriptor.of(topicPartition, null, null, null, null, null); + ProcessContinuation result = + dofnInstanceWithBrokenSeek.processElement(descriptor, tracker, null, receiver); + assertEquals(ProcessContinuation.stop(), result); + assertEquals( + createExpectedRecords(descriptor, startOffset, 3, "key", "value"), + receiver.getGoodRecords()); + } + @Test public void testRawSizeMetric() throws Exception { final int numElements = 1000; @@ -526,6 +567,18 @@ public void testProcessElementWhenTopicPartitionIsRemoved() throws Exception { assertEquals(ProcessContinuation.stop(), result); } + @Test + public void testSDFCommitOffsetEnabled() { + OffSetsVisitor visitor = testCommittingOffsets(true); + Assert.assertEquals(true, visitor.foundOffsetTransform); + } + + @Test + public void testSDFCommitOffsetNotEnabled() { + OffSetsVisitor visitor = testCommittingOffsets(false); + Assert.assertNotEquals(true, visitor.foundOffsetTransform); + } + @Test public void testProcessElementWhenTopicPartitionIsStopped() throws Exception { MockMultiOutputReceiver receiver = new MockMultiOutputReceiver(); @@ -688,4 +741,47 @@ public void visitValue(PValue value, Node producer) { } } } + + private OffSetsVisitor testCommittingOffsets(boolean enableOffsets) { + + // Force Kafka read to use SDF implementation + PipelineOptions pipelineOptions = PipelineOptionsFactory.create(); + ExperimentalOptions.addExperiment( + pipelineOptions.as(ExperimentalOptions.class), "use_sdf_read"); + + Pipeline p = Pipeline.create(pipelineOptions); + KafkaIO.Read read = + KafkaIO.read() + .withKeyDeserializer(StringDeserializer.class) + .withValueDeserializer(StringDeserializer.class) + .withConsumerConfigUpdates( + new ImmutableMap.Builder() + .put(ConsumerConfig.GROUP_ID_CONFIG, "group_id_1") + .build()) + .withBootstrapServers("bootstrap_server") + .withTopic("test-topic"); + + if (enableOffsets) { + read = read.commitOffsetsInFinalize(); + } + + p.apply(read.withoutMetadata()); + OffSetsVisitor visitor = new OffSetsVisitor(); + p.traverseTopologically(visitor); + return visitor; + } + + static class OffSetsVisitor extends PipelineVisitor.Defaults { + boolean foundOffsetTransform = false; + + @Override + public void visitValue(PValue value, Node producer) { + if (value instanceof PCollection) { + PCollection pc = (PCollection) value; + if (pc.getName().contains("KafkaCommitOffset")) { + foundOffsetTransform = true; + } + } + } + } } diff --git a/sdks/java/io/mqtt/src/main/java/org/apache/beam/sdk/io/mqtt/MqttIO.java b/sdks/java/io/mqtt/src/main/java/org/apache/beam/sdk/io/mqtt/MqttIO.java index 8b7f0991c2dd..0e584d564b5c 100644 --- a/sdks/java/io/mqtt/src/main/java/org/apache/beam/sdk/io/mqtt/MqttIO.java +++ b/sdks/java/io/mqtt/src/main/java/org/apache/beam/sdk/io/mqtt/MqttIO.java @@ -30,6 +30,7 @@ import java.util.Objects; import java.util.UUID; import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; import org.apache.beam.sdk.coders.ByteArrayCoder; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.coders.SerializableCoder; @@ -45,6 +46,7 @@ import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; import org.checkerframework.checker.nullness.qual.Nullable; import org.fusesource.mqtt.client.BlockingConnection; +import org.fusesource.mqtt.client.FutureConnection; import org.fusesource.mqtt.client.MQTT; import org.fusesource.mqtt.client.Message; import org.fusesource.mqtt.client.QoS; @@ -431,8 +433,7 @@ public boolean start() throws IOException { client = spec.connectionConfiguration().createClient(); LOG.debug("Reader client ID is {}", client.getClientId()); checkpointMark.clientId = client.getClientId().toString(); - connection = client.blockingConnection(); - connection.connect(); + connection = createConnection(client); connection.subscribe( new Topic[] {new Topic(spec.connectionConfiguration().getTopic(), QoS.AT_LEAST_ONCE)}); return advance(); @@ -569,8 +570,7 @@ public void createMqttClient() throws Exception { LOG.debug("Starting MQTT writer"); client = spec.connectionConfiguration().createClient(); LOG.debug("MQTT writer client ID is {}", client.getClientId()); - connection = client.blockingConnection(); - connection.connect(); + connection = createConnection(client); } @ProcessElement @@ -590,4 +590,20 @@ public void closeMqttClient() throws Exception { } } } + + /** Create a connected MQTT BlockingConnection from given client, aware of connection timeout. */ + static BlockingConnection createConnection(MQTT client) throws Exception { + FutureConnection futureConnection = client.futureConnection(); + org.fusesource.mqtt.client.Future connecting = futureConnection.connect(); + while (true) { + try { + connecting.await(1, TimeUnit.MINUTES); + } catch (TimeoutException e) { + LOG.warn("Connection to {} pending after waiting for 1 minute", client.getHost()); + continue; + } + break; + } + return new BlockingConnection(futureConnection); + } } diff --git a/sdks/java/io/mqtt/src/test/java/org/apache/beam/sdk/io/mqtt/MqttIOTest.java b/sdks/java/io/mqtt/src/test/java/org/apache/beam/sdk/io/mqtt/MqttIOTest.java index 30adad708f8d..7d60d6d65780 100644 --- a/sdks/java/io/mqtt/src/test/java/org/apache/beam/sdk/io/mqtt/MqttIOTest.java +++ b/sdks/java/io/mqtt/src/test/java/org/apache/beam/sdk/io/mqtt/MqttIOTest.java @@ -142,8 +142,7 @@ public void testReadNoClientId() throws Exception { publisherThread.join(); } - @Test(timeout = 30 * 1000) - @Ignore("https://github.com/apache/beam/issues/19092 Flake Non-deterministic output.") + @Test(timeout = 40 * 1000) public void testRead() throws Exception { PCollection output = pipeline.apply( @@ -151,7 +150,7 @@ public void testRead() throws Exception { .withConnectionConfiguration( MqttIO.ConnectionConfiguration.create("tcp://localhost:" + port, "READ_TOPIC") .withClientId("READ_PIPELINE")) - .withMaxReadTime(Duration.standardSeconds(3))); + .withMaxReadTime(Duration.standardSeconds(5))); PAssert.that(output) .containsInAnyOrder( "This is test 0".getBytes(StandardCharsets.UTF_8), @@ -180,12 +179,12 @@ public void testRead() throws Exception { + "messages ..."); boolean pipelineConnected = false; while (!pipelineConnected) { - Thread.sleep(1000); for (Connection connection : brokerService.getBroker().getClients()) { if (connection.getConnectionId().startsWith("READ_PIPELINE")) { pipelineConnected = true; } } + Thread.sleep(1000); } for (int i = 0; i < 10; i++) { publishConnection.publish( diff --git a/sdks/java/io/solace/src/test/java/org/apache/beam/sdk/io/solace/it/SolaceIOIT.java b/sdks/java/io/solace/src/test/java/org/apache/beam/sdk/io/solace/it/SolaceIOIT.java index 35ee7595352d..1a2a056efd45 100644 --- a/sdks/java/io/solace/src/test/java/org/apache/beam/sdk/io/solace/it/SolaceIOIT.java +++ b/sdks/java/io/solace/src/test/java/org/apache/beam/sdk/io/solace/it/SolaceIOIT.java @@ -87,6 +87,7 @@ public void testRead() { "Read from Solace", SolaceIO.read() .from(Queue.fromName(queueName)) + .withDeduplicateRecords(true) .withMaxNumConnections(1) .withSempClientFactory( BasicAuthSempClientFactory.builder() diff --git a/sdks/java/managed/src/test/java/org/apache/beam/sdk/managed/ManagedSchemaTransformTranslationTest.java b/sdks/java/managed/src/test/java/org/apache/beam/sdk/managed/ManagedSchemaTransformTranslationTest.java index 0d122646d899..c0f324c25606 100644 --- a/sdks/java/managed/src/test/java/org/apache/beam/sdk/managed/ManagedSchemaTransformTranslationTest.java +++ b/sdks/java/managed/src/test/java/org/apache/beam/sdk/managed/ManagedSchemaTransformTranslationTest.java @@ -38,6 +38,7 @@ import java.util.Map; import java.util.stream.Collectors; import org.apache.beam.model.pipeline.v1.RunnerApi; +import org.apache.beam.model.pipeline.v1.SchemaApi; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.coders.RowCoder; import org.apache.beam.sdk.managed.testing.TestSchemaTransformProvider; @@ -54,6 +55,7 @@ import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.InvalidProtocolBufferException; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableSet; import org.junit.Test; public class ManagedSchemaTransformTranslationTest { @@ -169,26 +171,33 @@ public void testProtoTranslation() throws Exception { .withFieldValue("transform_identifier", TestSchemaTransformProvider.IDENTIFIER) .withFieldValue("config", yamlStringConfig) .build(); - Map expectedAnnotations = - ImmutableMap.builder() - .put( - BeamUrns.getConstant(SCHEMATRANSFORM_URN_KEY), - ByteString.copyFromUtf8(MANAGED_TRANSFORM_URN)) - .put( - BeamUrns.getConstant(MANAGED_UNDERLYING_TRANSFORM_URN_KEY), - ByteString.copyFromUtf8(TestSchemaTransformProvider.IDENTIFIER)) - .put( - BeamUrns.getConstant(CONFIG_ROW_KEY), - ByteString.copyFrom( - CoderUtils.encodeToByteArray( - RowCoder.of(PROVIDER.configurationSchema()), managedConfigRow))) - .put( - BeamUrns.getConstant(CONFIG_ROW_SCHEMA_KEY), - ByteString.copyFrom( - SchemaTranslation.schemaToProto(PROVIDER.configurationSchema(), true) - .toByteArray())) - .build(); - assertEquals(expectedAnnotations, convertedTransform.getAnnotationsMap()); + assertEquals( + ImmutableSet.of( + BeamUrns.getConstant(SCHEMATRANSFORM_URN_KEY), + BeamUrns.getConstant(MANAGED_UNDERLYING_TRANSFORM_URN_KEY), + BeamUrns.getConstant(CONFIG_ROW_KEY), + BeamUrns.getConstant(CONFIG_ROW_SCHEMA_KEY)), + convertedTransform.getAnnotationsMap().keySet()); + assertEquals( + ByteString.copyFromUtf8(MANAGED_TRANSFORM_URN), + convertedTransform.getAnnotationsMap().get(BeamUrns.getConstant(SCHEMATRANSFORM_URN_KEY))); + assertEquals( + ByteString.copyFromUtf8(TestSchemaTransformProvider.IDENTIFIER), + convertedTransform + .getAnnotationsMap() + .get(BeamUrns.getConstant(MANAGED_UNDERLYING_TRANSFORM_URN_KEY))); + Schema annotationSchema = + SchemaTranslation.schemaFromProto( + SchemaApi.Schema.parseFrom( + convertedTransform + .getAnnotationsMap() + .get(BeamUrns.getConstant(CONFIG_ROW_SCHEMA_KEY)))); + assertEquals(PROVIDER.configurationSchema(), annotationSchema); + assertEquals( + managedConfigRow, + CoderUtils.decodeFromByteString( + RowCoder.of(annotationSchema), + convertedTransform.getAnnotationsMap().get(BeamUrns.getConstant(CONFIG_ROW_KEY)))); // Check that the spec proto contains correct values RunnerApi.FunctionSpec spec = convertedTransform.getSpec(); diff --git a/sdks/python/apache_beam/__init__.py b/sdks/python/apache_beam/__init__.py index 27c2b293fbd0..6e08083bc0de 100644 --- a/sdks/python/apache_beam/__init__.py +++ b/sdks/python/apache_beam/__init__.py @@ -75,6 +75,12 @@ 'This version of Apache Beam has not been sufficiently tested on ' 'Python %s.%s. You may encounter bugs or missing features.' % (sys.version_info.major, sys.version_info.minor)) + elif sys.version_info.minor == 8: + warnings.warn( + 'Python 3.8 reaches EOL in October 2024 and support will ' + 'be removed from Apache Beam in version 2.61.0. See ' + 'https://github.com/apache/beam/issues/31192 for more ' + 'information.') pass else: raise RuntimeError( diff --git a/sdks/python/apache_beam/dataframe/doctests.py b/sdks/python/apache_beam/dataframe/doctests.py index 61e904c4f9da..33faa6b58599 100644 --- a/sdks/python/apache_beam/dataframe/doctests.py +++ b/sdks/python/apache_beam/dataframe/doctests.py @@ -665,7 +665,10 @@ def set_pandas_options(): # See # https://github.com/pandas-dev/pandas/blob/a00202d12d399662b8045a8dd3fdac04f18e1e55/doc/source/conf.py#L319 np.random.seed(123456) - np.set_printoptions(precision=4, suppress=True) + legacy = None + if np.version.version.startswith('2'): + legacy = '1.25' + np.set_printoptions(precision=4, suppress=True, legacy=legacy) pd.options.display.max_rows = 15 diff --git a/sdks/python/apache_beam/dataframe/io.py b/sdks/python/apache_beam/dataframe/io.py index b795add1b44e..5fcb7326a026 100644 --- a/sdks/python/apache_beam/dataframe/io.py +++ b/sdks/python/apache_beam/dataframe/io.py @@ -280,7 +280,8 @@ def expand(self, root): first_path = match.metadata_list[0].path with io.filesystems.FileSystems.open(first_path) as handle: if not self.binary: - handle = TextIOWrapper(handle) + handle = TextIOWrapper( + handle, encoding=self.kwargs.get("encoding", None)) if self.incremental: with self.reader(handle, *self.args, **dict(self.kwargs, chunksize=100)) as stream: @@ -493,6 +494,10 @@ def __init__(self, underlying, tracker, splitter): self._buffer, self._underlying) self._buffer_start_pos += len(skip) + @property + def mode(self): + return getattr(self._underlying, "mode", "r") + def readable(self): return True @@ -572,6 +577,9 @@ def _read(self, size=-1): self._done = True return res + def flush(self): + self._underlying.flush() + class _ReadFromPandasDoFn(beam.DoFn, beam.RestrictionProvider): def __init__(self, reader, args, kwargs, binary, incremental, splitter): @@ -627,7 +635,8 @@ def process( splitter=self.splitter or _DelimSplitter(b'\n', _DEFAULT_BYTES_CHUNKSIZE)) if not self.binary: - handle = TextIOWrapper(handle) + handle = TextIOWrapper( + handle, encoding=self.kwargs.get("encoding", None)) if self.incremental: if 'chunksize' not in self.kwargs: self.kwargs['chunksize'] = _DEFAULT_LINES_CHUNKSIZE @@ -688,7 +697,8 @@ def open(self, file_handle): self.buffer = [] self.empty = self.header = self.footer = None if not self.binary: - file_handle = TextIOWrapper(file_handle) + file_handle = TextIOWrapper( + file_handle, encoding=self.kwargs.get("encoding", None)) self.file_handle = file_handle def write_to(self, df, file_handle=None): diff --git a/sdks/python/apache_beam/examples/inference/README.md b/sdks/python/apache_beam/examples/inference/README.md index 3bb68440ed60..f9c5af436965 100644 --- a/sdks/python/apache_beam/examples/inference/README.md +++ b/sdks/python/apache_beam/examples/inference/README.md @@ -853,6 +853,7 @@ path/to/my/image2: dandelions (78) Each line represents a prediction of the flower type along with the confidence in that prediction. --- + ## Text classifcation with a Vertex AI LLM [`vertex_ai_llm_text_classification.py`](./vertex_ai_llm_text_classification.py) contains an implementation for a RunInference pipeline that performs image classification using a model hosted on Vertex AI (based on https://cloud.google.com/vertex-ai/docs/tutorials/image-recognition-custom). @@ -882,4 +883,83 @@ This writes the output to the output file with contents like: ``` Each line represents a tuple containing the example, a [PredictionResult](https://beam.apache.org/releases/pydoc/2.40.0/apache_beam.ml.inference.base.html#apache_beam.ml.inference.base.PredictionResult) object with the response from the model in the inference field, and the endpoint id representing the model id. +--- + +## Text completion with vLLM + +[`vllm_text_completion.py`](./vllm_text_completion.py) contains an implementation for a RunInference pipeline that performs text completion using a local [vLLM](https://docs.vllm.ai/en/latest/) server. + +The pipeline reads in a set of text prompts or past messages, uses RunInference to spin up a local inference server and perform inference, and then writes the predictions to a text file. + +### Model for text completion + +To use this transform, you can use any [LLM supported by vLLM](https://docs.vllm.ai/en/latest/models/supported_models.html). + +### Running `vllm_text_completion.py` + +To run the text completion pipeline locally using the Facebook opt 125M model, use the following command. +```sh +python -m apache_beam.examples.inference.vllm_text_completion \ + --model "facebook/opt-125m" \ + --output 'path/to/output/file.txt' \ + <... aditional pipeline arguments to configure runner if not running in GPU environment ...> +``` + +You will either need to run this locally with a GPU accelerator or remotely on a runner that supports acceleration. +For example, you could run this on Dataflow with a GPU with the following command: + +```sh +python -m apache_beam.examples.inference.vllm_text_completion \ + --model "facebook/opt-125m" \ + --output 'gs://path/to/output/file.txt' \ + --runner dataflow \ + --project \ + --region us-central1 \ + --temp_location \ + --worker_harness_container_image "gcr.io/apache-beam-testing/beam-ml/vllm:latest" \ + --machine_type "n1-standard-4" \ + --dataflow_service_options "worker_accelerator=type:nvidia-tesla-t4;count:1;install-nvidia-driver:5xx" \ + --staging_location +``` + +Make sure to enable the 5xx driver since vLLM only works with 5xx drivers, not 4xx. + +This writes the output to the output file location with contents like: + +``` +'Hello, my name is', PredictionResult(example={'prompt': 'Hello, my name is'}, inference=Completion(id='cmpl-5f5113a317c949309582b1966511ffc4', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' Joel, my dad is Anton Harriman and my wife is Lydia. ', stop_reason=None)], created=1714064548, model='facebook/opt-125m', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=16, prompt_tokens=6, total_tokens=22))}) +``` +Each line represents a tuple containing the example, a [PredictionResult](https://beam.apache.org/releases/pydoc/2.40.0/apache_beam.ml.inference.base.html#apache_beam.ml.inference.base.PredictionResult) object with the response from the model in the inference field. + +You can also choose to run with chat examples. Doing this requires 2 steps: + +1) Upload a [chat_template](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#chat-template) to a filestore which is accessible from your job's environment (e.g. a public Google Cloud Storage bucket). You can copy [this sample template](https://storage.googleapis.com/apache-beam-ml/additional_files/sample_chat_template.jinja) to get started. You can skip this step if using a model other than `facebook/opt-125m` and you know your model provides a chat template. +2) Add the `--chat true` and `--chat_template ` parameters: + +```sh +python -m apache_beam.examples.inference.vllm_text_completion \ + --model "facebook/opt-125m" \ + --output 'gs://path/to/output/file.txt' \ + --chat true \ + --chat_template gs://path/to/your/file \ + <... aditional pipeline arguments to configure runner if not running in GPU environment ...> +``` + +This will configure the pipeline to run against a sequence of previous messages instead of a single text completion prompt. +For example, it might run against: + +``` +[ + OpenAIChatMessage(role='user', content='What is an example of a type of penguin?'), + OpenAIChatMessage(role='system', content='An emperor penguin is a type of penguin.'), + OpenAIChatMessage(role='user', content='Tell me about them') +], +``` + +and produce the following result in your output file location: + +``` +An emperor penguin is an adorable creature that lives in Antarctica. +``` + --- \ No newline at end of file diff --git a/sdks/python/apache_beam/examples/inference/vllm_text_completion.py b/sdks/python/apache_beam/examples/inference/vllm_text_completion.py new file mode 100644 index 000000000000..3cf7d04cb03e --- /dev/null +++ b/sdks/python/apache_beam/examples/inference/vllm_text_completion.py @@ -0,0 +1,162 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" A sample pipeline using the RunInference API to interface with an LLM using +vLLM. Takes in a set of prompts or lists of previous messages and produces +responses using a model of choice. + +Requires a GPU runtime with vllm, openai, and apache-beam installed to run +correctly. +""" + +import argparse +import logging +from typing import Iterable + +import apache_beam as beam +from apache_beam.ml.inference.base import PredictionResult +from apache_beam.ml.inference.base import RunInference +from apache_beam.ml.inference.vllm_inference import OpenAIChatMessage +from apache_beam.ml.inference.vllm_inference import VLLMChatModelHandler +from apache_beam.ml.inference.vllm_inference import VLLMCompletionsModelHandler +from apache_beam.options.pipeline_options import PipelineOptions +from apache_beam.options.pipeline_options import SetupOptions +from apache_beam.runners.runner import PipelineResult + +COMPLETION_EXAMPLES = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + "John cena is", +] + +CHAT_EXAMPLES = [ + [ + OpenAIChatMessage( + role='user', content='What is an example of a type of penguin?'), + OpenAIChatMessage( + role='assistant', content='Emperor penguin is a type of penguin.'), + OpenAIChatMessage(role='user', content='Tell me about them') + ], + [ + OpenAIChatMessage( + role='user', content='What colors are in the rainbow?'), + OpenAIChatMessage( + role='assistant', + content='Red, orange, yellow, green, blue, indigo, and violet.'), + OpenAIChatMessage(role='user', content='Do other colors ever appear?') + ], + [ + OpenAIChatMessage( + role='user', content='Who is the president of the United States?') + ], + [ + OpenAIChatMessage(role='user', content='What state is Fargo in?'), + OpenAIChatMessage(role='assistant', content='It is in North Dakota.'), + OpenAIChatMessage(role='user', content='How many people live there?'), + OpenAIChatMessage( + role='assistant', + content='Approximately 130,000 people live in Fargo, North Dakota.' + ), + OpenAIChatMessage(role='user', content='What is Fargo known for?'), + ], + [ + OpenAIChatMessage( + role='user', content='How many fish are in the ocean?'), + ], +] + + +def parse_known_args(argv): + """Parses args for the workflow.""" + parser = argparse.ArgumentParser() + parser.add_argument( + '--model', + dest='model', + type=str, + required=False, + default='facebook/opt-125m', + help='LLM to use for task') + parser.add_argument( + '--output', + dest='output', + type=str, + required=True, + help='Path to save output predictions.') + parser.add_argument( + '--chat', + dest='chat', + type=bool, + required=False, + default=False, + help='Whether to use chat model handler and examples') + parser.add_argument( + '--chat_template', + dest='chat_template', + type=str, + required=False, + default=None, + help='Chat template to use for chat example.') + return parser.parse_known_args(argv) + + +class PostProcessor(beam.DoFn): + def process(self, element: PredictionResult) -> Iterable[str]: + yield str(element.example) + ": " + str(element.inference) + + +def run( + argv=None, save_main_session=True, test_pipeline=None) -> PipelineResult: + """ + Args: + argv: Command line arguments defined for this example. + save_main_session: Used for internal testing. + test_pipeline: Used for internal testing. + """ + known_args, pipeline_args = parse_known_args(argv) + pipeline_options = PipelineOptions(pipeline_args) + pipeline_options.view_as(SetupOptions).save_main_session = save_main_session + + model_handler = VLLMCompletionsModelHandler(model_name=known_args.model) + input_examples = COMPLETION_EXAMPLES + + if known_args.chat: + model_handler = VLLMChatModelHandler( + model_name=known_args.model, + chat_template_path=known_args.chat_template) + input_examples = CHAT_EXAMPLES + + pipeline = test_pipeline + if not test_pipeline: + pipeline = beam.Pipeline(options=pipeline_options) + + examples = pipeline | "Create examples" >> beam.Create(input_examples) + predictions = examples | "RunInference" >> RunInference(model_handler) + process_output = predictions | "Process Predictions" >> beam.ParDo( + PostProcessor()) + _ = process_output | "WriteOutput" >> beam.io.WriteToText( + known_args.output, shard_name_template='', append_trailing_newlines=True) + + result = pipeline.run() + result.wait_until_finish() + return result + + +if __name__ == '__main__': + logging.getLogger().setLevel(logging.INFO) + run() diff --git a/sdks/python/apache_beam/examples/snippets/transforms/aggregation/groupby_expr.py b/sdks/python/apache_beam/examples/snippets/transforms/aggregation/groupby_expr.py index da90bd59da34..1a62af8c4f6d 100644 --- a/sdks/python/apache_beam/examples/snippets/transforms/aggregation/groupby_expr.py +++ b/sdks/python/apache_beam/examples/snippets/transforms/aggregation/groupby_expr.py @@ -47,9 +47,8 @@ def groupby_expr(test=None): | beam.GroupBy(lambda s: s[0]) | beam.Map(print)) # [END groupby_expr] - - if test: - test(grouped) + if test: + test(grouped) if __name__ == '__main__': diff --git a/sdks/python/apache_beam/examples/snippets/transforms/aggregation/groupby_global_aggregate.py b/sdks/python/apache_beam/examples/snippets/transforms/aggregation/groupby_global_aggregate.py index a46b14e01e8b..876644483a51 100644 --- a/sdks/python/apache_beam/examples/snippets/transforms/aggregation/groupby_global_aggregate.py +++ b/sdks/python/apache_beam/examples/snippets/transforms/aggregation/groupby_global_aggregate.py @@ -60,9 +60,8 @@ def global_aggregate(test=None): 'unit_price', max, 'max_price') | beam.Map(print)) # [END global_aggregate] - - if test: - test(grouped) + if test: + test(grouped) if __name__ == '__main__': diff --git a/sdks/python/apache_beam/examples/snippets/transforms/aggregation/groupby_simple_aggregate.py b/sdks/python/apache_beam/examples/snippets/transforms/aggregation/groupby_simple_aggregate.py index d700dc872bbf..528159b4990f 100644 --- a/sdks/python/apache_beam/examples/snippets/transforms/aggregation/groupby_simple_aggregate.py +++ b/sdks/python/apache_beam/examples/snippets/transforms/aggregation/groupby_simple_aggregate.py @@ -57,9 +57,8 @@ def simple_aggregate(test=None): 'quantity', sum, 'total_quantity') | beam.Map(print)) # [END simple_aggregate] - - if test: - test(grouped) + if test: + test(grouped) if __name__ == '__main__': diff --git a/sdks/python/apache_beam/examples/snippets/transforms/aggregation/groupby_test.py b/sdks/python/apache_beam/examples/snippets/transforms/aggregation/groupby_test.py index d7a3e2c880b2..3746be407b4b 100644 --- a/sdks/python/apache_beam/examples/snippets/transforms/aggregation/groupby_test.py +++ b/sdks/python/apache_beam/examples/snippets/transforms/aggregation/groupby_test.py @@ -38,6 +38,11 @@ from .groupby_simple_aggregate import simple_aggregate from .groupby_two_exprs import groupby_two_exprs +# +# TODO: Remove early returns in check functions +# https://github.com/apache/beam/issues/30778 +skip_due_to_30778 = True + class UnorderedList(object): def __init__(self, contents): @@ -73,7 +78,10 @@ def normalize_kv(k, v): # For documentation. NamedTuple = beam.Row + def check_groupby_expr_result(grouped): + if skip_due_to_30778: + return assert_that( grouped | beam.MapTuple(normalize_kv), equal_to([ @@ -86,6 +94,8 @@ def check_groupby_expr_result(grouped): def check_groupby_two_exprs_result(grouped): + if skip_due_to_30778: + return assert_that( grouped | beam.MapTuple(normalize_kv), equal_to([ @@ -99,6 +109,8 @@ def check_groupby_two_exprs_result(grouped): def check_groupby_attr_result(grouped): + if skip_due_to_30778: + return assert_that( grouped | beam.MapTuple(normalize_kv), equal_to([ @@ -146,57 +158,61 @@ def check_groupby_attr_result(grouped): def check_groupby_attr_expr_result(grouped): + if skip_due_to_30778: + return assert_that( grouped | beam.MapTuple(normalize_kv), equal_to([ #[START groupby_attr_expr_result] - ( - NamedTuple(recipe='pie', is_berry=True), - [ - beam.Row( - recipe='pie', - fruit='strawberry', - quantity=3, - unit_price=1.50), - beam.Row( - recipe='pie', - fruit='raspberry', - quantity=1, - unit_price=3.50), - beam.Row( - recipe='pie', - fruit='blackberry', - quantity=1, - unit_price=4.00), - beam.Row( - recipe='pie', - fruit='blueberry', - quantity=1, - unit_price=2.00), - ]), - ( - NamedTuple(recipe='muffin', is_berry=True), - [ - beam.Row( - recipe='muffin', - fruit='blueberry', - quantity=2, - unit_price=2.00), - ]), - ( - NamedTuple(recipe='muffin', is_berry=False), - [ - beam.Row( - recipe='muffin', - fruit='banana', - quantity=3, - unit_price=1.00), - ]), + ( + NamedTuple(recipe='pie', is_berry=True), + [ + beam.Row( + recipe='pie', + fruit='strawberry', + quantity=3, + unit_price=1.50), + beam.Row( + recipe='pie', + fruit='raspberry', + quantity=1, + unit_price=3.50), + beam.Row( + recipe='pie', + fruit='blackberry', + quantity=1, + unit_price=4.00), + beam.Row( + recipe='pie', + fruit='blueberry', + quantity=1, + unit_price=2.00), + ]), + ( + NamedTuple(recipe='muffin', is_berry=True), + [ + beam.Row( + recipe='muffin', + fruit='blueberry', + quantity=2, + unit_price=2.00), + ]), + ( + NamedTuple(recipe='muffin', is_berry=False), + [ + beam.Row( + recipe='muffin', + fruit='banana', + quantity=3, + unit_price=1.00), + ]), #[END groupby_attr_expr_result] ])) def check_simple_aggregate_result(grouped): + if skip_due_to_30778: + return assert_that( grouped | beam.MapTuple(normalize_kv), equal_to([ @@ -211,6 +227,8 @@ def check_simple_aggregate_result(grouped): def check_expr_aggregate_result(grouped): + if skip_due_to_30778: + return assert_that( grouped | beam.Map(normalize), equal_to([ @@ -222,6 +240,8 @@ def check_expr_aggregate_result(grouped): def check_global_aggregate_result(grouped): + if skip_due_to_30778: + return assert_that( grouped | beam.Map(normalize), equal_to([ @@ -232,19 +252,26 @@ def check_global_aggregate_result(grouped): @mock.patch( - 'apache_beam.examples.snippets.transforms.aggregation.groupby_expr.print', str) + 'apache_beam.examples.snippets.transforms.aggregation.groupby_expr.print', + str) @mock.patch( - 'apache_beam.examples.snippets.transforms.aggregation.groupby_two_exprs.print', str) + 'apache_beam.examples.snippets.transforms.aggregation.groupby_two_exprs.print', + str) @mock.patch( - 'apache_beam.examples.snippets.transforms.aggregation.groupby_attr.print', str) + 'apache_beam.examples.snippets.transforms.aggregation.groupby_attr.print', + str) @mock.patch( - 'apache_beam.examples.snippets.transforms.aggregation.groupby_attr_expr.print', str) + 'apache_beam.examples.snippets.transforms.aggregation.groupby_attr_expr.print', + str) @mock.patch( - 'apache_beam.examples.snippets.transforms.aggregation.groupby_simple_aggregate.print', str) + 'apache_beam.examples.snippets.transforms.aggregation.groupby_simple_aggregate.print', + str) @mock.patch( - 'apache_beam.examples.snippets.transforms.aggregation.groupby_expr_aggregate.print', str) + 'apache_beam.examples.snippets.transforms.aggregation.groupby_expr_aggregate.print', + str) @mock.patch( - 'apache_beam.examples.snippets.transforms.aggregation.groupby_global_aggregate.print', str) + 'apache_beam.examples.snippets.transforms.aggregation.groupby_global_aggregate.print', + str) class GroupByTest(unittest.TestCase): def test_groupby_expr(self): groupby_expr(check_groupby_expr_result) diff --git a/sdks/python/apache_beam/io/aws/s3filesystem.py b/sdks/python/apache_beam/io/aws/s3filesystem.py index 636b0a12f3e2..e181beac4a58 100644 --- a/sdks/python/apache_beam/io/aws/s3filesystem.py +++ b/sdks/python/apache_beam/io/aws/s3filesystem.py @@ -314,3 +314,11 @@ def delete(self, paths): } if exceptions: raise BeamIOError("Delete operation failed", exceptions) + + def report_lineage(self, path, lineage): + try: + components = s3io.parse_s3_path(path, get_account=True) + except ValueError: + # report lineage is fail-safe + return + lineage.add('s3', *components) diff --git a/sdks/python/apache_beam/io/azure/blobstoragefilesystem.py b/sdks/python/apache_beam/io/azure/blobstoragefilesystem.py index c446c17247d7..bb56fa09d370 100644 --- a/sdks/python/apache_beam/io/azure/blobstoragefilesystem.py +++ b/sdks/python/apache_beam/io/azure/blobstoragefilesystem.py @@ -316,3 +316,11 @@ def delete(self, paths): if exceptions: raise BeamIOError("Delete operation failed", exceptions) + + def report_lineage(self, path, lineage): + try: + components = blobstorageio.parse_azfs_path(path, get_account=True) + except ValueError: + # report lineage is fail-safe + return + lineage.add('abs', *components) diff --git a/sdks/python/apache_beam/io/filebasedsink.py b/sdks/python/apache_beam/io/filebasedsink.py index 816de9d33a05..c708e117c3a1 100644 --- a/sdks/python/apache_beam/io/filebasedsink.py +++ b/sdks/python/apache_beam/io/filebasedsink.py @@ -280,6 +280,7 @@ def _check_state_for_finalize_write(self, writer_results, num_shards): src_files.append(src) dst_files.append(dst) + FileSystems.report_sink_lineage(dst) return src_files, dst_files, delete_files, num_skipped @check_accessible(['file_path_prefix']) diff --git a/sdks/python/apache_beam/io/filebasedsource.py b/sdks/python/apache_beam/io/filebasedsource.py index 91763ced6e69..efd863810ed7 100644 --- a/sdks/python/apache_beam/io/filebasedsource.py +++ b/sdks/python/apache_beam/io/filebasedsource.py @@ -168,6 +168,7 @@ def _get_concat_source(self) -> concat_source.ConcatSource: min_bundle_size=self._min_bundle_size, splittable=splittable) single_file_sources.append(single_file_source) + FileSystems.report_source_lineage(file_name) self._concat_source = concat_source.ConcatSource(single_file_sources) return self._concat_source @@ -351,6 +352,7 @@ def process(self, element: Union[str, FileMetadata], *args, match_results = FileSystems.match([element]) metadata_list = match_results[0].metadata_list for metadata in metadata_list: + FileSystems.report_source_lineage(metadata.path) splittable = ( self._splittable and _determine_splittability_from_compression_type( metadata.path, self._compression_type)) diff --git a/sdks/python/apache_beam/io/filesystem.py b/sdks/python/apache_beam/io/filesystem.py index 550079a482c4..bdc25dcf0fe5 100644 --- a/sdks/python/apache_beam/io/filesystem.py +++ b/sdks/python/apache_beam/io/filesystem.py @@ -933,3 +933,11 @@ def delete(self, paths): ``BeamIOError``: if any of the delete operations fail """ raise NotImplementedError + + def report_lineage(self, path, unused_lineage): + """ + Report Lineage metrics for path. + + Unless override by FileSystem implementations, default to no-op. + """ + pass diff --git a/sdks/python/apache_beam/io/filesystems.py b/sdks/python/apache_beam/io/filesystems.py index e7cdf3844979..ccbeac640765 100644 --- a/sdks/python/apache_beam/io/filesystems.py +++ b/sdks/python/apache_beam/io/filesystems.py @@ -26,6 +26,7 @@ from apache_beam.io.filesystem import BeamIOError from apache_beam.io.filesystem import CompressionTypes from apache_beam.io.filesystem import FileSystem +from apache_beam.metrics.metric import Lineage from apache_beam.options.value_provider import RuntimeValueProvider _LOGGER = logging.getLogger(__name__) @@ -388,3 +389,15 @@ def get_chunk_size(path): """ filesystem = FileSystems.get_filesystem(path) return filesystem.CHUNK_SIZE + + @staticmethod + def report_source_lineage(path): + """Report source :class:`~apache_beam.metrics.metric.Lineage`.""" + filesystem = FileSystems.get_filesystem(path) + filesystem.report_lineage(path, Lineage.sources()) + + @staticmethod + def report_sink_lineage(path): + """Report sink :class:`~apache_beam.metrics.metric.Lineage`.""" + filesystem = FileSystems.get_filesystem(path) + filesystem.report_lineage(path, Lineage.sinks()) diff --git a/sdks/python/apache_beam/io/gcp/bigtableio.py b/sdks/python/apache_beam/io/gcp/bigtableio.py index 3f54e09ee3dd..ffb1852eb0f4 100644 --- a/sdks/python/apache_beam/io/gcp/bigtableio.py +++ b/sdks/python/apache_beam/io/gcp/bigtableio.py @@ -47,6 +47,7 @@ from apache_beam.io.gcp import resource_identifiers from apache_beam.metrics import Metrics from apache_beam.metrics import monitoring_infos +from apache_beam.metrics.metric import Lineage from apache_beam.transforms import PTransform from apache_beam.transforms.display import DisplayDataItem from apache_beam.transforms.external import BeamJarExpansionService @@ -162,6 +163,12 @@ def finish_bundle(self): if self.batcher: self.batcher.close() self.batcher = None + # Report Lineage metrics on write + Lineage.sinks().add( + 'bigtable', + self.beam_options['project_id'], + self.beam_options['instance_id'], + self.beam_options['table_id']) def display_data(self): return { diff --git a/sdks/python/apache_beam/io/gcp/bigtableio_test.py b/sdks/python/apache_beam/io/gcp/bigtableio_test.py index f97c9bcfbd6a..130f9a714129 100644 --- a/sdks/python/apache_beam/io/gcp/bigtableio_test.py +++ b/sdks/python/apache_beam/io/gcp/bigtableio_test.py @@ -35,12 +35,15 @@ from apache_beam.io.gcp import resource_identifiers from apache_beam.metrics import monitoring_infos from apache_beam.metrics.execution import MetricsEnvironment +from apache_beam.metrics.metric import Lineage +from apache_beam.testing.test_pipeline import TestPipeline _LOGGER = logging.getLogger(__name__) # Protect against environments where bigtable library is not available. try: from google.cloud.bigtable import client + from google.cloud.bigtable.batcher import MutationsBatcher from google.cloud.bigtable.row_filters import TimestampRange from google.cloud.bigtable.instance import Instance from google.cloud.bigtable.row import DirectRow, PartialRowData, Cell @@ -266,6 +269,18 @@ def setUp(self): instance = Instance(self._INSTANCE_ID, client) self.table = Table(self._TABLE_ID, instance) + def test_write(self): + direct_rows = [self.generate_row(i) for i in range(5)] + with patch.object(MutationsBatcher, 'mutate'), \ + patch.object(MutationsBatcher, 'close'), TestPipeline() as p: + _ = p | beam.Create(direct_rows) | bigtableio.WriteToBigTable( + self._PROJECT_ID, self._INSTANCE_ID, self._TABLE_ID) + self.assertSetEqual( + Lineage.query(p.result.metrics(), Lineage.SINK), + set([ + f"bigtable:{self._PROJECT_ID}.{self._INSTANCE_ID}.{self._TABLE_ID}" + ])) + def test_write_metrics(self): MetricsEnvironment.process_wide_container().reset() write_fn = bigtableio._BigTableWriteFn( diff --git a/sdks/python/apache_beam/io/gcp/gcsfilesystem.py b/sdks/python/apache_beam/io/gcp/gcsfilesystem.py index 47d1997ddc7b..053b02d325a5 100644 --- a/sdks/python/apache_beam/io/gcp/gcsfilesystem.py +++ b/sdks/python/apache_beam/io/gcp/gcsfilesystem.py @@ -365,3 +365,11 @@ def delete(self, paths): if exceptions: raise BeamIOError("Delete operation failed", exceptions) + + def report_lineage(self, path, lineage): + try: + bucket, blob = gcsio.parse_gcs_path(path) + except ValueError: + # report lineage is fail-safe + return + lineage.add('gcs', bucket, blob) diff --git a/sdks/python/apache_beam/io/gcp/gcsio.py b/sdks/python/apache_beam/io/gcp/gcsio.py index 6b0470b82361..22a33fa13c63 100644 --- a/sdks/python/apache_beam/io/gcp/gcsio.py +++ b/sdks/python/apache_beam/io/gcp/gcsio.py @@ -43,10 +43,10 @@ from apache_beam import version as beam_version from apache_beam.internal.gcp import auth +from apache_beam.io.gcp import gcsio_retry from apache_beam.metrics.metric import Metrics from apache_beam.options.pipeline_options import GoogleCloudOptions from apache_beam.options.pipeline_options import PipelineOptions -from apache_beam.utils import retry from apache_beam.utils.annotations import deprecated __all__ = ['GcsIO', 'create_storage_client'] @@ -155,6 +155,9 @@ def __init__(self, storage_client=None, pipeline_options=None): self.client = storage_client self._rewrite_cb = None self.bucket_to_project_number = {} + self._storage_client_retry = gcsio_retry.get_retry(pipeline_options) + self._use_blob_generation = getattr( + google_cloud_options, 'enable_gcsio_blob_generation', False) def get_project_number(self, bucket): if bucket not in self.bucket_to_project_number: @@ -167,7 +170,8 @@ def get_project_number(self, bucket): def get_bucket(self, bucket_name, **kwargs): """Returns an object bucket from its name, or None if it does not exist.""" try: - return self.client.lookup_bucket(bucket_name, **kwargs) + return self.client.lookup_bucket( + bucket_name, retry=self._storage_client_retry, **kwargs) except NotFound: return None @@ -188,7 +192,7 @@ def create_bucket( bucket_or_name=bucket, project=project, location=location, - ) + retry=self._storage_client_retry) if kms_key: bucket.default_kms_key_name(kms_key) bucket.patch() @@ -224,18 +228,18 @@ def open( return BeamBlobReader( blob, chunk_size=read_buffer_size, - enable_read_bucket_metric=self.enable_read_bucket_metric) + enable_read_bucket_metric=self.enable_read_bucket_metric, + retry=self._storage_client_retry) elif mode == 'w' or mode == 'wb': blob = bucket.blob(blob_name) return BeamBlobWriter( blob, mime_type, - enable_write_bucket_metric=self.enable_write_bucket_metric) + enable_write_bucket_metric=self.enable_write_bucket_metric, + retry=self._storage_client_retry) else: raise ValueError('Invalid file open mode: %s.' % mode) - @retry.with_exponential_backoff( - retry_filter=retry.retry_on_server_errors_and_timeout_filter) def delete(self, path): """Deletes the object at the given GCS path. @@ -243,14 +247,24 @@ def delete(self, path): path: GCS file path pattern in the form gs:///. """ bucket_name, blob_name = parse_gcs_path(path) + bucket = self.client.bucket(bucket_name) + if self._use_blob_generation: + # blob can be None if not found + blob = bucket.get_blob(blob_name, retry=self._storage_client_retry) + generation = getattr(blob, "generation", None) + else: + generation = None try: - bucket = self.client.bucket(bucket_name) - bucket.delete_blob(blob_name) + bucket.delete_blob( + blob_name, + if_generation_match=generation, + retry=self._storage_client_retry) except NotFound: return def delete_batch(self, paths): """Deletes the objects at the given GCS paths. + Warning: any exception during batch delete will NOT be retried. Args: paths: List of GCS file path patterns or Dict with GCS file path patterns @@ -287,8 +301,6 @@ def delete_batch(self, paths): return final_results - @retry.with_exponential_backoff( - retry_filter=retry.retry_on_server_errors_and_timeout_filter) def copy(self, src, dest): """Copies the given GCS object from src to dest. @@ -297,19 +309,32 @@ def copy(self, src, dest): dest: GCS file path pattern in the form gs:///. Raises: - TimeoutError: on timeout. + Any exceptions during copying """ src_bucket_name, src_blob_name = parse_gcs_path(src) dest_bucket_name, dest_blob_name= parse_gcs_path(dest, object_optional=True) src_bucket = self.client.bucket(src_bucket_name) - src_blob = src_bucket.blob(src_blob_name) + if self._use_blob_generation: + src_blob = src_bucket.get_blob(src_blob_name) + if src_blob is None: + raise NotFound("source blob %s not found during copying" % src) + src_generation = src_blob.generation + else: + src_blob = src_bucket.blob(src_blob_name) + src_generation = None dest_bucket = self.client.bucket(dest_bucket_name) if not dest_blob_name: dest_blob_name = None - src_bucket.copy_blob(src_blob, dest_bucket, new_name=dest_blob_name) + src_bucket.copy_blob( + src_blob, + dest_bucket, + new_name=dest_blob_name, + source_generation=src_generation, + retry=self._storage_client_retry) def copy_batch(self, src_dest_pairs): """Copies the given GCS objects from src to dest. + Warning: any exception during batch copy will NOT be retried. Args: src_dest_pairs: list of (src, dest) tuples of gs:/// files @@ -450,8 +475,6 @@ def _status(self, path): file_status['size'] = gcs_object.size return file_status - @retry.with_exponential_backoff( - retry_filter=retry.retry_on_server_errors_and_timeout_filter) def _gcs_object(self, path): """Returns a gcs object for the given path @@ -462,7 +485,7 @@ def _gcs_object(self, path): """ bucket_name, blob_name = parse_gcs_path(path) bucket = self.client.bucket(bucket_name) - blob = bucket.get_blob(blob_name) + blob = bucket.get_blob(blob_name, retry=self._storage_client_retry) if blob: return blob else: @@ -510,7 +533,8 @@ def list_files(self, path, with_metadata=False): else: _LOGGER.debug("Starting the size estimation of the input") bucket = self.client.bucket(bucket_name) - response = self.client.list_blobs(bucket, prefix=prefix) + response = self.client.list_blobs( + bucket, prefix=prefix, retry=self._storage_client_retry) for item in response: file_name = 'gs://%s/%s' % (item.bucket.name, item.name) if file_name not in file_info: @@ -546,8 +570,7 @@ def _updated_to_seconds(updated): def is_soft_delete_enabled(self, gcs_path): try: bucket_name, _ = parse_gcs_path(gcs_path) - # set retry timeout to 5 seconds when checking soft delete policy - bucket = self.get_bucket(bucket_name, retry=DEFAULT_RETRY.with_timeout(5)) + bucket = self.get_bucket(bucket_name) if (bucket.soft_delete_policy is not None and bucket.soft_delete_policy.retention_duration_seconds > 0): return True @@ -563,8 +586,9 @@ def __init__( self, blob, chunk_size=DEFAULT_READ_BUFFER_SIZE, - enable_read_bucket_metric=False): - super().__init__(blob, chunk_size=chunk_size) + enable_read_bucket_metric=False, + retry=DEFAULT_RETRY): + super().__init__(blob, chunk_size=chunk_size, retry=retry) self.enable_read_bucket_metric = enable_read_bucket_metric self.mode = "r" @@ -585,13 +609,14 @@ def __init__( content_type, chunk_size=16 * 1024 * 1024, ignore_flush=True, - enable_write_bucket_metric=False): + enable_write_bucket_metric=False, + retry=DEFAULT_RETRY): super().__init__( blob, content_type=content_type, chunk_size=chunk_size, ignore_flush=ignore_flush, - retry=DEFAULT_RETRY) + retry=retry) self.mode = "w" self.enable_write_bucket_metric = enable_write_bucket_metric diff --git a/sdks/python/apache_beam/io/gcp/gcsio_integration_test.py b/sdks/python/apache_beam/io/gcp/gcsio_integration_test.py index fad638136804..07a5fb5df553 100644 --- a/sdks/python/apache_beam/io/gcp/gcsio_integration_test.py +++ b/sdks/python/apache_beam/io/gcp/gcsio_integration_test.py @@ -34,6 +34,7 @@ import mock import pytest +from parameterized import parameterized_class from apache_beam.io.filesystems import FileSystems from apache_beam.options.pipeline_options import GoogleCloudOptions @@ -51,6 +52,9 @@ @unittest.skipIf(gcsio is None, 'GCP dependencies are not installed') +@parameterized_class( + ('no_gcsio_throttling_counter', 'enable_gcsio_blob_generation'), + [(False, False), (False, True), (True, False), (True, True)]) class GcsIOIntegrationTest(unittest.TestCase): INPUT_FILE = 'gs://dataflow-samples/shakespeare/kinglear.txt' @@ -67,7 +71,6 @@ def setUp(self): self.gcs_tempdir = ( self.test_pipeline.get_option('temp_location') + '/gcs_it-' + str(uuid.uuid4())) - self.gcsio = gcsio.GcsIO() def tearDown(self): FileSystems.delete([self.gcs_tempdir + '/']) @@ -92,14 +95,47 @@ def _verify_copy(self, src, dest, dest_kms_key_name=None): @pytest.mark.it_postcommit def test_copy(self): + self.gcsio = gcsio.GcsIO( + pipeline_options={ + "no_gcsio_throttling_counter": self.no_gcsio_throttling_counter, + "enable_gcsio_blob_generation": self.enable_gcsio_blob_generation + }) + src = self.INPUT_FILE + dest = self.gcs_tempdir + '/test_copy' + + self.gcsio.copy(src, dest) + self._verify_copy(src, dest) + + unknown_src = self.test_pipeline.get_option('temp_location') + \ + '/gcs_it-' + str(uuid.uuid4()) + with self.assertRaises(NotFound): + self.gcsio.copy(unknown_src, dest) + + @pytest.mark.it_postcommit + def test_copy_and_delete(self): + self.gcsio = gcsio.GcsIO( + pipeline_options={ + "no_gcsio_throttling_counter": self.no_gcsio_throttling_counter, + "enable_gcsio_blob_generation": self.enable_gcsio_blob_generation + }) src = self.INPUT_FILE dest = self.gcs_tempdir + '/test_copy' self.gcsio.copy(src, dest) self._verify_copy(src, dest) + self.gcsio.delete(dest) + + # no exception if we delete an nonexistent file. + self.gcsio.delete(dest) + @pytest.mark.it_postcommit def test_batch_copy_and_delete(self): + self.gcsio = gcsio.GcsIO( + pipeline_options={ + "no_gcsio_throttling_counter": self.no_gcsio_throttling_counter, + "enable_gcsio_blob_generation": self.enable_gcsio_blob_generation + }) num_copies = 10 srcs = [self.INPUT_FILE] * num_copies dests = [ @@ -152,6 +188,7 @@ def test_batch_copy_and_delete(self): @mock.patch('apache_beam.io.gcp.gcsio.default_gcs_bucket_name') @unittest.skipIf(NotFound is None, 'GCP dependencies are not installed') def test_create_default_bucket(self, mock_default_gcs_bucket_name): + self.gcsio = gcsio.GcsIO() google_cloud_options = self.test_pipeline.options.view_as( GoogleCloudOptions) # overwrite kms option here, because get_or_create_default_gcs_bucket() diff --git a/sdks/python/apache_beam/io/gcp/gcsio_retry.py b/sdks/python/apache_beam/io/gcp/gcsio_retry.py new file mode 100644 index 000000000000..29fd71c5195b --- /dev/null +++ b/sdks/python/apache_beam/io/gcp/gcsio_retry.py @@ -0,0 +1,71 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +Throttling Handler for GCSIO +""" + +import inspect +import logging +import math + +from google.api_core import exceptions as api_exceptions +from google.api_core import retry +from google.cloud.storage.retry import DEFAULT_RETRY +from google.cloud.storage.retry import _should_retry # pylint: disable=protected-access + +from apache_beam.metrics.metric import Metrics +from apache_beam.options.pipeline_options import GoogleCloudOptions + +_LOGGER = logging.getLogger(__name__) + +__all__ = ['DEFAULT_RETRY_WITH_THROTTLING_COUNTER'] + + +class ThrottlingHandler(object): + _THROTTLED_SECS = Metrics.counter('gcsio', "cumulativeThrottlingSeconds") + + def __call__(self, exc): + if isinstance(exc, api_exceptions.TooManyRequests): + _LOGGER.debug('Caught GCS quota error (%s), retrying.', exc.reason) + # TODO: revisit the logic here when gcs client library supports error + # callbacks + frame = inspect.currentframe() + if frame is None: + _LOGGER.warning('cannot inspect the current stack frame') + return + + prev_frame = frame.f_back + if prev_frame is None: + _LOGGER.warning('cannot inspect the caller stack frame') + return + + # next_sleep is one of the arguments in the caller + # i.e. _retry_error_helper() in google/api_core/retry/retry_base.py + sleep_seconds = prev_frame.f_locals.get("next_sleep", 0) + ThrottlingHandler._THROTTLED_SECS.inc(math.ceil(sleep_seconds)) + + +DEFAULT_RETRY_WITH_THROTTLING_COUNTER = retry.Retry( + predicate=_should_retry, on_error=ThrottlingHandler()) + + +def get_retry(pipeline_options): + if pipeline_options.view_as(GoogleCloudOptions).no_gcsio_throttling_counter: + return DEFAULT_RETRY + else: + return DEFAULT_RETRY_WITH_THROTTLING_COUNTER diff --git a/sdks/python/apache_beam/io/gcp/gcsio_retry_test.py b/sdks/python/apache_beam/io/gcp/gcsio_retry_test.py new file mode 100644 index 000000000000..750879ae0284 --- /dev/null +++ b/sdks/python/apache_beam/io/gcp/gcsio_retry_test.py @@ -0,0 +1,84 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Tests for Throttling Handler of GCSIO.""" + +import unittest +from unittest.mock import Mock + +from apache_beam.metrics.execution import MetricsContainer +from apache_beam.metrics.execution import MetricsEnvironment +from apache_beam.metrics.metricbase import MetricName +from apache_beam.runners.worker import statesampler +from apache_beam.utils import counters + +try: + from apache_beam.io.gcp import gcsio_retry + from google.api_core import exceptions as api_exceptions +except ImportError: + gcsio_retry = None + api_exceptions = None + + +@unittest.skipIf((gcsio_retry is None or api_exceptions is None), + 'GCP dependencies are not installed') +class TestGCSIORetry(unittest.TestCase): + def test_retry_on_non_retriable(self): + mock = Mock(side_effect=[ + Exception('Something wrong!'), + ]) + retry = gcsio_retry.DEFAULT_RETRY_WITH_THROTTLING_COUNTER + with self.assertRaises(Exception): + retry(mock)() + + def test_retry_on_throttling(self): + mock = Mock( + side_effect=[ + api_exceptions.TooManyRequests("Slow down!"), + api_exceptions.TooManyRequests("Slow down again!"), + 12345 + ]) + retry = gcsio_retry.DEFAULT_RETRY_WITH_THROTTLING_COUNTER + + sampler = statesampler.StateSampler('', counters.CounterFactory()) + statesampler.set_current_tracker(sampler) + state = sampler.scoped_state( + 'my_step', 'my_state', metrics_container=MetricsContainer('my_step')) + try: + sampler.start() + with state: + container = MetricsEnvironment.current_container() + + self.assertEqual( + container.get_counter( + MetricName('gcsio', + "cumulativeThrottlingSeconds")).get_cumulative(), + 0) + + self.assertEqual(12345, retry(mock)()) + + self.assertGreater( + container.get_counter( + MetricName('gcsio', + "cumulativeThrottlingSeconds")).get_cumulative(), + 1) + finally: + sampler.stop() + + +if __name__ == '__main__': + unittest.main() diff --git a/sdks/python/apache_beam/io/gcp/gcsio_test.py b/sdks/python/apache_beam/io/gcp/gcsio_test.py index 407295f2fb30..19df15dcf7fa 100644 --- a/sdks/python/apache_beam/io/gcp/gcsio_test.py +++ b/sdks/python/apache_beam/io/gcp/gcsio_test.py @@ -20,6 +20,7 @@ import logging import os +import random import unittest from datetime import datetime @@ -36,6 +37,7 @@ try: from apache_beam.io.gcp import gcsio + from apache_beam.io.gcp.gcsio_retry import DEFAULT_RETRY_WITH_THROTTLING_COUNTER from google.cloud.exceptions import BadRequest, NotFound except ImportError: NotFound = None @@ -85,7 +87,7 @@ def get_file(self, bucket, blob): holder = folder.get_blob(blob.name) return holder - def list_blobs(self, bucket_or_path, prefix=None): + def list_blobs(self, bucket_or_path, prefix=None, **unused_kwargs): bucket = self.get_bucket(bucket_or_path.name) if not prefix: return list(bucket.blobs.values()) @@ -120,7 +122,7 @@ def add_blob(self, blob): def blob(self, name): return self._create_blob(name) - def copy_blob(self, blob, dest, new_name=None): + def copy_blob(self, blob, dest, new_name=None, **kwargs): if self.get_blob(blob.name) is None: raise NotFound("source blob not found") if not new_name: @@ -129,7 +131,7 @@ def copy_blob(self, blob, dest, new_name=None): dest.add_blob(new_blob) return new_blob - def get_blob(self, blob_name): + def get_blob(self, blob_name, **unused_kwargs): bucket = self._get_canonical_bucket() if blob_name in bucket.blobs: return bucket.blobs[blob_name] @@ -146,7 +148,7 @@ def lookup_blob(self, name): def set_default_kms_key_name(self, name): self.default_kms_key_name = name - def delete_blob(self, name): + def delete_blob(self, name, **kwargs): bucket = self._get_canonical_bucket() if name in bucket.blobs: del bucket.blobs[name] @@ -175,6 +177,7 @@ def __init__( self.updated = updated self._fail_when_getting_metadata = fail_when_getting_metadata self._fail_when_reading = fail_when_reading + self.generation = random.randint(0, (1 << 63) - 1) def delete(self): self.bucket.delete_blob(self.name) @@ -532,7 +535,10 @@ def test_file_buffered_read_call(self): with mock.patch('apache_beam.io.gcp.gcsio.BeamBlobReader') as reader: self.gcs.open(file_name, read_buffer_size=read_buffer_size) reader.assert_called_with( - blob, chunk_size=read_buffer_size, enable_read_bucket_metric=False) + blob, + chunk_size=read_buffer_size, + enable_read_bucket_metric=False, + retry=DEFAULT_RETRY_WITH_THROTTLING_COUNTER) def test_file_write_call(self): file_name = 'gs://gcsio-test/write_file' diff --git a/sdks/python/apache_beam/io/gcp/pubsub.py b/sdks/python/apache_beam/io/gcp/pubsub.py index 32e7fbe5ed58..b6f801c63f79 100644 --- a/sdks/python/apache_beam/io/gcp/pubsub.py +++ b/sdks/python/apache_beam/io/gcp/pubsub.py @@ -43,8 +43,11 @@ from apache_beam.io import iobase from apache_beam.io.iobase import Read from apache_beam.io.iobase import Write +from apache_beam.metrics.metric import Lineage +from apache_beam.transforms import DoFn from apache_beam.transforms import Flatten from apache_beam.transforms import Map +from apache_beam.transforms import ParDo from apache_beam.transforms import PTransform from apache_beam.transforms.display import DisplayDataItem from apache_beam.utils.annotations import deprecated @@ -257,7 +260,16 @@ def __init__( def expand(self, pvalue): # TODO(BEAM-27443): Apply a proper transform rather than Read. pcoll = pvalue.pipeline | Read(self._source) + # explicit element_type required after native read, otherwise coder error pcoll.element_type = bytes + return self.expand_continued(pcoll) + + def expand_continued(self, pcoll): + pcoll = pcoll | ParDo( + _AddMetricsPassThrough( + project=self._source.project, + topic=self._source.topic_name, + sub=self._source.subscription_name)).with_output_types(bytes) if self.with_attributes: pcoll = pcoll | Map(PubsubMessage._from_proto_str) pcoll.element_type = PubsubMessage @@ -269,6 +281,31 @@ def to_runner_api_parameter(self, context): return self.to_runner_api_pickled(context) +class _AddMetricsPassThrough(DoFn): + def __init__(self, project, topic=None, sub=None): + self.project = project + self.topic = topic + self.sub = sub + self.reported_lineage = False + + def setup(self): + self.reported_lineage = False + + def process(self, element: bytes): + self.report_lineage_once() + yield element + + def report_lineage_once(self): + if not self.reported_lineage: + self.reported_lineage = True + if self.topic is not None: + Lineage.sources().add( + 'pubsub', self.project, self.topic, subtype='topic') + elif self.sub is not None: + Lineage.sources().add( + 'pubsub', self.project, self.sub, subtype='subscription') + + @deprecated(since='2.7.0', extra_message='Use ReadFromPubSub instead.') def ReadStringsFromPubSub(topic=None, subscription=None, id_label=None): return _ReadStringsFromPubSub(topic, subscription, id_label) @@ -314,6 +351,26 @@ def expand(self, pcoll): return pcoll | WriteToPubSub(self.topic) +class _AddMetricsAndMap(DoFn): + def __init__(self, fn, project, topic=None): + self.project = project + self.topic = topic + self.fn = fn + self.reported_lineage = False + + def setup(self): + self.reported_lineage = False + + def process(self, element): + self.report_lineage_once() + yield self.fn(element) + + def report_lineage_once(self): + if not self.reported_lineage: + self.reported_lineage = True + Lineage.sinks().add('pubsub', self.project, self.topic, subtype='topic') + + class WriteToPubSub(PTransform): """A ``PTransform`` for writing messages to Cloud Pub/Sub.""" @@ -364,9 +421,15 @@ def bytes_to_proto_str(element: Union[bytes, str]) -> bytes: def expand(self, pcoll): if self.with_attributes: - pcoll = pcoll | 'ToProtobufX' >> Map(self.message_to_proto_str) + pcoll = pcoll | 'ToProtobufX' >> ParDo( + _AddMetricsAndMap( + self.message_to_proto_str, self.project, + self.topic_name)).with_input_types(PubsubMessage) else: - pcoll = pcoll | 'ToProtobufY' >> Map(self.bytes_to_proto_str) + pcoll = pcoll | 'ToProtobufY' >> ParDo( + _AddMetricsAndMap( + self.bytes_to_proto_str, self.project, + self.topic_name)).with_input_types(Union[bytes, str]) pcoll.element_type = bytes return pcoll | Write(self._sink) diff --git a/sdks/python/apache_beam/io/gcp/pubsub_test.py b/sdks/python/apache_beam/io/gcp/pubsub_test.py index f704338626ee..2e3e9b301618 100644 --- a/sdks/python/apache_beam/io/gcp/pubsub_test.py +++ b/sdks/python/apache_beam/io/gcp/pubsub_test.py @@ -38,6 +38,7 @@ from apache_beam.io.gcp.pubsub import WriteToPubSub from apache_beam.io.gcp.pubsub import _PubSubSink from apache_beam.io.gcp.pubsub import _PubSubSource +from apache_beam.metrics.metric import Lineage from apache_beam.options.pipeline_options import PipelineOptions from apache_beam.options.pipeline_options import StandardOptions from apache_beam.portability import common_urns @@ -819,6 +820,30 @@ def test_runner_api_transformation_with_subscription( 'projects/fakeprj/subscriptions/a_subscription', transform_from_proto.source.full_subscription) + def test_read_from_pubsub_no_overwrite(self, unused_mock): + expected_elements = [ + TestWindowedValue( + b'apache', + timestamp.Timestamp(1520861826.234567), [window.GlobalWindow()]), + TestWindowedValue( + b'beam', + timestamp.Timestamp(1520861824.234567), [window.GlobalWindow()]) + ] + options = PipelineOptions([]) + options.view_as(StandardOptions).streaming = True + for test_case in ('topic', 'subscription'): + with TestPipeline(options=options) as p: + # Direct runner currently overwrites the whole ReadFromPubSub transform. + # This test part of composite transform without overwrite. + pcoll = p | beam.Create([b'apache', b'beam']) | beam.Map( + lambda x: window.TimestampedValue(x, 1520861820.234567 + len(x))) + args = {test_case: f'projects/fakeprj/{test_case}s/topic_or_sub'} + pcoll = ReadFromPubSub(**args).expand_continued(pcoll) + assert_that(pcoll, equal_to(expected_elements), reify_windows=True) + self.assertSetEqual( + Lineage.query(p.result.metrics(), Lineage.SOURCE), + set([f"pubsub:{test_case}:fakeprj.topic_or_sub"])) + @unittest.skipIf(pubsub is None, 'GCP dependencies are not installed') @mock.patch('google.cloud.pubsub.PublisherClient') @@ -974,6 +999,38 @@ def test_runner_api_transformation_properties_none(self, unused_mock_pubsub): self.assertIsNone(transform_from_proto.sink.id_label) self.assertIsNone(transform_from_proto.sink.timestamp_attribute) + def test_write_to_pubsub_no_overwrite(self, unused_mock): + data = 'data' + payloads = [data] + + options = PipelineOptions([]) + options.view_as(StandardOptions).streaming = True + with TestPipeline(options=options) as p: + pcoll = p | Create(payloads) + WriteToPubSub( + 'projects/fakeprj/topics/a_topic', + with_attributes=False).expand(pcoll) + self.assertSetEqual( + Lineage.query(p.result.metrics(), Lineage.SINK), + set(["pubsub:topic:fakeprj.a_topic"])) + + def test_write_to_pubsub_with_attributes_no_overwrite(self, unused_mock): + data = b'data' + attributes = {'key': 'value'} + payloads = [PubsubMessage(data, attributes)] + + options = PipelineOptions([]) + options.view_as(StandardOptions).streaming = True + with TestPipeline(options=options) as p: + pcoll = p | Create(payloads) + # Avoid direct runner overwrites WriteToPubSub + WriteToPubSub( + 'projects/fakeprj/topics/a_topic', + with_attributes=True).expand(pcoll) + self.assertSetEqual( + Lineage.query(p.result.metrics(), Lineage.SINK), + set(["pubsub:topic:fakeprj.a_topic"])) + if __name__ == '__main__': logging.getLogger().setLevel(logging.INFO) diff --git a/sdks/python/apache_beam/io/textio_test.py b/sdks/python/apache_beam/io/textio_test.py index b134d615e20e..d1bfdf6bfd35 100644 --- a/sdks/python/apache_beam/io/textio_test.py +++ b/sdks/python/apache_beam/io/textio_test.py @@ -1726,6 +1726,48 @@ def test_csv_read_write(self): assert_that(pcoll, equal_to(records)) + def test_non_utf8_csv_read_write(self): + content = b"\xe0,\xe1,\xe2\n0,1,2\n1,2,3\n" + + with tempfile.TemporaryDirectory() as dest: + input_fn = os.path.join(dest, 'input.csv') + with open(input_fn, 'wb') as f: + f.write(content) + + with TestPipeline() as p: + r1 = ( + p + | 'Read' >> beam.io.ReadFromCsv(input_fn, encoding="latin1") + | 'ToDict' >> beam.Map(lambda x: x._asdict())) + assert_that( + r1, + equal_to([{ + "\u00e0": 0, "\u00e1": 1, "\u00e2": 2 + }, { + "\u00e0": 1, "\u00e1": 2, "\u00e2": 3 + }])) + + with TestPipeline() as p: + _ = ( + p + | 'Read' >> beam.io.ReadFromCsv(input_fn, encoding="latin1") + | 'Write' >> beam.io.WriteToCsv( + os.path.join(dest, 'out'), encoding="latin1")) + + with TestPipeline() as p: + r2 = ( + p + | 'Read' >> beam.io.ReadFromCsv( + os.path.join(dest, 'out*'), encoding="latin1") + | 'ToDict' >> beam.Map(lambda x: x._asdict())) + assert_that( + r2, + equal_to([{ + "\u00e0": 0, "\u00e1": 1, "\u00e2": 2 + }, { + "\u00e0": 1, "\u00e1": 2, "\u00e2": 3 + }])) + class JsonTest(unittest.TestCase): def test_json_read_write(self): diff --git a/sdks/python/apache_beam/ml/gcp/visionml_test_it.py b/sdks/python/apache_beam/ml/gcp/visionml_test_it.py index ea3fc9768ff5..00fd38704a02 100644 --- a/sdks/python/apache_beam/ml/gcp/visionml_test_it.py +++ b/sdks/python/apache_beam/ml/gcp/visionml_test_it.py @@ -64,17 +64,18 @@ def test_text_detection_with_language_hint(self): context_side_input=beam.pvalue.AsDict(contexts)) | beam.ParDo(extract)) - assert_that( - output, - equal_to([ - 'WAITING?\nPLEASE\nTURN OFF\nYOUR\nENGINE', - 'WAITING?', - 'PLEASE', - 'TURN', - 'OFF', - 'YOUR', - 'ENGINE' - ])) + assert_that( + output, + equal_to([ + 'WAITING?\nPLEASE\nTURN OFF\nYOUR\nENGINE', + 'WAITING', + '?', + 'PLEASE', + 'TURN', + 'OFF', + 'YOUR', + 'ENGINE' + ])) if __name__ == '__main__': diff --git a/sdks/python/apache_beam/ml/inference/base_test.py b/sdks/python/apache_beam/ml/inference/base_test.py index 767721327842..31f02c9c61c5 100644 --- a/sdks/python/apache_beam/ml/inference/base_test.py +++ b/sdks/python/apache_beam/ml/inference/base_test.py @@ -878,21 +878,22 @@ def test_run_inference_timeout_not_hit(self): bad_without_error, equal_to(expected_bad), label='assert:failures') @unittest.skipIf( - sys.version_info < (3, 11), + sys.platform == "win32" or sys.version_info < (3, 11), "This test relies on the __del__ lifecycle method, but __del__ does " + - "not get invoked in the same way on older versions of Python, " + - "breaking this test. See " + + "not get invoked in the same way on older versions of Python or on " + + "windows, breaking this test. See " + "github.com/python/cpython/issues/87950#issuecomment-1807570983 " + "for example.") def test_run_inference_timeout_does_garbage_collection(self): with tempfile.TemporaryDirectory() as tmp_dirname: tmp_path = os.path.join(tmp_dirname, 'tmp_filename') + expected_file_contents = 'Deleted FakeSlowModel' with TestPipeline() as pipeline: # Start with bad example which gets timed out. # Then provide plenty of time for GC to happen. - examples = [20] + [1] * 15 + [20, 20, 20] + examples = [20] + [1] * 15 expected_good = [1] * 15 - expected_bad = [20, 20, 20, 20] + expected_bad = [20] pcoll = pipeline | 'start' >> beam.Create(examples) main, other = pcoll | base.RunInference( FakeSlowModelHandler( @@ -909,7 +910,7 @@ def test_run_inference_timeout_does_garbage_collection(self): with open(tmp_path) as f: s = f.read() - self.assertNotEqual(s, '') + self.assertEqual(s, expected_file_contents) def test_run_inference_impl_inference_args(self): with TestPipeline() as pipeline: diff --git a/sdks/python/apache_beam/ml/inference/test_resources/vllm.dockerfile b/sdks/python/apache_beam/ml/inference/test_resources/vllm.dockerfile new file mode 100644 index 000000000000..5abbffdc5a2a --- /dev/null +++ b/sdks/python/apache_beam/ml/inference/test_resources/vllm.dockerfile @@ -0,0 +1,47 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Used for any vLLM integration test + +FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 + +RUN apt update +RUN apt install software-properties-common -y +RUN add-apt-repository ppa:deadsnakes/ppa +RUN apt update + +ARG DEBIAN_FRONTEND=noninteractive + +RUN apt install python3.12 -y +RUN apt install python3.12-venv -y +RUN apt install python3.12-dev -y +RUN rm /usr/bin/python3 +RUN ln -s python3.12 /usr/bin/python3 +RUN python3 --version +RUN apt-get install -y curl +RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.12 && pip install --upgrade pip + +RUN pip install --no-cache-dir -vvv apache-beam[gcp]==2.58.1 +RUN pip install openai vllm + +RUN apt install libcairo2-dev pkg-config python3-dev -y +RUN pip install pycairo + +# Copy the Apache Beam worker dependencies from the Beam Python 3.8 SDK image. +COPY --from=apache/beam_python3.12_sdk:2.58.1 /opt/apache/beam /opt/apache/beam + +# Set the entrypoint to Apache Beam SDK worker launcher. +ENTRYPOINT [ "/opt/apache/beam/boot" ] diff --git a/sdks/python/apache_beam/ml/inference/vllm_inference.py b/sdks/python/apache_beam/ml/inference/vllm_inference.py new file mode 100644 index 000000000000..28890083d93e --- /dev/null +++ b/sdks/python/apache_beam/ml/inference/vllm_inference.py @@ -0,0 +1,312 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# pytype: skip-file + +import logging +import os +import subprocess +import threading +import time +import uuid +from dataclasses import dataclass +from typing import Any +from typing import Dict +from typing import Iterable +from typing import Optional +from typing import Sequence +from typing import Tuple + +from apache_beam.io.filesystems import FileSystems +from apache_beam.ml.inference.base import ModelHandler +from apache_beam.ml.inference.base import PredictionResult +from apache_beam.utils import subprocess_server +from openai import OpenAI + +try: + import vllm # pylint: disable=unused-import + logging.info('vllm module successfully imported.') +except ModuleNotFoundError: + msg = 'vllm module was not found. This is ok as long as the specified ' \ + 'runner has vllm dependencies installed.' + logging.warning(msg) + +__all__ = [ + 'OpenAIChatMessage', + 'VLLMCompletionsModelHandler', + 'VLLMChatModelHandler', +] + + +@dataclass(frozen=True) +class OpenAIChatMessage(): + """" + Dataclass containing previous chat messages in conversation. + Role is the entity that sent the message (either 'user' or 'system'). + Content is the contents of the message. + """ + role: str + content: str + + +def start_process(cmd) -> Tuple[subprocess.Popen, int]: + port, = subprocess_server.pick_port(None) + cmd = [arg.replace('{{PORT}}', str(port)) for arg in cmd] # pylint: disable=not-an-iterable + logging.info("Starting service with %s", str(cmd).replace("',", "'")) + process = subprocess.Popen( + cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + + # Emit the output of this command as info level logging. + def log_stdout(): + line = process.stdout.readline() + while line: + # The log obtained from stdout is bytes, decode it into string. + # Remove newline via rstrip() to not print an empty line. + logging.info(line.decode(errors='backslashreplace').rstrip()) + line = process.stdout.readline() + + t = threading.Thread(target=log_stdout) + t.daemon = True + t.start() + return process, port + + +def getVLLMClient(port) -> OpenAI: + openai_api_key = "EMPTY" + openai_api_base = f"http://localhost:{port}/v1" + return OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, + ) + + +class _VLLMModelServer(): + def __init__(self, model_name: str, vllm_server_kwargs: Dict[str, str]): + self._model_name = model_name + self._vllm_server_kwargs = vllm_server_kwargs + self._server_started = False + self._server_process = None + self._server_port: int = -1 + + self.start_server() + + def start_server(self, retries=3): + if not self._server_started: + server_cmd = [ + 'python', + '-m', + 'vllm.entrypoints.openai.api_server', + '--model', + self._model_name, + '--port', + '{{PORT}}', + ] + for k, v in self._vllm_server_kwargs.items(): + server_cmd.append(f'--{k}') + server_cmd.append(v) + self._server_process, self._server_port = start_process(server_cmd) + + self.check_connectivity() + + def get_server_port(self) -> int: + if not self._server_started: + self.start_server() + return self._server_port + + def check_connectivity(self, retries=3): + client = getVLLMClient(self._server_port) + while self._server_process.poll() is None: + try: + models = client.models.list().data + logging.info('models: %s' % models) + if len(models) > 0: + self._server_started = True + return + except: # pylint: disable=bare-except + pass + # Sleep while bringing up the process + time.sleep(5) + + if retries == 0: + self._server_started = False + raise Exception( + "Failed to start vLLM server, polling process exited with code " + + "%s. Next time a request is tried, the server will be restarted" % + self._server_process.poll()) + else: + self.start_server(retries - 1) + + +class VLLMCompletionsModelHandler(ModelHandler[str, + PredictionResult, + _VLLMModelServer]): + def __init__( + self, + model_name: str, + vllm_server_kwargs: Optional[Dict[str, str]] = None): + """Implementation of the ModelHandler interface for vLLM using text as + input. + + Example Usage:: + + pcoll | RunInference(VLLMModelHandler(model_name='facebook/opt-125m')) + + Args: + model_name: The vLLM model. See + https://docs.vllm.ai/en/latest/models/supported_models.html for + supported models. + vllm_server_kwargs: Any additional kwargs to be passed into your vllm + server when it is being created. Will be invoked using + `python -m vllm.entrypoints.openai.api_serverv + `. For example, you could pass + `{'echo': 'true'}` to prepend new messages with the previous message. + For a list of possible kwargs, see + https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#extra-parameters-for-completions-api + """ + self._model_name = model_name + self._vllm_server_kwargs: Dict[str, str] = vllm_server_kwargs or {} + self._env_vars = {} + + def load_model(self) -> _VLLMModelServer: + return _VLLMModelServer(self._model_name, self._vllm_server_kwargs) + + def run_inference( + self, + batch: Sequence[str], + model: _VLLMModelServer, + inference_args: Optional[Dict[str, Any]] = None + ) -> Iterable[PredictionResult]: + """Runs inferences on a batch of text strings. + + Args: + batch: A sequence of examples as text strings. + model: A _VLLMModelServer containing info for connecting to the server. + inference_args: Any additional arguments for an inference. + + Returns: + An Iterable of type PredictionResult. + """ + client = getVLLMClient(model.get_server_port()) + inference_args = inference_args or {} + predictions = [] + # TODO(https://github.com/apache/beam/issues/32528): We should add support + # for taking in batches and doing a bunch of async calls. That will end up + # being more efficient when we can do in bundle batching. + for prompt in batch: + try: + completion = client.completions.create( + model=self._model_name, prompt=prompt, **inference_args) + predictions.append(completion) + except Exception as e: + model.check_connectivity() + raise e + + return [PredictionResult(x, y) for x, y in zip(batch, predictions)] + + def share_model_across_processes(self) -> bool: + return True + + +class VLLMChatModelHandler(ModelHandler[Sequence[OpenAIChatMessage], + PredictionResult, + _VLLMModelServer]): + def __init__( + self, + model_name: str, + chat_template_path: Optional[str] = None, + vllm_server_kwargs: Optional[Dict[str, str]] = None): + """ Implementation of the ModelHandler interface for vLLM using previous + messages as input. + + Example Usage:: + + pcoll | RunInference(VLLMModelHandler(model_name='facebook/opt-125m')) + + Args: + model_name: The vLLM model. See + https://docs.vllm.ai/en/latest/models/supported_models.html for + supported models. + chat_template_path: Path to a chat template. This file must be accessible + from your runner's execution environment, so it is recommended to use + a cloud based file storage system (e.g. Google Cloud Storage). + For info on chat templates, see: + https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#chat-template + vllm_server_kwargs: Any additional kwargs to be passed into your vllm + server when it is being created. Will be invoked using + `python -m vllm.entrypoints.openai.api_serverv + `. For example, you could pass + `{'echo': 'true'}` to prepend new messages with the previous message. + For a list of possible kwargs, see + https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#extra-parameters-for-chat-api + """ + self._model_name = model_name + self._vllm_server_kwargs: Dict[str, str] = vllm_server_kwargs or {} + self._env_vars = {} + self._chat_template_path = chat_template_path + self._chat_file = f'template-{uuid.uuid4().hex}.jinja' + + def load_model(self) -> _VLLMModelServer: + chat_template_contents = '' + if self._chat_template_path is not None: + local_chat_template_path = os.path.join(os.getcwd(), self._chat_file) + if not os.path.exists(local_chat_template_path): + with FileSystems.open(self._chat_template_path) as fin: + chat_template_contents = fin.read().decode() + with open(local_chat_template_path, 'a') as f: + f.write(chat_template_contents) + self._vllm_server_kwargs['chat_template'] = local_chat_template_path + + return _VLLMModelServer(self._model_name, self._vllm_server_kwargs) + + def run_inference( + self, + batch: Sequence[Sequence[OpenAIChatMessage]], + model: _VLLMModelServer, + inference_args: Optional[Dict[str, Any]] = None + ) -> Iterable[PredictionResult]: + """Runs inferences on a batch of text strings. + + Args: + batch: A sequence of examples as OpenAI messages. + model: A _VLLMModelServer for connecting to the spun up server. + inference_args: Any additional arguments for an inference. + + Returns: + An Iterable of type PredictionResult. + """ + client = getVLLMClient(model.get_server_port()) + inference_args = inference_args or {} + predictions = [] + # TODO(https://github.com/apache/beam/issues/32528): We should add support + # for taking in batches and doing a bunch of async calls. That will end up + # being more efficient when we can do in bundle batching. + for messages in batch: + formatted = [] + for message in messages: + formatted.append({"role": message.role, "content": message.content}) + try: + completion = client.chat.completions.create( + model=self._model_name, messages=formatted, **inference_args) + predictions.append(completion) + except Exception as e: + model.check_connectivity() + raise e + + return [PredictionResult(x, y) for x, y in zip(batch, predictions)] + + def share_model_across_processes(self) -> bool: + return True diff --git a/sdks/python/apache_beam/options/pipeline_options.py b/sdks/python/apache_beam/options/pipeline_options.py index 0f8457a40a7b..4497ab0993a4 100644 --- a/sdks/python/apache_beam/options/pipeline_options.py +++ b/sdks/python/apache_beam/options/pipeline_options.py @@ -28,6 +28,7 @@ from typing import Dict from typing import List from typing import Optional +from typing import Sequence from typing import Type from typing import TypeVar @@ -185,9 +186,7 @@ def _add_argparse_args(cls, parser): By default the options classes will use command line arguments to initialize the options. """ - def __init__(self, flags=None, **kwargs): - # type: (Optional[List[str]], **Any) -> None - + def __init__(self, flags: Optional[Sequence[str]] = None, **kwargs) -> None: """Initialize an options class. The initializer will traverse all subclasses, add all their argparse @@ -216,6 +215,12 @@ def __init__(self, flags=None, **kwargs): # self._flags stores a list of not yet parsed arguments, typically, # command-line flags. This list is shared across different views. # See: view_as(). + if isinstance(flags, str): + # Unfortunately a single str passes the Iterable[str] test, as it is + # an iterable of single characters. This is almost certainly not the + # intent... + raise ValueError( + "Flags must be an iterable of of strings, not a single string.") self._flags = flags # Build parser that will parse options recognized by the [sub]class of @@ -355,6 +360,9 @@ def get_all_options( 'used for internal purposes.' % (','.join(unknown_args))) i = 0 while i < len(unknown_args): + # End of argument parsing. + if unknown_args[i] == '--': + break # Treat all unary flags as booleans, and all binary argument values as # strings. if not unknown_args[i].startswith('-'): @@ -576,6 +584,14 @@ def _add_argparse_args(cls, parser): 'updating a pipeline or reloading the job state. ' 'This is not recommended for streaming jobs.') + parser.add_argument( + '--no_wait_until_finish', + default=False, + action='store_true', + help='By default, the "with" statement waits for the job to ' + 'complete. Set this flag to bypass this behavior and continue ' + 'execution immediately') + class StreamingOptions(PipelineOptions): @classmethod @@ -931,6 +947,18 @@ def _add_argparse_args(cls, parser): help= 'Create metrics reporting the approximate number of bytes written per ' 'bucket.') + parser.add_argument( + '--no_gcsio_throttling_counter', + default=False, + action='store_true', + help='Throttling counter in GcsIO is enabled by default. Set ' + '--no_gcsio_throttling_counter to avoid it.') + parser.add_argument( + '--enable_gcsio_blob_generation', + default=False, + action='store_true', + help='Use blob generation when mutating blobs in GCSIO to ' + 'mitigate race conditions at the cost of more HTTP requests.') def _create_default_gcs_bucket(self): try: @@ -947,6 +975,11 @@ def _create_default_gcs_bucket(self): # Log warning if soft delete policy is enabled in a gcs bucket # that is specified in an argument. def _warn_if_soft_delete_policy_enabled(self, arg_name): + # skip the check if it is in dry-run mode because the later step requires + # internet connection to access GCS + if self.view_as(TestOptions).dry_run: + return + gcs_path = getattr(self, arg_name, None) try: from apache_beam.io.gcp import gcsio diff --git a/sdks/python/apache_beam/options/pipeline_options_test.py b/sdks/python/apache_beam/options/pipeline_options_test.py index 61b227d9a246..c0616bc6451c 100644 --- a/sdks/python/apache_beam/options/pipeline_options_test.py +++ b/sdks/python/apache_beam/options/pipeline_options_test.py @@ -44,6 +44,12 @@ _LOGGER = logging.getLogger(__name__) +try: + import apache_beam.io.gcp.gcsio # pylint: disable=unused-import + has_gcsio = True +except ImportError: + has_gcsio = False + # Mock runners to use for validations. class MockRunners(object): @@ -711,6 +717,16 @@ def test_options_store_false_with_different_dest(self): "the dest and the flag name to the map " "_FLAG_THAT_SETS_FALSE_VALUE in PipelineOptions.py") + def _check_errors(self, options, validator, expected): + if has_gcsio: + with mock.patch('apache_beam.io.gcp.gcsio.GcsIO.is_soft_delete_enabled', + return_value=False): + errors = options._handle_temp_and_staging_locations(validator) + self.assertEqual(errors, expected) + else: + errors = options._handle_temp_and_staging_locations(validator) + self.assertEqual(errors, expected) + def test_validation_good_stg_good_temp(self): runner = MockRunners.DataflowRunner() options = GoogleCloudOptions([ @@ -719,8 +735,7 @@ def test_validation_good_stg_good_temp(self): '--temp_location=gs://beam/tmp' ]) validator = PipelineOptionsValidator(options, runner) - errors = options._handle_temp_and_staging_locations(validator) - self.assertEqual(errors, []) + self._check_errors(options, validator, []) self.assertEqual( options.get_all_options()['staging_location'], "gs://beam/stg") self.assertEqual( @@ -734,8 +749,7 @@ def test_validation_bad_stg_good_temp(self): '--temp_location=gs://beam/tmp' ]) validator = PipelineOptionsValidator(options, runner) - errors = options._handle_temp_and_staging_locations(validator) - self.assertEqual(errors, []) + self._check_errors(options, validator, []) self.assertEqual( options.get_all_options()['staging_location'], "gs://beam/tmp") self.assertEqual( @@ -749,8 +763,7 @@ def test_validation_good_stg_bad_temp(self): '--temp_location=badGSpath' ]) validator = PipelineOptionsValidator(options, runner) - errors = options._handle_temp_and_staging_locations(validator) - self.assertEqual(errors, []) + self._check_errors(options, validator, []) self.assertEqual( options.get_all_options()['staging_location'], "gs://beam/stg") self.assertEqual( @@ -764,8 +777,7 @@ def test_validation_bad_stg_bad_temp_with_default(self): '--temp_location=badGSpath' ]) validator = PipelineOptionsValidator(options, runner) - errors = options._handle_temp_and_staging_locations(validator) - self.assertEqual(errors, []) + self._check_errors(options, validator, []) self.assertEqual( options.get_all_options()['staging_location'], "gs://default/bucket") self.assertEqual( @@ -779,16 +791,15 @@ def test_validation_bad_stg_bad_temp_no_default(self): '--temp_location=badGSpath' ]) validator = PipelineOptionsValidator(options, runner) - errors = options._handle_temp_and_staging_locations(validator) - self.assertEqual(len(errors), 2, errors) - self.assertIn( - 'Invalid GCS path (badGSpath), given for the option: temp_location.', - errors, - errors) - self.assertIn( - 'Invalid GCS path (badGSpath), given for the option: staging_location.', - errors, - errors) + self._check_errors( + options, + validator, + [ + 'Invalid GCS path (badGSpath), given for the option: ' \ + 'temp_location.', + 'Invalid GCS path (badGSpath), given for the option: ' \ + 'staging_location.' + ]) if __name__ == '__main__': diff --git a/sdks/python/apache_beam/pipeline.py b/sdks/python/apache_beam/pipeline.py index f5183b49bc4b..6209ca1ddae8 100644 --- a/sdks/python/apache_beam/pipeline.py +++ b/sdks/python/apache_beam/pipeline.py @@ -618,7 +618,13 @@ def __exit__( try: if not exc_type: self.result = self.run() - self.result.wait_until_finish() + if not self._options.view_as(StandardOptions).no_wait_until_finish: + self.result.wait_until_finish() + else: + logging.info( + 'Job execution continues without waiting for completion.' + ' Use "wait_until_finish" in PipelineResult to block' + ' until finished.') finally: self._extra_context.__exit__(exc_type, exc_val, exc_tb) diff --git a/sdks/python/apache_beam/pipeline_test.py b/sdks/python/apache_beam/pipeline_test.py index 7863352cbefa..61aac350280f 100644 --- a/sdks/python/apache_beam/pipeline_test.py +++ b/sdks/python/apache_beam/pipeline_test.py @@ -274,6 +274,16 @@ def test_reuse_custom_transform_instance(self): 'reloading the job state. This is not recommended for ' 'streaming jobs.') + @mock.patch('logging.info') # Mock the logging.info function + def test_no_wait_until_finish(self, mock_info): + with Pipeline(runner='DirectRunner', + options=PipelineOptions(["--no_wait_until_finish"])) as p: + _ = p | beam.Create(['test']) + mock_info.assert_called_once_with( + 'Job execution continues without waiting for completion. ' + 'Use "wait_until_finish" in PipelineResult to block until finished.') + p.result.wait_until_finish() + def test_auto_unique_labels(self): opts = PipelineOptions(["--auto_unique_labels"]) @@ -773,7 +783,7 @@ def test_windowed_value_param(self): | Map(lambda _, wv=DoFn.WindowedValueParam: (wv.value, wv.windows))) assert_that( pcoll, - equal_to([(1, [IntervalWindow(0, 5)]), (7, [IntervalWindow(5, 10)])])) + equal_to([(1, [IntervalWindow(0, 5)]), (7, [IntervalWindow(5, 10)])])) # pylint: disable=too-many-function-args def test_timestamp_param(self): class TestDoFn(DoFn): diff --git a/sdks/python/apache_beam/runners/dataflow/internal/apiclient_test.py b/sdks/python/apache_beam/runners/dataflow/internal/apiclient_test.py index 8331d9cf3919..022136aae9a2 100644 --- a/sdks/python/apache_beam/runners/dataflow/internal/apiclient_test.py +++ b/sdks/python/apache_beam/runners/dataflow/internal/apiclient_test.py @@ -938,7 +938,7 @@ def test_experiment_use_multiple_sdk_containers(self): @mock.patch( 'apache_beam.runners.dataflow.internal.apiclient.sys.version_info', - (3, 8)) + (3, 9)) def test_get_python_sdk_name(self): pipeline_options = PipelineOptions([ '--project', @@ -957,7 +957,7 @@ def test_get_python_sdk_name(self): 1, FAKE_PIPELINE_URL) self.assertEqual( - 'Apache Beam Python 3.8 SDK', environment._get_python_sdk_name()) + 'Apache Beam Python 3.9 SDK', environment._get_python_sdk_name()) @mock.patch( 'apache_beam.runners.dataflow.internal.apiclient.sys.version_info', diff --git a/sdks/python/apache_beam/runners/dataflow/internal/names.py b/sdks/python/apache_beam/runners/dataflow/internal/names.py index 7b1afdd66b97..044e144c65a0 100644 --- a/sdks/python/apache_beam/runners/dataflow/internal/names.py +++ b/sdks/python/apache_beam/runners/dataflow/internal/names.py @@ -34,6 +34,6 @@ # Unreleased sdks use container image tag specified below. # Update this tag whenever there is a change that # requires changes to SDK harness container or SDK harness launcher. -BEAM_DEV_SDK_CONTAINER_TAG = 'beam-master-20240829' +BEAM_DEV_SDK_CONTAINER_TAG = 'beam-master-20240918' DATAFLOW_CONTAINER_IMAGE_REPOSITORY = 'gcr.io/cloud-dataflow/v1beta3' diff --git a/sdks/python/apache_beam/runners/interactive/display/pcoll_visualization_test.py b/sdks/python/apache_beam/runners/interactive/display/pcoll_visualization_test.py index d34b966b0efa..7fc76feb7494 100644 --- a/sdks/python/apache_beam/runners/interactive/display/pcoll_visualization_test.py +++ b/sdks/python/apache_beam/runners/interactive/display/pcoll_visualization_test.py @@ -66,7 +66,7 @@ def setUp(self): ie.current_env().track_user_pipelines() recording_manager = RecordingManager(self._p) - recording = recording_manager.record([self._pcoll], 5, 5) + recording = recording_manager.record([self._pcoll], max_n=5, max_duration=5) self._stream = recording.stream(self._pcoll) def test_pcoll_visualization_generate_unique_display_id(self): diff --git a/sdks/python/apache_beam/runners/interactive/interactive_beam.py b/sdks/python/apache_beam/runners/interactive/interactive_beam.py index 5c76f9c228c8..9554abf3a47a 100644 --- a/sdks/python/apache_beam/runners/interactive/interactive_beam.py +++ b/sdks/python/apache_beam/runners/interactive/interactive_beam.py @@ -36,7 +36,9 @@ import logging from datetime import timedelta +from typing import Any from typing import Dict +from typing import Iterable from typing import List from typing import Optional from typing import Union @@ -46,6 +48,7 @@ import apache_beam as beam from apache_beam.dataframe.frame_base import DeferredBase from apache_beam.options.pipeline_options import FlinkRunnerOptions +from apache_beam.pvalue import PCollection from apache_beam.runners.interactive import interactive_environment as ie from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import DataprocClusterManager from apache_beam.runners.interactive.dataproc.types import ClusterIdentifier @@ -681,13 +684,11 @@ def run_pipeline(self): @progress_indicated def show( - *pcolls, - include_window_info=False, - visualize_data=False, - n='inf', - duration='inf'): - # type: (*Union[Dict[Any, PCollection], Iterable[PCollection], PCollection], bool, bool, Union[int, str], Union[int, str]) -> None # noqa: F821 - + *pcolls: Union[Dict[Any, PCollection], Iterable[PCollection], PCollection], + include_window_info: bool = False, + visualize_data: bool = False, + n: Union[int, str] = 'inf', + duration: Union[int, str] = 'inf'): """Shows given PCollections in an interactive exploratory way if used within a notebook, or prints a heading sampled data if used within an ipython shell. Noop if used in a non-interactive environment. @@ -880,6 +881,8 @@ def collect( n='inf', duration='inf', include_window_info=False, + runner=None, + options=None, force_compute=False, force_tuple=False): """Materializes the elements from a PCollection into a Dataframe. @@ -896,6 +899,9 @@ def collect( a string duration. Default 'inf'. include_window_info: (optional) if True, appends the windowing information to each row. Default False. + runner: (optional) the runner with which to compute the results + options: (optional) any additional pipeline options to use to compute the + results force_compute: (optional) if True, forces recomputation rather than using cached PCollections force_tuple: (optional) if True, return a 1-tuple or results rather than @@ -969,7 +975,12 @@ def as_pcollection(pcoll_or_df): uncomputed = set(pcolls) - set(computed.keys()) if uncomputed: recording = recording_manager.record( - uncomputed, max_n=n, max_duration=duration, force_compute=force_compute) + uncomputed, + max_n=n, + max_duration=duration, + runner=runner, + options=options, + force_compute=force_compute) try: for pcoll in uncomputed: diff --git a/sdks/python/apache_beam/runners/interactive/non_interactive_runner_test.py b/sdks/python/apache_beam/runners/interactive/non_interactive_runner_test.py index 47adf7b36b33..f7fd052fecc4 100644 --- a/sdks/python/apache_beam/runners/interactive/non_interactive_runner_test.py +++ b/sdks/python/apache_beam/runners/interactive/non_interactive_runner_test.py @@ -257,6 +257,36 @@ def test_dataframes_same_cell_twice(self): df_expected['cube'], ib.collect(df['cube'], n=10).reset_index(drop=True)) + @unittest.skipIf(sys.platform == "win32", "[BEAM-10627]") + def test_new_runner_and_options(self): + class MyRunner(beam.runners.PipelineRunner): + run_count = 0 + + @classmethod + def run_pipeline(cls, pipeline, options): + assert options._all_options['my_option'] == 123 + cls.run_count += 1 + return direct_runner.DirectRunner().run_pipeline(pipeline, options) + + clear_side_effect() + p = beam.Pipeline(direct_runner.DirectRunner()) + + # Initial collection runs the pipeline. + pcoll1 = p | beam.Create(['a', 'b', 'c']) | beam.Map(cause_side_effect) + collected1 = ib.collect(pcoll1) + self.assertEqual(set(collected1[0]), set(['a', 'b', 'c'])) + self.assertEqual(count_side_effects('a'), 1) + + # Using the PCollection uses the cache with a different runner and options. + pcoll2 = pcoll1 | beam.Map(str.upper) + collected2 = ib.collect( + pcoll2, + runner=MyRunner(), + options=beam.options.pipeline_options.PipelineOptions(my_option=123)) + self.assertEqual(set(collected2[0]), set(['A', 'B', 'C'])) + self.assertEqual(count_side_effects('a'), 1) + self.assertEqual(MyRunner.run_count, 1) + if __name__ == '__main__': unittest.main() diff --git a/sdks/python/apache_beam/runners/interactive/pipeline_fragment.py b/sdks/python/apache_beam/runners/interactive/pipeline_fragment.py index 5b385d3f8a0f..20dee2b71163 100644 --- a/sdks/python/apache_beam/runners/interactive/pipeline_fragment.py +++ b/sdks/python/apache_beam/runners/interactive/pipeline_fragment.py @@ -34,7 +34,7 @@ class PipelineFragment(object): A pipeline fragment is built from the original pipeline definition to include only PTransforms that are necessary to produce the given PCollections. """ - def __init__(self, pcolls, options=None): + def __init__(self, pcolls, options=None, runner=None): """Constructor of PipelineFragment. Args: @@ -42,6 +42,8 @@ def __init__(self, pcolls, options=None): fragment for. options: (PipelineOptions) the pipeline options for the implicit pipeline run. + runner: (Runner) the pipeline runner for the implicit + pipeline run. """ assert len(pcolls) > 0, ( 'Need at least 1 PCollection as the target data to build a pipeline ' @@ -61,6 +63,7 @@ def __init__(self, pcolls, options=None): 'given and cannot be used to build a pipeline fragment that produces ' 'the given PCollections.'.format(pcoll)) self._options = options + self._runner = runner # A copied pipeline instance for modification without changing the user # pipeline instance held by the end user. This instance can be processed @@ -98,7 +101,7 @@ def deduce_fragment(self): """Deduce the pipeline fragment as an apache_beam.Pipeline instance.""" fragment = beam.pipeline.Pipeline.from_runner_api( self._runner_pipeline.to_runner_api(), - self._runner_pipeline.runner, + self._runner or self._runner_pipeline.runner, self._options) ie.current_env().add_derived_pipeline(self._runner_pipeline, fragment) return fragment @@ -129,9 +132,8 @@ def run(self, display_pipeline_graph=False, use_cache=True, blocking=False): 'the Beam pipeline to use this function ' 'on unbouded PCollections.') result = beam.pipeline.Pipeline.from_runner_api( - pipeline_instrument_proto, - self._runner_pipeline.runner, - self._runner_pipeline._options).run() + pipeline_instrument_proto, fragment.runner, + fragment._options).run() result.wait_until_finish() ie.current_env().mark_pcollection_computed( pipeline_instrument.cached_pcolls) diff --git a/sdks/python/apache_beam/runners/interactive/recording_manager.py b/sdks/python/apache_beam/runners/interactive/recording_manager.py index cb28a61a95f1..6811d3e0d345 100644 --- a/sdks/python/apache_beam/runners/interactive/recording_manager.py +++ b/sdks/python/apache_beam/runners/interactive/recording_manager.py @@ -28,7 +28,9 @@ import apache_beam as beam from apache_beam.dataframe.frame_base import DeferredBase +from apache_beam.options import pipeline_options from apache_beam.portability.api import beam_runner_api_pb2 +from apache_beam.runners import runner from apache_beam.runners.interactive import background_caching_job as bcj from apache_beam.runners.interactive import interactive_environment as ie from apache_beam.runners.interactive import interactive_runner as ir @@ -384,8 +386,11 @@ def record_pipeline(self) -> bool: def record( self, pcolls: List[beam.pvalue.PCollection], + *, max_n: int, max_duration: Union[int, str], + runner: runner.PipelineRunner = None, + options: pipeline_options.PipelineOptions = None, force_compute: bool = False) -> Recording: # noqa: F821 @@ -427,12 +432,20 @@ def record( # incomplete. self._clear() + merged_options = pipeline_options.PipelineOptions( + **{ + **self.user_pipeline.options.get_all_options( + drop_default=True, retain_unknown_options=True), + **options.get_all_options( + drop_default=True, retain_unknown_options=True) + }) if options else self.user_pipeline.options + cache_path = ie.current_env().options.cache_root is_remote_run = cache_path and ie.current_env( ).options.cache_root.startswith('gs://') pf.PipelineFragment( - list(uncomputed_pcolls), - self.user_pipeline.options).run(blocking=is_remote_run) + list(uncomputed_pcolls), merged_options, + runner=runner).run(blocking=is_remote_run) result = ie.current_env().pipeline_result(self.user_pipeline) else: result = None diff --git a/sdks/python/apache_beam/runners/portability/fn_api_runner/worker_handlers.py b/sdks/python/apache_beam/runners/portability/fn_api_runner/worker_handlers.py index eb40ced2da08..bcfa965c0469 100644 --- a/sdks/python/apache_beam/runners/portability/fn_api_runner/worker_handlers.py +++ b/sdks/python/apache_beam/runners/portability/fn_api_runner/worker_handlers.py @@ -939,7 +939,7 @@ def close_all(self): except Exception: _LOGGER.error( "Error closing worker_handler %s" % worker_handler, exc_info=True) - self._cached_handlers = {} # type: ignore[assignment] + self._cached_handlers = collections.defaultdict(list) self._workers_by_id = {} if self._grpc_server is not None: self._grpc_server.close() diff --git a/sdks/python/apache_beam/runners/portability/fn_api_runner/worker_handlers_test.py b/sdks/python/apache_beam/runners/portability/fn_api_runner/worker_handlers_test.py new file mode 100644 index 000000000000..832e7ecee801 --- /dev/null +++ b/sdks/python/apache_beam/runners/portability/fn_api_runner/worker_handlers_test.py @@ -0,0 +1,52 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# pytype: skip-file +import logging +import unittest + +from apache_beam.portability.api import beam_provision_api_pb2 +from apache_beam.runners.portability.fn_api_runner.fn_runner import ExtendedProvisionInfo +from apache_beam.runners.portability.fn_api_runner.worker_handlers import WorkerHandlerManager +from apache_beam.transforms import environments + +_LOGGER = logging.getLogger(__name__) + + +class WorkerHandlerManagerTest(unittest.TestCase): + def test_close_all(self): + inprocess_env = environments.EmbeddedPythonEnvironment( + capabilities=environments.python_sdk_capabilities(), + artifacts=(), + resource_hints={}).to_runner_api(None) + envs = { + 'inprocess': inprocess_env, + } + prov_info = ExtendedProvisionInfo( + beam_provision_api_pb2.ProvisionInfo( + retrieval_token='unused-retrieval-token')) + + manager = WorkerHandlerManager(envs, job_provision_info=prov_info) + first_workers = manager.get_worker_handlers('inprocess', 1) + manager.close_all() + second_workers = manager.get_worker_handlers('inprocess', 1) + assert len(first_workers) == len(second_workers) == 1 + assert first_workers != second_workers + + +if __name__ == '__main__': + logging.getLogger().setLevel(logging.INFO) + unittest.main() diff --git a/sdks/python/apache_beam/testing/util.py b/sdks/python/apache_beam/testing/util.py index cffafa6c0740..f7fabde43d4c 100644 --- a/sdks/python/apache_beam/testing/util.py +++ b/sdks/python/apache_beam/testing/util.py @@ -261,6 +261,23 @@ def assert_that( """ assert isinstance(actual, pvalue.PCollection), ( '%s is not a supported type for Beam assert' % type(actual)) + pipeline = actual.pipeline + if getattr(actual.pipeline, 'result', None): + # The pipeline was already run. The user most likely called assert_that + # after the pipeleline context. + raise RuntimeError( + 'assert_that must be used within a beam.Pipeline context') + + # Usually, the uniqueness of the label is left to the pipeline + # writer to guarantee. Since we're in a testing context, we'll + # just automatically append a number to the label if it's + # already in use, as tests don't typically have to worry about + # long-term update compatibility stability of stage names. + if label in pipeline.applied_labels: + label_idx = 2 + while f"{label}_{label_idx}" in pipeline.applied_labels: + label_idx += 1 + label = f"{label}_{label_idx}" if isinstance(matcher, _EqualToPerWindowMatcher): reify_windows = True diff --git a/sdks/python/apache_beam/testing/util_test.py b/sdks/python/apache_beam/testing/util_test.py index 98c1349ef36c..ba3c743c03f3 100644 --- a/sdks/python/apache_beam/testing/util_test.py +++ b/sdks/python/apache_beam/testing/util_test.py @@ -183,6 +183,19 @@ def test_equal_to_per_window_fail_unmatched_window(self): equal_to_per_window(expected), reify_windows=True) + def test_runtimeerror_outside_of_context(self): + with beam.Pipeline() as p: + outputs = (p | beam.Create([1, 2, 3]) | beam.Map(lambda x: x + 1)) + with self.assertRaises(RuntimeError): + assert_that(outputs, equal_to([2, 3, 4])) + + def test_multiple_assert_that_labels(self): + with beam.Pipeline() as p: + outputs = (p | beam.Create([1, 2, 3]) | beam.Map(lambda x: x + 1)) + assert_that(outputs, equal_to([2, 3, 4])) + assert_that(outputs, equal_to([2, 3, 4])) + assert_that(outputs, equal_to([2, 3, 4])) + def test_equal_to_per_window_fail_unmatched_element(self): with self.assertRaises(BeamAssertException): start = int(MIN_TIMESTAMP.micros // 1e6) - 5 diff --git a/sdks/python/apache_beam/transforms/core.py b/sdks/python/apache_beam/transforms/core.py index d7415e8d8135..a3762adac0cb 100644 --- a/sdks/python/apache_beam/transforms/core.py +++ b/sdks/python/apache_beam/transforms/core.py @@ -1462,8 +1462,14 @@ def partition_for(self, element, num_partitions, *args, **kwargs): def _get_function_body_without_inners(func): source_lines = inspect.getsourcelines(func)[0] source_lines = dropwhile(lambda x: x.startswith("@"), source_lines) - def_line = next(source_lines).strip() - if def_line.startswith("def ") and def_line.endswith(":"): + first_def_line = next(source_lines).strip() + if first_def_line.startswith("def "): + last_def_line_without_comment = first_def_line.split("#")[0] \ + .split("\"\"\"")[0] + while not last_def_line_without_comment.strip().endswith(":"): + last_def_line_without_comment = next(source_lines).split("#")[0] \ + .split("\"\"\"")[0] + first_line = next(source_lines) indentation = len(first_line) - len(first_line.lstrip()) final_lines = [first_line[indentation:]] @@ -1487,7 +1493,7 @@ def _get_function_body_without_inners(func): return "".join(final_lines) else: - return def_line.rsplit(":")[-1].strip() + return first_def_line.rsplit(":")[-1].strip() def _check_fn_use_yield_and_return(fn): @@ -1497,15 +1503,26 @@ def _check_fn_use_yield_and_return(fn): source_code = _get_function_body_without_inners(fn) has_yield = False has_return = False + return_none_warning = ( + "No iterator is returned by the process method in %s.", + fn.__self__.__class__) for line in source_code.split("\n"): - if line.lstrip().startswith("yield ") or line.lstrip().startswith( + lstripped_line = line.lstrip() + if lstripped_line.startswith("yield ") or lstripped_line.startswith( "yield("): has_yield = True - if line.lstrip().startswith("return ") or line.lstrip().startswith( + if lstripped_line.startswith("return ") or lstripped_line.startswith( "return("): has_return = True + if lstripped_line.startswith( + "return None") or lstripped_line.rstrip() == "return": + _LOGGER.warning(return_none_warning) if has_yield and has_return: return True + + if not has_yield and not has_return: + _LOGGER.warning(return_none_warning) + return False except Exception as e: _LOGGER.debug(str(e)) diff --git a/sdks/python/apache_beam/transforms/core_test.py b/sdks/python/apache_beam/transforms/core_test.py index b492ab0938cc..54afb365d2d8 100644 --- a/sdks/python/apache_beam/transforms/core_test.py +++ b/sdks/python/apache_beam/transforms/core_test.py @@ -30,6 +30,8 @@ from apache_beam.testing.util import equal_to from apache_beam.transforms.window import FixedWindows +RETURN_NONE_PARTIAL_WARNING = "No iterator is returned" + class TestDoFn1(beam.DoFn): def process(self, element): @@ -96,6 +98,24 @@ def process(self, element): yield element +class TestDoFn10(beam.DoFn): + """test process returning None explicitly""" + def process(self, element): + return None + + +class TestDoFn11(beam.DoFn): + """test process returning None (no return and no yield)""" + def process(self, element): + pass + + +class TestDoFn12(beam.DoFn): + """test process returning None (return statement without a value)""" + def process(self, element): + return + + class CreateTest(unittest.TestCase): @pytest.fixture(autouse=True) def inject_fixtures(self, caplog): @@ -119,6 +139,24 @@ def test_dofn_with_yield_and_return(self): beam.ParDo(TestDoFn3()) assert warning_text in self._caplog.text + def test_dofn_with_explicit_return_none(self): + with self._caplog.at_level(logging.WARNING): + beam.ParDo(TestDoFn10()) + assert RETURN_NONE_PARTIAL_WARNING in self._caplog.text + assert str(TestDoFn10) in self._caplog.text + + def test_dofn_with_implicit_return_none_missing_return_and_yield(self): + with self._caplog.at_level(logging.WARNING): + beam.ParDo(TestDoFn11()) + assert RETURN_NONE_PARTIAL_WARNING in self._caplog.text + assert str(TestDoFn11) in self._caplog.text + + def test_dofn_with_implicit_return_none_return_without_value(self): + with self._caplog.at_level(logging.WARNING): + beam.ParDo(TestDoFn12()) + assert RETURN_NONE_PARTIAL_WARNING in self._caplog.text + assert str(TestDoFn12) in self._caplog.text + class PartitionTest(unittest.TestCase): def test_partition_boundedness(self): diff --git a/sdks/python/apache_beam/transforms/external_transform_provider_it_test.py b/sdks/python/apache_beam/transforms/external_transform_provider_it_test.py index df30611598fe..6b26206908fb 100644 --- a/sdks/python/apache_beam/transforms/external_transform_provider_it_test.py +++ b/sdks/python/apache_beam/transforms/external_transform_provider_it_test.py @@ -193,7 +193,7 @@ def test_pretty_types(self): expected_type_names = [('List[str]', True), ('numpy.int16', False), ('str', False), ('Dict[str, numpy.float64]', False), ('Dict[str, List[numpy.int64]]', True), - ('Dict[int, Union[str, NoneType]]', False)] + ('Dict[int, Optional[str]]', False)] for i in range(len(types)): self.assertEqual( diff --git a/sdks/python/apache_beam/transforms/periodicsequence.py b/sdks/python/apache_beam/transforms/periodicsequence.py index b2d7b375571b..613661b22957 100644 --- a/sdks/python/apache_beam/transforms/periodicsequence.py +++ b/sdks/python/apache_beam/transforms/periodicsequence.py @@ -48,14 +48,31 @@ def initial_restriction(self, element): def create_tracker(self, restriction): return OffsetRestrictionTracker(restriction) - def restriction_size(self, unused_element, restriction): - return restriction.size() + def restriction_size(self, element, restriction): + return _sequence_backlog_bytes(element, time.time(), restriction) # On drain, immediately stop emitting new elements def truncate(self, unused_element, unused_restriction): return None +def _sequence_backlog_bytes(element, now, offset_range): + ''' + Calculates size of the output that the sequence should have emitted up to now. + ''' + start, _, interval = element + if isinstance(start, Timestamp): + start = start.micros / 1000000 + assert interval > 0 + + now_index = math.floor((now - start) / interval) + if now_index < offset_range.start: + return 0 + # We attempt to be precise as some runners scale based upon bytes and + # output byte throughput. + return 8 * (min(offset_range.stop, now_index) - offset_range.start) + + class ImpulseSeqGenDoFn(beam.DoFn): ''' ImpulseSeqGenDoFn fn receives tuple elements with three parts: diff --git a/sdks/python/apache_beam/transforms/periodicsequence_test.py b/sdks/python/apache_beam/transforms/periodicsequence_test.py index 932779555341..221520c94622 100644 --- a/sdks/python/apache_beam/transforms/periodicsequence_test.py +++ b/sdks/python/apache_beam/transforms/periodicsequence_test.py @@ -24,11 +24,13 @@ import unittest import apache_beam as beam +from apache_beam.io.restriction_trackers import OffsetRange from apache_beam.testing.test_pipeline import TestPipeline from apache_beam.testing.util import assert_that from apache_beam.testing.util import equal_to from apache_beam.transforms.periodicsequence import PeriodicImpulse from apache_beam.transforms.periodicsequence import PeriodicSequence +from apache_beam.transforms.periodicsequence import _sequence_backlog_bytes # Disable frequent lint warning due to pipe operator for chaining transforms. # pylint: disable=expression-not-assigned @@ -112,6 +114,24 @@ def test_periodicsequence_outputs_valid_sequence_in_past(self): self.assertEqual(result.is_bounded, False) assert_that(result, equal_to(k)) + def test_periodicsequence_output_size(self): + element = [0, 1000000000, 10] + self.assertEqual( + _sequence_backlog_bytes(element, 100, OffsetRange(10, 100000000)), 0) + self.assertEqual( + _sequence_backlog_bytes(element, 100, OffsetRange(9, 100000000)), 8) + self.assertEqual( + _sequence_backlog_bytes(element, 100, OffsetRange(8, 100000000)), 16) + self.assertEqual( + _sequence_backlog_bytes(element, 101, OffsetRange(9, 100000000)), 8) + self.assertEqual( + _sequence_backlog_bytes(element, 10000, OffsetRange(0, 100000000)), + 8 * 10000 / 10) + self.assertEqual( + _sequence_backlog_bytes(element, 10000, OffsetRange(1002, 1003)), 0) + self.assertEqual( + _sequence_backlog_bytes(element, 10100, OffsetRange(1002, 1003)), 8) + if __name__ == '__main__': unittest.main() diff --git a/sdks/python/apache_beam/transforms/ptransform_test.py b/sdks/python/apache_beam/transforms/ptransform_test.py index a51d5cd83d26..2fdec14651f1 100644 --- a/sdks/python/apache_beam/transforms/ptransform_test.py +++ b/sdks/python/apache_beam/transforms/ptransform_test.py @@ -1495,17 +1495,17 @@ def test_filter_does_not_type_check_using_type_hints_decorator(self): def more_than_half(a): return a > 0.50 - # Func above was hinted to only take a float, yet an int will be passed. + # Func above was hinted to only take a float, yet a str will be passed. with self.assertRaises(typehints.TypeCheckError) as e: ( self.p - | 'Ints' >> beam.Create([1, 2, 3, 4]).with_output_types(int) + | 'Ints' >> beam.Create(['1', '2', '3', '4']).with_output_types(str) | 'Half' >> beam.Filter(more_than_half)) self.assertStartswith( e.exception.args[0], "Type hint violation for 'Half': " - "requires {} but got {} for a".format(float, int)) + "requires {} but got {} for a".format(float, str)) def test_filter_type_checks_using_type_hints_decorator(self): @with_input_types(b=int) diff --git a/sdks/python/apache_beam/transforms/trigger_test.py b/sdks/python/apache_beam/transforms/trigger_test.py index 06e205df61ec..962a06e485df 100644 --- a/sdks/python/apache_beam/transforms/trigger_test.py +++ b/sdks/python/apache_beam/transforms/trigger_test.py @@ -583,7 +583,6 @@ def test_after_processing_time(self): accumulation_mode=AccumulationMode.DISCARDING) | beam.GroupByKey() | beam.Map(lambda x: x[1])) - assert_that(results, equal_to([list(range(total_elements_in_trigger))])) def test_repeatedly_after_processing_time(self): @@ -772,11 +771,11 @@ def test_multiple_accumulating_firings(self): | beam.GroupByKey() | beam.FlatMap(lambda x: x[1])) - # The trigger should fire twice. Once after 5 seconds, and once after 10. - # The firings should accumulate the output. - first_firing = [str(i) for i in elements if i <= 5] - second_firing = [str(i) for i in elements] - assert_that(records, equal_to(first_firing + second_firing)) + # The trigger should fire twice. Once after 5 seconds, and once after 10. + # The firings should accumulate the output. + first_firing = [str(i) for i in elements if i <= 5] + second_firing = [str(i) for i in elements] + assert_that(records, equal_to(first_firing + second_firing)) def test_on_pane_watermark_hold_no_pipeline_stall(self): """A regression test added for diff --git a/sdks/python/apache_beam/transforms/util_test.py b/sdks/python/apache_beam/transforms/util_test.py index 74d9f438a5df..9c70be7900da 100644 --- a/sdks/python/apache_beam/transforms/util_test.py +++ b/sdks/python/apache_beam/transforms/util_test.py @@ -1016,13 +1016,13 @@ def test_constant_k(self): with TestPipeline() as p: pc = p | beam.Create(self.l) with_keys = pc | util.WithKeys('k') - assert_that(with_keys, equal_to([('k', 1), ('k', 2), ('k', 3)], )) + assert_that(with_keys, equal_to([('k', 1), ('k', 2), ('k', 3)], )) def test_callable_k(self): with TestPipeline() as p: pc = p | beam.Create(self.l) with_keys = pc | util.WithKeys(lambda x: x * x) - assert_that(with_keys, equal_to([(1, 1), (4, 2), (9, 3)])) + assert_that(with_keys, equal_to([(1, 1), (4, 2), (9, 3)])) @staticmethod def _test_args_kwargs_fn(x, multiply, subtract): @@ -1033,7 +1033,7 @@ def test_args_kwargs_k(self): pc = p | beam.Create(self.l) with_keys = pc | util.WithKeys( WithKeysTest._test_args_kwargs_fn, 2, subtract=1) - assert_that(with_keys, equal_to([(1, 1), (3, 2), (5, 3)])) + assert_that(with_keys, equal_to([(1, 1), (3, 2), (5, 3)])) def test_sideinputs(self): with TestPipeline() as p: @@ -1046,7 +1046,7 @@ def test_sideinputs(self): the_singleton: x + sum(the_list) + the_singleton, si1, the_singleton=si2) - assert_that(with_keys, equal_to([(17, 1), (18, 2), (19, 3)])) + assert_that(with_keys, equal_to([(17, 1), (18, 2), (19, 3)])) class GroupIntoBatchesTest(unittest.TestCase): diff --git a/sdks/python/apache_beam/typehints/typed_pipeline_test.py b/sdks/python/apache_beam/typehints/typed_pipeline_test.py index 9cb3fcdbb91d..72aed46f5e78 100644 --- a/sdks/python/apache_beam/typehints/typed_pipeline_test.py +++ b/sdks/python/apache_beam/typehints/typed_pipeline_test.py @@ -422,7 +422,7 @@ def test_typed_ptransform_fn_conflicting_hints(self): # In this case, both MyMap and its contained ParDo have separate type # checks (that disagree with each other). @beam.ptransform_fn - @typehints.with_input_types(int) + @typehints.with_input_types(str) def MyMap(pcoll): def fn(element: float): yield element @@ -430,11 +430,11 @@ def fn(element: float): return pcoll | beam.ParDo(fn) with self.assertRaisesRegex(typehints.TypeCheckError, - r'ParDo.*requires.*float.*got.*int'): - _ = [1, 2, 3] | MyMap() + r'ParDo.*requires.*float.*got.*str'): + _ = ['1', '2', '3'] | MyMap() with self.assertRaisesRegex(typehints.TypeCheckError, - r'MyMap.*expected.*int.*got.*str'): - _ = ['a'] | MyMap() + r'MyMap.*expected.*str.*got.*bytes'): + _ = [b'a'] | MyMap() def test_typed_dofn_string_literals(self): class MyDoFn(beam.DoFn): diff --git a/sdks/python/apache_beam/typehints/typehints.py b/sdks/python/apache_beam/typehints/typehints.py index b368f0abdf3d..912cb78dc095 100644 --- a/sdks/python/apache_beam/typehints/typehints.py +++ b/sdks/python/apache_beam/typehints/typehints.py @@ -1309,6 +1309,12 @@ def is_consistent_with(sub, base): return True if isinstance(sub, AnyTypeConstraint) or isinstance(base, AnyTypeConstraint): return True + # Per PEP484, ints are considered floats and complexes and + # floats are considered complexes. + if sub is int and base in (float, complex): + return True + if sub is float and base is complex: + return True sub = normalize(sub, none_as_type=True) base = normalize(base, none_as_type=True) if isinstance(sub, UnionConstraint): diff --git a/sdks/python/apache_beam/typehints/typehints_test.py b/sdks/python/apache_beam/typehints/typehints_test.py index c395893a23ba..843c1498cac5 100644 --- a/sdks/python/apache_beam/typehints/typehints_test.py +++ b/sdks/python/apache_beam/typehints/typehints_test.py @@ -166,6 +166,14 @@ def test_any_compatibility(self): self.assertCompatible(object, typehints.Any) self.assertCompatible(typehints.Any, object) + def test_int_float_complex_compatibility(self): + self.assertCompatible(float, int) + self.assertCompatible(complex, int) + self.assertCompatible(complex, float) + self.assertNotCompatible(int, float) + self.assertNotCompatible(int, complex) + self.assertNotCompatible(float, complex) + def test_repr(self): self.assertEqual('Any', repr(typehints.Any)) @@ -218,7 +226,7 @@ def test_union_hint_compatibility(self): typehints.Union[int, str], typehints.Union[str, typehints.Union[int, str]]) - self.assertNotCompatible( + self.assertCompatible( typehints.Union[float, bool], typehints.Union[int, bool]) self.assertNotCompatible( typehints.Union[bool, str], typehints.Union[float, bool, int]) diff --git a/sdks/python/apache_beam/yaml/examples/io/spanner_read.yaml b/sdks/python/apache_beam/yaml/examples/io/spanner_read.yaml new file mode 100644 index 000000000000..c86d42c1e0c6 --- /dev/null +++ b/sdks/python/apache_beam/yaml/examples/io/spanner_read.yaml @@ -0,0 +1,80 @@ +# coding=utf-8 +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +pipeline: + transforms: + + # Reading data from a Spanner database. The table used here has the following columns: + # shipment_id (String), customer_id (String), shipment_date (String), shipment_cost (Float64), customer_name (String), customer_email (String) + # ReadFromSpanner transform is called using project_id, instance_id, database_id and a query + # A table with a list of columns can also be specified instead of a query + - type: ReadFromSpanner + name: ReadShipments + config: + project_id: 'apache-beam-testing' + instance_id: 'shipment-test' + database_id: 'shipment' + query: 'SELECT * FROM shipments' + + # Filtering the data based on a specific condition + # Here, the condition is used to keep only the rows where the customer_id is 'C1' + - type: Filter + name: FilterShipments + input: ReadShipments + config: + language: python + keep: "customer_id == 'C1'" + + # Mapping the data fields and applying transformations + # A new field 'shipment_cost_category' is added with a custom transformation + # A callable is defined to categorize shipment cost + - type: MapToFields + name: MapFieldsForSpanner + input: FilterShipments + config: + language: python + fields: + shipment_id: shipment_id + customer_id: customer_id + shipment_date: shipment_date + shipment_cost: shipment_cost + customer_name: customer_name + customer_email: customer_email + shipment_cost_category: + callable: | + def categorize_cost(row): + cost = float(row[3]) + if cost < 50: + return 'Low Cost' + elif cost < 200: + return 'Medium Cost' + else: + return 'High Cost' + + # Writing the transformed data to a CSV file + - type: WriteToCsv + name: WriteBig + input: MapFieldsForSpanner + config: + path: shipments.csv + + + # On executing the above pipeline, a new CSV file is created with the following records + +# Expected: +# Row(shipment_id='S1', customer_id='C1', shipment_date='2023-05-01', shipment_cost=150.0, customer_name='Alice', customer_email='alice@example.com', shipment_cost_category='Medium Cost') +# Row(shipment_id='S3', customer_id='C1', shipment_date='2023-05-10', shipment_cost=20.0, customer_name='Alice', customer_email='alice@example.com', shipment_cost_category='Low Cost') diff --git a/sdks/python/apache_beam/yaml/examples/io/spanner_write.yaml b/sdks/python/apache_beam/yaml/examples/io/spanner_write.yaml new file mode 100644 index 000000000000..74ac35de260f --- /dev/null +++ b/sdks/python/apache_beam/yaml/examples/io/spanner_write.yaml @@ -0,0 +1,53 @@ +# coding=utf-8 +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +pipeline: + transforms: + + # Step 1: Creating rows to be written to Spanner + # The element names correspond to the column names in the Spanner table + - type: Create + name: CreateRows + config: + elements: + - shipment_id: "S5" + customer_id: "C5" + shipment_date: "2023-05-09" + shipment_cost: 300.0 + customer_name: "Erin" + customer_email: "erin@example.com" + + # Step 2: Writing the created rows to a Spanner database + # We require the project ID, instance ID, database ID and table ID to connect to Spanner + # Error handling can be specified optionally to ensure any failed operations aren't lost + # The failed data is passed on in the pipeline and can be handled + - type: WriteToSpanner + name: WriteSpanner + input: CreateRows + config: + project_id: 'apache-beam-testing' + instance_id: 'shipment-test' + database_id: 'shipment' + table_id: 'shipments' + error_handling: + output: my_error_output + + # Step 3: Writing the failed records to a JSON file + - type: WriteToJson + input: WriteSpanner.my_error_output + config: + path: errors.json diff --git a/sdks/python/container/py310/base_image_requirements.txt b/sdks/python/container/py310/base_image_requirements.txt index c89f35e243f1..ec7a6d123d63 100644 --- a/sdks/python/container/py310/base_image_requirements.txt +++ b/sdks/python/container/py310/base_image_requirements.txt @@ -78,7 +78,7 @@ googleapis-common-protos==1.65.0 greenlet==3.0.3 grpc-google-iam-v1==0.13.1 grpc-interceptor==0.15.4 -grpcio==1.66.1 +grpcio==1.65.5 grpcio-status==1.62.3 guppy3==3.1.4.post1 hdfs==2.7.3 diff --git a/sdks/python/container/py311/base_image_requirements.txt b/sdks/python/container/py311/base_image_requirements.txt index fda2e2146184..39d014a699cd 100644 --- a/sdks/python/container/py311/base_image_requirements.txt +++ b/sdks/python/container/py311/base_image_requirements.txt @@ -76,7 +76,7 @@ googleapis-common-protos==1.65.0 greenlet==3.0.3 grpc-google-iam-v1==0.13.1 grpc-interceptor==0.15.4 -grpcio==1.66.1 +grpcio==1.65.5 grpcio-status==1.62.3 guppy3==3.1.4.post1 hdfs==2.7.3 diff --git a/sdks/python/container/py312/base_image_requirements.txt b/sdks/python/container/py312/base_image_requirements.txt index c027191bc6aa..4c445ba0d2ab 100644 --- a/sdks/python/container/py312/base_image_requirements.txt +++ b/sdks/python/container/py312/base_image_requirements.txt @@ -75,7 +75,7 @@ googleapis-common-protos==1.65.0 greenlet==3.0.3 grpc-google-iam-v1==0.13.1 grpc-interceptor==0.15.4 -grpcio==1.66.1 +grpcio==1.65.5 grpcio-status==1.62.3 guppy3==3.1.4.post1 hdfs==2.7.3 diff --git a/sdks/python/container/py38/base_image_requirements.txt b/sdks/python/container/py38/base_image_requirements.txt index 1796e4e83bfa..32d77b7d53fb 100644 --- a/sdks/python/container/py38/base_image_requirements.txt +++ b/sdks/python/container/py38/base_image_requirements.txt @@ -79,7 +79,7 @@ googleapis-common-protos==1.65.0 greenlet==3.0.3 grpc-google-iam-v1==0.13.1 grpc-interceptor==0.15.4 -grpcio==1.66.1 +grpcio==1.65.5 grpcio-status==1.62.3 guppy3==3.1.4.post1 hdfs==2.7.3 diff --git a/sdks/python/container/py39/base_image_requirements.txt b/sdks/python/container/py39/base_image_requirements.txt index aa2b4f3bb388..08fec552c2f3 100644 --- a/sdks/python/container/py39/base_image_requirements.txt +++ b/sdks/python/container/py39/base_image_requirements.txt @@ -78,7 +78,7 @@ googleapis-common-protos==1.65.0 greenlet==3.0.3 grpc-google-iam-v1==0.13.1 grpc-interceptor==0.15.4 -grpcio==1.66.1 +grpcio==1.65.5 grpcio-status==1.62.3 guppy3==3.1.4.post1 hdfs==2.7.3 diff --git a/sdks/python/expansion-service-container/Dockerfile b/sdks/python/expansion-service-container/Dockerfile index d3cd4a4afad3..5a5ef0f410bc 100644 --- a/sdks/python/expansion-service-container/Dockerfile +++ b/sdks/python/expansion-service-container/Dockerfile @@ -18,7 +18,7 @@ # We just need to support one Python version supported by Beam. # Picking the current default Beam Python version which is Python 3.8. -FROM python:3.8-bookworm as expansion-service +FROM python:3.9-bookworm as expansion-service LABEL Author "Apache Beam " ARG TARGETOS ARG TARGETARCH diff --git a/sdks/python/mypy.ini b/sdks/python/mypy.ini index 298f249ffbff..562cb8d56dcc 100644 --- a/sdks/python/mypy.ini +++ b/sdks/python/mypy.ini @@ -16,7 +16,7 @@ # [mypy] -python_version = 3.8 +python_version = 3.9 ignore_missing_imports = true follow_imports = normal warn_no_return = true @@ -28,6 +28,7 @@ files = apache_beam color_output = true # uncomment this to see how close we are to being complete # check_untyped_defs = true +disable_error_code = var-annotated [mypy-apache_beam.coders.proto2_coder_test_messages_pb2] ignore_errors = true diff --git a/sdks/python/setup.py b/sdks/python/setup.py index 7bcff2bacfd2..721cb4c1a8dd 100644 --- a/sdks/python/setup.py +++ b/sdks/python/setup.py @@ -278,6 +278,11 @@ def get_portability_package_data(): 'This version of Apache Beam has not been sufficiently tested on ' 'Python %s.%s. You may encounter bugs or missing features.' % (sys.version_info.major, sys.version_info.minor)) +elif sys.version_info.major == 3 and sys.version_info.minor == 8: + warnings.warn('Python 3.8 reaches EOL in October 2024 and support will ' + 'be removed from Apache Beam in version 2.61.0. See ' + 'https://github.com/apache/beam/issues/31192 for more ' + 'information.') if __name__ == '__main__': # In order to find the tree of proto packages, the directory @@ -353,7 +358,8 @@ def get_portability_package_data(): 'cloudpickle~=2.2.1', 'fastavro>=0.23.6,<2', 'fasteners>=0.3,<1.0', - 'grpcio>=1.33.1,<2,!=1.48.0,!=1.59.*,!=1.60.*,!=1.61.*,!=1.62.0,!=1.62.1', # pylint: disable=line-too-long + # TODO(https://github.com/grpc/grpc/issues/37710): Unpin grpc + 'grpcio>=1.33.1,<2,!=1.48.0,!=1.59.*,!=1.60.*,!=1.61.*,!=1.62.0,!=1.62.1,<1.66.0', # pylint: disable=line-too-long 'hdfs>=2.1.0,<3.0.0', 'httplib2>=0.8,<0.23.0', 'jsonschema>=4.0.0,<5.0.0', @@ -399,6 +405,7 @@ def get_portability_package_data(): # https://github.com/sphinx-doc/sphinx/issues/9727 'docutils==0.17.1', 'pandas<2.2.0', + 'openai' ], 'test': [ 'docstring-parser>=0.15,<1.0', @@ -441,7 +448,7 @@ def get_portability_package_data(): 'google-cloud-bigquery-storage>=2.6.3,<3', 'google-cloud-core>=2.0.0,<3', 'google-cloud-bigtable>=2.19.0,<3', - 'google-cloud-spanner>=3.0.0,<3.48', + 'google-cloud-spanner>=3.0.0,<4', # GCP Packages required by ML functionality 'google-cloud-dlp>=3.0.0,<4', 'google-cloud-language>=2.0,<3', @@ -498,7 +505,10 @@ def get_portability_package_data(): 'tf2onnx', 'torch', 'transformers', - 'xgboost<2.0', # https://github.com/apache/beam/issues/31252 + # Comment out xgboost as it is breaking presubmit python ml + # tests due to tag check introduced since pip 24.2 + # https://github.com/apache/beam/issues/31285 + # 'xgboost<2.0', # https://github.com/apache/beam/issues/31252 ], 'aws': ['boto3>=1.9,<2'], 'azure': [ diff --git a/sdks/python/test-suites/containers/tensorrt_runinference/tensor_rt.dockerfile b/sdks/python/test-suites/containers/tensorrt_runinference/tensor_rt.dockerfile index 6d99a6393fa9..c1dc4deb6e69 100644 --- a/sdks/python/test-suites/containers/tensorrt_runinference/tensor_rt.dockerfile +++ b/sdks/python/test-suites/containers/tensorrt_runinference/tensor_rt.dockerfile @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -ARG BUILD_IMAGE=nvcr.io/nvidia/tensorrt:22.05-py3 +ARG BUILD_IMAGE=nvcr.io/nvidia/tensorrt:23.05-py3 FROM ${BUILD_IMAGE} @@ -22,7 +22,7 @@ ENV PATH="/usr/src/tensorrt/bin:${PATH}" WORKDIR /workspace -COPY --from=apache/beam_python3.8_sdk:latest /opt/apache/beam /opt/apache/beam +COPY --from=apache/beam_python3.10_sdk:latest /opt/apache/beam /opt/apache/beam RUN pip install --upgrade pip \ && pip install torch>=1.7.1 \ @@ -32,4 +32,4 @@ RUN pip install --upgrade pip \ && pip install cuda-python ENTRYPOINT [ "/opt/apache/beam/boot" ] -RUN apt-get update && apt-get install -y python3.8-venv +RUN apt-get update && apt-get install -y python3.10-venv diff --git a/sdks/python/test-suites/dataflow/common.gradle b/sdks/python/test-suites/dataflow/common.gradle index e5d301ecbe14..6bca904c1a64 100644 --- a/sdks/python/test-suites/dataflow/common.gradle +++ b/sdks/python/test-suites/dataflow/common.gradle @@ -424,6 +424,39 @@ def tensorRTTests = tasks.create("tensorRTtests") { } } +def vllmTests = tasks.create("vllmTests") { + dependsOn 'installGcpTest' + dependsOn ':sdks:python:sdist' + doLast { + def testOpts = basicPytestOpts + def argMap = [ + "runner": "DataflowRunner", + "machine_type":"n1-standard-4", + // TODO(https://github.com/apache/beam/issues/22651): Build docker image for VLLM tests during Run time. + // This would also enable to use wheel "--sdk_location" as other tasks, and eliminate distTarBall dependency + // declaration for this project. + // Right now, this is built from https://github.com/apache/beam/blob/master/sdks/python/apache_beam/ml/inference/test_resources/vllm.dockerfile + "sdk_container_image": "us.gcr.io/apache-beam-testing/python-postcommit-it/vllm:latest", + "sdk_location": files(configurations.distTarBall.files).singleFile, + "project": "apache-beam-testing", + "region": "us-central1", + "model": "facebook/opt-125m", + "output": "gs://apache-beam-ml/outputs/vllm_predictions.txt", + "disk_size_gb": 75 + ] + def cmdArgs = mapToArgString(argMap) + // Exec one version with and one version without the chat option + exec { + executable 'sh' + args '-c', ". ${envdir}/bin/activate && pip install openai && python -m apache_beam.examples.inference.vllm_text_completion $cmdArgs --experiment='worker_accelerator=type:nvidia-tesla-t4;count:1;install-nvidia-driver:5xx'" + } + exec { + executable 'sh' + args '-c', ". ${envdir}/bin/activate && pip install openai && python -m apache_beam.examples.inference.vllm_text_completion $cmdArgs --chat true --chat_template 'gs://apache-beam-ml/additional_files/sample_chat_template.jinja' --experiment='worker_accelerator=type:nvidia-tesla-t4;count:1;install-nvidia-driver:5xx'" + } + } +} + // Vertex AI RunInference IT tests task vertexAIInferenceTest { dependsOn 'initializeForDataflowJob' @@ -521,6 +554,12 @@ project.tasks.register("inferencePostCommitIT") { ] } +project.tasks.register("inferencePostCommitITPy312") { + dependsOn = [ + 'vllmTests', + ] +} + // Create cross-language tasks for running tests against Java expansion service(s) def gcpProject = project.findProperty('gcpProject') ?: 'apache-beam-testing' diff --git a/sdks/python/test-suites/gradle.properties b/sdks/python/test-suites/gradle.properties index 3d16ee63f12a..f8c04e0f5609 100644 --- a/sdks/python/test-suites/gradle.properties +++ b/sdks/python/test-suites/gradle.properties @@ -23,13 +23,13 @@ # dataflow test-suites # (TODO): https://github.com/apache/beam/issues/21971 # Add python 3.10 to dataflow test-suites -dataflow_precommit_it_task_py_versions=3.8,3.12 -dataflow_mongodbio_it_task_py_versions=3.8 -dataflow_chicago_taxi_example_task_py_versions=3.8 +dataflow_precommit_it_task_py_versions=3.9,3.12 +dataflow_mongodbio_it_task_py_versions=3.9 +dataflow_chicago_taxi_example_task_py_versions=3.9 # TODO: Enable following tests after making sure we have enough capacity. -dataflow_validates_runner_batch_tests=3.8,3.12 -dataflow_validates_runner_streaming_tests=3.8,3.12 +dataflow_validates_runner_batch_tests=3.9,3.12 +dataflow_validates_runner_streaming_tests=3.9,3.12 dataflow_examples_postcommit_py_versions=3.12 # TFX_BSL is not yet supported on Python 3.10. dataflow_cloudml_benchmark_tests_py_versions=3.9 @@ -38,14 +38,14 @@ direct_mongodbio_it_task_py_versions=3.12 # flink runner test-suites flink_validates_runner_precommit_py_versions=3.12 -flink_validates_runner_postcommit_py_versions=3.8,3.12 -flink_examples_postcommit_py_versions=3.8,3.12 +flink_validates_runner_postcommit_py_versions=3.9,3.12 +flink_examples_postcommit_py_versions=3.9,3.12 # samza runner test-suites -samza_validates_runner_postcommit_py_versions=3.8,3.12 +samza_validates_runner_postcommit_py_versions=3.9,3.12 # spark runner test-suites -spark_examples_postcommit_py_versions=3.8,3.12 +spark_examples_postcommit_py_versions=3.9,3.12 # cross language postcommit python test suites -cross_language_validates_py_versions=3.8,3.12 +cross_language_validates_py_versions=3.9,3.12 diff --git a/sdks/python/test-suites/tox/py39/build.gradle b/sdks/python/test-suites/tox/py39/build.gradle index 5bb73b60a5d2..ea02e9d5b1e8 100644 --- a/sdks/python/test-suites/tox/py39/build.gradle +++ b/sdks/python/test-suites/tox/py39/build.gradle @@ -26,4 +26,199 @@ applyPythonNature() // Required to setup a Python 3 virtualenv and task names. pythonVersion = '3.9' +def posargs = project.findProperty("posargs") ?: "" + apply from: "../common.gradle" + +toxTask "testPy39CloudCoverage", "py39-cloudcoverage", "${posargs}" +test.dependsOn "testPy39CloudCoverage" +project.tasks.register("preCommitPyCoverage") { + dependsOn = ["testPy39CloudCoverage"] +} + +// Dep Postcommit runs test suites that evaluate compatibility of particular +// dependencies. It is exercised on a single Python version. +// +// Should still leave at least one version in PreCommit unless the marked tests +// are also exercised by existing PreCommit +// e.g. pyarrow and pandas also run on PreCommit Dataframe and Coverage +project.tasks.register("postCommitPyDep") {} + +// Create a test task for supported major versions of pyarrow +// We should have a test for the lowest supported version and +// For versions that we would like to prioritize for testing, +// for example versions released in a timeframe of last 1-2 years. + +toxTask "testPy39pyarrow-3", "py39-pyarrow-3", "${posargs}" +test.dependsOn "testPy39pyarrow-3" +postCommitPyDep.dependsOn "testPy39pyarrow-3" + +toxTask "testPy39pyarrow-9", "py39-pyarrow-9", "${posargs}" +test.dependsOn "testPy39pyarrow-9" +postCommitPyDep.dependsOn "testPy39pyarrow-9" + +toxTask "testPy39pyarrow-10", "py39-pyarrow-10", "${posargs}" +test.dependsOn "testPy39pyarrow-10" +postCommitPyDep.dependsOn "testPy39pyarrow-10" + +toxTask "testPy39pyarrow-11", "py39-pyarrow-11", "${posargs}" +test.dependsOn "testPy39pyarrow-11" +postCommitPyDep.dependsOn "testPy39pyarrow-11" + +toxTask "testPy39pyarrow-12", "py39-pyarrow-12", "${posargs}" +test.dependsOn "testPy39pyarrow-12" +postCommitPyDep.dependsOn "testPy39pyarrow-12" + +toxTask "testPy39pyarrow-13", "py39-pyarrow-13", "${posargs}" +test.dependsOn "testPy39pyarrow-13" +postCommitPyDep.dependsOn "testPy39pyarrow-13" + +toxTask "testPy39pyarrow-14", "py39-pyarrow-14", "${posargs}" +test.dependsOn "testPy39pyarrow-14" +postCommitPyDep.dependsOn "testPy39pyarrow-14" + +toxTask "testPy39pyarrow-15", "py39-pyarrow-15", "${posargs}" +test.dependsOn "testPy39pyarrow-15" +postCommitPyDep.dependsOn "testPy39pyarrow-15" + +toxTask "testPy39pyarrow-16", "py39-pyarrow-16", "${posargs}" +test.dependsOn "testPy39pyarrow-16" +postCommitPyDep.dependsOn "testPy39pyarrow-16" + +// Create a test task for each supported minor version of pandas +toxTask "testPy39pandas-14", "py39-pandas-14", "${posargs}" +test.dependsOn "testPy39pandas-14" +postCommitPyDep.dependsOn "testPy39pandas-14" + +toxTask "testPy39pandas-15", "py39-pandas-15", "${posargs}" +test.dependsOn "testPy39pandas-15" +postCommitPyDep.dependsOn "testPy39pandas-15" + +toxTask "testPy39pandas-20", "py39-pandas-20", "${posargs}" +test.dependsOn "testPy39pandas-20" +postCommitPyDep.dependsOn "testPy39pandas-20" + +// TODO(https://github.com/apache/beam/issues/31192): Add below suites +// after dependency compat tests suite switches to Python 3.9 or we add +// Python 2.2 support. + +// toxTask "testPy39pandas-21", "py39-pandas-21", "${posargs}" +// test.dependsOn "testPy39pandas-21" +// postCommitPyDep.dependsOn "testPy39pandas-21" + +// toxTask "testPy39pandas-22", "py39-pandas-22", "${posargs}" +// test.dependsOn "testPy39pandas-22" +// postCommitPyDep.dependsOn "testPy39pandas-22" + +// TODO(https://github.com/apache/beam/issues/30908): Revise what are we testing + +// Create a test task for each minor version of pytorch +toxTask "testPy39pytorch-19", "py39-pytorch-19", "${posargs}" +test.dependsOn "testPy39pytorch-19" +postCommitPyDep.dependsOn "testPy39pytorch-19" + +toxTask "testPy39pytorch-110", "py39-pytorch-110", "${posargs}" +test.dependsOn "testPy39pytorch-110" +postCommitPyDep.dependsOn "testPy39pytorch-110" + +toxTask "testPy39pytorch-111", "py39-pytorch-111", "${posargs}" +test.dependsOn "testPy39pytorch-111" +postCommitPyDep.dependsOn "testPy39pytorch-111" + +toxTask "testPy39pytorch-112", "py39-pytorch-112", "${posargs}" +test.dependsOn "testPy39pytorch-112" +postCommitPyDep.dependsOn "testPy39pytorch-112" + +toxTask "testPy39pytorch-113", "py39-pytorch-113", "${posargs}" +test.dependsOn "testPy39pytorch-113" +postCommitPyDep.dependsOn "testPy39pytorch-113" + +// run on precommit +toxTask "testPy39pytorch-200", "py39-pytorch-200", "${posargs}" +test.dependsOn "testPy39pytorch-200" +postCommitPyDep.dependsOn "testPy39pytorch-200" + +toxTask "testPy39tft-113", "py39-tft-113", "${posargs}" +test.dependsOn "testPy39tft-113" +postCommitPyDep.dependsOn "testPy39tft-113" + +// TODO(https://github.com/apache/beam/issues/25796) - uncomment onnx tox task once onnx supports protobuf 4.x.x +// Create a test task for each minor version of onnx +// toxTask "testPy39onnx-113", "py39-onnx-113", "${posargs}" +// test.dependsOn "testPy39onnx-113" +// postCommitPyDep.dependsOn "testPy39onnx-113" + +// Create a test task for each minor version of tensorflow +toxTask "testPy39tensorflow-212", "py39-tensorflow-212", "${posargs}" +test.dependsOn "testPy39tensorflow-212" +postCommitPyDep.dependsOn "testPy39tensorflow-212" + +// Create a test task for each minor version of transformers +toxTask "testPy39transformers-428", "py39-transformers-428", "${posargs}" +test.dependsOn "testPy39transformers-428" +postCommitPyDep.dependsOn "testPy39transformers-428" + +toxTask "testPy39transformers-429", "py39-transformers-429", "${posargs}" +test.dependsOn "testPy39transformers-429" +postCommitPyDep.dependsOn "testPy39transformers-429" + +toxTask "testPy39transformers-430", "py39-transformers-430", "${posargs}" +test.dependsOn "testPy39transformers-430" +postCommitPyDep.dependsOn "testPy39transformers-430" + +toxTask "testPy39embeddingsMLTransform", "py39-embeddings", "${posargs}" +test.dependsOn "testPy39embeddingsMLTransform" +postCommitPyDep.dependsOn "testPy39embeddingsMLTransform" + +// Part of MLTransform embeddings test suite but requires tensorflow hub, which we need to test on +// mutliple versions so keeping this suite separate. +toxTask "testPy39TensorflowHubEmbeddings-014", "py39-TFHubEmbeddings-014", "${posargs}" +test.dependsOn "testPy39TensorflowHubEmbeddings-014" +postCommitPyDep.dependsOn "testPy39TensorflowHubEmbeddings-014" + +toxTask "testPy39TensorflowHubEmbeddings-015", "py39-TFHubEmbeddings-015", "${posargs}" +test.dependsOn "testPy39TensorflowHubEmbeddings-015" +postCommitPyDep.dependsOn "testPy39TensorflowHubEmbeddings-015" + +toxTask "whitespacelint", "whitespacelint", "${posargs}" + +task archiveFilesToLint(type: Zip) { + archiveFileName = "files-to-whitespacelint.zip" + destinationDirectory = file("$buildDir/dist") + + from ("$rootProject.projectDir") { + include "**/*.md" + include "**/build.gradle" + include '**/build.gradle.kts' + exclude '**/build/**' // intermediate build directory + exclude 'website/www/site/themes/docsy/**' // fork to google/docsy + exclude "**/node_modules/*" + exclude "**/.gogradle/*" + } +} + +task unpackFilesToLint(type: Copy) { + from zipTree("$buildDir/dist/files-to-whitespacelint.zip") + into "$buildDir/files-to-whitespacelint" +} + +whitespacelint.dependsOn archiveFilesToLint, unpackFilesToLint +unpackFilesToLint.dependsOn archiveFilesToLint +archiveFilesToLint.dependsOn cleanPython + +toxTask "jest", "jest", "${posargs}" + +toxTask "eslint", "eslint", "${posargs}" + +task copyTsSource(type: Copy) { + from ("$rootProject.projectDir") { + include "sdks/python/apache_beam/runners/interactive/extensions/**/*" + exclude "sdks/python/apache_beam/runners/interactive/extensions/**/lib/*" + exclude "sdks/python/apache_beam/runners/interactive/extensions/**/node_modules/*" + } + into "$buildDir/ts" +} + +jest.dependsOn copyTsSource +eslint.dependsOn copyTsSource +copyTsSource.dependsOn cleanPython diff --git a/sdks/python/tox.ini b/sdks/python/tox.ini index aa0200f75005..d733fd17fb6b 100644 --- a/sdks/python/tox.ini +++ b/sdks/python/tox.ini @@ -17,7 +17,7 @@ [tox] # new environments will be excluded by default unless explicitly added to envlist. -envlist = py38,py39,py310,py311,py312,py38-{cloud,cloudcoverage,dask},py39-{cloud},py310-{cloud,dask},py311-{cloud,dask},py312-{cloud,dask},docs,lint,mypy,whitespacelint +envlist = py39,py310,py311,py312,py39-{cloud,cloudcoverage,dask},py310-{cloud,dask},py311-{cloud,dask},py312-{cloud,dask},docs,lint,mypy,whitespacelint toxworkdir = {toxinidir}/target/{env:ENV_NAME:.tox} [pycodestyle] @@ -67,38 +67,53 @@ commands_post = commands = false {envname} is misconfigured -[testenv:py{38,39,310,311,312}] +[testenv:py{39,310,311,312}] +commands_pre = + python --version + pip --version + pip check + bash {toxinidir}/scripts/run_tox_cleanup.sh +commands = + python apache_beam/examples/complete/autocomplete_test.py + bash {toxinidir}/scripts/run_pytest.sh {envname} "{posargs}" + +[testenv:py{39,310,311,312}-macos] +commands_pre = + python --version + pip --version + # pip check + bash {toxinidir}/scripts/run_tox_cleanup.sh commands = python apache_beam/examples/complete/autocomplete_test.py bash {toxinidir}/scripts/run_pytest.sh {envname} "{posargs}" -[testenv:py{38,39,310,311,312}-win] +[testenv:py{39,310,311,312}-win] commands = python apache_beam/examples/complete/autocomplete_test.py bash {toxinidir}/scripts/run_pytest.sh {envname} "{posargs}" install_command = {envbindir}/python.exe {envbindir}/pip.exe install --retries 10 {opts} {packages} list_dependencies_command = {envbindir}/python.exe {envbindir}/pip.exe freeze -[testenv:py{38,39,310,311,312}-cloud] +[testenv:py{39,310,311,312}-cloud] ; extras = test,gcp,interactive,dataframe,aws,azure extras = test,gcp,interactive,dataframe,aws,azure commands = python apache_beam/examples/complete/autocomplete_test.py bash {toxinidir}/scripts/run_pytest.sh {envname} "{posargs}" -[testenv:py{38,39,310,311,312}-ml] +[testenv:py{39,310,311,312}-ml] # Don't set TMPDIR to avoid "AF_UNIX path too long" errors in certain tests. setenv = extras = test,gcp,dataframe,ml_test commands = bash {toxinidir}/scripts/run_pytest.sh {envname} "{posargs}" -[testenv:py{38,39,310,311,312}-dask] +[testenv:py{39,310,311,312}-dask] extras = test,dask commands = bash {toxinidir}/scripts/run_pytest.sh {envname} "{posargs}" -[testenv:py38-cloudcoverage] +[testenv:py39-cloudcoverage] deps = pytest-cov==3.0.0 # Don't set TMPDIR to avoid "AF_UNIX path too long" errors in certain tests. @@ -271,7 +286,7 @@ commands = bash {toxinidir}/scripts/pytest_validates_runner.sh {envname} {toxinidir}/apache_beam/runners/portability/spark_runner_test.py {posargs} -[testenv:py{38,39}-pyarrow-{3,9,10,11,12,13,14,15,16}] +[testenv:py{39,310}-pyarrow-{3,9,10,11,12,13,14,15,16}] deps = # As a courtesy to users, test against the oldest allowed version of Pyarrow. # We'd have to increase the pyarrow lower bound when Python 3.9 is deprecated. @@ -296,7 +311,7 @@ commands = /bin/sh -c 'pytest -o junit_suite_name={envname} --junitxml=pytest_{envname}.xml -n 6 -m uses_pyarrow {posargs}; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret' -[testenv:py{38,39}-pandas-{14,15,20}] +[testenv:py{39,310}-pandas-{14,15,20}] deps = 14: pandas>=1.4.3,<1.5.0 # Exclude 1.5.0 and 1.5.1 because of https://github.com/pandas-dev/pandas/issues/45725 @@ -309,7 +324,7 @@ commands = # Run all DataFrame API unit tests bash {toxinidir}/scripts/run_pytest.sh {envname} 'apache_beam/dataframe' -[testenv:py{38,39}-tft-{113,114}] +[testenv:py{39,310}-tft-{113,114}] deps = # Help pip resolve conflict with typing-extensions due to an old version of tensorflow https://github.com/apache/beam/issues/30852 113: pydantic<2.0 @@ -317,7 +332,7 @@ deps = commands = bash {toxinidir}/scripts/run_pytest.sh {envname} 'apache_beam/ml/transforms apache_beam/examples/snippets/transforms/elementwise/mltransform_test.py' -[testenv:py{38,39}-pytorch-{19,110,111,112,113}] +[testenv:py{39,310}-pytorch-{19,110,111,112,113}] deps = 19: torch>=1.9.0,<1.10.0 110: torch>=1.10.0,<1.11.0 @@ -334,7 +349,7 @@ commands = # Allow exit code 5 (no tests run) so that we can run this command safely on arbitrary subdirectories. /bin/sh -c 'pytest -o junit_suite_name={envname} --junitxml=pytest_{envname}.xml -n 6 -m uses_pytorch {posargs}; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret' -[testenv:py{38,39}-pytorch-200] +[testenv:py{39,310}-pytorch-200] deps = 200: torch>=2.0.0,<2.1.0 @@ -350,7 +365,7 @@ commands = /bin/sh -c 'pytest -o junit_suite_name={envname} --junitxml=pytest_{envname}.xml -n 6 -m uses_pytorch {posargs}; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret' # TODO(https://github.com/apache/beam/issues/25796) - uncomment onnx tox task in tox/py38/build.gradle once onnx supports protobuf 4.x.x -[testenv:py{38,39}-onnx-113] +[testenv:py{39,310}-onnx-113] # TODO(https://github.com/apache/beam/issues/25443) # apparently tox has problem when substitution key has single value. Change back to -onnx-{113,...} # when multiple onnx versions are tested. @@ -369,7 +384,7 @@ commands = # Run all ONNX unit tests pytest -o junit_suite_name={envname} --junitxml=pytest_{envname}.xml -n 6 -m uses_onnx {posargs} -[testenv:py{38,39}-tensorflow-212] +[testenv:py39-tensorflow-212] deps = 212: tensorflow>=2.12rc1,<2.13 # Help pip resolve conflict with typing-extensions for old version of TF https://github.com/apache/beam/issues/30852 @@ -382,7 +397,7 @@ commands = # Allow exit code 5 (no tests run) so that we can run this command safely on arbitrary subdirectories. /bin/sh -c 'pytest -o junit_suite_name={envname} --junitxml=pytest_{envname}.xml -n 6 -m uses_tf {posargs}; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret' -[testenv:py{38,39}-xgboost-{160,170}] +[testenv:py39-xgboost-{160,170}] deps = 160: xgboost>=1.6.0,<1.7.0 @@ -398,7 +413,7 @@ commands = # Allow exit code 5 (no tests run) so that we can run this command safely on arbitrary subdirectories. /bin/sh -c 'pytest -o junit_suite_name={envname} --junitxml=pytest_{envname}.xml -n 6 -m uses_xgboost {posargs}; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret' -[testenv:py{38,39}-transformers-{428,429,430}] +[testenv:py{39,310}-transformers-{428,429,430}] deps = 428: transformers>=4.28.0,<4.29.0 429: transformers>=4.29.0,<4.30.0 @@ -415,7 +430,7 @@ commands = # Allow exit code 5 (no tests run) so that we can run this command safely on arbitrary subdirectories. /bin/sh -c 'pytest -o junit_suite_name={envname} --junitxml=pytest_{envname}.xml -n 6 -m uses_transformers {posargs}; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret' -[testenv:py{38,311}-vertex-ai] +[testenv:py{39,312}-vertex-ai] deps = tensorflow==2.12.0 extras = test,gcp @@ -428,7 +443,7 @@ commands = /bin/sh -c 'pytest -o junit_suite_name={envname} --junitxml=pytest_{envname}.xml -n 6 -m uses_vertex_ai {posargs}; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret' -[testenv:py{38,39}-embeddings] +[testenv:py{39,310}-embeddings] deps = sentence-transformers==2.2.2 passenv = HF_INFERENCE_TOKEN @@ -441,7 +456,7 @@ commands = /bin/sh -c 'pytest apache_beam/ml/transforms/embeddings -o junit_suite_name={envname} --junitxml=pytest_{envname}.xml -n 6 {posargs}; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret' -[testenv:py{38,39}-TFHubEmbeddings-{014,015}] +[testenv:py{39,310}-TFHubEmbeddings-{014,015}] deps = 014: tensorflow-hub>=0.14.0,<0.15.0 # Help pip resolve conflict with typing-extensions due to an old version of tensorboard https://github.com/apache/beam/issues/30852 diff --git a/sdks/standard_external_transforms.yaml b/sdks/standard_external_transforms.yaml index 05aa3c9b9de5..3c8c09ceb361 100644 --- a/sdks/standard_external_transforms.yaml +++ b/sdks/standard_external_transforms.yaml @@ -19,7 +19,7 @@ # configuration in /sdks/standard_expansion_services.yaml. # Refer to gen_xlang_wrappers.py for more info. # -# Last updated on: 2024-04-18 +# Last updated on: 2024-08-27 - default_service: sdks:java:io:expansion-service:shadowJar description: 'Outputs a PCollection of Beam Rows, each containing a single INT64 @@ -43,8 +43,7 @@ description: Specifies the rate to generate a given number of elements per a given number of seconds. Applicable only to unbounded sequences. nullable: true - type: Row(elements=, seconds=typing.Union[numpy.int64, - NoneType]) + type: Row(elements=, seconds=typing.Optional[numpy.int64]) start: description: The minimum number to generate (inclusive). nullable: false diff --git a/website/www/site/config.toml b/website/www/site/config.toml index 23d88f752c2e..e937289fbde7 100644 --- a/website/www/site/config.toml +++ b/website/www/site/config.toml @@ -104,7 +104,7 @@ github_project_repo = "https://github.com/apache/beam" [params] description = "Apache Beam is an open source, unified model and set of language-specific SDKs for defining and executing data processing workflows, and also data ingestion and integration flows, supporting Enterprise Integration Patterns (EIPs) and Domain Specific Languages (DSLs). Dataflow pipelines simplify the mechanics of large-scale batch and streaming data processing and can run on a number of runtimes like Apache Flink, Apache Spark, and Google Cloud Dataflow (a cloud service). Beam also brings DSL in different languages, allowing users to easily implement their data integration processes." -release_latest = "2.58.1" +release_latest = "2.59.0" # The repository and branch where the files live in Github or Colab. This is used # to serve and stage from your local branch, but publish to the master branch. # e.g. https://github.com/{{< param branch_repo >}}/path/to/notebook.ipynb diff --git a/website/www/site/content/en/blog/beam-2.59.0.md b/website/www/site/content/en/blog/beam-2.59.0.md new file mode 100644 index 000000000000..68b712ae8fe1 --- /dev/null +++ b/website/www/site/content/en/blog/beam-2.59.0.md @@ -0,0 +1,76 @@ +--- +title: "Apache Beam 2.59.0" +date: 2024-09-11 13:00:00 -0800 +categories: + - blog + - release +authors: + - lostluck +--- + + +We are happy to present the new 2.59.0 release of Beam. +This release includes both improvements and new functionality. +See the [download page](/get-started/downloads/#2590-2024-09-11) for this release. + + + +For more information on changes in 2.59.0, check out the [detailed release notes](https://github.com/apache/beam/milestone/23). + +## Highlights + +* Added support for setting a configureable timeout when loading a model and performing inference in the [RunInference](https://beam.apache.org/documentation/ml/inference-overview/) transform using [with_exception_handling](https://beam.apache.org/releases/pydoc/current/apache_beam.ml.inference.base.html#apache_beam.ml.inference.base.RunInference.with_exception_handling) ([#32137](https://github.com/apache/beam/issues/32137)) +* Initial experimental support for using Prism with the Java and Python SDKs + * Prism is presently targeting local testing usage, or other small scale execution. + * For Java, use 'PrismRunner', or 'TestPrismRunner' as an argument to the `--runner` flag. + * For Python, use 'PrismRunner' as an argument to the `--runner` flag. + * Go already uses Prism as the default local runner. + +## I/Os + +* Improvements to the performance of BigqueryIO when using withPropagateSuccessfulStorageApiWrites(true) method (Java) ([#31840](https://github.com/apache/beam/pull/31840)). +* [Managed Iceberg] Added support for writing to partitioned tables ([#32102](https://github.com/apache/beam/pull/32102)) +* Update ClickHouseIO to use the latest version of the ClickHouse JDBC driver ([#32228](https://github.com/apache/beam/issues/32228)). +* Add ClickHouseIO dedicated User-Agent ([#32252](https://github.com/apache/beam/issues/32252)). + +## New Features / Improvements + +* BigQuery endpoint can be overridden via PipelineOptions, this enables BigQuery emulators (Java) ([#28149](https://github.com/apache/beam/issues/28149)). +* Go SDK Minimum Go Version updated to 1.21 ([#32092](https://github.com/apache/beam/pull/32092)). +* [BigQueryIO] Added support for withFormatRecordOnFailureFunction() for STORAGE_WRITE_API and STORAGE_API_AT_LEAST_ONCE methods (Java) ([#31354](https://github.com/apache/beam/issues/31354)). +* Updated Go protobuf package to new version (Go) ([#21515](https://github.com/apache/beam/issues/21515)). +* Added support for setting a configureable timeout when loading a model and performing inference in the [RunInference](https://beam.apache.org/documentation/ml/inference-overview/) transform using [with_exception_handling](https://beam.apache.org/releases/pydoc/current/apache_beam.ml.inference.base.html#apache_beam.ml.inference.base.RunInference.with_exception_handling) ([#32137](https://github.com/apache/beam/issues/32137)) +* Adds OrderedListState support for Java SDK via FnApi. +* Initial support for using Prism from the Python and Java SDKs. + +## Bugfixes + +* Fixed incorrect service account impersonation flow for Python pipelines using BigQuery IOs ([#32030](https://github.com/apache/beam/issues/32030)). +* Auto-disable broken and meaningless `upload_graph` feature when using Dataflow Runner V2 ([#32159](https://github.com/apache/beam/issues/32159)). +* (Python) Upgraded google-cloud-storage to version 2.18.2 to fix a data corruption issue ([#32135](https://github.com/apache/beam/pull/32135)). +* (Go) Fix corruption on State API writes. ([#32245](https://github.com/apache/beam/issues/32245)). + +## Known Issues + +* Prism is under active development and does not yet support all pipelines. See [#29650](https://github.com/apache/beam/issues/29650) for progress. + * In the 2.59.0 release, Prism passes most runner validations tests with the exceptions of pipelines using the following features: + OrderedListState, OnWindowExpiry (eg. GroupIntoBatches), CustomWindows, MergingWindowFns, Trigger and WindowingStrategy associated features, Bundle Finalization, Looping Timers, and some Coder related issues such as with Python combiner packing, and Java Schema transforms, and heterogenous flatten coders. Processing Time timers do not yet have real time support. + * If your pipeline is having difficulty with the Python or Java direct runners, but runs well on Prism, please let us know. + +For the most up to date list of known issues, see https://github.com/apache/beam/blob/master/CHANGES.md + +## List of Contributors + +According to git shortlog, the following people contributed to the 2.59.0 release. Thank you to all contributors! + +Ahmed Abualsaud,Ahmet Altay,Andrew Crites,atask-g,Axel Magnuson,Ayush Pandey,Bartosz Zablocki,Chamikara Jayalath,cutiepie-10,Damon,Danny McCormick,dependabot[bot],Eddie Phillips,Francis O'Hara,Hyeonho Kim,Israel Herraiz,Jack McCluskey,Jaehyeon Kim,Jan Lukavský,Jeff Kinard,Jeffrey Kinard,jonathan-lemos,jrmccluskey,Kirill Berezin,Kiruphasankaran Nataraj,lahariguduru,liferoad,lostluck,Maciej Szwaja,Manit Gupta,Mark Zitnik,martin trieu,Naireen Hussain,Prerit Chandok,Radosław Stankiewicz,Rebecca Szper,Robert Bradshaw,Robert Burke,ron-gal,Sam Whittle,Sergei Lilichenko,Shunping Huang,Svetak Sundhar,Thiago Nunes,Timothy Itodo,tvalentyn,twosom,Vatsal,Vitaly Terentyev,Vlado Djerek,Yifan Ye,Yi Hu diff --git a/website/www/site/content/en/blog/unit-testing-in-beam.md b/website/www/site/content/en/blog/unit-testing-in-beam.md new file mode 100644 index 000000000000..b11ced8c6e30 --- /dev/null +++ b/website/www/site/content/en/blog/unit-testing-in-beam.md @@ -0,0 +1,198 @@ +--- +title: "Unit Testing in Beam: An opinionated guide" +date: 2024-09-13 00:00:01 -0800 +categories: + - blog +aliases: + - /blog/2024/09/09/unit-testing-in-beam.html +authors: + - svetakvsundhar +--- + + +Testing remains one of the most fundamental components of software engineering. In this blog post, we shed light on some of the constructs that Apache Beam provides for testing. +We cover an opinionated set of best practices to write unit tests for your data pipeline. This post doesn't include integration tests, and you need to author those separately. +All snippets in this post are included in [this notebook](https://github.com/apache/beam/blob/master/examples/notebooks/blog/unittests_in_beam.ipynb). Additionally, to see tests that exhibit best practices, look at the [Beam starter projects](https://beam.apache.org/blog/beam-starter-projects/), which contain tests that exhibit best practices. + +## Best practices + +When testing Beam pipelines, we recommend the following best practices: + +1) Don't write unit tests for the already supported connectors in the Beam Library, such as `ReadFromBigQuery` and `WriteToText`. These connectors are already tested in Beam’s test suite to ensure correct functionality. They add unnecessary cost and dependencies to a unit test. + +2) Ensure that your function is well tested when using it with `Map`, `FlatMap`, or `Filter`. You can assume your function will work as intended when using `Map(your_function)`. +3) For more complex transforms such as `ParDo`’s, side inputs, timestamp inspection, etc., treat the entire transform as a unit, and test it. +4) If needed, use mocking to mock any API calls that might be present in your DoFn. The purpose of mocking is to test your functionality extensively, even if this testing requires a specific response from an API call. + + 1) Be sure to modularize your API calls in separate functions, rather than making the API call directly in the `DoFn`. This step provides a cleaner experience when mocking the external API calls. + + +## Example 1 + +Use the following pipeline as an example. You don't have to write a separate unit test to test this function in the context of this pipeline, assuming the function `median_house_value_per_bedroom` is unit tested elsewhere in the code. You can trust that the `Map` primitive works as expected (this illustrates point #2 noted previously). + +```python +# The following code computes the median house value per bedroom. + +with beam.Pipeline() as p1: + result = ( + p1 + | ReadFromText("/content/sample_data/california_housing_test.csv",skip_header_lines=1) + | beam.Map(median_house_value_per_bedroom) + | WriteToText("/content/example2") + ) +``` + +## Example 2 + +Use the following function as the example. The functions `median_house_value_per_bedroom` and `multiply_by_factor` are tested elsewhere, but the pipeline as a whole, which consists of composite transforms, is not. + +```python +with beam.Pipeline() as p2: + result = ( + p2 + | ReadFromText("/content/sample_data/california_housing_test.csv",skip_header_lines=1) + | beam.Map(median_house_value_per_bedroom) + | beam.Map(multiply_by_factor) + | beam.CombinePerKey(sum) + | WriteToText("/content/example3") + ) +``` + +The best practice for the previous code is to create a transform with all functions between `ReadFromText` and `WriteToText`. This step separates the transformation logic from the I/Os, allowing you to unit test the transformation logic. The following example is a refactoring of the previous code: + +```python +def transform_data_set(pcoll): + return (pcoll + | beam.Map(median_house_value_per_bedroom) + | beam.Map(multiply_by_factor) + | beam.CombinePerKey(sum)) + +# Define a new class that inherits from beam.PTransform. +class MapAndCombineTransform(beam.PTransform): + def expand(self, pcoll): + return transform_data_set(pcoll) + +with beam.Pipeline() as p2: + result = ( + p2 + | ReadFromText("/content/sample_data/california_housing_test.csv",skip_header_lines=1) + | MapAndCombineTransform() + | WriteToText("/content/example3") + ) +``` + +This code shows the corresponding unit test for the previous example: + +```python +import unittest +import apache_beam as beam +from apache_beam.testing.test_pipeline import TestPipeline +from apache_beam.testing.util import assert_that, equal_to + + +class TestBeam(unittest.TestCase): + +# This test corresponds to example 3, and is written to confirm the pipeline works as intended. + def test_transform_data_set(self): + expected=[(1, 10570.185786231425), (2, 13.375337533753376), (3, 13.315649867374006)] + input_elements = [ + '-122.050000,37.370000,27.000000,3885.000000,661.000000,1537.000000,606.000000,6.608500,344700.000000', + '121.05,99.99,23.30,39.5,55.55,41.01,10,34,74.30,91.91', + '122.05,100.99,24.30,40.5,56.55,42.01,11,35,75.30,92.91', + '-120.05,39.37,29.00,4085.00,681.00,1557.00,626.00,6.8085,364700.00' + ] + with beam.Pipeline() as p2: + result = ( + p2 + | beam.Create(input_elements) + | beam.Map(MapAndCombineTransform()) + ) + assert_that(result,equal_to(expected)) +``` + +## Example 3 + +Suppose we write a pipeline that reads data from a JSON file, passes it through a custom function that makes external API calls for parsing, and then writes it to a custom destination (for example, if we need to do some custom data formatting to have data prepared for a downstream application). + + +The pipeline has the following structure: + +```python +# The following packages are used to run the example pipelines. + +import apache_beam as beam +from apache_beam.io import ReadFromText, WriteToText +from apache_beam.options.pipeline_options import PipelineOptions + +class MyDoFn(beam.DoFn): + def process(self,element): + returned_record = MyApiCall.get_data("http://my-api-call.com") + if len(returned_record)!=10: + raise ValueError("Length of record does not match expected length") + yield returned_record + +with beam.Pipeline() as p3: + result = ( + p3 + | ReadFromText("/content/sample_data/anscombe.json") + | beam.ParDo(MyDoFn()) + | WriteToText("/content/example1") + ) +``` + +This test checks whether the API response is a record of the wrong length and throws the expected error if the test fails. + +```python +!pip install mock # Install the 'mock' module. +``` +```python +# Import the mock package for mocking functionality. +from unittest.mock import Mock,patch +# from MyApiCall import get_data +import mock + + +# MyApiCall is a function that calls get_data to fetch some data by using an API call. +@patch('MyApiCall.get_data') +def test_error_message_wrong_length(self, mock_get_data): + response = ['field1','field2'] + mock_get_data.return_value = Mock() + mock_get_data.return_value.json.return_value=response + + input_elements = ['-122.050000,37.370000,27.000000,3885.000000,661.000000,1537.000000,606.000000,6.608500,344700.000000'] #input length 9 + with self.assertRaisesRegex(ValueError, + "Length of record does not match expected length'"): + p3 = beam.Pipeline() + result = p3 | beam.create(input_elements) | beam.ParDo(MyDoFn()) + result +``` + +## Other testing best practices: + +1) Test all error messages that you raise. +2) Cover any edge cases that might exist in your data. +3) Example 1 could have written the `beam.Map` step with lambda functions instead of with `beam.Map(median_house_value_per_bedroom)`: + +``` +beam.Map(lambda x: x.strip().split(',')) | beam.Map(lambda x: float(x[8])/float(x[4]) +``` + +Separating lambdas into a helper function by using `beam.Map(median_house_value_per_bedroom)` is the recommended approach for more testable code, because changes to the function would be modularized. + +4) Use the `assert_that` statement to ensure that `PCollection` values match correctly, as in the previous example. + +For more guidance about testing on Beam and Dataflow, see the [Google Cloud documentation](https://cloud.google.com/dataflow/docs/guides/develop-and-test-pipelines). For more examples of unit testing in Beam, see the `base_test.py` [code](https://github.com/apache/beam/blob/736cf50430b375d32093e793e1556567557614e9/sdks/python/apache_beam/ml/inference/base_test.py#L262). + +Special thanks to Robert Bradshaw, Danny McCormick, XQ Hu, Surjit Singh, and Rebecca Spzer, who helped refine the ideas in this post. + diff --git a/website/www/site/content/en/case-studies/schrodinger.md b/website/www/site/content/en/case-studies/schrodinger.md new file mode 100644 index 000000000000..178f87a768d0 --- /dev/null +++ b/website/www/site/content/en/case-studies/schrodinger.md @@ -0,0 +1,19 @@ +--- +title: "Schrodinger" +icon: /images/logos/powered-by/schrodinger.png +hasNav: false +hasLink: "https://www.schrodinger.com/" +--- + diff --git a/website/www/site/content/en/documentation/pipelines/test-your-pipeline.md b/website/www/site/content/en/documentation/pipelines/test-your-pipeline.md index 744b12aa1625..05767fea4846 100644 --- a/website/www/site/content/en/documentation/pipelines/test-your-pipeline.md +++ b/website/www/site/content/en/documentation/pipelines/test-your-pipeline.md @@ -65,6 +65,27 @@ with TestPipeline as p: ... {{< /highlight >}} +{{< highlight go >}} +import "github.com/apache/beam/sdks/v2/go/pkg/beam/testing/ptest" + +// Override TestMain with ptest.Main, +// once per package. +func TestMain(m *testing.M) { + ptest.Main(m) +} + +func TestPipeline(t *testing.T) { + ... + // The Go SDK doesn't use a TestPipeline concept, + // and recommends using the ptest harness + // to wrap pipeline construction. + pr := ptest.BuildAndRun(t, func(s beam.Scope) { + ... + }) + ... +} +{{< /highlight >}} + > **Note:** Read about testing unbounded pipelines in Beam in [this blog post](/blog/2016/10/20/test-stream.html). ### Using the Create Transform @@ -100,6 +121,16 @@ assert_that( equal_to(["elem1", "elem3", "elem2"])) {{< /highlight >}} +{{< highlight go >}} +import "github.com/apache/beam/sdks/v2/go/pkg/beam/testing/passert" + +output := ... // beam.PCollection + +// Check whether a PCollection contains some elements in any order. +passert.EqualsList(s, output, ["elem1", "elem3", "elem2"]) +{{< /highlight >}} + + {{< paragraph class="language-java" >}} Any Java code that uses `PAssert` must link in `JUnit` and `Hamcrest`. If you're using Maven, you can link in `Hamcrest` by adding the following dependency to your project's `pom.xml` file: {{< /paragraph >}} @@ -194,6 +225,48 @@ class CountTest(unittest.TestCase): # The pipeline will run and verify the results. {{< /highlight >}} +{{< highlight go >}} +import ( + "testing" + + "github.com/apache/beam/sdks/v2/go/pkg/beam" + "github.com/apache/beam/sdks/v2/go/pkg/beam/register" + "github.com/apache/beam/sdks/v2/go/pkg/beam/testing/passert" + "github.com/apache/beam/sdks/v2/go/pkg/beam/testing/ptest" + "github.com/apache/beam/sdks/v2/go/pkg/beam/transforms/stats" +) + +// formatFn takes a key value pair and puts them +// into a single string for comparison. +func formatFn(w string, c int) string { + return fmt.Sprintf("%s: %d", w, c) +} + +// Register the functional DoFn to ensure execution on workers. +func init() { + register.Function2x1(formatFn) +} + +func TestCountWords(t *testing.T) { + // The pipeline will run and verify the results. + ptest.BuildAndRun(t, func(s beam.Scope) { + words := []string{"hi", "there", "hi", "hi", "sue", "bob", + "hi", "sue", "", "", "ZOW", "bob", ""} + + wantCounts := []string{"hi: 5", "there: 1", "sue: 2", "bob: 2"} + + // Create a PCollection from the words static input data. + input := beam.CreateList(s, words) + + // Apply the Count transform under test. + output := stats.Count(s, col) + formatted := beam.ParDo(s, formatFn, output) + + // Assert that the output PCollection matches the wantCounts data. + passert.Equals(s, formatted, wantCounts...) + }) +} +{{< /highlight >}} ## Testing a Pipeline End-to-End You can use the test classes in the Beam SDKs (such as `TestPipeline` and `PAssert` in the Beam SDK for Java) to test an entire pipeline end-to-end. Typically, to test an entire pipeline, you do the following: @@ -283,3 +356,45 @@ class WordCountTest(unittest.TestCase): # The pipeline will run and verify the results. {{< /highlight >}} + +{{< highlight go >}} +package wordcount + +import ( + "testing" + + "github.com/apache/beam/sdks/v2/go/pkg/beam" + "github.com/apache/beam/sdks/v2/go/pkg/beam/testing/passert" + "github.com/apache/beam/sdks/v2/go/pkg/beam/testing/ptest" +) + +// CountWords and formatFn are omitted for conciseness. +// Code for the Full transforms can be found here: +// https://github.com/apache/beam/blob/master/sdks/go/examples/debugging_wordcount/debugging_wordcount.go + +func CountWords(s beam.Scope, lines beam.PCollection) beam.PCollection { ... } + +func formatFn(w string, c int) string { ... } + +func TestCountWords(t *testing.T) { + // The pipeline will run and verify the results. + ptest.BuildAndRun(t, func(s beam.Scope) { + words := []string{"hi", "there", "hi", "hi", "sue", "bob", + "hi", "sue", "", "", "ZOW", "bob", ""} + + wantCounts := []string{"hi: 5", "there: 1", "sue: 2", "bob: 2"} + + // Create a PCollection from the words static input data. + input := beam.CreateList(s, words) + + // Run ALL the pipeline's transforms + // (in this case, the CountWords composite transform). + output := CountWords(s, input) + formatted := beam.ParDo(s, formatFn, output) + + // Assert that the output PCollection matches + // the wantCounts data. + passert.Equals(s, formatted, wantCounts...) + }) +} +{{< /highlight >}} diff --git a/website/www/site/content/en/documentation/programming-guide.md b/website/www/site/content/en/documentation/programming-guide.md index 4005879a48ac..c716c7554db4 100644 --- a/website/www/site/content/en/documentation/programming-guide.md +++ b/website/www/site/content/en/documentation/programming-guide.md @@ -6096,6 +6096,18 @@ func (fn *MyDoFn) ProcessElement(ctx context.Context, ...) { } {{< /highlight >}} +{{< highlight py>}} +from apache_beam import metrics + +class MyDoFn(beam.DoFn): + def __init__(self): + self.counter = metrics.Metrics.counter("namespace", "counter1") + + def process(self, element): + self.counter.inc() + yield element +{{< /highlight >}} + **Distribution**: A metric that reports information about the distribution of reported values. {{< highlight java >}} @@ -6120,6 +6132,16 @@ func (fn *MyDoFn) ProcessElement(ctx context.Context, v int64, ...) { } {{< /highlight >}} +{{< highlight py >}} +class MyDoFn(beam.DoFn): + def __init__(self): + self.distribution = metrics.Metrics.distribution("namespace", "distribution1") + + def process(self, element): + self.distribution.update(element) + yield element +{{< /highlight >}} + **Gauge**: A metric that reports the latest value out of reported values. Since metrics are collected from many workers the value may not be the absolute last, but one of the latest values. @@ -6145,6 +6167,16 @@ func (fn *MyDoFn) ProcessElement(ctx context.Context, v int64, ...) { } {{< /highlight >}} +{{< highlight py >}} +class MyDoFn(beam.DoFn): + def __init__(self): + self.gauge = metrics.Metrics.gauge("namespace", "gauge1") + + def process(self, element): + self.gaguge.set(element) + yield element +{{< /highlight >}} + ### 10.3. Querying metrics {#querying-metrics} {{< paragraph class="language-java language-python">}} `PipelineResult` has a method `metrics()` which returns a `MetricResults` object that allows @@ -6159,6 +6191,17 @@ matching a given filter. It takes in a predicate with a `SingleResult` paramete be used for custom filters. {{< /paragraph >}} +{{< paragraph class="language-py">}} +`PipelineResult` has a `metrics` method that returns a `MetricResults` object. The `MetricResults` object lets you +access metrics. The main method available in the `MetricResults` object, `query`, lets you +query all metrics that match a given filter. The `query` method takes in a `MetricsFilter` object that you can +use to filter by several different criteria. Querying a `MetricResults` object returns +a dictionary of lists of `MetricResult` objects, with the dictionary organizing them by type, +for example, `Counter`, `Distribution`, and `Gauge`. The `MetricResult` object contains a `result` function +that gets the value of the metric and contains a `key` property. The `key` property contains information about +the namespace and the name of the metric. +{{< /paragraph >}} + {{< highlight java >}} public interface PipelineResult { MetricResults metrics(); @@ -6186,6 +6229,20 @@ public interface MetricResult { {{< code_sample "sdks/go/examples/snippets/10metrics.go" metrics_query >}} {{< /highlight >}} +{{< highlight py >}} +class PipelineResult: + def metrics(self) -> MetricResults: + """Returns a the metric results from the pipeline.""" + +class MetricResults: + def query(self, filter: MetricsFilter) -> Dict[str, List[MetricResult]]: + """Filters the results against the specified filter.""" + +class MetricResult: + def result(self): + """Returns the value of the metric.""" +{{< /highlight >}} + ### 10.4. Using metrics in pipeline {#using-metrics} Below, there is a simple example of how to use a `Counter` metric in a user pipeline. @@ -6228,6 +6285,28 @@ public class MyMetricsDoFn extends DoFn { {{< code_sample "sdks/go/examples/snippets/10metrics.go" metrics_pipeline >}} {{< /highlight >}} +{{< highlight py >}} +class MyMetricsDoFn(beam.DoFn): + def __init__(self): + self.counter = metrics.Metrics.counter("namespace", "counter1") + + def process(self, element): + counter.inc() + yield element + +pipeline = beam.Pipeline() + +pipeline | beam.ParDo(MyMetricsDoFn()) + +result = pipeline.run().wait_until_finish() + +metrics = result.metrics().query( + metrics.MetricsFilter.with_namespace("namespace").with_name("counter1")) + +for metric in metrics["counters"]: + print(metric) +{{< /highlight >}} + ### 10.5. Export metrics {#export-metrics} Beam metrics can be exported to external sinks. If a metrics sink is set up in the configuration, the runner will push metrics to it at a default 5s period. diff --git a/website/www/site/content/en/get-started/downloads.md b/website/www/site/content/en/get-started/downloads.md index fa822d31e6d2..08614b8835c1 100644 --- a/website/www/site/content/en/get-started/downloads.md +++ b/website/www/site/content/en/get-started/downloads.md @@ -96,24 +96,31 @@ versions denoted `0.x.y`. ## Releases +### 2.59.0 (2024-09-11) +Official [source code download](https://downloads.apache.org/beam/2.59.0/apache-beam-2.59.0-source-release.zip). +[SHA-512](https://downloads.apache.org/beam/2.59.0/apache-beam-2.59.0-source-release.zip.sha512). +[signature](https://downloads.apache.org/beam/2.59.0/apache-beam-2.59.0-source-release.zip.asc). + +[Release notes](https://github.com/apache/beam/releases/tag/v2.59.0) + ### 2.58.1 (2024-08-15) -Official [source code download](https://downloads.apache.org/beam/2.58.1/apache-beam-2.58.1-source-release.zip). -[SHA-512](https://downloads.apache.org/beam/2.58.1/apache-beam-2.58.1-source-release.zip.sha512). -[signature](https://downloads.apache.org/beam/2.58.1/apache-beam-2.58.1-source-release.zip.asc). +Official [source code download](https://archive.apache.org/dist/beam/2.58.1/apache-beam-2.58.1-source-release.zip). +[SHA-512](https://archive.apache.org/dist/beam/2.58.1/apache-beam-2.58.1-source-release.zip.sha512). +[signature](https://archive.apache.org/dist/beam/2.58.1/apache-beam-2.58.1-source-release.zip.asc). [Release notes](https://github.com/apache/beam/releases/tag/v2.58.1) ### 2.58.0 (2024-08-06) -Official [source code download](https://archive.apache.org/beam/2.58.0/apache-beam-2.58.0-source-release.zip). -[SHA-512](https://archive.apache.org/beam/2.58.0/apache-beam-2.58.0-source-release.zip.sha512). -[signature](https://archive.apache.org/beam/2.58.0/apache-beam-2.58.0-source-release.zip.asc). +Official [source code download](https://archive.apache.org/dist/beam/2.58.0/apache-beam-2.58.0-source-release.zip). +[SHA-512](https://archive.apache.org/dist/beam/2.58.0/apache-beam-2.58.0-source-release.zip.sha512). +[signature](https://archive.apache.org/dist/beam/2.58.0/apache-beam-2.58.0-source-release.zip.asc). [Release notes](https://github.com/apache/beam/releases/tag/v2.58.0) ### 2.57.0 (2024-06-26) -Official [source code download](https://archive.apache.org/beam/2.57.0/apache-beam-2.57.0-source-release.zip). -[SHA-512](https://archive.apache.org/beam/2.57.0/apache-beam-2.57.0-source-release.zip.sha512). -[signature](https://archive.apache.org/beam/2.57.0/apache-beam-2.57.0-source-release.zip.asc). +Official [source code download](https://archive.apache.org/dist/beam/2.57.0/apache-beam-2.57.0-source-release.zip). +[SHA-512](https://archive.apache.org/dist/beam/2.57.0/apache-beam-2.57.0-source-release.zip.sha512). +[signature](https://archive.apache.org/dist/beam/2.57.0/apache-beam-2.57.0-source-release.zip.asc). [Release notes](https://github.com/apache/beam/releases/tag/v2.57.0) diff --git a/website/www/site/layouts/partials/header.html b/website/www/site/layouts/partials/header.html index a01ce5de084e..139ae9bc885d 100644 --- a/website/www/site/layouts/partials/header.html +++ b/website/www/site/layouts/partials/header.html @@ -207,12 +207,6 @@