From 1c926d3eaf984caab63114627fedf1e184fe5611 Mon Sep 17 00:00:00 2001 From: Nam Bui Date: Fri, 10 Apr 2020 15:25:24 +0200 Subject: [PATCH] Runners and Contribute pages migration (#32) * Migrated Runners page * Migrated Contribute page * Refactored redirected links --- .../site/content/en/blog/capability-matrix.md | 2 +- .../www/site/content/en/contribute/_index.md | 55 +- .../en/contribute/become-a-committer.md | 3 - .../content/en/contribute/committer-guide.md | 7 +- .../content/en/contribute/dependencies.md | 5 +- .../content/en/contribute/design-documents.md | 8 +- .../content/en/contribute/feature-branches.md | 8 +- .../site/content/en/contribute/get-help.md | 9 +- .../content/en/contribute/jira-priorities.md | 3 - .../en/contribute/postcommits-guides.md | 3 - .../postcommits-policies-details.md | 3 - .../en/contribute/postcommits-policies.md | 21 +- .../en/contribute/precommit-policies.md | 3 - .../en/contribute/precommit-triage-guide.md | 3 - .../en/contribute/ptransform-style-guide.md | 58 +- .../content/en/contribute/release-blocking.md | 4 +- .../content/en/contribute/release-guide.md | 41 +- .../content/en/contribute/runner-guide.md | 77 +- .../www/site/content/en/contribute/team.md | 11 +- .../content/en/documentation/runners/apex.md | 16 +- .../runners/capability-matrix.md | 1729 ++++++++++++++++- .../en/documentation/runners/dataflow.md | 64 +- .../en/documentation/runners/direct.md | 37 +- .../content/en/documentation/runners/flink.md | 162 +- .../en/documentation/runners/gearpump.md | 19 +- .../content/en/documentation/runners/jet.md | 108 +- .../en/documentation/runners/jstorm.md | 13 +- .../en/documentation/runners/mapreduce.md | 12 +- .../content/en/documentation/runners/nemo.md | 10 +- .../content/en/documentation/runners/samza.md | 57 +- .../content/en/documentation/runners/spark.md | 156 +- .../www/site/layouts/contribute/baseof.html | 40 + .../site/layouts/documentation/baseof.html | 61 +- website/www/site/layouts/partials/header.html | 4 +- .../partials/section-menu/contribute.html | 45 + .../partials/section-menu/roadmap.html | 4 +- .../partials/section-menu/runners.html | 23 + website/www/site/layouts/runners/baseof.html | 40 + .../layouts/shortcodes/capability-matrix.html | 15 +- .../flink-java-pipeline-options.html | 210 ++ .../flink-python-pipeline-options.html | 209 ++ website/www/site/static/.htaccess | 4 + 42 files changed, 2697 insertions(+), 665 deletions(-) create mode 100644 website/www/site/layouts/contribute/baseof.html create mode 100644 website/www/site/layouts/partials/section-menu/contribute.html create mode 100644 website/www/site/layouts/partials/section-menu/runners.html create mode 100644 website/www/site/layouts/runners/baseof.html create mode 100644 website/www/site/layouts/shortcodes/flink-java-pipeline-options.html create mode 100644 website/www/site/layouts/shortcodes/flink-python-pipeline-options.html create mode 100644 website/www/site/static/.htaccess diff --git a/website/www/site/content/en/blog/capability-matrix.md b/website/www/site/content/en/blog/capability-matrix.md index cd02be4ba39a7..9f1406a261dad 100644 --- a/website/www/site/content/en/blog/capability-matrix.md +++ b/website/www/site/content/en/blog/capability-matrix.md @@ -550,7 +550,7 @@ capability-matrix-snapshot: l2: '' l3: '' - - name: 'Accumulating & Retracting' + - name: 'Accumulating & Retracting' values: - class: model jira: BEAM-91 diff --git a/website/www/site/content/en/contribute/_index.md b/website/www/site/content/en/contribute/_index.md index 984d522233252..939739675a7d5 100644 --- a/website/www/site/content/en/contribute/_index.md +++ b/website/www/site/content/en/contribute/_index.md @@ -1,9 +1,6 @@ --- -layout: section title: "Beam Contribution Guide" -permalink: /contribute/ -section_menu: section-menu/contribute.html -redirect_from: +aliases: - /contribution-guide/ - /contribute/contribution-guide/ - /docs/contribute/ @@ -28,15 +25,13 @@ limitations under the License. The Apache Beam community welcomes contributions from anyone! -If you have questions, please [reach out to the Beam community]({{ site.baseurl }}/contribute/get-help). +If you have questions, please [reach out to the Beam community](/contribute/get-help). There are lots of opportunities to contribute: - - ask or answer questions on [user@beam.apache.org]({{ site.baseurl -}}/community/contact-us/) or + - ask or answer questions on [user@beam.apache.org](/community/contact-us/) or [stackoverflow](https://stackoverflow.com/questions/tagged/apache-beam) - - review proposed design ideas on [dev@beam.apache.org]({{ site.baseurl -}}/community/contact-us/) + - review proposed design ideas on [dev@beam.apache.org](/community/contact-us/) - improve the documentation - file [bug reports](https://issues.apache.org/jira/projects/BEAM/issues) - test releases @@ -101,10 +96,10 @@ gLinux users should configure their machines for sudoless Docker. ### Connect With the Beam community -1. Consider subscribing to the [dev@ mailing list]({{ site.baseurl}}/community/contact-us/), especially +1. Consider subscribing to the [dev@ mailing list](/community/contact-us/), especially if you plan to make more than one change or the change will be large. All decisions happen on the public dev list. -1. (Optionally) Join the [#beam channel of the ASF slack]({{ site.baseurl}}/community/contact-us/). +1. (Optionally) Join the [#beam channel of the ASF slack](/community/contact-us/). 1. Create an account on [Beam issue tracker (JIRA)](https://issues.apache.org/jira/projects/BEAM/issues) (anyone can do this). @@ -116,15 +111,15 @@ gLinux users should configure their machines for sudoless Docker. 1. If you want to get involved but don't have a project in mind, check our list of open starter tasks, [https://s.apache.org/beam-starter-tasks](https://s.apache.org/beam-starter-tasks). 1. Assign the issue to yourself. To get the permission to do so, email - the [dev@ mailing list]({{ site.baseurl }}/community/contact-us) + the [dev@ mailing list](/community/contact-us) to introduce yourself and to be added as a contributor in the Beam issue tracker including your ASF Jira Username. For example [this welcome email]( https://lists.apache.org/thread.html/e6018c2aaf7dc7895091434295e5b0fafe192b975e3e3761fcf0cda7@%3Cdev.beam.apache.org%3E). 1. If your change is large or it is your first change, it is a good idea to - [discuss it on the dev@ mailing list]({{ site.baseurl }}/community/contact-us/). + [discuss it on the dev@ mailing list](/community/contact-us/). 1. For large changes create a design doc ([template](https://s.apache.org/beam-design-doc-template), - [examples](https://s.apache.org/beam-design-docs)) and email it to the [dev@ mailing list]({{ site.baseurl }}/community/contact-us). + [examples](https://s.apache.org/beam-design-docs)) and email it to the [dev@ mailing list](/community/contact-us). ### Development Setup @@ -195,15 +190,15 @@ gLinux users should configure their machines for sudoless Docker. that start various post-commit tests suites. Use these sparingly because post-commit tests consume shared development resources. 1. Pull requests can only be merged by a - [Beam committer]({{ site.baseurl }}/contribute/team/). + [Beam committer](/contribute/team). To find a committer for your area, either: - look in the OWNERS file of the directory where you changed files, or - look for similar code merges, or - - ask on [dev@beam.apache.org]({{ site.baseurl }}/community/contact-us/) + - ask on [dev@beam.apache.org](/community/contact-us/) Use `R: @username` in the pull request to notify a reviewer. -1. If you don't get any response in 3 business days, email the [dev@ mailing list]({{ site.baseurl }}/community/contact-us) to ask for someone to look at your pull +1. If you don't get any response in 3 business days, email the [dev@ mailing list](/community/contact-us) to ask for someone to look at your pull request. ### Make the reviewer's job easier @@ -245,41 +240,39 @@ unassigned from the author but will stay open. Anyone can access it and browse issues. Anyone can register an account and login to create issues or add comments. Only contributors can be assigned issues. If you want to be assigned issues, a PMC member can add you to the project contributor - group. Email the [dev@ mailing list]({{ site.baseurl }}/community/contact-us) + group. Email the [dev@ mailing list](/community/contact-us) to ask to be added as a contributor in the Beam issue tracker, and include your ASF Jira username. - [Beam Wiki Space](https://cwiki.apache.org/confluence/display/BEAM/Apache+Beam): Anyone has read access. If you wish to contribute changes, please create an account and request edit access on the - [dev@ mailing list]({{ site.baseurl }}/community/contact-us) (include your Wiki account user ID). + [dev@ mailing list](/community/contact-us) (include your Wiki account user ID). - Pull requests can only be merged by a - [Beam committer]({{ site.baseurl }}/contribute/team/). + [Beam committer](/contribute/team). - [Voting on a release](https://www.apache.org/foundation/voting.html): Everyone can vote. Only - [Beam PMC]({{ site.baseurl }}/contribute/team/) members should mark their votes as binding. + [Beam PMC](/contribute/team) members should mark their votes as binding. ## Communication All communication is expected to align with the [Code of Conduct](https://www.apache.org/foundation/policies/conduct). -Discussion about contributing code to Beam happens on the [dev@ mailing list]({{ site.baseurl -}}/community/contact-us/). Introduce yourself! +Discussion about contributing code to Beam happens on the [dev@ mailing list](/community/contact-us/). Introduce yourself! -Questions can be asked on the [#beam channel of the ASF slack]({{ site.baseurl -}}/community/contact-us/). Introduce yourself! +Questions can be asked on the [#beam channel of the ASF slack](/community/contact-us/). Introduce yourself! ## Additional resources If you are contributing a `PTransform` to Beam, we have an extensive -[PTransform Style Guide]({{ site.baseurl }}/contribute/ptransform-style-guide). +[PTransform Style Guide](/contribute/ptransform-style-guide). If you are contributing a Runner to Beam, refer to the -[Runner authoring guide]({{ site.baseurl }}/contribute/runner-guide/) +[Runner authoring guide](/contribute/runner-guide/) Review [design documents](https://s.apache.org/beam-design-docs). A great way to contribute is to join an existing effort. For the most -intensive efforts, check out the [roadmap]({{site.baseurl}}/roadmap/). +intensive efforts, check out the [roadmap](/roadmap/). You can also find a more exhaustive list on the [Beam developers' wiki]( https://cwiki.apache.org/confluence/display/BEAM/Apache+Beam) @@ -288,9 +281,9 @@ https://cwiki.apache.org/confluence/display/BEAM/Apache+Beam) If you run into any issues, check out the [contribution FAQ]( https://cwiki.apache.org/confluence/display/BEAM/Contributor+FAQ) or ask on -on the [dev@ mailing list]({{ site.baseurl}}/community/contact-us/) or -[#beam channel of the ASF slack]({{ site.baseurl}}/community/contact-us/). +on the [dev@ mailing list](/community/contact-us/) or +[#beam channel of the ASF slack](/community/contact-us/). ---- If you didn't find the information you were looking for in this guide, please -[reach out to the Beam community]({{ site.baseurl }}/community/contact-us). +[reach out to the Beam community](/community/contact-us). diff --git a/website/www/site/content/en/contribute/become-a-committer.md b/website/www/site/content/en/contribute/become-a-committer.md index 0adba85c10db4..e7f8160fb231e 100644 --- a/website/www/site/content/en/contribute/become-a-committer.md +++ b/website/www/site/content/en/contribute/become-a-committer.md @@ -1,8 +1,5 @@ --- -layout: section title: "Become A Committer" -permalink: /contribute/become-a-committer/ -section_menu: section-menu/contribute.html --- + + diff --git a/website/www/site/content/en/contribute/feature-branches.md b/website/www/site/content/en/contribute/feature-branches.md index d4735834459fa..24853e1621604 100644 --- a/website/www/site/content/en/contribute/feature-branches.md +++ b/website/www/site/content/en/contribute/feature-branches.md @@ -1,8 +1,5 @@ --- -layout: section title: "Beam Feature Branches" -permalink: /contribute/feature-branches/ -section_menu: section-menu/contribute.html --- + See the [download page](/get-started/downloads/{$DOWNLOAD_ANCHOR}) for this release. For more information on changes in {$RELEASE_VERSION}, check out the [detailed release notes]({$JIRA_RELEASE_NOTES}). @@ -890,7 +893,7 @@ Template: 1. Maven artifacts deployed to the staging repository of [repository.apache.org](https://repository.apache.org/content/repositories/) 1. Source distribution deployed to the dev repository of [dist.apache.org](https://dist.apache.org/repos/dist/dev/beam/) -1. Website pull request proposed to list the [release]({{ site.baseurl }}/get-started/downloads/), publish the [Java API reference manual](https://beam.apache.org/releases/javadoc/), and publish the [Python API reference manual](https://beam.apache.org/releases/pydoc/). +1. Website pull request proposed to list the [release](/get-started/downloads/), publish the [Java API reference manual](https://beam.apache.org/releases/javadoc/), and publish the [Python API reference manual](https://beam.apache.org/releases/pydoc/). 1. Docker images are published to [DockerHub](https://hub.docker.com/search?q=apache%2Fbeam&type=image) with tags: {RELEASE}_rc{RC_NUM}. You can (optionally) also do additional verification by: @@ -1027,7 +1030,7 @@ _Note_: -Prepourl and -Pver can be found in the RC vote email sent by Release Ma ``` Flink Local Runner ``` - ./gradlew :runners:flink:1.10:runQuickstartJavaFlinkLocal \ + ./gradlew :runners:flink:1.9:runQuickstartJavaFlinkLocal \ -Prepourl=https://repository.apache.org/content/repositories/orgapachebeam-${KEY} \ -Pver=${RELEASE_VERSION} ``` @@ -1178,7 +1181,6 @@ _Note_: -Prepourl and -Pver can be found in the RC vote email sent by Release Ma ``` python -m apache_beam.examples.complete.game.leader_board \ --project=${YOUR_PROJECT} \ - --region=${GCE_REGION} \ --topic projects/${YOUR_PROJECT}/topics/${YOUR_PUBSUB_TOPIC} \ --dataset ${USER}_test \ --runner DataflowRunner \ @@ -1208,7 +1210,6 @@ _Note_: -Prepourl and -Pver can be found in the RC vote email sent by Release Ma ``` python -m apache_beam.examples.complete.game.game_stats \ --project=${YOUR_PROJECT} \ - --region=${GCE_REGION} \ --topic projects/${YOUR_PROJECT}/topics/${YOUR_PUBSUB_TOPIC} \ --dataset ${USER}_test \ --runner DataflowRunner \ @@ -1278,7 +1279,7 @@ Make sure the download address for last release version is upldaed, [example PR] ./beam/release/src/main/scripts/publish_docker_images.sh ``` Verify that: -* Images are published at [DockerHub](https://hub.docker.com/search?q=apache%2Fbeam&type=image) with tags {RELEASE} and *latest*. +* Images are published at [DockerHub](https://hub.docker.com/u/apachebeam) with tags {RELEASE} and *latest*. * Images with *latest* tag are pointing to current release by confirming 1. Digest of the image with *latest* tag is the same as the one with {RELEASE} tag. @@ -1292,7 +1293,7 @@ Create and push a new signed tag for the released version by copying the tag for ### Merge website pull request -Merge the website pull request to [list the release]({{ site.baseurl }}/get-started/downloads/), publish the [Python API reference manual](https://beam.apache.org/releases/pydoc/), the [Java API reference manual](https://beam.apache.org/releases/javadoc/) and Blogpost created earlier. +Merge the website pull request to [list the release](/get-started/downloads/), publish the [Python API reference manual](https://beam.apache.org/releases/pydoc/), the [Java API reference manual](https://beam.apache.org/releases/javadoc/) and Blogpost created earlier. ### Mark the version as released in JIRA @@ -1311,7 +1312,7 @@ __NOTE__: Only PMC members have permissions to do it, ping [dev@](mailto:dev@bea * Maven artifacts released and indexed in the [Maven Central Repository](https://search.maven.org/#search%7Cga%7C1%7Cg%3A%22org.apache.beam%22) * Source distribution available in the release repository of [dist.apache.org](https://dist.apache.org/repos/dist/release/beam/) * Source distribution removed from the dev repository of [dist.apache.org](https://dist.apache.org/repos/dist/dev/beam/) -* Website pull request to [list the release]({{ site.baseurl }}/get-started/downloads/) and publish the [API reference manual](https://beam.apache.org/releases/javadoc/) merged +* Website pull request to [list the release](/get-started/downloads/) and publish the [API reference manual](https://beam.apache.org/releases/javadoc/) merged * Release tagged in the source code repository * Release version finalized in JIRA. (Note: Not all committers have administrator access to JIRA. If you end up getting permissions errors ask on the mailing list for assistance.) * Release version is listed at reporter.apache.org diff --git a/website/www/site/content/en/contribute/runner-guide.md b/website/www/site/content/en/contribute/runner-guide.md index c0f6d573b8704..482520547394d 100644 --- a/website/www/site/content/en/contribute/runner-guide.md +++ b/website/www/site/content/en/contribute/runner-guide.md @@ -1,8 +1,5 @@ --- -layout: section title: "Runner Authoring Guide" -section_menu: section-menu/contribute.html -permalink: /contribute/runner-guide/ --- + + diff --git a/website/www/site/content/en/documentation/runners/apex.md b/website/www/site/content/en/documentation/runners/apex.md index 33e977c9bab40..16257130278a4 100644 --- a/website/www/site/content/en/documentation/runners/apex.md +++ b/website/www/site/content/en/documentation/runners/apex.md @@ -1,8 +1,6 @@ --- -layout: section +type: runners title: "Apache Apex Runner" -section_menu: section-menu/runners.html -permalink: /documentation/runners/apex/ --- # Using the Apache Apex Runner -The Apex Runner executes Apache Beam pipelines using [Apache Apex](https://apex.apache.org/) as an underlying engine. The runner has broad support for the [Beam model and supports streaming and batch pipelines]({{ site.baseurl }}/documentation/runners/capability-matrix/). +The Apex Runner executes Apache Beam pipelines using [Apache Apex](http://apex.apache.org/) as an underlying engine. The runner has broad support for the [Beam model and supports streaming and batch pipelines](/documentation/runners/capability-matrix/). -[Apache Apex](https://apex.apache.org/) is a stream processing platform and framework for low-latency, high-throughput and fault-tolerant analytics applications on Apache Hadoop. Apex has a unified streaming architecture and can be used for real-time and batch processing. +[Apache Apex](http://apex.apache.org/) is a stream processing platform and framework for low-latency, high-throughput and fault-tolerant analytics applications on Apache Hadoop. Apex has a unified streaming architecture and can be used for real-time and batch processing. The following instructions are for running Beam pipelines with Apex on a YARN cluster. -They are not required for Apex in embedded mode (see [quickstart]({{ site.baseurl }}/get-started/quickstart-java/)). +They are not required for Apex in embedded mode (see [quickstart](/get-started/quickstart-java/)). ## Apex Runner prerequisites You may set up your own Hadoop cluster. Beam does not require anything extra to launch the pipelines on YARN. An optional Apex installation may be useful for monitoring and troubleshooting. -The Apex CLI can be [built](https://apex.apache.org/docs/apex/apex_development_setup/) or +The Apex CLI can be [built](http://apex.apache.org/docs/apex/apex_development_setup/) or obtained as binary build. -For more download options see [distribution information on the Apache Apex website](https://apex.apache.org/downloads.html). +For more download options see [distribution information on the Apache Apex website](http://apex.apache.org/downloads.html). ## Running wordcount with Apex @@ -71,7 +69,7 @@ it is necessary to augment the build to include the respective file system provi Depending on your installation, you may be able to monitor the progress of your job on the Hadoop cluster. Alternatively, you have following options: * YARN : Using YARN web UI generally running on 8088 on the node running resource manager. -* Apex command-line interface: [Using the Apex CLI to get running application information](https://apex.apache.org/docs/apex/apex_cli/#apex-cli-commands). +* Apex command-line interface: [Using the Apex CLI to get running application information](http://apex.apache.org/docs/apex/apex_cli/#apex-cli-commands). Check the output of the pipeline: ``` diff --git a/website/www/site/content/en/documentation/runners/capability-matrix.md b/website/www/site/content/en/documentation/runners/capability-matrix.md index 2e7d845b6f7c7..1f1292bbc6212 100644 --- a/website/www/site/content/en/documentation/runners/capability-matrix.md +++ b/website/www/site/content/en/documentation/runners/capability-matrix.md @@ -1,11 +1,1706 @@ --- -layout: section +type: runners title: "Apache Beam Capability Matrix" -section_menu: section-menu/runners.html -permalink: /documentation/runners/capability-matrix/ -redirect_from: +aliases: - /learn/runners/capability-matrix/ - /capability-matrix/ + +capability-matrix: + columns: + - class: model + name: Beam Model + - class: dataflow + name: Google Cloud Dataflow + - class: flink + name: Apache Flink + - class: spark-rdd + name: Apache Spark (RDD/DStream based) + - class: spark-dataset + name: Apache Spark Structured Streaming (Dataset based) + - class: apex + name: Apache Apex + - class: gearpump + name: Apache Gearpump + - class: mapreduce + name: Apache Hadoop MapReduce + - class: jstorm + name: JStorm + - class: ibmstreams + name: IBM Streams + - class: samza + name: Apache Samza + - class: nemo + name: Apache Nemo + - class: jet + name: Hazelcast Jet + + categories: + - description: What is being computed? + anchor: what + color-b: 'ca1' + color-y: 'ec3' + color-p: 'fe5' + color-n: 'ddd' + rows: + - name: ParDo + values: + - class: model + l1: 'Yes' + l2: element-wise processing + l3: Element-wise transformation parameterized by a chunk of user code. Elements are processed in bundles, with initialization and termination hooks. Bundle size is chosen by the runner and cannot be controlled by user code. ParDo processes a main input PCollection one element at a time, but provides side input access to additional PCollections. + - class: dataflow + l1: 'Yes' + l2: fully supported + l3: Batch mode uses large bundle sizes. Streaming uses smaller bundle sizes. + - class: flink + l1: 'Yes' + l2: fully supported + l3: ParDo itself, as per-element transformation with UDFs, is fully supported by Flink for both batch and streaming. + - class: spark-rdd + l1: 'Yes' + l2: fully supported + l3: ParDo applies per-element transformations as Spark FlatMapFunction. + - class: spark-dataset + l1: 'Partially' + l2: fully supported in batch mode + l3: ParDo applies per-element transformations as Spark FlatMapFunction. + - class: apex + l1: 'Yes' + l2: fully supported + l3: Supported through Apex operator that wraps the function and processes data as single element bundles. + - class: gearpump + l1: 'Yes' + l2: fully supported + l3: Gearpump wraps the per-element transformation function into processor execution. + - class: mapreduce + l1: 'Yes' + l2: fully supported + l3: '' + - class: jstorm + l1: 'Yes' + l2: fully supported + l3: '' + - class: ibmstreams + l1: 'Yes' + l2: fully supported + l3: '' + - class: samza + l1: 'Yes' + l2: fully supported + l3: Supported with per-element transformation. + - class: nemo + l1: 'Yes' + l2: fully supported + l3: '' + - class: jet + l1: 'Yes' + l2: fully supported + l3: '' + - name: GroupByKey + values: + - class: model + l1: 'Yes' + l2: key grouping + l3: Grouping of key-value pairs per key, window, and pane. (See also other tabs.) + - class: dataflow + l1: 'Yes' + l2: fully supported + l3: '' + - class: flink + l1: 'Yes' + l2: fully supported + l3: "Uses Flink's keyBy for key grouping. When grouping by window in streaming (creating the panes) the Flink runner uses the Beam code. This guarantees support for all windowing and triggering mechanisms." + - class: spark-rdd + l1: 'Partially' + l2: fully supported in batch mode + l3: "Using Spark's groupByKey. GroupByKey with multiple trigger firings in streaming mode is a work in progress." + - class: spark-dataset + l1: 'Partially' + l2: fully supported in batch mode + l3: "Using Spark's groupByKey." + - class: apex + l1: 'Yes' + l2: fully supported + l3: "Apex runner uses the Beam code for grouping by window and thereby has support for all windowing and triggering mechanisms. Runner does not implement partitioning yet (BEAM-838)" + - class: gearpump + l1: 'Yes' + l2: fully supported + l3: "Use Gearpump's groupBy and window for key grouping and translate Beam's windowing and triggering to Gearpump's internal implementation." + - class: mapreduce + l1: 'Yes' + l2: fully supported + l3: '' + - class: jstorm + l1: 'Yes' + l2: fully supported + l3: '' + - class: ibmstreams + l1: 'Yes' + l2: fully supported + l3: '' + - class: samza + l1: 'Yes' + l2: fully supported + l3: "Uses Samza's partitionBy for key grouping and Beam's logic for window aggregation and triggering." + - class: nemo + l1: 'Yes' + l2: fully supported + l3: '' + - class: jet + l1: 'Yes' + l2: fully supported + l3: '' + - name: Flatten + values: + - class: model + l1: 'Yes' + l2: collection concatenation + l3: Concatenates multiple homogenously typed collections together. + - class: dataflow + l1: 'Yes' + l2: fully supported + l3: '' + - class: flink + l1: 'Yes' + l2: fully supported + l3: '' + - class: spark-rdd + l1: 'Yes' + l2: fully supported + l3: '' + - class: spark-dataset + l1: 'Partially' + l2: fully supported in batch mode + l3: Some corner cases like flatten on empty collections are not yet supported. + - class: apex + l1: 'Yes' + l2: fully supported + l3: '' + - class: gearpump + l1: 'Yes' + l2: fully supported + l3: '' + - class: mapreduce + l1: 'Yes' + l2: fully supported + l3: '' + - class: jstorm + l1: 'Yes' + l2: fully supported + l3: '' + - class: ibmstreams + l1: 'Yes' + l2: fully supported + l3: '' + - class: samza + l1: 'Yes' + l2: fully supported + l3: '' + - class: nemo + l1: 'Yes' + l2: fully supported + l3: '' + - class: jet + l1: 'Yes' + l2: fully supported + l3: '' + - name: Combine + values: + - class: model + l1: 'Yes' + l2: associative & commutative aggregation + l3: 'Application of an associative, commutative operation over all values ("globally") or over all values associated with each key ("per key"). Can be implemented using ParDo, but often more efficient implementations exist.' + - class: dataflow + l1: 'Yes' + l2: 'efficient execution' + l3: '' + - class: flink + l1: 'Yes' + l2: 'fully supported' + l3: Uses a combiner for pre-aggregation for batch and streaming. + - class: spark-rdd + l1: 'Yes' + l2: fully supported + l3: "Using Spark's combineByKey and aggregate functions." + - class: spark-dataset + l1: 'Partially' + l2: fully supported in batch mode + l3: "Using Spark's Aggregator and agg function" + - class: apex + l1: 'Yes' + l2: 'fully supported' + l3: "Default Beam translation. Currently no efficient pre-aggregation (BEAM-935)." + - class: gearpump + l1: 'Yes' + l2: fully supported + l3: '' + - class: mapreduce + l1: 'Yes' + l2: fully supported + l3: '' + - class: jstorm + l1: 'Yes' + l2: fully supported + l3: '' + - class: ibmstreams + l1: 'Yes' + l2: fully supported + l3: '' + - class: samza + l1: 'Yes' + l2: fully supported + l3: Use combiner for efficient pre-aggregation. + - class: nemo + l1: 'Yes' + l2: fully supported + l3: 'Batch mode uses pre-aggregation' + - class: jet + l1: 'Yes' + l2: fully supported + l3: 'Batch mode uses pre-aggregation' + - name: Composite Transforms + values: + - class: model + l1: 'Yes' + l2: user-defined transformation subgraphs + l3: Allows easy extensibility for library writers. In the near future, we expect there to be more information provided at this level -- customized metadata hooks for monitoring, additional runtime/environment hooks, etc. + - class: dataflow + l1: 'Partially' + l2: supported via inlining + l3: Currently composite transformations are inlined during execution. The structure is later recreated from the names, but other transform level information (if added to the model) will be lost. + - class: flink + l1: 'Partially' + l2: supported via inlining + l3: '' + - class: spark-rdd + l1: 'Partially' + l2: supported via inlining + l3: '' + - class: spark-dataset + l1: 'Partially' + l2: supported via inlining only in batch mode + l3: '' + - class: apex + l1: 'Partially' + l2: supported via inlining + l3: '' + - class: gearpump + l1: 'Partially' + l2: supported via inlining + l3: '' + - class: mapreduce + l1: 'Yes' + l2: fully supported + l3: '' + - class: jstorm + l1: 'Yes' + l2: fully supported + l3: '' + - class: ibmstreams + l1: 'Partially' + l2: supported via inlining + l3: '' + - class: samza + l1: 'Partially' + l2: supported via inlining + l3: '' + - class: nemo + l1: 'Yes' + l2: fully supported + l3: '' + - class: jet + l1: 'Partially' + l2: supported via inlining + l3: '' + - name: Side Inputs + values: + - class: model + l1: 'Yes' + l2: additional elements available during DoFn execution + l3: Side inputs are additional PCollections whose contents are computed during pipeline execution and then made accessible to DoFn code. The exact shape of the side input depends both on the PCollectionView used to describe the access pattern (interable, map, singleton) and the window of the element from the main input that is currently being processed. + - class: dataflow + l1: 'Yes' + l2: some size restrictions in streaming + l3: Batch mode supports a distributed implementation, but streaming mode may force some size restrictions. Neither mode is able to push lookups directly up into key-based sources. + - class: flink + l1: 'Yes' + l2: some size restrictions in streaming + l3: Batch mode supports a distributed implementation, but streaming mode may force some size restrictions. Neither mode is able to push lookups directly up into key-based sources. + - class: spark-rdd + l1: 'Yes' + l2: fully supported + l3: "Using Spark's broadcast variables. In streaming mode, side inputs may update but only between micro-batches." + - class: spark-dataset + l1: 'Partially' + l2: fully supported in batch mode + l3: "Using Spark's broadcast variables." + - class: apex + l1: 'Yes' + l2: size restrictions + l3: No distributed implementation and therefore size restrictions. + - class: gearpump + l1: 'Yes' + l2: fully supported + l3: Implemented by merging side input as a normal stream in Gearpump + - class: mapreduce + l1: 'Yes' + l2: fully supported + l3: '' + - class: jstorm + l1: 'Yes' + l2: some size restrictions + l3: '' + - class: ibmstreams + l1: 'Yes' + l2: fully supported + l3: '' + - class: samza + l1: 'Yes' + l2: fully supported + l3: Uses Samza's broadcast operator to distribute the side inputs. + - class: nemo + l1: 'Yes' + l2: fully supported + l3: '' + - class: jet + l1: 'Partially' + l2: with restrictions + l3: Supported only when the side input source is bounded and windowing uses global window + - name: Source API + values: + - class: model + l1: 'Yes' + l2: user-defined sources + l3: Allows users to provide additional input sources. Supports both bounded and unbounded data. Includes hooks necessary to provide efficient parallelization (size estimation, progress information, dynamic splitting, etc). + - class: dataflow + l1: 'Yes' + l2: fully supported + l3: Support includes autotuning features (https://cloud.google.com/dataflow/service/dataflow-service-desc#autotuning-features). + - class: flink + l1: 'Yes' + l2: fully supported + l3: + - class: spark-rdd + l1: 'Yes' + l2: fully supported + l3: + - class: spark-dataset + l1: 'Partially' + l2: bounded source only + l3: "Using Spark's DatasourceV2 API in microbatch mode (Continuous streaming mode is tagged experimental in spark and does not support aggregation)." + - class: apex + l1: 'Yes' + l2: fully supported + l3: + - class: gearpump + l1: 'Yes' + l2: fully supported + l3: '' + - class: mapreduce + l1: 'Partially' + l2: bounded source only + l3: '' + - class: jstorm + l1: 'Yes' + l2: fully supported + l3: '' + - class: ibmstreams + l1: 'Yes' + l2: fully supported + l3: '' + - class: samza + l1: 'Yes' + l2: fully supported + l3: '' + - class: nemo + l1: 'Yes' + l2: fully supported + l3: '' + - class: jet + l1: 'Yes' + l2: fully supported + l3: '' + - name: Splittable DoFn (SDF) + values: + - class: model + l1: 'Partially' + l2: DoFn where processing of each element can be split for parallelism, or suspended and resumed + l3: Allows users to develop DoFn's that process a single element in portions ("restrictions"), executed in parallel or sequentially. This supersedes the unbounded and bounded `Source` APIs by supporting all of their features on a per-element basis. See http://s.apache.org/splittable-do-fn. Design is in progress on achieving parity with Source API regarding progress signals. + - class: dataflow + l1: 'Yes' + l2: + l3: Does not yet support autotuning features of the Source API. + - class: flink + l1: 'Yes' + l2: + l3: + - class: spark-rdd + l1: 'Partially' + l2: supports bounded-per-element SDFs + l3: + - class: spark-dataset + l1: 'No' + l2: not implemented + l3: + - class: apex + l1: 'Partially' + l2: supports bounded-per-element SDFs + l3: implementation in streaming mode coming soon + - class: gearpump + l1: 'Partially' + l2: supports bounded-per-element SDFs + l3: + - class: mapreduce + l1: 'No' + l2: not implemented + l3: + - class: jstorm + l1: 'No' + l2: not implemented + l3: + - class: ibmstreams + l1: 'No' + l2: not implemented + l3: + - class: samza + l1: 'Partially' + l2: supports bounded-per-element SDFs + l3: + - class: nemo + l1: 'No' + l2: not implemented + l3: '' + - class: jet + l1: 'No' + l2: not implemented + l3: '' + - name: Metrics + values: + - class: model + l1: 'Partially' + l2: user-provided metrics + l3: Allow transforms to gather simple metrics across bundles in a PTransform. Provide a mechanism to obtain both committed and attempted metrics. Semantically similar to using an additional output, but support partial results as the transform executes, and support both committed and attempted values. Will likely want to augment Metrics to be more useful for processing unbounded data by making them windowed. + - class: dataflow + l1: 'Partially' + l2: '' + l3: Gauge metrics are not supported. All other metric types are supported. + - class: flink + l1: 'Partially' + l2: All metrics types are supported. + l3: Only attempted values are supported. No committed values for metrics. + - class: spark-rdd + l1: 'Partially' + l2: All metric types are supported. + l3: Only attempted values are supported. No committed values for metrics. + - class: spark-dataset + l1: 'Partially' + l2: All metric types are supported in batch mode. + l3: Only attempted values are supported. No committed values for metrics. + - class: apex + l1: 'No' + l2: Not implemented in runner. + l3: + - class: gearpump + l1: 'No' + l2: '' + l3: not implemented + - class: mapreduce + l1: 'Partially' + l2: Only attempted counters are supported + l3: '' + - class: jstorm + l1: 'Partially' + l2: Metrics are only supported in local mode. + l3: '' + - class: ibmstreams + l1: 'Partially' + l2: All metrics types are supported. + l3: Only attempted values are supported. No committed values for metrics. + - class: samza + l1: 'Partially' + l2: Counter and Gauge are supported. + l3: Only attempted values are supported. No committed values for metrics. + - class: nemo + l1: 'No' + l2: not implemented + l3: '' + - class: jet + l1: 'Partially' + l2: All metrics types supported, both in batching and streaming mode. + l3: Doesn't differentiate between committed and attempted values. + - name: Stateful Processing + values: + - class: model + l1: 'Yes' + l2: storage per key, per window + l3: Allows fine-grained access to per-key, per-window persistent state. Necessary for certain use cases (e.g. high-volume windows which store large amounts of data, but typically only access small portions of it; complex state machines; etc.) that are not easily or efficiently addressed via Combine or GroupByKey+ParDo. + - class: dataflow + l1: 'Partially' + l2: non-merging windows + l3: State is supported for non-merging windows. SetState and MapState are not yet supported. + - class: flink + l1: 'Partially' + l2: non-merging windows + l3: State is supported for non-merging windows. SetState and MapState are not yet supported. + - class: spark-rdd + l1: 'Partially' + l2: full support in batch mode + l3: + - class: spark-dataset + l1: 'No' + l2: not implemented + l3: + - class: apex + l1: 'Partially' + l2: non-merging windows + l3: State is supported for non-merging windows. SetState and MapState are not yet supported. + - class: gearpump + l1: 'No' + l2: not implemented + l3: '' + - class: mapreduce + l1: 'Partially' + l2: non-merging windows + l3: '' + - class: jstorm + l1: 'Partially' + l2: non-merging windows + l3: '' + - class: ibmstreams + l1: 'Partially' + l2: non-merging windows + l3: '' + - class: samza + l1: 'Partially' + l2: non-merging windows + l3: 'States are backed up by either rocksDb KV store or in-memory hash map, and persist using changelog.' + - class: nemo + l1: 'No' + l2: not implemented + l3: '' + - class: jet + l1: 'Partially' + l2: non-merging windows + l3: '' + - description: Where in event time? + anchor: where + color-b: '37d' + color-y: '59f' + color-p: '8cf' + color-n: 'ddd' + rows: + - name: Global windows + values: + - class: model + l1: 'Yes' + l2: all time + l3: The default window which covers all of time. (Basically how traditional batch cases fit in the model.) + - class: dataflow + l1: 'Yes' + l2: default + l3: '' + - class: flink + l1: 'Yes' + l2: supported + l3: '' + - class: spark-rdd + l1: 'Yes' + l2: supported + l3: '' + - class: spark-dataset + l1: 'Partially' + l2: fully supported in batch mode + l3: '' + - class: apex + l1: 'Yes' + l2: supported + l3: '' + - class: gearpump + l1: 'Yes' + l2: supported + l3: '' + - class: mapreduce + l1: 'Yes' + l2: supported + l3: '' + - class: jstorm + l1: 'Yes' + l2: supported + l3: '' + - class: ibmstreams + l1: 'Yes' + l2: supported + l3: '' + - class: samza + l1: 'Yes' + l2: supported + l3: '' + - class: nemo + l1: 'Yes' + l2: supported + l3: '' + - class: jet + l1: 'Yes' + l2: supported + l3: '' + - name: Fixed windows + values: + - class: model + l1: 'Yes' + l2: periodic, non-overlapping + l3: Fixed-size, timestamp-based windows. (Hourly, Daily, etc) + - class: dataflow + l1: 'Yes' + l2: built-in + l3: '' + - class: flink + l1: 'Yes' + l2: supported + l3: '' + - class: spark-rdd + l1: 'Yes' + l2: supported + l3: '' + - class: spark-dataset + l1: 'Partially' + l2: fully supported in batch mode + l3: '' + - class: apex + l1: 'Yes' + l2: supported + l3: '' + - class: gearpump + l1: 'Yes' + l2: supported + l3: '' + - class: mapreduce + l1: 'Yes' + l2: supported + l3: '' + - class: jstorm + l1: 'Yes' + l2: supported + l3: '' + - class: ibmstreams + l1: 'Yes' + l2: supported + l3: '' + - class: samza + l1: 'Yes' + l2: supported + l3: '' + - class: nemo + l1: 'Yes' + l2: supported + l3: '' + - class: jet + l1: 'Yes' + l2: supported + l3: '' + - name: Sliding windows + values: + - class: model + l1: 'Yes' + l2: periodic, overlapping + l3: Possibly overlapping fixed-size timestamp-based windows (Every minute, use the last ten minutes of data.) + - class: dataflow + l1: 'Yes' + l2: built-in + l3: '' + - class: flink + l1: 'Yes' + l2: supported + l3: '' + - class: spark-rdd + l1: 'Yes' + l2: supported + l3: '' + - class: spark-dataset + l1: 'Partially' + l2: fully supported in batch mode + l3: '' + - class: apex + l1: 'Yes' + l2: supported + l3: '' + - class: gearpump + l1: 'Yes' + l2: supported + l3: '' + - class: mapreduce + l1: 'Yes' + l2: supported + l3: '' + - class: jstorm + l1: 'Yes' + l2: supported + l3: '' + - class: ibmstreams + l1: 'Yes' + l2: supported + l3: '' + - class: samza + l1: 'Yes' + l2: supported + l3: '' + - class: nemo + l1: 'Yes' + l2: supported + l3: '' + - class: jet + l1: 'Yes' + l2: supported + l3: '' + - name: Session windows + values: + - class: model + l1: 'Yes' + l2: activity-based + l3: Based on bursts of activity separated by a gap size. Different per key. + - class: dataflow + l1: 'Yes' + l2: built-in + l3: '' + - class: flink + l1: 'Yes' + l2: supported + l3: '' + - class: spark-rdd + l1: 'Yes' + l2: supported + l3: '' + - class: spark-dataset + l1: 'Partially' + l2: fully supported in batch mode + l3: '' + - class: apex + l1: 'Yes' + l2: supported + l3: '' + - class: gearpump + l1: 'Yes' + l2: supported + l3: '' + - class: mapreduce + l1: 'Yes' + l2: supported + l3: '' + - class: jstorm + l1: 'Yes' + l2: supported + l3: '' + - class: ibmstreams + l1: 'Yes' + l2: supported + l3: '' + - class: samza + l1: 'Yes' + l2: supported + l3: '' + - class: nemo + l1: 'Yes' + l2: supported + l3: '' + - class: jet + l1: 'Yes' + l2: supported + l3: '' + - name: Custom windows + values: + - class: model + l1: 'Yes' + l2: user-defined windows + l3: All windows must implement BoundedWindow, which specifies a max timestamp. Each WindowFn assigns elements to an associated window. + - class: dataflow + l1: 'Yes' + l2: supported + l3: '' + - class: flink + l1: 'Yes' + l2: supported + l3: '' + - class: spark-rdd + l1: 'Yes' + l2: supported + l3: '' + - class: spark-dataset + l1: 'Partially' + l2: fully supported in batch mode + l3: '' + - class: apex + l1: 'Yes' + l2: supported + l3: '' + - class: gearpump + l1: 'Yes' + l2: supported + l3: '' + - class: mapreduce + l1: 'Yes' + l2: supported + l3: '' + - class: jstorm + l1: 'Yes' + l2: supported + l3: '' + - class: ibmstreams + l1: 'Yes' + l2: supported + l3: '' + - class: samza + l1: 'Yes' + l2: supported + l3: '' + - class: nemo + l1: 'Yes' + l2: supported + l3: '' + - class: jet + l1: 'Yes' + l2: supported + l3: '' + - name: Custom merging windows + values: + - class: model + l1: 'Yes' + l2: user-defined merging windows + l3: A custom WindowFn additionally specifies whether and how to merge windows. + - class: dataflow + l1: 'Yes' + l2: supported + l3: '' + - class: flink + l1: 'Yes' + l2: supported + l3: '' + - class: spark-rdd + l1: 'Yes' + l2: supported + l3: '' + - class: spark-dataset + l1: 'Partially' + l2: fully supported in batch mode + l3: '' + - class: apex + l1: 'Yes' + l2: supported + l3: '' + - class: gearpump + l1: 'Yes' + l2: supported + l3: '' + - class: mapreduce + l1: 'Yes' + l2: supported + l3: '' + - class: jstorm + l1: 'Yes' + l2: supported + l3: '' + - class: ibmstreams + l1: 'Yes' + l2: supported + l3: '' + - class: samza + l1: 'Yes' + l2: supported + l3: '' + - class: nemo + l1: 'Yes' + l2: supported + l3: '' + - class: jet + l1: 'Yes' + l2: supported + l3: '' + - name: Timestamp control + values: + - class: model + l1: 'Yes' + l2: output timestamp for window panes + l3: For a grouping transform, such as GBK or Combine, an OutputTimeFn specifies (1) how to combine input timestamps within a window and (2) how to merge aggregated timestamps when windows merge. + - class: dataflow + l1: 'Yes' + l2: supported + l3: '' + - class: flink + l1: 'Yes' + l2: supported + l3: '' + - class: spark-rdd + l1: 'Yes' + l2: supported + l3: '' + - class: spark-dataset + l1: 'Partially' + l2: fully supported in batch mode + l3: '' + - class: apex + l1: 'Yes' + l2: supported + l3: '' + - class: gearpump + l1: 'Yes' + l2: supported + l3: '' + - class: mapreduce + l1: 'Yes' + l2: supported + l3: '' + - class: jstorm + l1: 'Yes' + l2: supported + l3: '' + - class: ibmstreams + l1: 'Yes' + l2: supported + l3: '' + - class: samza + l1: 'Yes' + l2: supported + l3: '' + - class: nemo + l1: 'Yes' + l2: supported + l3: '' + - class: jet + l1: 'Yes' + l2: supported + l3: '' + + - description: When in processing time? + anchor: when + color-b: '6a4' + color-y: '8c6' + color-p: 'ae8' + color-n: 'ddd' + rows: + + - name: Configurable triggering + values: + - class: model + l1: 'Yes' + l2: user customizable + l3: Triggering may be specified by the user (instead of simply driven by hardcoded defaults). + - class: dataflow + l1: 'Yes' + l2: fully supported + l3: Fully supported in streaming mode. In batch mode, intermediate trigger firings are effectively meaningless. + - class: flink + l1: 'Yes' + l2: fully supported + l3: '' + - class: spark-rdd + l1: 'Yes' + l2: fully supported + l3: '' + - class: spark-dataset + l1: 'Partially' + l2: fully supported in batch mode + l3: '' + - class: apex + l1: 'Yes' + l2: fully supported + l3: '' + - class: gearpump + l1: 'No' + l2: '' + l3: '' + - class: mapreduce + l1: 'No' + l2: batch-only runner + l3: '' + - class: jstorm + l1: 'Yes' + l2: fully supported + l3: '' + - class: ibmstreams + l1: 'Yes' + l2: fully supported + l3: '' + - class: samza + l1: 'Yes' + l2: fully supported + l3: '' + - class: nemo + l1: 'Yes' + l2: fully supported + l3: '' + - class: jet + l1: 'Yes' + l2: fully supported + l3: '' + + - name: Event-time triggers + values: + - class: model + l1: 'Yes' + l2: relative to event time + l3: Triggers that fire in response to event-time completeness signals, such as watermarks progressing. + - class: dataflow + l1: 'Yes' + l2: yes in streaming, fixed granularity in batch + l3: Fully supported in streaming mode. In batch mode, currently watermark progress jumps from the beginning of time to the end of time once the input has been fully consumed, thus no additional triggering granularity is available. + - class: flink + l1: 'Yes' + l2: fully supported + l3: '' + - class: spark-rdd + l1: 'Yes' + l2: fully supported + l3: '' + - class: spark-dataset + l1: 'Partially' + l2: fully supported in batch mode + l3: '' + - class: apex + l1: 'Yes' + l2: fully supported + l3: '' + - class: gearpump + l1: 'Yes' + l2: fully supported + l3: '' + - class: mapreduce + l1: 'No' + l2: '' + l3: '' + - class: jstorm + l1: 'Yes' + l2: fully supported + l3: '' + - class: ibmstreams + l1: 'Yes' + l2: fully supported + l3: '' + - class: samza + l1: 'Yes' + l2: fully supported + l3: '' + - class: nemo + l1: 'Yes' + l2: fully supported + l3: '' + - class: jet + l1: 'Yes' + l2: fully supported + l3: '' + + - name: Processing-time triggers + values: + - class: model + l1: 'Yes' + l2: relative to processing time + l3: Triggers that fire in response to processing-time advancing. + - class: dataflow + l1: 'Yes' + l2: yes in streaming, fixed granularity in batch + l3: Fully supported in streaming mode. In batch mode, from the perspective of triggers, processing time currently jumps from the beginning of time to the end of time once the input has been fully consumed, thus no additional triggering granularity is available. + - class: flink + l1: 'Yes' + l2: fully supported + l3: '' + - class: spark-rdd + l1: 'Yes' + l2: "This is Spark streaming's native model" + l3: "Spark processes streams in micro-batches. The micro-batch size is actually a pre-set, fixed, time interval. Currently, the runner takes the first window size in the pipeline and sets it's size as the batch interval. Any following window operations will be considered processing time windows and will affect triggering." + - class: spark-dataset + l1: 'Partially' + l2: fully supported in batch mode + l3: + - class: apex + l1: 'Yes' + l2: fully supported + l3: '' + - class: gearpump + l1: 'No' + l2: '' + l3: '' + - class: mapreduce + l1: 'No' + l2: '' + l3: '' + - class: jstorm + l1: 'Yes' + l2: fully supported + l3: '' + - class: ibmstreams + l1: 'Yes' + l2: fully supported + l3: '' + - class: samza + l1: 'Yes' + l2: fully supported + l3: '' + - class: nemo + l1: 'Yes' + l2: fully supported + l3: '' + - class: jet + l1: 'Yes' + l2: fully supported + l3: '' + + - name: Count triggers + values: + - class: model + l1: 'Yes' + l2: every N elements + l3: Triggers that fire after seeing at least N elements. + - class: dataflow + l1: 'Yes' + l2: fully supported + l3: Fully supported in streaming mode. In batch mode, elements are processed in the largest bundles possible, so count-based triggers are effectively meaningless. + - class: flink + l1: 'Yes' + l2: fully supported + l3: '' + - class: spark-rdd + l1: 'Yes' + l2: fully supported + l3: '' + - class: spark-dataset + l1: 'Partially' + l2: fully supported in batch mode + l3: '' + - class: apex + l1: 'Yes' + l2: fully supported + l3: '' + - class: gearpump + l1: 'No' + l2: '' + l3: '' + - class: mapreduce + l1: 'No' + l2: '' + l3: '' + - class: jstorm + l1: 'Yes' + l2: fully supported + l3: '' + - class: ibmstreams + l1: 'Yes' + l2: fully supported + l3: '' + - class: samza + l1: 'Yes' + l2: fully supported + l3: '' + - class: nemo + l1: 'Yes' + l2: fully supported + l3: '' + - class: jet + l1: 'Yes' + l2: fully supported + l3: '' + + - name: '[Meta]data driven triggers' + values: + - class: model + jira: BEAM-101 + l1: 'No' + l2: in response to data + l3: Triggers that fire in response to attributes of the data being processed. + - class: dataflow + l1: 'No' + l2: pending model support + l3: + - class: flink + l1: 'No' + l2: pending model support + l3: + - class: spark-rdd + l1: 'No' + l2: pending model support + l3: + - class: spark-dataset + l1: 'No' + l2: pending model support + l3: + - class: apex + l1: 'No' + l2: pending model support + l3: + - class: gearpump + l1: 'No' + l2: pending model support + l3: + - class: mapreduce + l1: 'No' + l2: '' + l3: + - class: jstorm + l1: 'No' + l2: pending model support + l3: + - class: ibmstreams + l1: 'No' + l2: pending model support + l3: + - class: samza + l1: 'No' + l2: pending model support + l3: + - class: nemo + l1: 'No' + l2: pending model support + l3: '' + - class: jet + l1: 'No' + l2: pending model support + l3: '' + + - name: Composite triggers + values: + - class: model + l1: 'Yes' + l2: compositions of one or more sub-triggers + l3: Triggers which compose other triggers in more complex structures, such as logical AND, logical OR, early/on-time/late, etc. + - class: dataflow + l1: 'Yes' + l2: fully supported + l3: '' + - class: flink + l1: 'Yes' + l2: fully supported + l3: '' + - class: spark-rdd + l1: 'Yes' + l2: fully supported + l3: '' + - class: spark-dataset + l1: 'Partially' + l2: fully supported in batch mode + l3: '' + - class: apex + l1: 'Yes' + l2: fully supported + l3: '' + - class: gearpump + l1: 'No' + l2: '' + l3: '' + - class: mapreduce + l1: 'No' + l2: '' + l3: '' + - class: jstorm + l1: 'Yes' + l2: fully supported + l3: '' + - class: ibmstreams + l1: 'Yes' + l2: fully supported + l3: '' + - class: samza + l1: 'Yes' + l2: fully supported + l3: '' + - class: nemo + l1: 'Yes' + l2: fully supported + l3: '' + - class: jet + l1: 'Yes' + l2: fully supported + l3: '' + + - name: Allowed lateness + values: + - class: model + l1: 'Yes' + l2: event-time bound on window lifetimes + l3: A way to bound the useful lifetime of a window (in event time), after which any unemitted results may be materialized, the window contents may be garbage collected, and any addtional late data that arrive for the window may be discarded. + - class: dataflow + l1: 'Yes' + l2: fully supported + l3: Fully supported in streaming mode. In batch mode no data is ever late. + - class: flink + l1: 'Yes' + l2: fully supported + l3: '' + - class: spark-rdd + l1: 'No' + l2: '' + l3: '' + - class: spark-dataset + l1: 'No' + l2: no streaming support in the runner + l3: '' + - class: apex + l1: 'Yes' + l2: fully supported + l3: '' + - class: gearpump + l1: 'Yes' + l2: fully supported + l3: '' + - class: mapreduce + l1: 'No' + l2: '' + l3: '' + - class: jstorm + l1: 'Yes' + l2: fully supported + l3: '' + - class: ibmstreams + l1: 'Yes' + l2: fully supported + l3: '' + - class: samza + l1: 'Yes' + l2: fully supported + l3: '' + - class: nemo + l1: 'Yes' + l2: fully supported + l3: '' + - class: jet + l1: 'Yes' + l2: fully supported + l3: '' + + - name: Timers + values: + - class: model + l1: 'Yes' + l2: delayed processing callbacks + l3: A fine-grained mechanism for performing work at some point in the future, in either the event-time or processing-time domain. Useful for orchestrating delayed events, timeouts, etc in complex state per-key, per-window state machines. + - class: dataflow + l1: 'Partially' + l2: non-merging windows + l3: Dataflow supports timers in non-merging windows. + - class: flink + l1: 'Partially' + l2: non-merging windows + l3: The Flink Runner supports timers in non-merging windows. + - class: spark-rdd + l1: 'Partially' + l2: fully supported in batch mode + l3: '' + - class: spark-dataset + l1: 'No' + l2: not implemented + l3: '' + - class: apex + l1: 'No' + l2: not implemented + l3: '' + - class: gearpump + l1: 'No' + l2: not implemented + l3: '' + - class: mapreduce + l1: 'No' + l2: '' + l3: '' + - class: jstorm + l1: 'Partially' + l2: non-merging windows + l3: '' + - class: ibmstreams + l1: 'Partially' + l2: non-merging windows + l3: '' + - class: samza + l1: 'Partially' + l2: non-merging windows + l3: The Samza Runner supports timers in non-merging windows. + - class: nemo + l1: 'No' + l2: not implemented + l3: '' + - class: jet + l1: 'Partially' + l2: non-merging windows + l3: '' + + - description: How do refinements relate? + anchor: how + color-b: 'b55' + color-y: 'd77' + color-p: 'faa' + color-n: 'ddd' + rows: + + - name: Discarding + values: + - class: model + l1: 'Yes' + l2: panes discard elements when fired + l3: Elements are discarded from accumulated state as their pane is fired. + - class: dataflow + l1: 'Yes' + l2: fully supported + l3: '' + - class: flink + l1: 'Yes' + l2: fully supported + l3: '' + - class: spark-rdd + l1: 'Yes' + l2: fully supported + l3: 'Spark streaming natively discards elements after firing.' + - class: spark-dataset + l1: 'Partially' + l2: fully supported in batch mode + l3: '' + - class: apex + l1: 'Yes' + l2: fully supported + l3: '' + - class: gearpump + l1: 'Yes' + l2: fully supported + l3: '' + - class: mapreduce + l1: 'No' + l2: batch-only runner + l3: '' + - class: jstorm + l1: 'Yes' + l2: fully supported + l3: '' + - class: ibmstreams + l1: 'Yes' + l2: fully supported + l3: '' + - class: samza + l1: 'Yes' + l2: fully supported + l3: '' + - class: nemo + l1: 'Yes' + l2: fully supported + l3: '' + - class: jet + l1: 'Yes' + l2: fully supported + l3: '' + + - name: Accumulating + values: + - class: model + l1: 'Yes' + l2: panes accumulate elements across firings + l3: Elements are accumulated in state across multiple pane firings for the same window. + - class: dataflow + l1: 'Yes' + l2: fully supported + l3: Requires that the accumulated pane fits in memory, after being passed through the combiner (if relevant) + - class: flink + l1: 'Yes' + l2: fully supported + l3: '' + - class: spark-rdd + l1: 'No' + l2: '' + l3: '' + - class: spark-dataset + l1: 'No' + l2: '' + l3: '' + - class: apex + l1: 'Yes' + l2: fully supported + l3: 'Size restriction, see combine support.' + - class: gearpump + l1: 'No' + l2: '' + l3: '' + - class: mapreduce + l1: 'No' + l2: '' + l3: '' + - class: jstorm + l1: 'Yes' + l2: fully supported + l3: '' + - class: ibmstreams + l1: 'Yes' + l2: fully supported + l3: '' + - class: samza + l1: 'Yes' + l2: fully supported + l3: '' + - class: nemo + l1: 'Yes' + l2: fully supported + l3: '' + - class: jet + l1: 'Yes' + l2: fully supported + l3: '' + + - name: 'Accumulating & Retracting' + values: + - class: model + jira: BEAM-91 + l1: 'No' + l2: accumulation plus retraction of old panes + l3: Elements are accumulated across multiple pane firings and old emitted values are retracted. Also known as "backsies" ;-D + - class: dataflow + l1: 'No' + l2: pending model support + l3: '' + - class: flink + l1: 'No' + l2: pending model support + l3: '' + - class: spark-rdd + l1: 'No' + l2: pending model support + l3: '' + - class: spark-dataset + l1: 'No' + l2: pending model support + l3: '' + - class: apex + l1: 'No' + l2: pending model support + l3: '' + - class: gearpump + l1: 'No' + l2: pending model support + l3: '' + - class: mapreduce + l1: 'No' + l2: '' + l3: '' + - class: jstorm + l1: 'No' + l2: pending model support + l3: '' + - class: ibmstreams + l1: 'No' + l2: pending model support + l3: '' + - class: samza + l1: 'No' + l2: pending model support + l3: '' + - class: nemo + l1: 'No' + l2: pending model support + l3: '' + - class: jet + l1: 'No' + l2: pending model support + l3: '' + - description: Additional common features not yet part of the Beam model + anchor: misc + color-b: 'aaa' + color-y: 'bbb' + color-p: 'ccc' + color-n: 'ddd' + rows: + - name: Drain + values: + - class: model + l1: 'Partially' + l2: + l3: APIs and semantics for draining a pipeline are under discussion. This would cause incomplete aggregations to be emitted regardless of trigger and tagged with metadata indicating it is incompleted. + - class: dataflow + l1: 'Partially' + l2: + l3: Dataflow has a native drain operation, but it does not work in the presence of event time timer loops. Final implemention pending model support. + - class: flink + l1: 'Partially' + l2: + l3: Flink supports taking a "savepoint" of the pipeline and shutting the pipeline down after its completion. + - class: spark-rdd + l1: + l2: + l3: + - class: spark-dataset + l1: + l2: + l3: + - class: apex + l1: + l2: + l3: + - class: gearpump + l1: + l2: + l3: + - class: mapreduce + l1: + l2: + l3: + - class: jstorm + l1: + l2: + l3: + - class: ibmstreams + l1: + l2: + l3: + - class: samza + l1: + l2: + l3: + - class: nemo + l1: + l2: + l3: + - name: Checkpoint + values: + - class: model + l1: 'Partially' + l2: + l3: APIs and semantics for saving a pipeline checkpoint are under discussion. This would be a runner-specific materialization of the pipeline state required to resume or duplicate the pipeline. + - class: dataflow + l1: 'No' + l2: + l3: + - class: flink + l1: 'Partially' + l2: + l3: Flink has a native savepoint capability. + - class: spark-rdd + l1: 'Partially' + l2: + l3: Spark has a native savepoint capability. + - class: spark-dataset + l1: 'No' + l2: + l3: not implemented + - class: apex + l1: + l2: + l3: + - class: gearpump + l1: + l2: + l3: + - class: mapreduce + l1: + l2: + l3: + - class: jstorm + l1: + l2: + l3: + - class: ibmstreams + l1: + l2: + l3: + - class: samza + l1: 'Partially' + l2: + l3: Samza has a native checkpoint capability. + - class: nemo + l1: + l2: + l3: + - class: jet + l1: + l2: + l3: --- # Beam Capability Matrix -Apache Beam provides a portable API layer for building sophisticated data-parallel processing pipelines that may be executed across a diversity of execution engines, or runners. The core concepts of this layer are based upon the Beam Model (formerly referred to as the [Dataflow Model](https://www.vldb.org/pvldb/vol8/p1792-Akidau.pdf)), and implemented to varying degrees in each Beam runner. To help clarify the capabilities of individual runners, we've created the capability matrix below. +Apache Beam provides a portable API layer for building sophisticated data-parallel processing pipelines that may be executed across a diversity of execution engines, or runners. The core concepts of this layer are based upon the Beam Model (formerly referred to as the [Dataflow Model](http://www.vldb.org/pvldb/vol8/p1792-Akidau.pdf)), and implemented to varying degrees in each Beam runner. To help clarify the capabilities of individual runners, we've created the capability matrix below. Individual capabilities have been grouped by their corresponding What / Where / When / How question: @@ -31,30 +1726,14 @@ Individual capabilities have been grouped by their corresponding When in processing time? - How do refinements of results relate? -For more details on the What / Where / When / How breakdown of concepts, we recommend reading through the Streaming 102 post on O'Reilly Radar. +For more details on the What / Where / When / How breakdown of concepts, we recommend reading through the Streaming 102 post on O'Reilly Radar. Note that in the future, we intend to add additional tables beyond the current set, for things like runtime characterstics (e.g. at-least-once vs exactly-once), performance, etc. -{% include capability-matrix-common.md %} -{% assign cap-data=site.data.capability-matrix %} - -
+{{< capability-matrix-common >}} -{% assign cap-style='cap-summary' %} -{% assign cap-view='summary' %} -{% assign cap-other-view='full' %} -{% assign cap-toggle-details=1 %} -{% assign cap-display='block' %} - -{% include capability-matrix.md %} +{{< capability-matrix cap-data="capability-matrix" cap-style="cap-summary" cap-view="summary" cap-other-view="full" cap-toggle-details=1 cap-display="block" >}} -{% assign cap-style='cap' %} -{% assign cap-view='full' %} -{% assign cap-other-view='summary' %} -{% assign cap-toggle-details=0 %} -{% assign cap-display='none' %} - -{% include capability-matrix.md %} -
+{{< capability-matrix cap-data="capability-matrix" cap-style="cap" cap-view="full" cap-other-view="summary" cap-toggle-details=0 cap-display="none" >}} diff --git a/website/www/site/content/en/documentation/runners/dataflow.md b/website/www/site/content/en/documentation/runners/dataflow.md index f684b4d68a6e7..561e188dbec22 100644 --- a/website/www/site/content/en/documentation/runners/dataflow.md +++ b/website/www/site/content/en/documentation/runners/dataflow.md @@ -1,9 +1,7 @@ --- -layout: section +type: runners title: "Cloud Dataflow Runner" -permalink: /documentation/runners/dataflow/ -section_menu: section-menu/runners.html -redirect_from: /learn/runners/dataflow/ +aliases: /learn/runners/dataflow/ --- # Using the Google Cloud Dataflow Runner - +{{< language-switcher java py >}} The Google Cloud Dataflow Runner uses the [Cloud Dataflow managed service](https://cloud.google.com/dataflow/service/dataflow-service-desc). When you run your pipeline with the Cloud Dataflow service, the runner uploads your executable code and dependencies to a Google Cloud Storage bucket and creates a Cloud Dataflow job, which executes your pipeline on managed resources in Google Cloud Platform. @@ -36,7 +28,7 @@ The Cloud Dataflow Runner and service are suitable for large scale, continuous j * [autoscaling](https://cloud.google.com/dataflow/service/dataflow-service-desc#autoscaling) of the number of workers throughout the lifetime of the job * [dynamic work rebalancing](https://cloud.google.com/blog/big-data/2016/05/no-shard-left-behind-dynamic-work-rebalancing-in-google-cloud-dataflow) -The [Beam Capability Matrix]({{ site.baseurl }}/documentation/runners/capability-matrix/) documents the supported capabilities of the Cloud Dataflow Runner. +The [Beam Capability Matrix](/documentation/runners/capability-matrix/) documents the supported capabilities of the Cloud Dataflow Runner. ## Cloud Dataflow Runner prerequisites and setup {#setup} @@ -57,38 +49,41 @@ for your chosen language. ### Specify your dependency {#dependency} When using Java, you must specify your dependency on the Cloud Dataflow Runner in your `pom.xml`. -```java +{{< highlight java >}} org.apache.beam beam-runners-google-cloud-dataflow-java - {{ site.release_latest }} + {{< param release_latest >}} runtime -``` +{{< /highlight >}} This section is not applicable to the Beam SDK for Python. ### Self executing JAR {#self-executing-jar} -{:.language-py} +{{< paragraph class="language-py" >}} This section is not applicable to the Beam SDK for Python. +{{< /paragraph >}} -{:.language-java} +{{< paragraph class="language-java" >}} In some cases, such as starting a pipeline using a scheduler such as [Apache AirFlow](https://airflow.apache.org), you must have a self-contained application. You can pack a self-executing JAR by explicitly adding the following dependency on the Project section of your pom.xml, in addition to the adding existing dependency shown in the previous section. +{{< /paragraph >}} -```java +{{< highlight java >}} org.apache.beam beam-runners-google-cloud-dataflow-java ${beam.version} runtime -``` +{{< /highlight >}} -{:.language-java} +{{< paragraph class="language-java" >}} Then, add the mainClass name in the Maven JAR plugin. +{{< /paragraph >}} -```java +{{< highlight java >}} org.apache.maven.plugins maven-jar-plugin @@ -103,25 +98,26 @@ Then, add the mainClass name in the Maven JAR plugin. -``` +{{< /highlight >}} -{:.language-java} +{{< paragraph class="language-java" >}} After running mvn package, run ls target and you should see (assuming your artifactId is `beam-examples` and the version is 1.0.0) the following output. +{{< /paragraph >}} -```java +{{< highlight java >}} beam-examples-bundled-1.0.0.jar -``` +{{< /highlight >}} -{:.language-java} +{{< paragraph class="language-java" >}} To run the self-executing JAR on Cloud Dataflow, use the following command. +{{< /paragraph >}} -```java +{{< highlight java >}} java -jar target/beam-examples-bundled-1.0.0.jar \ --runner=DataflowRunner \ --project= \ - --region= \ --tempLocation=gs:///temp/ -``` +{{< /highlight >}} ## Pipeline options for the Cloud Dataflow Runner {#pipeline-options} @@ -147,12 +143,6 @@ java -jar target/beam-examples-bundled-1.0.0.jar \ If not set, defaults to the default project in the current environment. The default project is set via gcloud. - - region - The Google Compute Engine region to create the job. - If not set, defaults to the default region in the current environment. The default region is set via gcloud. - - streaming Whether streaming mode is enabled or disabled; true if enabled. Set to true if running pipelines with unbounded PCollections. @@ -210,8 +200,8 @@ java -jar target/beam-examples-bundled-1.0.0.jar \ See the reference documentation for the -[DataflowPipelineOptions](https://beam.apache.org/releases/javadoc/{{ site.release_latest }}/index.html?org/apache/beam/runners/dataflow/options/DataflowPipelineOptions.html) -[`PipelineOptions`](https://beam.apache.org/releases/pydoc/{{ site.release_latest }}/apache_beam.options.pipeline_options.html#apache_beam.options.pipeline_options.PipelineOptions) +[DataflowPipelineOptions](https://beam.apache.org/releases/javadoc/{{< param release_latest >}}/index.html?org/apache/beam/runners/dataflow/options/DataflowPipelineOptions.html) +[`PipelineOptions`](https://beam.apache.org/releases/pydoc/{{< param release_latest >}}/apache_beam.options.pipeline_options.html#apache_beam.options.pipeline_options.PipelineOptions) interface (and any subinterfaces) for additional pipeline configuration options. ## Additional information and caveats {#additional-info} diff --git a/website/www/site/content/en/documentation/runners/direct.md b/website/www/site/content/en/documentation/runners/direct.md index acfc6bb4dd5ce..5df3b69eb4042 100644 --- a/website/www/site/content/en/documentation/runners/direct.md +++ b/website/www/site/content/en/documentation/runners/direct.md @@ -1,9 +1,7 @@ --- -layout: section +type: runners title: "Direct Runner" -permalink: /documentation/runners/direct/ -section_menu: section-menu/runners.html -redirect_from: /learn/runners/direct/ +aliases: /learn/runners/direct/ --- # Using the Direct Runner - +{{< language-switcher java py >}} The Direct Runner executes pipelines on your machine and is designed to validate that pipelines adhere to the Apache Beam model as closely as possible. Instead of focusing on efficient pipeline execution, the Direct Runner performs additional checks to ensure that users do not rely on semantics that are not guaranteed by the model. Some of these checks include: @@ -40,11 +32,11 @@ Using the Direct Runner for testing and development helps ensure that pipelines Here are some resources with information about how to test your pipelines. ## Direct Runner prerequisites and setup @@ -52,14 +44,14 @@ Here are some resources with information about how to test your pipelines. ### Specify your dependency When using Java, you must specify your dependency on the Direct Runner in your `pom.xml`. -```java +{{< highlight java >}} org.apache.beam beam-runners-direct-java - {{ site.release_latest }} + {{< param release_latest >}} runtime -``` +{{< /highlight >}} This section is not applicable to the Beam SDK for Python. @@ -68,15 +60,15 @@ Here are some resources with information about how to test your pipelines. When executing your pipeline from the command-line, set `runner` to `direct` or `DirectRunner`. The default values for the other pipeline options are generally sufficient. See the reference documentation for the -[`DirectOptions`](https://beam.apache.org/releases/javadoc/{{ site.release_latest }}/index.html?org/apache/beam/runners/direct/DirectOptions.html) -[`DirectOptions`](https://beam.apache.org/releases/pydoc/{{ site.release_latest }}/apache_beam.options.pipeline_options.html#apache_beam.options.pipeline_options.DirectOptions) +[`DirectOptions`](https://beam.apache.org/releases/javadoc/{{< param release_latest >}}/index.html?org/apache/beam/runners/direct/DirectOptions.html) +[`DirectOptions`](https://beam.apache.org/releases/pydoc/{{< param release_latest >}}/apache_beam.options.pipeline_options.html#apache_beam.options.pipeline_options.DirectOptions) interface for defaults and additional pipeline configuration options. ## Additional information and caveats ### Memory considerations -Local execution is limited by the memory available in your local environment. It is highly recommended that you run your pipeline with data sets small enough to fit in local memory. You can create a small in-memory data set using a [`Create`](https://beam.apache.org/releases/javadoc/{{ site.release_latest }}/index.html?org/apache/beam/sdk/transforms/Create.html)[`Create`](https://github.com/apache/beam/blob/master/sdks/python/apache_beam/transforms/core.py) transform, or you can use a [`Read`](https://beam.apache.org/releases/javadoc/{{ site.release_latest }}/index.html?org/apache/beam/sdk/io/Read.html)[`Read`](https://github.com/apache/beam/blob/master/sdks/python/apache_beam/io/iobase.py) transform to work with small local or remote files. +Local execution is limited by the memory available in your local environment. It is highly recommended that you run your pipeline with data sets small enough to fit in local memory. You can create a small in-memory data set using a [`Create`](https://beam.apache.org/releases/javadoc/{{< param release_latest >}}/index.html?org/apache/beam/sdk/transforms/Create.html)[`Create`](https://github.com/apache/beam/blob/master/sdks/python/apache_beam/transforms/core.py) transform, or you can use a [`Read`](https://beam.apache.org/releases/javadoc/{{< param release_latest >}}/index.html?org/apache/beam/sdk/io/Read.html)[`Read`](https://github.com/apache/beam/blob/master/sdks/python/apache_beam/io/iobase.py) transform to work with small local or remote files. ### Streaming execution @@ -88,7 +80,10 @@ Python [FnApiRunner](https://beam.apache.org/contribute/runner-guide/#the-fn-api Setting parallelism -Number of threads or subprocesses is defined by setting the `direct_num_workers` option. There are several ways to set this option. +Number of threads or subprocesses is defined by setting the `direct_num_workers` option. +From 2.22.0, `direct_num_workers = 0` is supported. When `direct_num_workers` is set to 0, it will set the number of threads/subprocess to the number of cores of the machine where the pipelien is running. + +There are several ways to set this option. * Passing through CLI when executing a pipeline. ``` diff --git a/website/www/site/content/en/documentation/runners/flink.md b/website/www/site/content/en/documentation/runners/flink.md index b7152dbd8e94e..1431424154359 100644 --- a/website/www/site/content/en/documentation/runners/flink.md +++ b/website/www/site/content/en/documentation/runners/flink.md @@ -1,9 +1,7 @@ --- -layout: section +type: runners title: "Apache Flink Runner" -section_menu: section-menu/runners.html -permalink: /documentation/runners/flink/ -redirect_from: /learn/runners/flink/ +aliases: /learn/runners/flink/ --- - +{{< paragraph class="language-java" >}} If you have a Flink `JobManager` running on your local machine you can provide `localhost:8081` for `flinkMaster`. Otherwise an embedded Flink cluster will be started for the job. - - - +{{< /paragraph >}} Starting with Beam 2.18.0, pre-built Docker images are available at Docker Hub. @@ -297,18 +283,20 @@ Beam SDK: To run a pipeline on an embedded Flink cluster: -1. Start the JobService endpoint: `docker run --net=host apachebeam/flink1.9_job_server:latest` - +{{< paragraph class="language-py" >}} +1. Start the JobService endpoint: `docker run --net=host apachebeam/flink1.9_job_server:latest` +{{< /paragraph >}} - +{{< paragraph class="language-py" >}} The JobService is the central instance where you submit your Beam pipeline to. The JobService will create a Flink job for the pipeline and execute the job. - +{{< /paragraph >}} -2. Submit the Python pipeline to the above endpoint by using the `PortableRunner`, `job_endpoint` set to `localhost:8099` (this is the default address of the JobService), and `environment_type` set to `LOOPBACK`. For example: - +{{< paragraph class="language-py" >}} +2. Submit the Python pipeline to the above endpoint by using the `PortableRunner`, `job_endpoint` set to `localhost:8099` (this is the default address of the JobService), and `environment_type` set to `LOOPBACK`. For example: +{{< /paragraph >}} -```py +{{< highlight py >}} import apache_beam as beam from apache_beam.options.pipeline_options import PipelineOptions @@ -319,28 +307,32 @@ options = PipelineOptions([ ]) with beam.Pipeline(options) as p: ... -``` +{{< /highlight >}} - +{{< paragraph class="language-py" >}} To run on a separate [Flink cluster](https://ci.apache.org/projects/flink/flink-docs-release-1.9/getting-started/tutorials/local_setup.html): - +{{< /paragraph >}} -1. Start a Flink cluster which exposes the Rest interface on `localhost:8081` by default. - +{{< paragraph class="language-py" >}} +1. Start a Flink cluster which exposes the Rest interface on `localhost:8081` by default. +{{< /paragraph >}} -2. Start JobService with Flink Rest endpoint: `docker run --net=host apachebeam/flink1.9_job_server:latest --flink-master=localhost:8081`. - +{{< paragraph class="language-py" >}} +2. Start JobService with Flink Rest endpoint: `docker run --net=host apachebeam/flink1.9_job_server:latest --flink-master=localhost:8081`. +{{< /paragraph >}} -3. Submit the pipeline as above. +{{< paragraph class="language-py" >}} +3. Submit the pipeline as above. Note however that `environment_type=LOOPBACK` is only intended for local testing. -See [here]({{ site.baseurl }}/documentation/runtime/sdk-harness-config/) for details. - +See [here](/roadmap/portability/#sdk-harness-config) for details. +{{< /paragraph >}} -Steps 2 and 3 can be automated in Python by using the `FlinkRunner`, +{{< paragraph class="language-py" >}} +Steps 2 and 3 can be automated in Python by using the `FlinkRunner`, plus the optional `flink_version` and `flink_master` options, e.g.: - +{{< /paragraph >}} -```py +{{< highlight py >}} import apache_beam as beam from apache_beam.options.pipeline_options import PipelineOptions @@ -352,7 +344,7 @@ options = PipelineOptions([ ]) with beam.Pipeline(options=options) as p: ... -``` +{{< /highlight >}} ## Additional information and caveats @@ -372,26 +364,30 @@ Many sources like `PubSubIO` rely on their checkpoints to be acknowledged which When executing your pipeline with the Flink Runner, you can set these pipeline options. The following list of Flink-specific pipeline options is generated automatically from the -[FlinkPipelineOptions](https://beam.apache.org/releases/javadoc/{{ site.release_latest }}/index.html?org/apache/beam/runners/flink/FlinkPipelineOptions.html) +[FlinkPipelineOptions](https://beam.apache.org/releases/javadoc/{{< param release_latest >}}/index.html?org/apache/beam/runners/flink/FlinkPipelineOptions.html) reference class:
-{% include flink_java_pipeline_options.html %} + +{{< flink-java-pipeline-options >}} +
+
-{% include flink_python_pipeline_options.html %} + +{{< flink-python-pipeline-options >}} +
For general Beam pipeline options see the -[PipelineOptions](https://beam.apache.org/releases/javadoc/{{ site.release_latest }}/index.html?org/apache/beam/sdk/options/PipelineOptions.html) +[PipelineOptions](https://beam.apache.org/releases/javadoc/{{< param release_latest >}}/index.html?org/apache/beam/sdk/options/PipelineOptions.html) reference. ## Capability -The [Beam Capability Matrix]({{ site.baseurl -}}/documentation/runners/capability-matrix/) documents the +The [Beam Capability Matrix](/documentation/runners/capability-matrix/) documents the capabilities of the classic Flink Runner. The [Portable Capability diff --git a/website/www/site/content/en/documentation/runners/gearpump.md b/website/www/site/content/en/documentation/runners/gearpump.md index 1c104c32b03cd..638b42a4234f5 100644 --- a/website/www/site/content/en/documentation/runners/gearpump.md +++ b/website/www/site/content/en/documentation/runners/gearpump.md @@ -1,8 +1,6 @@ --- -layout: section +type: runners title: "Apache Gearpump (incubating) Runner" -section_menu: section-menu/runners.html -permalink: /documentation/runners/gearpump/ --- # Using the Apache Hadoop MapReduce Runner -The Apache Hadoop MapReduce Runner can be used to execute Beam pipelines using [Apache Hadoop](https://hadoop.apache.org/). +The Apache Hadoop MapReduce Runner can be used to execute Beam pipelines using [Apache Hadoop](http://hadoop.apache.org/). -The [Beam Capability Matrix]({{ site.baseurl }}/documentation/runners/capability-matrix/) documents the currently supported capabilities of the Apache Hadoop MapReduce Runner. +The [Beam Capability Matrix](/documentation/runners/capability-matrix/) documents the currently supported capabilities of the Apache Hadoop MapReduce Runner. ## Apache Hadoop MapReduce Runner prerequisites and setup You need to have an Apache Hadoop environment with either [Single Node Setup](https://hadoop.apache.org/docs/r1.2.1/single_node_setup.html) or [Cluster Setup](https://hadoop.apache.org/docs/r1.2.1/cluster_setup.html) @@ -33,7 +31,7 @@ You can add a dependency on the latest version of the Apache Hadoop MapReduce ru org.apache.beam beam-runners-mapreduce - {{ site.release_latest }} + {{< param release_latest >}} ``` @@ -50,7 +48,7 @@ $ mvn exec:java -Dexec.mainClass=org.apache.beam.examples.WordCount \ To execute in a Hadoop cluster, package your program along with all dependencies in a fat jar. -If you are following through the [Beam Java SDK Quickstart]({{ site.baseurl }}/get-started/quickstart-java/), you can run this command: +If you are following through the [Beam Java SDK Quickstart](/get-started/quickstart-java/), you can run this command: ``` $ mvn package -Pflink-runner ``` diff --git a/website/www/site/content/en/documentation/runners/nemo.md b/website/www/site/content/en/documentation/runners/nemo.md index db3e50cf0d5b6..93287524ac2f8 100644 --- a/website/www/site/content/en/documentation/runners/nemo.md +++ b/website/www/site/content/en/documentation/runners/nemo.md @@ -1,9 +1,7 @@ --- -layout: section +type: runners title: "Apache Nemo Runner" -section_menu: section-menu/runners.html -permalink: /documentation/runners/nemo/ -redirect_from: /learn/runners/nemo/ +aliases: /learn/runners/nemo/ --- # Using the Apache Nemo Runner -The Apache Nemo Runner can be used to execute Beam pipelines using [Apache Nemo](https://nemo.apache.org). +The Apache Nemo Runner can be used to execute Beam pipelines using [Apache Nemo](http://nemo.apache.org). The Nemo Runner can optimize Beam pipelines with the Nemo compiler through various optimization passes and execute them in a distributed fashion using the Nemo runtime. You can also deploy a self-contained application for local mode or run using resource managers like YARN or Mesos. @@ -32,7 +30,7 @@ The Nemo Runner executes Beam pipelines on top of Apache Nemo, providing: * Integration with YARN and other components of the Apache Hadoop ecosystem * Support for the various optimizations provided by the Nemo optimizer -The [Beam Capability Matrix]({{ site.baseurl }}/documentation/runners/capability-matrix/) documents the +The [Beam Capability Matrix](/documentation/runners/capability-matrix/) documents the supported capabilities of the Nemo Runner. ## Nemo Runner prerequisites and setup diff --git a/website/www/site/content/en/documentation/runners/samza.md b/website/www/site/content/en/documentation/runners/samza.md index 4ead79cf09a19..8e8923efaffde 100644 --- a/website/www/site/content/en/documentation/runners/samza.md +++ b/website/www/site/content/en/documentation/runners/samza.md @@ -1,9 +1,7 @@ --- -layout: section +type: runners title: "Apache Samza Runner" -section_menu: section-menu/runners.html -permalink: /documentation/runners/samza/ -redirect_from: /learn/runners/Samza/ +aliases: /learn/runners/Samza/ --- # Using the Apache Spark Runner -The Apache Spark Runner can be used to execute Beam pipelines using [Apache Spark](https://spark.apache.org/). +The Apache Spark Runner can be used to execute Beam pipelines using [Apache Spark](http://spark.apache.org/). The Spark Runner can execute Spark pipelines just like a native Spark application; deploying a self-contained application for local mode, running on Spark's Standalone RM, or using YARN or Mesos. The Spark Runner executes Beam pipelines on top of Apache Spark, providing: * Batch and streaming (and combined) pipelines. -* The same fault-tolerance [guarantees](https://spark.apache.org/docs/latest/streaming-programming-guide.html#fault-tolerance-semantics) as provided by RDDs and DStreams. -* The same [security](https://spark.apache.org/docs/latest/security.html) features Spark provides. +* The same fault-tolerance [guarantees](http://spark.apache.org/docs/latest/streaming-programming-guide.html#fault-tolerance-semantics) as provided by RDDs and DStreams. +* The same [security](http://spark.apache.org/docs/latest/security.html) features Spark provides. * Built-in metrics reporting using Spark's metrics system, which reports Beam Aggregators as well. * Native support for Beam side-inputs via spark's Broadcast variables. -The [Beam Capability Matrix]({{ site.baseurl }}/documentation/runners/capability-matrix/) documents the currently supported capabilities of the Spark Runner. +The [Beam Capability Matrix](/documentation/runners/capability-matrix/) documents the currently supported capabilities of the Spark Runner. ## Three flavors of the Spark runner The Spark runner comes in three flavors: @@ -55,7 +53,7 @@ pipelines written in other languages. If your applications only use Java, then you should currently go with one of the java based runners. If you want to run Python or Go pipelines with Beam on Spark, you need to use the portable Runner. For more information on portability, please visit the -[Portability page]({{site.baseurl }}/roadmap/portability/). +[Portability page](/roadmap/portability/).