apache · svetakvsundhar · Jun 26, 2024 · Jun 27, 2024 · Jun 27, 2024 · Jun 27, 2024
diff --git a/examples/notebooks/blogposts/unittests_in_beam.ipynb b/examples/notebooks/blogposts/unittests_in_beam.ipynb
@@ -0,0 +1,259 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "authorship_tag": "ABX9TyP+whTO0l5Xd2TU4xa2Z7KC",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/apache/beam/blob/testing_blog_post/examples/notebooks/blogposts/unittests_in_beam.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 36,
+      "metadata": {
+        "id": "7DSE6TgWy7PP"
+      },
+      "outputs": [],
+      "source": [
+        "# @title ###### Licensed to the Apache Software Foundation (ASF), Version 2.0 (the \"License\")\n",
+        "\n",
+        "# Licensed to the Apache Software Foundation (ASF) under one\n",
+        "# or more contributor license agreements. See the NOTICE file\n",
+        "# distributed with this work for additional information\n",
+        "# regarding copyright ownership. The ASF licenses this file\n",
+        "# to you under the Apache License, Version 2.0 (the\n",
+        "# \"License\"); you may not use this file except in compliance\n",
+        "# with the License. You may obtain a copy of the License at\n",
+        "#\n",
+        "#   http://www.apache.org/licenses/LICENSE-2.0\n",
+        "#\n",
+        "# Unless required by applicable law or agreed to in writing,\n",
+        "# software distributed under the License is distributed on an\n",
+        "# \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY\n",
+        "# KIND, either express or implied. See the License for the\n",
+        "# specific language governing permissions and limitations\n",
+        "# under the License"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Install the Apache Beam library\n",
+        "\n",
+        "!pip install apache_beam[gcp] --quiet"
+      ],
+      "metadata": {
+        "id": "5W2nuV7uzlPg"
+      },
+      "execution_count": 37,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "#The following packages are used to run the example pipelines\n",
+        "\n",
+        "import apache_beam as beam\n",
+        "from apache_beam.io import ReadFromText, WriteToText\n",
+        "from apache_beam.options.pipeline_options import PipelineOptions\n",
+        "\n",
+        "class CustomClass(beam.DoFn):\n",
+        "  def custom_function(x):\n",
+        "          ...\n",
+        "          # returned_record = requests.get(\"http://my-api-call.com\")\n",
+        "          ...\n",
+        "          # if len(returned_record)!=10:\n",
+        "          # raise ValueError(\"Length of record does not match expected length\")\n",
+        "          return x\n",
+        "\n",
+        "  with beam.Pipeline() as p:\n",
+        "    result = (\n",
+        "            p\n",
+        "            | ReadFromText(\"/content/sample_data/anscombe.json\")\n",
+        "            | beam.ParDo(lambda x: CustomClass.custom_function(x))\n",
+        "            | WriteToText(\"/content/\")\n",
+        "    )"
+      ],
+      "metadata": {
+        "id": "Ktk9EVIFzGfP"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "**Example Pipeline 1**\n"
+      ],
+      "metadata": {
+        "id": "IVjBkewt1sLA"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# This function is going to return the square the integer at the first index of our record.\n",
+        "def compute_square(element):\n",
+        "  return int(element[1])**2\n",
+        "\n",
+        "with beam.Pipeline() as p1:\n",
+        "    result = (\n",
+        "        p1\n",
+        "        | ReadFromText(\"/content/sample_data/california_housing_test.csv\",skip_header_lines=1)\n",
+        "        | beam.Map(compute_square)\n",
+        "        | WriteToText(\"/content/\")\n",
+        "    )"
+      ],
+      "metadata": {
+        "id": "oHbSvOUI1pOe"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "**Example Pipeline 2**"
+      ],
+      "metadata": {
+        "id": "Mh3nZZ1_12sX"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "with beam.Pipeline() as p2:\n",
+        "    result = (\n",
+        "        p2\n",
+        "        | ReadFromText(\"/content/sample_data/anscombe.json\")\n",
+        "        | beam.Map(str.strip)\n",
+        "        | WriteToText(\"/content/sample_data/\")\n",
+        "    )"
+      ],
+      "metadata": {
+        "id": "hmO1Chl51vPG"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "**Unit Tests for Pipelines**"
+      ],
+      "metadata": {
+        "id": "uoNJLQl_15gj"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# The following packages are imported for unit testing.\n",
+        "import unittest\n",
+        "import apache_beam as beam\n",
+        "from apache_beam.testing.test_pipeline import TestPipeline\n",
+        "from apache_beam.testing.util import assert_that, equal_to\n",
+        "try:\n",
+        "  from apitools.base.py.exceptions import HttpError\n",
+        "except ImportError:\n",
+        "  HttpError = None\n",
+        "\n",
+        "\n",
+        "@unittest.skipIf(HttpError is None, 'GCP dependencies are not installed')\n",
+        "class TestBeam(unittest.TestCase):\n",
+        "\n",
+        "# This test corresponds to pipeline p1, and is written to confirm the compute_square function works as intended.\n",
+        "  def test_compute_square(self):\n",
+        "    expected=[4]\n",
+        "    with TestPipeline() as p:\n",
+        "      output = p | beam.Create([\"1234\"]) \\\n",
+        "                 | beam.Map(compute_square)\n",
+        "      assert_that(output, equal_to(expected))"
+      ],
+      "metadata": {
+        "id": "3-twYhdLTan0"
+      },
+      "execution_count": 41,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# This test corresponds to pipeline p2, and is written to confirm the pipeline works as intended.\n",
+        "def test_strip_map(self):\n",
+        "  expected=['Strawberry', 'Carrot', 'Eggplant']\n",
+        "  strings = [' Strawberry   \\n', '   Carrot   \\n', '   Eggplant   \\n']\n",
+        "  with TestPipeline() as p:\n",
+        "    output = p | beam.Create(strings) \\\n",
+        "               | beam.Map(str.strip)\n",
+        "    assert_that(output, equal_to(expected))"
+      ],
+      "metadata": {
+        "id": "BU9Eil-TrtpE"
+      },
+      "execution_count": 42,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "**Mocking Example**"
+      ],
+      "metadata": {
+        "id": "58GVMyMa2PwE"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!pip install mock  # Install the 'mock' module"
+      ],
+      "metadata": {
+        "id": "ESclJ_G-6JcW"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# We import the mock package for mocking functionality.\n",
+        "import mock\n",
+        "\n",
+        "@mock.patch.object(CustomClass, 'custom_function')\n",
+        "def test_error_message_wrong_length(self, get_record):\n",
+        "  record = [\"field1\",\"field2\"]\n",
+        "  CustomClass.custom_function.return_value = record\n",
+        "  with self.assertRaisesRegex(ValueError,\n",
+        "                              \"Length of record does not match expected length'\"):\n",
+        "      p = beam.Pipeline()\n",
+        "      result = p | beam.ParDo(CustomClass.custom_function())\n",
+        "      result"
+      ],
+      "metadata": {
+        "id": "IRuv8s8a2O8F"
+      },
+      "execution_count": 44,
+      "outputs": []
+    }
+  ]
+}