diff --git a/transforms/language/doc_quality/doc_quality.ipynb b/transforms/language/doc_quality/doc_quality.ipynb new file mode 100644 index 000000000..99bab8ff3 --- /dev/null +++ b/transforms/language/doc_quality/doc_quality.ipynb @@ -0,0 +1,169 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "afd55886-5f5b-4794-838e-ef8179fb0394", + "metadata": {}, + "source": [ + "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n", + "```\n", + "make venv \n", + "source venv/bin/activate \n", + "pip install jupyterlab\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "## This is here as a reference only\n", + "# Users and application developers must use the right tag for the latest from pypi\n", + "#!pip install data-prep-toolkit\n", + "#!pip install data-prep-toolkit-transforms\n", + "#!pip install data-prep-connector" + ] + }, + { + "cell_type": "markdown", + "id": "407fd4e4-265d-4ec7-bbc9-b43158f5f1f3", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "##### **** Configure the transform parameters. The set of dictionary keys holding DocQualityTransform configuration for values are as follows: \n", + "* text_lang - specifies language used in the text content. By default, \"en\" is used.\n", + "* doc_content_column - specifies column name that contains document text. By default, \"contents\" is used.\n", + "* bad_word_filepath - specifies a path to bad word file: local folder (file or directory) that points to bad word file. You don't have to set this parameter if you don't need to set bad words.\n", + "#####" + ] + }, + { + "cell_type": "markdown", + "id": "ebf1f782-0e61-485c-8670-81066beb734c", + "metadata": {}, + "source": [ + "##### ***** Import required classes and modules" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", + "metadata": {}, + "outputs": [], + "source": [ + "import ast\n", + "import os\n", + "import sys\n", + "\n", + "from data_processing.runtime.pure_python import PythonTransformLauncher\n", + "from data_processing.utils import ParamsUtils\n", + "from doc_quality_transform import (bad_word_filepath_cli_param, doc_content_column_cli_param, text_lang_cli_param,)\n", + "from doc_quality_transform_python import DocQualityPythonTransformConfiguration" + ] + }, + { + "cell_type": "markdown", + "id": "7234563c-2924-4150-8a31-4aec98c1bf33", + "metadata": {}, + "source": [ + "##### ***** Setup runtime parameters for this transform" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e90a853e-412f-45d7-af3d-959e755aeebb", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# create parameters\n", + "input_folder = os.path.join(\"python\", \"test-data\", \"input\")\n", + "output_folder = os.path.join( \"python\", \"output\")\n", + "local_conf = {\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + "}\n", + "params = {\n", + " # Data access. Only required parameters are specified\n", + " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", + " # execution info\n", + " \"runtime_pipeline_id\": \"pipeline_id\",\n", + " \"runtime_job_id\": \"job_id\",\n", + " \"runtime_code_location\": ParamsUtils.convert_to_ast(code_location),\n", + " # doc_quality params\n", + " text_lang_cli_param: \"en\",\n", + " doc_content_column_cli_param: \"contents\",\n", + " bad_word_filepath_cli_param: os.path.join(\"python\", \"ldnoobw\", \"en\"),\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a", + "metadata": {}, + "source": [ + "##### ***** Use python runtime to invoke the transform" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0775e400-7469-49a6-8998-bd4772931459", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "sys.argv = ParamsUtils.dict_to_req(d=params)\n", + "launcher = PythonTransformLauncher(runtime_config=DocQualityPythonTransformConfiguration())\n", + "launcher.launch()" + ] + }, + { + "cell_type": "markdown", + "id": "c3df5adf-4717-4a03-864d-9151cd3f134b", + "metadata": {}, + "source": [ + "##### **** The specified folder will include the transformed parquet files." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7276fe84-6512-4605-ab65-747351e13a7c", + "metadata": {}, + "outputs": [], + "source": [ + "import glob\n", + "glob.glob(\"python/output/*\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}