diff --git a/transforms/language/doc_quality/doc_quality.ipynb b/transforms/language/doc_quality/doc_quality.ipynb new file mode 100644 index 000000000..bf91047b6 --- /dev/null +++ b/transforms/language/doc_quality/doc_quality.ipynb @@ -0,0 +1,207 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "afd55886-5f5b-4794-838e-ef8179fb0394", + "metadata": {}, + "source": [ + "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n", + "```\n", + "make venv \n", + "source venv/bin/activate \n", + "pip install jupyterlab\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "## This is here as a reference only\n", + "# Users and application developers must use the right tag for the latest from pypi\n", + "%pip install data-prep-toolkit\n", + "%pip install data-prep-toolkit-transforms==0.2.2.dev3" + ] + }, + { + "cell_type": "markdown", + "id": "407fd4e4-265d-4ec7-bbc9-b43158f5f1f3", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "##### **** Configure the transform parameters. The set of dictionary keys holding DocQualityTransform configuration for values are as follows: \n", + "* text_lang - specifies language used in the text content. By default, \"en\" is used.\n", + "* doc_content_column - specifies column name that contains document text. By default, \"contents\" is used.\n", + "* bad_word_filepath - specifies a path to bad word file: local folder (file or directory) that points to bad word file. You don't have to set this parameter if you don't need to set bad words.\n", + "#####" + ] + }, + { + "cell_type": "markdown", + "id": "ebf1f782-0e61-485c-8670-81066beb734c", + "metadata": {}, + "source": [ + "##### ***** Import required classes and modules" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "\n", + "from data_processing.runtime.pure_python import PythonTransformLauncher\n", + "from data_processing.utils import ParamsUtils\n", + "from doc_quality_transform import (bad_word_filepath_cli_param, doc_content_column_cli_param, text_lang_cli_param,)\n", + "from doc_quality_transform_python import DocQualityPythonTransformConfiguration" + ] + }, + { + "cell_type": "markdown", + "id": "7234563c-2924-4150-8a31-4aec98c1bf33", + "metadata": {}, + "source": [ + "##### ***** Setup runtime parameters for this transform" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e90a853e-412f-45d7-af3d-959e755aeebb", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# create parameters\n", + "input_folder = os.path.join(\"python\", \"test-data\", \"input\")\n", + "output_folder = os.path.join( \"python\", \"output\")\n", + "local_conf = {\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + "}\n", + "code_location = {\"github\": \"github\", \"commit_hash\": \"12345\", \"path\": \"path\"}\n", + "params = {\n", + " # Data access. Only required parameters are specified\n", + " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", + " # execution info\n", + " \"runtime_pipeline_id\": \"pipeline_id\",\n", + " \"runtime_job_id\": \"job_id\",\n", + " \"runtime_code_location\": ParamsUtils.convert_to_ast(code_location),\n", + " # doc_quality params\n", + " text_lang_cli_param: \"en\",\n", + " doc_content_column_cli_param: \"contents\",\n", + " bad_word_filepath_cli_param: os.path.join(\"python\", \"ldnoobw\", \"en\"),\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a", + "metadata": {}, + "source": [ + "##### ***** Use python runtime to invoke the transform" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "0775e400-7469-49a6-8998-bd4772931459", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "12:39:07 INFO - doc_quality parameters are : {'text_lang': 'en', 'doc_content_column': 'contents', 'bad_word_filepath': 'python/ldnoobw/en', 's3_cred': None, 'docq_data_factory': }\n", + "12:39:07 INFO - pipeline id pipeline_id\n", + "12:39:07 INFO - code location {'github': 'github', 'commit_hash': '12345', 'path': 'path'}\n", + "12:39:07 INFO - data factory data_ is using local data access: input_folder - python/test-data/input output_folder - python/output\n", + "12:39:07 INFO - data factory data_ max_files -1, n_sample -1\n", + "12:39:07 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "12:39:07 INFO - orchestrator docq started at 2024-11-25 12:39:07\n", + "12:39:07 INFO - Number of files is 1, source profile {'max_file_size': 0.0009870529174804688, 'min_file_size': 0.0009870529174804688, 'total_file_size': 0.0009870529174804688}\n", + "12:39:07 INFO - Load badwords found locally from python/ldnoobw/en\n", + "12:39:09 INFO - Completed 1 files (100.0%) in 0.033 min\n", + "12:39:09 INFO - Done processing 1 files, waiting for flush() completion.\n", + "12:39:09 INFO - done flushing in 0.0 sec\n", + "12:39:09 INFO - Completed execution in 0.033 min, execution result 0\n" + ] + } + ], + "source": [ + "%%capture\n", + "sys.argv = ParamsUtils.dict_to_req(d=params)\n", + "launcher = PythonTransformLauncher(runtime_config=DocQualityPythonTransformConfiguration())\n", + "launcher.launch()" + ] + }, + { + "cell_type": "markdown", + "id": "c3df5adf-4717-4a03-864d-9151cd3f134b", + "metadata": {}, + "source": [ + "##### **** The specified folder will include the transformed parquet files." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7276fe84-6512-4605-ab65-747351e13a7c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['python/output/metadata.json', 'python/output/test1.parquet']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import glob\n", + "glob.glob(\"python/output/*\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "845a75cf-f4a9-467d-87fa-ccbac1c9beb8", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/transforms/language/doc_quality/python/README.md b/transforms/language/doc_quality/python/README.md index 38421f34f..c10bc4b88 100644 --- a/transforms/language/doc_quality/python/README.md +++ b/transforms/language/doc_quality/python/README.md @@ -1,13 +1,25 @@ # Document Quality Transform + Please see the set of [transform project conventions](../../../README.md#transform-project-conventions) for details on general project conventions, transform configuration, testing and IDE set up. -## Summary -This transform will calculate and annotate several metrics related to document, which are usuful to see the quality of document. +## Contributors + +- Daiki Tsuzuku (dtsuzuku@jp.ibm.com) + +## Description +This transform will calculate and annotate several metrics which are useful to assess the quality of the document. +The document quality transform operates on text documents only + +### Input -In this transform, following metrics will be included: +| input column name | data type | description | +|-|-|-| +| the one specified in _doc_content_column_ configuration | string | text whose quality will be calculated by this transform | + +### Output columns annotated by this transform | output column name | data type | description | supported language | |-|-|-|-| @@ -27,7 +39,7 @@ In this transform, following metrics will be included: You can see more detailed backgrounds of some columns in [Deepmind's Gopher paper](https://arxiv.org/pdf/2112.11446.pdf) -## Configuration and command line Options +## Configuration The set of dictionary keys holding [DocQualityTransform](src/doc_quality_transform.py) configuration for values are as follows: @@ -36,13 +48,19 @@ configuration for values are as follows: * _doc_content_column_ - specifies column name that contains document text. By default, "contents" is used. * _bad_word_filepath_ - specifies a path to bad word file: local folder (file or directory) that points to bad word file. You don't have to set this parameter if you don't need to set bad words. -## Running +Example +``` +{ + text_lang_key: "en", + doc_content_column_key: "contents", + bad_word_filepath_key: os.path.join(basedir, "ldnoobw", "en"), +} +``` + +## Usage ### Launched Command Line Options -When running the transform with the Ray launcher (i.e. TransformLauncher), -the following command line arguments are available in addition to -the options provided by -the [python launcher](../../../../data-processing-lib/doc/python-launcher-options.md). +The following command line arguments are available ``` --docq_text_lang DOCQ_TEXT_LANG language used in the text content. By default, "en" is used. --docq_doc_content_column DOCQ_DOC_CONTENT_COLUMN column name that contain document text. By default, "contents" is used. @@ -70,6 +88,9 @@ ls output ``` To see results of the transform. +### Code example + +[notebook](../doc_quality.ipynb) ### Transforming data using the transform image @@ -77,7 +98,27 @@ To use the transform image to transform your data, please refer to the [running images quickstart](../../../../doc/quick-start/run-transform-image.md), substituting the name of this transform image and runtime as appropriate. +## Testing + +Following [the testing strategy of data-processing-lib](../../../../data-processing-lib/doc/transform-testing.md) + +Currently we have: +- [Unit test](test/test_doc_quality_python.py) +- [Integration test](test/test_doc_quality.py) + + +## Further Resource + +- For those who want to learn C4 heuristic rules + - https://arxiv.org/pdf/1910.10683.pdf +- For those who want to learn Gopher statistics + - https://arxiv.org/pdf/2112.11446.pdf +- For those who want to see the source of badwords used by default + - https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words + + +## Consideration -## Troubleshooting guide +### Troubleshooting guide For M1 Mac user, if you see following error during make command, `error: command '/usr/bin/clang' failed with exit code 1`, you may better follow [this step](https://freeman.vc/notes/installing-fasttext-on-an-m1-mac) \ No newline at end of file