From 71aedab1ec4167da6dce323bd27ffd51b39c9384 Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Tue, 29 Oct 2024 13:00:30 -0700 Subject: [PATCH] add remaining docs Signed-off-by: Sarah Yurick --- docs/user-guide/cpuvsgpu.rst | 4 +- examples/README.md | 22 ++++- examples/slurm/README.md | 10 ++- nemo_curator/modules/dataset_ops.py | 2 +- nemo_curator/scripts/README.md | 30 ++++++- nemo_curator/scripts/classifiers/README.md | 93 +++++++++++++++++++++- 6 files changed, 154 insertions(+), 7 deletions(-) diff --git a/docs/user-guide/cpuvsgpu.rst b/docs/user-guide/cpuvsgpu.rst index 7bb5858b..8d8cd64a 100644 --- a/docs/user-guide/cpuvsgpu.rst +++ b/docs/user-guide/cpuvsgpu.rst @@ -88,9 +88,9 @@ Dask with Slurm We provide an example Slurm script pipeline in ``examples/slurm``. This pipeline has a script ``start-slurm.sh`` that provides configuration options similar to what ``get_client`` provides. Every Slurm cluster is different, so make sure you understand how your Slurm cluster works so the scripts can be easily adapted. -``start-slurm.sh`` calls ``containter-entrypoint.sh`` which sets up a Dask scheduler and workers across the cluster. +``start-slurm.sh`` calls ``containter-entrypoint.sh``, which sets up a Dask scheduler and workers across the cluster. -Our Python examples are designed to work such that they can be run locally on their own, or easily substituted into the ``start-slurm.sh`` to run on multiple nodes. +Our Python examples are designed to work such that they can be run locally on their own, or easily substituted into the ``start-slurm.sh`` script to run on multiple nodes. You can adapt your scripts easily too by simply following the pattern of adding ``get_client`` with ``add_distributed_args``. ----------------------------------------- diff --git a/examples/README.md b/examples/README.md index 46409041..a61dea8b 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1 +1,21 @@ -# TODO +# NeMo Curator Python API examples + +This directory contains multiple Python scripts with examples of how to use various NeMo Curator classes and functions. +The goal of these examples is to give the user an overview of many of the ways your text data can be curated. +These include: + +- `blend_and_shuffle.py`: Combine multiple datasets into one with different amounts of each dataset, then randomly permute the dataset. +- `classifier_filtering.py`: Train a fastText classifier, then use it to filter high and low quality data. +- `download_arxiv.py`: Download Arxiv tar files and extract them. +- `download_common_crawl.py`: Download Common Crawl WARC snapshots and extract them. +- `download_wikipedia.py`: Download the latest Wikipedia dumps and extract them. +- `exact_deduplication.py`: Use the `ExactDuplicates` class to perform exact deduplication on text data. +- `find_pii_and_deidentify.py`: Use the `PiiModifier` and `Modify` classes to remove personally identifiable information from text data. +- `fuzzy_deduplication.py`: Use the `FuzzyDuplicatesConfig` and `FuzzyDuplicates` classes to perform fuzzy deduplication on text data. +- `identify_languages_and_fix_unicode.py`: Use `FastTextLangId` to filter data by language, then fix the unicode in it. +- `raw_download_common_crawl.py`: Download the raw compressed WARC files from Common Crawl without extracting them. +- `semdedup_example.py`: Use the `SemDedup` class to perform semantic deduplication on text data. +- `task_decontamination.py`: Remove segments of downstream evaluation tasks from a dataset. +- `translation_example.py`: Create and use an `IndicTranslation` model for language translation. + +The `classifiers`, `k8s`, `nemo_run`, and `slurm` subdirectories contain even more examples of NeMo Curator's capabilities. diff --git a/examples/slurm/README.md b/examples/slurm/README.md index 46409041..df6e4408 100644 --- a/examples/slurm/README.md +++ b/examples/slurm/README.md @@ -1 +1,9 @@ -# TODO +# Dask with Slurm + +This directory provides an example Slurm script pipeline. +This pipeline has a script `start-slurm.sh` that provides configuration options similar to what `get_client` provides. +Every Slurm cluster is different, so make sure you understand how your Slurm cluster works so the scripts can be easily adapted. +`start-slurm.sh` calls `containter-entrypoint.sh`, which sets up a Dask scheduler and workers across the cluster. + +Our Python examples are designed to work such that they can be run locally on their own, or easily substituted into the `start-slurm.sh` script to run on multiple nodes. +You can adapt your scripts easily too by simply following the pattern of adding `get_client` with `add_distributed_args`. diff --git a/nemo_curator/modules/dataset_ops.py b/nemo_curator/modules/dataset_ops.py index 38589b1e..745d741c 100644 --- a/nemo_curator/modules/dataset_ops.py +++ b/nemo_curator/modules/dataset_ops.py @@ -117,7 +117,7 @@ def blend_datasets( target_size: int, datasets: List[DocumentDataset], sampling_weights: List[float] ) -> DocumentDataset: """ - Combined multiple datasets into one with different amounts of each dataset + Combines multiple datasets into one with different amounts of each dataset. Args: target_size: The number of documents the resulting dataset should have. The actual size of the dataset may be slightly larger if the normalized weights do not allow diff --git a/nemo_curator/scripts/README.md b/nemo_curator/scripts/README.md index 46409041..418043e4 100644 --- a/nemo_curator/scripts/README.md +++ b/nemo_curator/scripts/README.md @@ -1 +1,29 @@ -# TODO +# NeMo Curator CLI Scripts + +The following Python scripts are designed to be executed from the command line (terminal) only. + +Here, we list all of the Python scripts and their terminal commands: + +| Python Command | CLI Command | +|------------------------------------------|--------------------------------| +| python add_id.py | add_id | +| python blend_datasets.py | blend_datasets | +| python download_and_extract.py | download_and_extract | +| python filter_documents.py | filter_documents | +| python find_exact_duplicates.py | gpu_exact_dups | +| python find_matching_ngrams.py | find_matching_ngrams | +| python find_pii_and_deidentify.py | deidentify | +| python get_common_crawl_urls.py | get_common_crawl_urls | +| python get_wikipedia_urls.py | get_wikipedia_urls | +| python make_data_shards.py | make_data_shards | +| python prepare_fasttext_training_data.py | prepare_fasttext_training_data | +| python prepare_task_data.py | prepare_task_data | +| python remove_matching_ngrams.py | remove_matching_ngrams | +| python separate_by_metadata.py | separate_by_metadata | +| python text_cleaning.py | text_cleaning | +| python train_fasttext.py | train_fasttext | +| python verify_classification_results.py | verify_classification_results | + +For more information about the arguments needed for each script, you can use `add_id --help`, etc. + +More scripts can be found in the `classifiers`, `fuzzy_deduplication`, and `semdedup` subdirectories. diff --git a/nemo_curator/scripts/classifiers/README.md b/nemo_curator/scripts/classifiers/README.md index 46409041..0499e370 100644 --- a/nemo_curator/scripts/classifiers/README.md +++ b/nemo_curator/scripts/classifiers/README.md @@ -1 +1,92 @@ -# TODO +## Text Classification + +The Python scripts in this directory demonstrate how to run classification on your text data with each of these 4 classifiers: + +- Domain Classifier +- Quality Classifier +- AEGIS Safety Models +- FineWeb Educational Content Classifier + +For more information about these classifiers, please see NeMo Curator's [Distributed Data Classification documentation](https://docs.nvidia.com/nemo-framework/user-guide/latest/datacuration/distributeddataclassification.html). + +### Usage + +#### Domain classifier inference + +```bash +# same as `python domain_classifier_inference.py` +domain_classifier_inference \ + --input-data-dir /path/to/data/directory \ + --output-data-dir /path/to/output/directory \ + --input-file-type "jsonl" \ + --input-file-extension "jsonl" \ + --output-file-type "jsonl" \ + --input-text-field "text" \ + --batch-size 64 \ + --autocast \ + --max-chars 2000 \ + --device "gpu" +``` + +Additional arguments may be added for customizing a Dask cluster and client. Run `domain_classifier_inference --help` for more information. + +#### Quality classifier inference + +```bash +# same as `python quality_classifier_inference.py` +quality_classifier_inference \ + --input-data-dir /path/to/data/directory \ + --output-data-dir /path/to/output/directory \ + --input-file-type "jsonl" \ + --input-file-extension "jsonl" \ + --output-file-type "jsonl" \ + --input-text-field "text" \ + --batch-size 64 \ + --autocast \ + --max-chars 2000 \ + --device "gpu" +``` + +Additional arguments may be added for customizing a Dask cluster and client. Run `quality_classifier_inference --help` for more information. + +#### AEGIS classifier inference + +```bash +# same as `python aegis_classifier_inference.py` +aegis_classifier_inference \ + --input-data-dir /path/to/data/directory \ + --output-data-dir /path/to/output/directory \ + --input-file-type "jsonl" \ + --input-file-extension "jsonl" \ + --output-file-type "jsonl" \ + --input-text-field "text" \ + --batch-size 64 \ + --max-chars 6000 \ + --device "gpu" \ + --aegis-variant "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0" \ + --token "hf_1234" +``` + +- `--aegis-variant` can be `nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0`, `nvidia/Aegis-AI-Content-Safety-LlamaGuard-Permissive-1.0`, or a path to your own PEFT of LlamaGuard 2. +- `--token` is your HuggingFace token, which is used when downloading the base Llama Guard model. + +Additional arguments may be added for customizing a Dask cluster and client. Run `aegis_classifier_inference --help` for more information. + +#### FineWeb-Edu classifier inference + +```bash +# same as `python fineweb_edu_classifier_inference.py` +fineweb_edu_classifier_inference \ + --input-data-dir /path/to/data/directory \ + --output-data-dir /path/to/output/directory \ + --input-file-type "jsonl" \ + --input-file-extension "jsonl" \ + --output-file-type "jsonl" \ + --input-text-field "text" \ + --batch-size 64 \ + --autocast \ + --max-chars 2000 \ + --device "gpu" +``` + +Additional arguments may be added for customizing a Dask cluster and client. Run `fineweb_edu_classifier_inference --help` for more information.