From cd0eaca555f4236d32cb314a861142e020f61279 Mon Sep 17 00:00:00 2001
From: v-chen_data <v-chen_data@example.com>
Date: Thu, 11 Jul 2024 18:06:28 -0700
Subject: [PATCH 01/18] cli

---
 llmfoundry/data_prep/__init__.py              |  23 +
 llmfoundry/data_prep/convert_text_to_mds.py   | 617 ++++++++++++++++++
 scripts/data_prep/convert_text_to_mds.py      | 503 +-------------
 .../data_prep/test_convert_text_to_mds.py     |  18 +-
 4 files changed, 651 insertions(+), 510 deletions(-)
 create mode 100644 llmfoundry/data_prep/__init__.py
 create mode 100644 llmfoundry/data_prep/convert_text_to_mds.py

diff --git a/llmfoundry/data_prep/__init__.py b/llmfoundry/data_prep/__init__.py
new file mode 100644
index 0000000000..c959c0ddcd
--- /dev/null
+++ b/llmfoundry/data_prep/__init__.py
@@ -0,0 +1,23 @@
+from llmfoundry.data_prep.convert_text_to_mds import (
+    convert_text_to_mds,
+    convert_text_to_mds_from_args,
+    maybe_create_object_store_from_uri,
+    parse_uri,
+    download_and_convert,
+    merge_shard_groups,
+    is_already_processed,
+    write_done_file,
+    DONE_FILENAME,
+)
+
+__all__ = [
+    'convert_text_to_mds',
+    'convert_text_to_mds_from_args',
+    'maybe_create_object_store_from_uri',
+    'parse_uri',
+    'download_and_convert',
+    'merge_shard_groups',
+    'is_already_processed',
+    'write_done_file',
+    'DONE_FILENAME'
+]
\ No newline at end of file
diff --git a/llmfoundry/data_prep/convert_text_to_mds.py b/llmfoundry/data_prep/convert_text_to_mds.py
new file mode 100644
index 0000000000..ce6a8f7f70
--- /dev/null
+++ b/llmfoundry/data_prep/convert_text_to_mds.py
@@ -0,0 +1,617 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+import logging
+import math
+import os
+import tempfile
+from argparse import ArgumentParser, Namespace
+from concurrent.futures import ProcessPoolExecutor
+from functools import partial
+from glob import glob
+from typing import Dict, Iterable, List, Tuple, cast
+
+import numpy as np
+import psutil
+from composer.utils import (
+    ObjectStore,
+    maybe_create_object_store_from_uri,
+    parse_uri,
+)
+from numpy.typing import NDArray
+from streaming import MDSWriter
+from tqdm import tqdm
+from transformers import AutoTokenizer, PreTrainedTokenizerBase
+
+from llmfoundry.data.data import AbstractConcatTokensDataset
+from llmfoundry.utils.data_prep_utils import (
+    DownloadingIterable,
+    download_file,
+    merge_shard_groups,
+)
+from llmfoundry.utils.exceptions import (
+    InputFolderMissingDataError,
+    OutputFolderNotEmptyError,
+)
+
+log = logging.getLogger(__name__)
+
+DONE_FILENAME = '.text_to_mds_conversion_done'
+
+
+class ConcatTokensFromFilesDataset(AbstractConcatTokensDataset):
+    """An IterableDataset that returns token samples for MDSWriter from files.
+
+    Returns dicts of {'tokens': ndarray:int32}
+
+    Each file is considered a sequence.
+    """
+
+    def __init__(
+        self,
+        files: Iterable[str],
+        tokenizer: PreTrainedTokenizerBase,
+        max_length: int,
+        bos_text: str,
+        eos_text: str,
+        no_wrap: bool,
+    ):
+        self.files = files
+        super().__init__(tokenizer, max_length, bos_text, eos_text, no_wrap)
+
+    def __iter__(self) -> Iterable[Dict[str, NDArray]]:
+
+        buffer = []
+        for file in self.files:
+            with open(file, 'r') as f:
+                buffer += self.bos_tokens
+                first_chunk = True
+                # Read the file in 1MB chunks to avoid memory issues
+                for chunk in iter(partial(f.read, 1000000), ''):
+                    # Tokenize the chunk
+                    encoded = self.tokenizer(
+                        chunk,
+                        truncation=False,
+                        padding=False,
+                    )
+                    iids = encoded['input_ids']
+
+                    # If this is not the first chunk, remove the BOS token
+                    if not first_chunk:
+                        if iids[0] == self.tokenizer.bos_token_id:
+                            iids = iids[1:]
+
+                    # Add the tokens to the buffer
+                    buffer += iids
+                    while len(buffer) >= self.max_length:
+                        concat_sample = buffer[:self.max_length]
+                        buffer = buffer[self.
+                                        max_length:] if self.should_wrap else []
+                        yield {
+                            'tokens': np.asarray(concat_sample, dtype=np.int32),
+                        }
+
+                    first_chunk = False
+
+                # Add the EOS token to the buffer to separate files.
+                buffer += self.eos_tokens
+
+        # Yield any remaining samples of size max_length.
+        while len(buffer) >= self.max_length:
+            concat_sample = buffer[:self.max_length]
+            buffer = buffer[self.max_length:] if self.should_wrap else []
+            yield {'tokens': np.asarray(concat_sample, dtype=np.int32)}
+
+
+def parse_args() -> Namespace:
+    """Parse commandline arguments."""
+    parser = ArgumentParser(
+        description=
+        'Convert text files into MDS format, optionally concatenating and tokenizing',
+    )
+    parser.add_argument(
+        '--output_folder',
+        type=str,
+        required=True,
+        help='The folder to write output to',
+    )
+    parser.add_argument(
+        '--input_folder',
+        type=str,
+        required=True,
+        help='The folder with text files to convert to mds',
+    )
+    parser.add_argument(
+        '--compression',
+        type=str,
+        default='zstd',
+        required=False,
+        help='The compression algorithm to use for MDS writing',
+    )
+
+    parser.add_argument(
+        '--concat_tokens',
+        type=int,
+        required=True,
+        help='Convert text to tokens and concatenate up to this many tokens',
+    )
+
+    parser.add_argument(
+        '--tokenizer',
+        type=str,
+        required=True,
+        help='The name of the tokenizer to use',
+    )
+    parser.add_argument(
+        '--bos_text',
+        type=str,
+        required=False,
+        default=None,
+        help=
+        'The text to prepend to each example to separate concatenated examples',
+    )
+    parser.add_argument(
+        '--eos_text',
+        type=str,
+        required=False,
+        default=None,
+        help=
+        'The text to append to each example to separate concatenated examples',
+    )
+    parser.add_argument(
+        '--use_tokenizer_eos',
+        required=False,
+        action='store_true',
+        default=False,
+        help='Use the EOS text from the tokenizer.',
+    )
+    parser.add_argument(
+        '--no_wrap',
+        default=False,
+        action='store_true',
+        help=
+        'Whether to let text examples wrap across multiple training examples',
+    )
+    parser.add_argument(
+        '--processes',
+        type=int,
+        required=False,
+        default=min(max(psutil.cpu_count() - 2, 1), 32),
+        help=
+        'The number of processes to use to download and convert the dataset',
+    )
+    parser.add_argument(
+        '--reprocess',
+        type=bool,
+        required=False,
+        default=False,
+        help='If true, reprocess the input_folder to mds format. Otherwise, ' +
+        'only reprocess upon changes to the input folder or dataset creation parameters.',
+    )
+    parser.add_argument(
+        '--trust-remote-code',
+        type=bool,
+        required=False,
+        default=False,
+        help='If true, allows custom code to be executed to load the tokenizer',
+    )
+    parser.add_argument(
+        '--logging-level',
+        type=str,
+        required=False,
+        default='INFO',
+        help='Logging level for the script. Default is INFO.',
+    )
+    parsed = parser.parse_args()
+    return parsed
+
+
+def get_object_names(input_folder: str) -> List[str]:
+    """Get object names from a local or remote folder.
+
+    Args:
+        input_folder (str): local or remote folder path.
+    """
+    object_store = maybe_create_object_store_from_uri(input_folder)
+    if object_store is not None:
+        _, _, folder_prefix = parse_uri(input_folder)
+        names = [
+            name for name in object_store.list_objects(folder_prefix)
+            if name.endswith('.txt')
+        ]
+    else:
+        # input_folder is a local folder
+        names = [
+            text_file for dirpath, _, _ in os.walk(input_folder)
+            for text_file in glob(os.path.join(dirpath, '*.txt'))
+        ]
+    # return names, sizes
+    log.info(f'Found {len(names)} text files at {input_folder}')
+
+    return names
+
+
+def get_task_args(
+    object_names: List[str],
+    output_root: str,
+    input_folder: str,
+    n_groups: int,
+    tokenizer_name: str,
+    concat_tokens: int,
+    eos_text: str,
+    bos_text: str,
+    no_wrap: bool,
+    compression: str,
+    trust_remote_code: bool,
+) -> Iterable:
+    """Get download_and_convert arguments split across n_groups.
+
+    Each group handles a portion of object_names.
+
+    Args:
+        object_names (List[str]): Names of objects to process
+        output_root (str): Folder to write MDS shards to
+        input_folder (str): Folder of text files to process
+        n_groups (int): Number of groups to split the object names into
+        tokenizer_name (str): Name of tokenizer to use
+        concat_tokens (int): Concatenate up to this many tokens
+        eos_text (str): Text to append to each example to separate concatenated samples
+        bos_text (str): Text to prepend to each example to separate concatenated samples
+        no_wrap: (bool): Whether to let text examples wrap across multiple training examples
+        compression (str): The compression algorithm to use for MDS writing
+        trust_remote_code (bool): If true, allows custom code to be executed to load the tokenizer
+    """
+    num_objects = len(object_names)
+    objs_per_group = math.ceil(num_objects / n_groups)
+    for group, i in enumerate(range(0, num_objects, objs_per_group)):
+        output_subdir = os.path.join(output_root, str(group))
+        yield (
+            object_names[i:min(i + objs_per_group, num_objects)],
+            output_subdir,
+            input_folder,
+            tokenizer_name,
+            concat_tokens,
+            eos_text,
+            bos_text,
+            no_wrap,
+            compression,
+            trust_remote_code,
+        )
+
+
+def download_and_convert_starargs(args: Tuple):
+    """Helper function to call download_and_convert with star args.
+
+    This helps us use download_and_convert with multiprocessing.
+    """
+    return download_and_convert(*args)
+
+
+def download_and_convert(
+    file_names: List[str],
+    output_folder: str,
+    input_folder: str,
+    tokenizer_name: str,
+    concat_tokens: int,
+    eos_text: str,
+    bos_text: str,
+    no_wrap: bool,
+    compression: str,
+    trust_remote_code: bool,
+):
+    """Downloads and converts text files to MDS format.
+
+    Args:
+        file_names (List[str]): Files to process
+        output_folder (str): Folder to write MDS shards to
+        input_folder (str): Folder of text files to process
+        tokenizer_name (str): Name of tokenizer to use
+        concat_tokens (int): Concatenate up to this many tokens
+        eos_text (str): Text to append to each example to separate concatenated samples
+        bos_text (str): Text to prepend to each example to separate concatenated samples
+        no_wrap: (bool): Whether to let text examples wrap across multiple training examples
+        compression (str): The compression algorithm to use for MDS writing
+        trust_remote_code (bool): If true, allows custom code to be executed to load the tokenizer
+    """
+    object_store = maybe_create_object_store_from_uri(input_folder)
+
+    # Download file_names
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        downloading_iter = DownloadingIterable(
+            object_names=file_names,
+            output_folder=tmp_dir,
+            object_store=object_store,
+        )
+        tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_name,
+            trust_remote_code=trust_remote_code,
+        )
+        tokenizer.model_max_length = 5000000000  # Hack to prevent warnings from HuggingFace
+
+        # Use the ConcatTokensDataset from LLM-foundry to concatenate sequences of tokens up
+        # to the maximum sequence length
+        dataset = ConcatTokensFromFilesDataset(
+            files=downloading_iter,
+            max_length=concat_tokens,
+            tokenizer=tokenizer,
+            eos_text=eos_text,
+            bos_text=bos_text,
+            no_wrap=no_wrap,
+        )
+
+        columns = {'tokens': 'ndarray:int32'}
+
+        log.info('Converting to MDS format...')
+        with MDSWriter(
+            out=output_folder,
+            columns=columns,
+            compression=compression,
+        ) as out:
+            for sample in tqdm(dataset):
+                out.write(sample)
+
+
+def is_remote_path(path: str) -> bool:
+    """Checks whether a path is a remote path.
+
+    Args:
+        path (str): path to check
+    """
+    backend, _, _ = parse_uri(path)
+    return backend != ''
+
+
+def is_already_processed(
+    output_root: str,
+    args_str: str,
+    object_names: List[str],
+) -> bool:
+    """Determines whether a group of text files has already been processed.
+
+    Checks the done fie at output root to determine this.
+
+    Args:
+        output_root (str): Output folder where a done file may exist
+        args_str (str): String representation of the arguments
+        object_names (List[str]): Names of objects to convert to MDS format
+    """
+    # Retrieve the done file contents
+    output_object_store = maybe_create_object_store_from_uri(output_root)
+    if output_object_store is not None:
+        # Download and read the done file from the remote object store
+        _, _, output_folder_prefix = parse_uri(output_root)
+        try:
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                done_file = os.path.join(tmp_dir, DONE_FILENAME)
+                download_file(
+                    object_store=output_object_store,
+                    object_name=os.path.join(
+                        output_folder_prefix,
+                        DONE_FILENAME,
+                    ),
+                    output_filename=done_file,
+                )
+                with open(done_file) as df:
+                    done_file_contents = df.read().splitlines()
+        except FileNotFoundError:
+            return False
+    else:
+        # Read the local done file
+        done_file = os.path.join(output_root, DONE_FILENAME)
+        if not os.path.isfile(done_file):
+            return False
+        with open(done_file) as df:
+            done_file_contents = df.read().splitlines()
+    # Compare the arguments
+    prev_args_str = done_file_contents[0]
+    if prev_args_str != args_str:
+        return False
+
+    # Compare file names
+    prev_names = done_file_contents[1:]
+    if len(prev_names) != len(object_names):
+        return False
+    for idx, prev_name in enumerate(prev_names):
+        if object_names[idx] != prev_name:
+            return False
+    return True
+
+
+def write_done_file(folder: str, args_str: str, object_names: List[str]):
+    """Write a file to signify completion.
+
+    This the done file includes the arguments to processing and
+    a list of objects that were processed.
+
+    Args:
+        folder (str): Folder to write the done file to
+        args_str (str): String representation of arguments
+        object_names (List[str]): List of objects to convert to MDS format
+    """
+    with open(os.path.join(folder, DONE_FILENAME), 'w') as done_file:
+        done_file.write('\n'.join([args_str] + object_names) + '\n')
+
+
+def convert_text_to_mds(
+    tokenizer_name: str,
+    output_folder: str,
+    input_folder: str,
+    concat_tokens: int,
+    eos_text: str,
+    bos_text: str,
+    no_wrap: bool,
+    compression: str,
+    processes: int,
+    args_str: str,
+    reprocess: bool,
+    trust_remote_code: bool,
+):
+    """Convert a folder of text files to MDS format.
+
+    Args:
+        tokenizer_name (str): Name of tokenizer to use
+        output_folder (str): Folder to write MDS shards to
+        input_folder (str): Folder of text files to process
+        concat_tokens (int): Concatenate up to this many tokens
+        eos_text (str): Text to append to each example to separate concatenated samples
+        bos_text (str): Text to prepend to each example to separate concatenated samples
+        no_wrap: (bool): Whether to let text examples wrap across multiple training examples
+        compression (str): The compression algorithm to use for MDS writing
+        processes (int): The number of processes to use.
+        args_str (str): String representation of the arguments
+        reprocess (bool): Whether to always reprocess the given folder of text files
+        trust_remote_code (bool): If true, allows custom code to be executed to load the tokenizer
+    """
+    is_remote_output = is_remote_path(output_folder)
+
+    object_names = get_object_names(input_folder)
+    if len(object_names) == 0:
+        raise InputFolderMissingDataError(input_folder)
+
+    # Check if the text files in the bucket have already been processed.
+    if not reprocess and is_already_processed(
+        output_folder,
+        args_str,
+        object_names,
+    ):
+        log.info(
+            f'Input folder {input_folder} is already processed at {output_folder} and '
+            +
+            'reprocess is set to False. Set reprocess to True if you would like to force reprocessing.',
+        )
+        return
+
+    # Use a temporary local directory if the output is remote and there are more than 1 processes
+    local_output_folder = tempfile.TemporaryDirectory(
+    ).name if is_remote_output else output_folder
+
+    if os.path.isdir(output_folder) and len(os.listdir(output_folder)) > 0:
+        raise OutputFolderNotEmptyError(output_folder)
+
+    if processes > 1:
+        # Download and convert the text files in parallel
+        args = get_task_args(
+            object_names,
+            local_output_folder,
+            input_folder,
+            processes,
+            tokenizer_name,
+            concat_tokens,
+            eos_text,
+            bos_text,
+            no_wrap,
+            compression,
+            trust_remote_code,
+        )
+        with ProcessPoolExecutor(max_workers=processes) as executor:
+            list(executor.map(download_and_convert_starargs, args))
+
+        # Merge the mds shards from each of the processes into a single folder
+        merge_shard_groups(local_output_folder)
+    else:
+        download_and_convert(
+            object_names,
+            local_output_folder,
+            input_folder,
+            tokenizer_name,
+            concat_tokens,
+            eos_text,
+            bos_text,
+            no_wrap,
+            compression,
+            trust_remote_code,
+        )
+
+    # Write a done file with the args and object names
+    write_done_file(local_output_folder, args_str, object_names)
+
+    if is_remote_output:
+        # Upload the local output to the remote location
+        output_object_store = cast(
+            ObjectStore,
+            maybe_create_object_store_from_uri(output_folder),
+        )
+        _, _, output_folder_prefix = parse_uri(output_folder)
+        files_to_upload = os.listdir(local_output_folder)
+
+        for file in files_to_upload:
+            assert not os.path.isdir(file)
+            remote_path = os.path.join(output_folder_prefix, file)
+            output_object_store.upload_object(
+                remote_path,
+                os.path.join(local_output_folder, file),
+            )
+
+
+def _args_str(original_args: Namespace) -> str:
+    """Create a string from the args to determine whether to reprocess.
+
+    Args:
+        original_args (Namespace): Arguments to main function.
+    """
+    # Take the arguments that influence the final result.
+    # reprocess and max_mds_writer_workers are not taken.
+    args = Namespace(
+        tokenizer_name=original_args.tokenizer,
+        output_folder=original_args.output_folder,
+        input_folder=original_args.input_folder,
+        concat_tokens=original_args.concat_tokens,
+        eos_text=original_args.eos_text,
+        bos_text=original_args.bos_text,
+        no_wrap=original_args.no_wrap,
+        compression=original_args.compression,
+        processes=original_args.processes,
+    )
+
+    return str(args)
+
+
+def _configure_logging(logging_level: str):
+    """Configure logging.
+
+    Args:
+        logging_level (str): Logging level.
+    """
+    logging.basicConfig(
+        format=
+        f'%(asctime)s: [%(process)d][%(threadName)s]: %(levelname)s: %(name)s: %(message)s',
+    )
+    logging_level = logging_level.upper()
+    logging.getLogger('llmfoundry').setLevel(logging_level)
+    logging.getLogger(__name__).setLevel(logging_level)
+    log.info(f'Logging level set to {logging_level}')
+
+
+def convert_text_to_mds_from_args(args: Namespace) -> None:
+    if args.use_tokenizer_eos:
+        # Ensure that eos text is not specified twice.
+        if args.eos_text is not None:
+            args.error(
+                'Cannot set --eos_text with --use_tokenizer_eos. Please specify one.',
+            )
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.tokenizer,
+            trust_remote_code=args.trust_remote_code,
+        )
+        args.eos_text = tokenizer.eos_token
+
+    # now that we have validated them, change BOS/EOS to strings
+    if args.bos_text is None:
+        args.bos_text = ''
+    if args.eos_text is None:
+        args.eos_text = ''
+    _configure_logging(args.logging_level)
+    convert_text_to_mds(
+        tokenizer_name=args.tokenizer,
+        output_folder=args.output_folder,
+        input_folder=args.input_folder,
+        concat_tokens=args.concat_tokens,
+        eos_text=args.eos_text,
+        bos_text=args.bos_text,
+        no_wrap=args.no_wrap,
+        compression=args.compression,
+        processes=args.processes,
+        reprocess=args.reprocess,
+        trust_remote_code=args.trust_remote_code,
+        args_str=_args_str(args),
+    )
diff --git a/scripts/data_prep/convert_text_to_mds.py b/scripts/data_prep/convert_text_to_mds.py
index 92c36eb35d..f0ab28a90f 100644
--- a/scripts/data_prep/convert_text_to_mds.py
+++ b/scripts/data_prep/convert_text_to_mds.py
@@ -2,107 +2,17 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import logging
-import math
-import os
-import tempfile
 from argparse import ArgumentParser, Namespace
-from concurrent.futures import ProcessPoolExecutor
-from functools import partial
-from glob import glob
-from typing import Dict, Iterable, List, Tuple, cast
 
-import numpy as np
 import psutil
-from composer.utils import (
-    ObjectStore,
-    maybe_create_object_store_from_uri,
-    parse_uri,
-)
-from numpy.typing import NDArray
-from streaming import MDSWriter
-from tqdm import tqdm
-from transformers import AutoTokenizer, PreTrainedTokenizerBase
 
-from llmfoundry.data.data import AbstractConcatTokensDataset
-from llmfoundry.utils.data_prep_utils import (
-    DownloadingIterable,
-    download_file,
-    merge_shard_groups,
-)
-from llmfoundry.utils.exceptions import (
-    InputFolderMissingDataError,
-    OutputFolderNotEmptyError,
-)
+from llmfoundry.data_prep import convert_text_to_mds_from_args
 
 log = logging.getLogger(__name__)
 
 DONE_FILENAME = '.text_to_mds_conversion_done'
 
 
-class ConcatTokensFromFilesDataset(AbstractConcatTokensDataset):
-    """An IterableDataset that returns token samples for MDSWriter from files.
-
-    Returns dicts of {'tokens': ndarray:int32}
-
-    Each file is considered a sequence.
-    """
-
-    def __init__(
-        self,
-        files: Iterable[str],
-        tokenizer: PreTrainedTokenizerBase,
-        max_length: int,
-        bos_text: str,
-        eos_text: str,
-        no_wrap: bool,
-    ):
-        self.files = files
-        super().__init__(tokenizer, max_length, bos_text, eos_text, no_wrap)
-
-    def __iter__(self) -> Iterable[Dict[str, NDArray]]:
-
-        buffer = []
-        for file in self.files:
-            with open(file, 'r') as f:
-                buffer += self.bos_tokens
-                first_chunk = True
-                # Read the file in 1MB chunks to avoid memory issues
-                for chunk in iter(partial(f.read, 1000000), ''):
-                    # Tokenize the chunk
-                    encoded = self.tokenizer(
-                        chunk,
-                        truncation=False,
-                        padding=False,
-                    )
-                    iids = encoded['input_ids']
-
-                    # If this is not the first chunk, remove the BOS token
-                    if not first_chunk:
-                        if iids[0] == self.tokenizer.bos_token_id:
-                            iids = iids[1:]
-
-                    # Add the tokens to the buffer
-                    buffer += iids
-                    while len(buffer) >= self.max_length:
-                        concat_sample = buffer[:self.max_length]
-                        buffer = buffer[self.
-                                        max_length:] if self.should_wrap else []
-                        yield {
-                            'tokens': np.asarray(concat_sample, dtype=np.int32),
-                        }
-
-                    first_chunk = False
-
-                # Add the EOS token to the buffer to separate files.
-                buffer += self.eos_tokens
-
-        # Yield any remaining samples of size max_length.
-        while len(buffer) >= self.max_length:
-            concat_sample = buffer[:self.max_length]
-            buffer = buffer[self.max_length:] if self.should_wrap else []
-            yield {'tokens': np.asarray(concat_sample, dtype=np.int32)}
-
-
 def parse_args() -> Namespace:
     """Parse commandline arguments."""
     parser = ArgumentParser(
@@ -203,418 +113,9 @@ def parse_args() -> Namespace:
         help='Logging level for the script. Default is INFO.',
     )
     parsed = parser.parse_args()
-
-    # Set eos token.
-    if parsed.use_tokenizer_eos:
-        # Ensure that eos text is not specified twice.
-        if parsed.eos_text is not None:
-            parser.error(
-                'Cannot set --eos_text with --use_tokenizer_eos. Please specify one.',
-            )
-        tokenizer = AutoTokenizer.from_pretrained(
-            parsed.tokenizer,
-            trust_remote_code=parsed.trust_remote_code,
-        )
-        parsed.eos_text = tokenizer.eos_token
-
-    # now that we have validated them, change BOS/EOS to strings
-    if parsed.bos_text is None:
-        parsed.bos_text = ''
-    if parsed.eos_text is None:
-        parsed.eos_text = ''
     return parsed
 
 
-def get_object_names(input_folder: str) -> List[str]:
-    """Get object names from a local or remote folder.
-
-    Args:
-        input_folder (str): local or remote folder path.
-    """
-    object_store = maybe_create_object_store_from_uri(input_folder)
-    if object_store is not None:
-        _, _, folder_prefix = parse_uri(input_folder)
-        names = [
-            name for name in object_store.list_objects(folder_prefix)
-            if name.endswith('.txt')
-        ]
-    else:
-        # input_folder is a local folder
-        names = [
-            text_file for dirpath, _, _ in os.walk(input_folder)
-            for text_file in glob(os.path.join(dirpath, '*.txt'))
-        ]
-    # return names, sizes
-    log.info(f'Found {len(names)} text files at {input_folder}')
-
-    return names
-
-
-def get_task_args(
-    object_names: List[str],
-    output_root: str,
-    input_folder: str,
-    n_groups: int,
-    tokenizer_name: str,
-    concat_tokens: int,
-    eos_text: str,
-    bos_text: str,
-    no_wrap: bool,
-    compression: str,
-    trust_remote_code: bool,
-) -> Iterable:
-    """Get download_and_convert arguments split across n_groups.
-
-    Each group handles a portion of object_names.
-
-    Args:
-        object_names (List[str]): Names of objects to process
-        output_root (str): Folder to write MDS shards to
-        input_folder (str): Folder of text files to process
-        n_groups (int): Number of groups to split the object names into
-        tokenizer_name (str): Name of tokenizer to use
-        concat_tokens (int): Concatenate up to this many tokens
-        eos_text (str): Text to append to each example to separate concatenated samples
-        bos_text (str): Text to prepend to each example to separate concatenated samples
-        no_wrap: (bool): Whether to let text examples wrap across multiple training examples
-        compression (str): The compression algorithm to use for MDS writing
-        trust_remote_code (bool): If true, allows custom code to be executed to load the tokenizer
-    """
-    num_objects = len(object_names)
-    objs_per_group = math.ceil(num_objects / n_groups)
-    for group, i in enumerate(range(0, num_objects, objs_per_group)):
-        output_subdir = os.path.join(output_root, str(group))
-        yield (
-            object_names[i:min(i + objs_per_group, num_objects)],
-            output_subdir,
-            input_folder,
-            tokenizer_name,
-            concat_tokens,
-            eos_text,
-            bos_text,
-            no_wrap,
-            compression,
-            trust_remote_code,
-        )
-
-
-def download_and_convert_starargs(args: Tuple):
-    """Helper function to call download_and_convert with star args.
-
-    This helps us use download_and_convert with multiprocessing.
-    """
-    return download_and_convert(*args)
-
-
-def download_and_convert(
-    file_names: List[str],
-    output_folder: str,
-    input_folder: str,
-    tokenizer_name: str,
-    concat_tokens: int,
-    eos_text: str,
-    bos_text: str,
-    no_wrap: bool,
-    compression: str,
-    trust_remote_code: bool,
-):
-    """Downloads and converts text files to MDS format.
-
-    Args:
-        file_names (List[str]): Files to process
-        output_folder (str): Folder to write MDS shards to
-        input_folder (str): Folder of text files to process
-        tokenizer_name (str): Name of tokenizer to use
-        concat_tokens (int): Concatenate up to this many tokens
-        eos_text (str): Text to append to each example to separate concatenated samples
-        bos_text (str): Text to prepend to each example to separate concatenated samples
-        no_wrap: (bool): Whether to let text examples wrap across multiple training examples
-        compression (str): The compression algorithm to use for MDS writing
-        trust_remote_code (bool): If true, allows custom code to be executed to load the tokenizer
-    """
-    object_store = maybe_create_object_store_from_uri(input_folder)
-
-    # Download file_names
-    with tempfile.TemporaryDirectory() as tmp_dir:
-        downloading_iter = DownloadingIterable(
-            object_names=file_names,
-            output_folder=tmp_dir,
-            object_store=object_store,
-        )
-        tokenizer = AutoTokenizer.from_pretrained(
-            tokenizer_name,
-            trust_remote_code=trust_remote_code,
-        )
-        tokenizer.model_max_length = 5000000000  # Hack to prevent warnings from HuggingFace
-
-        # Use the ConcatTokensDataset from LLM-foundry to concatenate sequences of tokens up
-        # to the maximum sequence length
-        dataset = ConcatTokensFromFilesDataset(
-            files=downloading_iter,
-            max_length=concat_tokens,
-            tokenizer=tokenizer,
-            eos_text=eos_text,
-            bos_text=bos_text,
-            no_wrap=no_wrap,
-        )
-
-        columns = {'tokens': 'ndarray:int32'}
-
-        log.info('Converting to MDS format...')
-        with MDSWriter(
-            out=output_folder,
-            columns=columns,
-            compression=compression,
-        ) as out:
-            for sample in tqdm(dataset):
-                out.write(sample)
-
-
-def is_remote_path(path: str) -> bool:
-    """Checks whether a path is a remote path.
-
-    Args:
-        path (str): path to check
-    """
-    backend, _, _ = parse_uri(path)
-    return backend != ''
-
-
-def is_already_processed(
-    output_root: str,
-    args_str: str,
-    object_names: List[str],
-) -> bool:
-    """Determines whether a group of text files has already been processed.
-
-    Checks the done fie at output root to determine this.
-
-    Args:
-        output_root (str): Output folder where a done file may exist
-        args_str (str): String representation of the arguments
-        object_names (List[str]): Names of objects to convert to MDS format
-    """
-    # Retrieve the done file contents
-    output_object_store = maybe_create_object_store_from_uri(output_root)
-    if output_object_store is not None:
-        # Download and read the done file from the remote object store
-        _, _, output_folder_prefix = parse_uri(output_root)
-        try:
-            with tempfile.TemporaryDirectory() as tmp_dir:
-                done_file = os.path.join(tmp_dir, DONE_FILENAME)
-                download_file(
-                    object_store=output_object_store,
-                    object_name=os.path.join(
-                        output_folder_prefix,
-                        DONE_FILENAME,
-                    ),
-                    output_filename=done_file,
-                )
-                with open(done_file) as df:
-                    done_file_contents = df.read().splitlines()
-        except FileNotFoundError:
-            return False
-    else:
-        # Read the local done file
-        done_file = os.path.join(output_root, DONE_FILENAME)
-        if not os.path.isfile(done_file):
-            return False
-        with open(done_file) as df:
-            done_file_contents = df.read().splitlines()
-    # Compare the arguments
-    prev_args_str = done_file_contents[0]
-    if prev_args_str != args_str:
-        return False
-
-    # Compare file names
-    prev_names = done_file_contents[1:]
-    if len(prev_names) != len(object_names):
-        return False
-    for idx, prev_name in enumerate(prev_names):
-        if object_names[idx] != prev_name:
-            return False
-    return True
-
-
-def write_done_file(folder: str, args_str: str, object_names: List[str]):
-    """Write a file to signify completion.
-
-    This the done file includes the arguments to processing and
-    a list of objects that were processed.
-
-    Args:
-        folder (str): Folder to write the done file to
-        args_str (str): String representation of arguments
-        object_names (List[str]): List of objects to convert to MDS format
-    """
-    with open(os.path.join(folder, DONE_FILENAME), 'w') as done_file:
-        done_file.write('\n'.join([args_str] + object_names) + '\n')
-
-
-def convert_text_to_mds(
-    tokenizer_name: str,
-    output_folder: str,
-    input_folder: str,
-    concat_tokens: int,
-    eos_text: str,
-    bos_text: str,
-    no_wrap: bool,
-    compression: str,
-    processes: int,
-    args_str: str,
-    reprocess: bool,
-    trust_remote_code: bool,
-):
-    """Convert a folder of text files to MDS format.
-
-    Args:
-        tokenizer_name (str): Name of tokenizer to use
-        output_folder (str): Folder to write MDS shards to
-        input_folder (str): Folder of text files to process
-        concat_tokens (int): Concatenate up to this many tokens
-        eos_text (str): Text to append to each example to separate concatenated samples
-        bos_text (str): Text to prepend to each example to separate concatenated samples
-        no_wrap: (bool): Whether to let text examples wrap across multiple training examples
-        compression (str): The compression algorithm to use for MDS writing
-        processes (int): The number of processes to use.
-        args_str (str): String representation of the arguments
-        reprocess (bool): Whether to always reprocess the given folder of text files
-        trust_remote_code (bool): If true, allows custom code to be executed to load the tokenizer
-    """
-    is_remote_output = is_remote_path(output_folder)
-
-    object_names = get_object_names(input_folder)
-    if len(object_names) == 0:
-        raise InputFolderMissingDataError(input_folder)
-
-    # Check if the text files in the bucket have already been processed.
-    if not reprocess and is_already_processed(
-        output_folder,
-        args_str,
-        object_names,
-    ):
-        log.info(
-            f'Input folder {input_folder} is already processed at {output_folder} and '
-            +
-            'reprocess is set to False. Set reprocess to True if you would like to force reprocessing.',
-        )
-        return
-
-    # Use a temporary local directory if the output is remote and there are more than 1 processes
-    local_output_folder = tempfile.TemporaryDirectory(
-    ).name if is_remote_output else output_folder
-
-    if os.path.isdir(output_folder) and len(os.listdir(output_folder)) > 0:
-        raise OutputFolderNotEmptyError(output_folder)
-
-    if processes > 1:
-        # Download and convert the text files in parallel
-        args = get_task_args(
-            object_names,
-            local_output_folder,
-            input_folder,
-            processes,
-            tokenizer_name,
-            concat_tokens,
-            eos_text,
-            bos_text,
-            no_wrap,
-            compression,
-            trust_remote_code,
-        )
-        with ProcessPoolExecutor(max_workers=processes) as executor:
-            list(executor.map(download_and_convert_starargs, args))
-
-        # Merge the mds shards from each of the processes into a single folder
-        merge_shard_groups(local_output_folder)
-    else:
-        download_and_convert(
-            object_names,
-            local_output_folder,
-            input_folder,
-            tokenizer_name,
-            concat_tokens,
-            eos_text,
-            bos_text,
-            no_wrap,
-            compression,
-            trust_remote_code,
-        )
-
-    # Write a done file with the args and object names
-    write_done_file(local_output_folder, args_str, object_names)
-
-    if is_remote_output:
-        # Upload the local output to the remote location
-        output_object_store = cast(
-            ObjectStore,
-            maybe_create_object_store_from_uri(output_folder),
-        )
-        _, _, output_folder_prefix = parse_uri(output_folder)
-        files_to_upload = os.listdir(local_output_folder)
-
-        for file in files_to_upload:
-            assert not os.path.isdir(file)
-            remote_path = os.path.join(output_folder_prefix, file)
-            output_object_store.upload_object(
-                remote_path,
-                os.path.join(local_output_folder, file),
-            )
-
-
-def _args_str(original_args: Namespace) -> str:
-    """Create a string from the args to determine whether to reprocess.
-
-    Args:
-        original_args (Namespace): Arguments to main function.
-    """
-    # Take the arguments that influence the final result.
-    # reprocess and max_mds_writer_workers are not taken.
-    args = Namespace(
-        tokenizer_name=original_args.tokenizer,
-        output_folder=original_args.output_folder,
-        input_folder=original_args.input_folder,
-        concat_tokens=original_args.concat_tokens,
-        eos_text=original_args.eos_text,
-        bos_text=original_args.bos_text,
-        no_wrap=original_args.no_wrap,
-        compression=original_args.compression,
-        processes=original_args.processes,
-    )
-
-    return str(args)
-
-
-def _configure_logging(logging_level: str):
-    """Configure logging.
-
-    Args:
-        logging_level (str): Logging level.
-    """
-    logging.basicConfig(
-        format=
-        f'%(asctime)s: [%(process)d][%(threadName)s]: %(levelname)s: %(name)s: %(message)s',
-    )
-    logging_level = logging_level.upper()
-    logging.getLogger('llmfoundry').setLevel(logging_level)
-    logging.getLogger(__name__).setLevel(logging_level)
-    log.info(f'Logging level set to {logging_level}')
-
-
 if __name__ == '__main__':
     args = parse_args()
-    _configure_logging(args.logging_level)
-    convert_text_to_mds(
-        tokenizer_name=args.tokenizer,
-        output_folder=args.output_folder,
-        input_folder=args.input_folder,
-        concat_tokens=args.concat_tokens,
-        eos_text=args.eos_text,
-        bos_text=args.bos_text,
-        no_wrap=args.no_wrap,
-        compression=args.compression,
-        processes=args.processes,
-        reprocess=args.reprocess,
-        trust_remote_code=args.trust_remote_code,
-        args_str=_args_str(args),
-    )
+    convert_text_to_mds_from_args(args)
diff --git a/tests/a_scripts/data_prep/test_convert_text_to_mds.py b/tests/a_scripts/data_prep/test_convert_text_to_mds.py
index 8dac151f55..7cdaed51a8 100644
--- a/tests/a_scripts/data_prep/test_convert_text_to_mds.py
+++ b/tests/a_scripts/data_prep/test_convert_text_to_mds.py
@@ -13,11 +13,7 @@
 from streaming import StreamingDataset
 from transformers import AutoTokenizer
 
-from llmfoundry.utils.exceptions import (
-    InputFolderMissingDataError,
-    OutputFolderNotEmptyError,
-)
-from scripts.data_prep.convert_text_to_mds import (
+from llmfoundry.data_prep import (
     DONE_FILENAME,
     convert_text_to_mds,
     download_and_convert,
@@ -25,6 +21,10 @@
     merge_shard_groups,
     write_done_file,
 )
+from llmfoundry.utils.exceptions import (
+    InputFolderMissingDataError,
+    OutputFolderNotEmptyError,
+)
 
 
 class MockObjectStore():
@@ -83,15 +83,15 @@ def _assert_files_exist(prefix: str, files: List[str]):
 @pytest.mark.parametrize('processes', [1, 2, 3])
 @patch.object(ProcessPoolExecutor, 'map', new=Mock(wraps=_mock_map))
 @patch(
-    'scripts.data_prep.convert_text_to_mds.maybe_create_object_store_from_uri',
+    'llmfoundry.data_prep.maybe_create_object_store_from_uri',
 )
-@patch('scripts.data_prep.convert_text_to_mds.parse_uri')
+@patch('llmfoundry.data_prep.parse_uri')
 @patch(
-    'scripts.data_prep.convert_text_to_mds.download_and_convert',
+    'llmfoundry.data_prep.download_and_convert',
     wraps=download_and_convert,
 )
 @patch(
-    'scripts.data_prep.convert_text_to_mds.merge_shard_groups',
+    'llmfoundry.data_prep..merge_shard_groups',
     wraps=merge_shard_groups,
 )
 def test_single_and_multi_process(

From f7b0084eed3641cab8b9026167ee38d0a1ae11dd Mon Sep 17 00:00:00 2001
From: v-chen_data <v-chen_data@example.com>
Date: Thu, 11 Jul 2024 18:50:18 -0700
Subject: [PATCH 02/18] cli

---
 llmfoundry/cli/cli.py            | 69 ++++++++++++++++++++++++++++++++
 llmfoundry/data_prep/__init__.py | 15 ++++---
 2 files changed, 78 insertions(+), 6 deletions(-)

diff --git a/llmfoundry/cli/cli.py b/llmfoundry/cli/cli.py
index 8e86e76467..46c167b20d 100644
--- a/llmfoundry/cli/cli.py
+++ b/llmfoundry/cli/cli.py
@@ -1,11 +1,13 @@
 # Copyright 2024 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
+from argparse import Namespace
 from typing import Optional
 
 import typer
 
 from llmfoundry.cli import registry_cli
+from llmfoundry.data_prep import convert_text_to_mds_from_args
 from llmfoundry.train import train_from_yaml
 
 app = typer.Typer(pretty_exceptions_show_locals=False)
@@ -25,5 +27,72 @@ def train(
     train_from_yaml(yaml_path, args_list)
 
 
+@app.command(name='convert_text_to_mds')
+def convert_text_to_mds_cli(
+    output_folder: str = typer.
+    Option(..., help='The folder to write output to'),  # type: ignore
+    input_folder: str = typer.Option(
+        ..., help='The folder with text files to convert to MDS'
+    ),  # type: ignore
+    compression: str = typer.Option(
+        'zstd', help='The compression algorithm to use for MDS writing'
+    ),  # type: ignore
+    concat_tokens: int = typer.Option(
+        ...,
+        help='Convert text to tokens and concatenate up to this many tokens'
+    ),  # type: ignore
+    tokenizer: str = typer.Option(..., help='The name of the tokenizer to use'
+                                 ),  # type: ignore
+    bos_text: Optional[str] = typer.Option(
+        None,
+        help=
+        'The text to prepend to each example to separate concatenated examples'
+    ),  # type: ignore
+    eos_text: Optional[str] = typer.Option(
+        None,
+        help=
+        'The text to append to each example to separate concatenated examples'
+    ),  # type: ignore
+    use_tokenizer_eos: bool = typer.
+    Option(False, help='Use the EOS text from the tokenizer.'),  # type: ignore
+    no_wrap: bool = typer.Option(
+        False,
+        help='Whether to let text examples wrap across multiple training examples'
+    ),  # type: ignore
+    processes: int = typer.Option(
+        min(max(psutil.cpu_count() - 2, 1), 32),
+        help='The number of processes to use to download and convert the dataset'
+    ),  # type: ignore
+    reprocess: bool = typer.Option(
+        False,
+        help=
+        'If true, reprocess the input_folder to MDS format. Otherwise, only reprocess upon changes to the input folder or dataset creation parameters.'
+    ),  # type: ignore
+    trust_remote_code: bool = typer.Option(
+        False,
+        help='If true, allows custom code to be executed to load the tokenizer'
+    ),  # type: ignore
+    logging_level: str = typer.Option(
+        'INFO', help='Logging level for the script. Default is INFO.'
+    ),  # type: ignore
+):
+    args = Namespace(
+        output_folder=output_folder,
+        input_folder=input_folder,
+        compression=compression,
+        concat_tokens=concat_tokens,
+        tokenizer=tokenizer,
+        bos_text=bos_text,
+        eos_text=eos_text,
+        use_tokenizer_eos=use_tokenizer_eos,
+        no_wrap=no_wrap,
+        processes=processes,
+        reprocess=reprocess,
+        trust_remote_code=trust_remote_code,
+        logging_level=logging_level,
+    )
+    convert_text_to_mds_from_args(args)
+
+
 if __name__ == '__main__':
     app()
diff --git a/llmfoundry/data_prep/__init__.py b/llmfoundry/data_prep/__init__.py
index c959c0ddcd..55e17f9eaf 100644
--- a/llmfoundry/data_prep/__init__.py
+++ b/llmfoundry/data_prep/__init__.py
@@ -1,13 +1,16 @@
+# Copyright 2024 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
 from llmfoundry.data_prep.convert_text_to_mds import (
+    DONE_FILENAME,
     convert_text_to_mds,
     convert_text_to_mds_from_args,
-    maybe_create_object_store_from_uri,
-    parse_uri,
     download_and_convert,
-    merge_shard_groups,
     is_already_processed,
+    maybe_create_object_store_from_uri,
+    merge_shard_groups,
+    parse_uri,
     write_done_file,
-    DONE_FILENAME,
 )
 
 __all__ = [
@@ -19,5 +22,5 @@
     'merge_shard_groups',
     'is_already_processed',
     'write_done_file',
-    'DONE_FILENAME'
-]
\ No newline at end of file
+    'DONE_FILENAME',
+]

From 20d7eb5b83b710f5b02a0e1f8f8346c4206559a6 Mon Sep 17 00:00:00 2001
From: v-chen_data <v-chen_data@example.com>
Date: Thu, 11 Jul 2024 18:50:49 -0700
Subject: [PATCH 03/18] ignore

---
 llmfoundry/cli/cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llmfoundry/cli/cli.py b/llmfoundry/cli/cli.py
index 46c167b20d..c2cb504fd8 100644
--- a/llmfoundry/cli/cli.py
+++ b/llmfoundry/cli/cli.py
@@ -60,7 +60,7 @@ def convert_text_to_mds_cli(
         help='Whether to let text examples wrap across multiple training examples'
     ),  # type: ignore
     processes: int = typer.Option(
-        min(max(psutil.cpu_count() - 2, 1), 32),
+        min(max(psutil.cpu_count() - 2, 1), 32), # type: ignore
         help='The number of processes to use to download and convert the dataset'
     ),  # type: ignore
     reprocess: bool = typer.Option(

From 53cd776f5fdc0a210f24c83f3baea796ba18bf9e Mon Sep 17 00:00:00 2001
From: v-chen_data <v-chen_data@example.com>
Date: Thu, 11 Jul 2024 18:58:43 -0700
Subject: [PATCH 04/18] import

---
 llmfoundry/cli/cli.py | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/llmfoundry/cli/cli.py b/llmfoundry/cli/cli.py
index c2cb504fd8..c4897d935d 100644
--- a/llmfoundry/cli/cli.py
+++ b/llmfoundry/cli/cli.py
@@ -4,6 +4,7 @@
 from argparse import Namespace
 from typing import Optional
 
+import psutil
 import typer
 
 from llmfoundry.cli import registry_cli
@@ -32,48 +33,48 @@ def convert_text_to_mds_cli(
     output_folder: str = typer.
     Option(..., help='The folder to write output to'),  # type: ignore
     input_folder: str = typer.Option(
-        ..., help='The folder with text files to convert to MDS'
+        ..., help='The folder with text files to convert to MDS',
     ),  # type: ignore
     compression: str = typer.Option(
-        'zstd', help='The compression algorithm to use for MDS writing'
+        'zstd', help='The compression algorithm to use for MDS writing',
     ),  # type: ignore
     concat_tokens: int = typer.Option(
         ...,
-        help='Convert text to tokens and concatenate up to this many tokens'
+        help='Convert text to tokens and concatenate up to this many tokens',
     ),  # type: ignore
-    tokenizer: str = typer.Option(..., help='The name of the tokenizer to use'
+    tokenizer: str = typer.Option(..., help='The name of the tokenizer to use',
                                  ),  # type: ignore
     bos_text: Optional[str] = typer.Option(
         None,
         help=
-        'The text to prepend to each example to separate concatenated examples'
+        'The text to prepend to each example to separate concatenated examples',
     ),  # type: ignore
     eos_text: Optional[str] = typer.Option(
         None,
         help=
-        'The text to append to each example to separate concatenated examples'
+        'The text to append to each example to separate concatenated examples',
     ),  # type: ignore
     use_tokenizer_eos: bool = typer.
     Option(False, help='Use the EOS text from the tokenizer.'),  # type: ignore
     no_wrap: bool = typer.Option(
         False,
-        help='Whether to let text examples wrap across multiple training examples'
+        help='Whether to let text examples wrap across multiple training examples',
     ),  # type: ignore
     processes: int = typer.Option(
         min(max(psutil.cpu_count() - 2, 1), 32), # type: ignore
-        help='The number of processes to use to download and convert the dataset'
+        help='The number of processes to use to download and convert the dataset',
     ),  # type: ignore
     reprocess: bool = typer.Option(
         False,
         help=
-        'If true, reprocess the input_folder to MDS format. Otherwise, only reprocess upon changes to the input folder or dataset creation parameters.'
+        'If true, reprocess the input_folder to MDS format. Otherwise, only reprocess upon changes to the input folder or dataset creation parameters.',
     ),  # type: ignore
     trust_remote_code: bool = typer.Option(
         False,
-        help='If true, allows custom code to be executed to load the tokenizer'
+        help='If true, allows custom code to be executed to load the tokenizer',
     ),  # type: ignore
     logging_level: str = typer.Option(
-        'INFO', help='Logging level for the script. Default is INFO.'
+        'INFO', help='Logging level for the script. Default is INFO.',
     ),  # type: ignore
 ):
     args = Namespace(

From 3710a94ab087e2d2fb89d7d5f538f928d67c148f Mon Sep 17 00:00:00 2001
From: v-chen_data <v-chen_data@example.com>
Date: Thu, 11 Jul 2024 19:06:44 -0700
Subject: [PATCH 05/18] naming

---
 llmfoundry/cli/cli.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/llmfoundry/cli/cli.py b/llmfoundry/cli/cli.py
index c4897d935d..c145737cf8 100644
--- a/llmfoundry/cli/cli.py
+++ b/llmfoundry/cli/cli.py
@@ -31,50 +31,57 @@ def train(
 @app.command(name='convert_text_to_mds')
 def convert_text_to_mds_cli(
     output_folder: str = typer.
-    Option(..., help='The folder to write output to'),  # type: ignore
+    Option(..., '--output_folder', help='The folder to write output to'),  # type: ignore
     input_folder: str = typer.Option(
-        ..., help='The folder with text files to convert to MDS',
+        ..., '--input_folder', help='The folder with text files to convert to MDS',
     ),  # type: ignore
     compression: str = typer.Option(
-        'zstd', help='The compression algorithm to use for MDS writing',
+        'zstd', '--compression', help='The compression algorithm to use for MDS writing',
     ),  # type: ignore
     concat_tokens: int = typer.Option(
         ...,
+        '--concat_tokens',
         help='Convert text to tokens and concatenate up to this many tokens',
     ),  # type: ignore
-    tokenizer: str = typer.Option(..., help='The name of the tokenizer to use',
+    tokenizer: str = typer.Option(..., '--tokenizer', help='The name of the tokenizer to use',
                                  ),  # type: ignore
     bos_text: Optional[str] = typer.Option(
         None,
+        '--bos_text',
         help=
         'The text to prepend to each example to separate concatenated examples',
     ),  # type: ignore
     eos_text: Optional[str] = typer.Option(
         None,
+        '--eos_text',
         help=
         'The text to append to each example to separate concatenated examples',
     ),  # type: ignore
     use_tokenizer_eos: bool = typer.
-    Option(False, help='Use the EOS text from the tokenizer.'),  # type: ignore
+    Option(False, '--use_tokenizer_eos', help='Use the EOS text from the tokenizer.'),  # type: ignore
     no_wrap: bool = typer.Option(
         False,
+        '--no_wrap',
         help='Whether to let text examples wrap across multiple training examples',
     ),  # type: ignore
     processes: int = typer.Option(
         min(max(psutil.cpu_count() - 2, 1), 32), # type: ignore
+        '--processes',
         help='The number of processes to use to download and convert the dataset',
     ),  # type: ignore
     reprocess: bool = typer.Option(
         False,
+        '--reprocess',
         help=
         'If true, reprocess the input_folder to MDS format. Otherwise, only reprocess upon changes to the input folder or dataset creation parameters.',
     ),  # type: ignore
     trust_remote_code: bool = typer.Option(
         False,
+        '--trust_remote_code',
         help='If true, allows custom code to be executed to load the tokenizer',
     ),  # type: ignore
     logging_level: str = typer.Option(
-        'INFO', help='Logging level for the script. Default is INFO.',
+        'INFO', '--logging_level', help='Logging level for the script. Default is INFO.',
     ),  # type: ignore
 ):
     args = Namespace(

From 5c5c7e639e873c983b971820e73335f9cc12a42b Mon Sep 17 00:00:00 2001
From: v-chen_data <v-chen_data@example.com>
Date: Fri, 12 Jul 2024 10:07:05 -0700
Subject: [PATCH 06/18] typo

---
 tests/a_scripts/data_prep/test_convert_text_to_mds.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/a_scripts/data_prep/test_convert_text_to_mds.py b/tests/a_scripts/data_prep/test_convert_text_to_mds.py
index 7cdaed51a8..4817144ee6 100644
--- a/tests/a_scripts/data_prep/test_convert_text_to_mds.py
+++ b/tests/a_scripts/data_prep/test_convert_text_to_mds.py
@@ -91,7 +91,7 @@ def _assert_files_exist(prefix: str, files: List[str]):
     wraps=download_and_convert,
 )
 @patch(
-    'llmfoundry.data_prep..merge_shard_groups',
+    'llmfoundry.data_prep.merge_shard_groups',
     wraps=merge_shard_groups,
 )
 def test_single_and_multi_process(

From 9dfe42c4959d22e09b8277ec95f43ca243a971e5 Mon Sep 17 00:00:00 2001
From: v-chen_data <v-chen_data@example.com>
Date: Fri, 12 Jul 2024 12:10:54 -0700
Subject: [PATCH 07/18] test

---
 tests/a_scripts/data_prep/test_convert_text_to_mds.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/a_scripts/data_prep/test_convert_text_to_mds.py b/tests/a_scripts/data_prep/test_convert_text_to_mds.py
index 4817144ee6..c94477042d 100644
--- a/tests/a_scripts/data_prep/test_convert_text_to_mds.py
+++ b/tests/a_scripts/data_prep/test_convert_text_to_mds.py
@@ -13,7 +13,7 @@
 from streaming import StreamingDataset
 from transformers import AutoTokenizer
 
-from llmfoundry.data_prep import (
+from llmfoundry.data_prep.convert_text_to_mds import (
     DONE_FILENAME,
     convert_text_to_mds,
     download_and_convert,
@@ -83,15 +83,15 @@ def _assert_files_exist(prefix: str, files: List[str]):
 @pytest.mark.parametrize('processes', [1, 2, 3])
 @patch.object(ProcessPoolExecutor, 'map', new=Mock(wraps=_mock_map))
 @patch(
-    'llmfoundry.data_prep.maybe_create_object_store_from_uri',
+    'llmfoundry.data_prep.convert_text_to_mds.maybe_create_object_store_from_uri',
 )
-@patch('llmfoundry.data_prep.parse_uri')
+@patch('llmfoundry.data_prep.convert_text_to_mds.parse_uri')
 @patch(
-    'llmfoundry.data_prep.download_and_convert',
+    'llmfoundry.data_prep.convert_text_to_mds.download_and_convert',
     wraps=download_and_convert,
 )
 @patch(
-    'llmfoundry.data_prep.merge_shard_groups',
+    'llmfoundry.data_prep.convert_text_to_mds.merge_shard_groups',
     wraps=merge_shard_groups,
 )
 def test_single_and_multi_process(

From e4e2b80c73e03aae657db7cfc2ea788ccc06e10e Mon Sep 17 00:00:00 2001
From: v-chen_data <v-chen_data@example.com>
Date: Fri, 12 Jul 2024 23:34:46 -0700
Subject: [PATCH 08/18] commit comments 1

---
 llmfoundry/cli/cli.py                       | 92 +++++++--------------
 llmfoundry/data_prep/convert_text_to_mds.py | 77 +++++++++++------
 2 files changed, 84 insertions(+), 85 deletions(-)

diff --git a/llmfoundry/cli/cli.py b/llmfoundry/cli/cli.py
index c145737cf8..56a3df2204 100644
--- a/llmfoundry/cli/cli.py
+++ b/llmfoundry/cli/cli.py
@@ -1,28 +1,29 @@
 # Copyright 2024 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
-from argparse import Namespace
-from typing import Optional
+from typing import Annotated, Optional
 
 import psutil
-import typer
+from typer import Argument, Option, Typer
 
 from llmfoundry.cli import registry_cli
 from llmfoundry.data_prep import convert_text_to_mds_from_args
 from llmfoundry.train import train_from_yaml
 
-app = typer.Typer(pretty_exceptions_show_locals=False)
+app = Typer(pretty_exceptions_show_locals=False)
 app.add_typer(registry_cli.app, name='registry')
 
 
 @app.command(name='train')
 def train(
-    yaml_path: str = typer.Argument(
-        ...,
-        help='Path to the YAML configuration file',
-    ),  # type: ignore
-    args_list: Optional[list[str]] = typer.
-    Argument(None, help='Additional command line arguments'),  # type: ignore
+    yaml_path: Annotated[str,
+                         Argument(
+                             ...,
+                             help='Path to the YAML configuration file',
+                         )],
+    args_list: Annotated[
+        Optional[list[str]],
+        Argument(help='Additional command line arguments')] = None,
 ):
     """Run the training with optional overrides from CLI."""
     train_from_yaml(yaml_path, args_list)
@@ -30,61 +31,31 @@ def train(
 
 @app.command(name='convert_text_to_mds')
 def convert_text_to_mds_cli(
-    output_folder: str = typer.
-    Option(..., '--output_folder', help='The folder to write output to'),  # type: ignore
-    input_folder: str = typer.Option(
-        ..., '--input_folder', help='The folder with text files to convert to MDS',
-    ),  # type: ignore
-    compression: str = typer.Option(
-        'zstd', '--compression', help='The compression algorithm to use for MDS writing',
-    ),  # type: ignore
-    concat_tokens: int = typer.Option(
-        ...,
-        '--concat_tokens',
-        help='Convert text to tokens and concatenate up to this many tokens',
-    ),  # type: ignore
-    tokenizer: str = typer.Option(..., '--tokenizer', help='The name of the tokenizer to use',
-                                 ),  # type: ignore
-    bos_text: Optional[str] = typer.Option(
-        None,
-        '--bos_text',
-        help=
-        'The text to prepend to each example to separate concatenated examples',
-    ),  # type: ignore
-    eos_text: Optional[str] = typer.Option(
-        None,
-        '--eos_text',
-        help=
-        'The text to append to each example to separate concatenated examples',
-    ),  # type: ignore
-    use_tokenizer_eos: bool = typer.
-    Option(False, '--use_tokenizer_eos', help='Use the EOS text from the tokenizer.'),  # type: ignore
-    no_wrap: bool = typer.Option(
-        False,
-        '--no_wrap',
-        help='Whether to let text examples wrap across multiple training examples',
-    ),  # type: ignore
-    processes: int = typer.Option(
-        min(max(psutil.cpu_count() - 2, 1), 32), # type: ignore
-        '--processes',
+    output_folder: Annotated[str, Option(..., '--output_folder', help='The folder to write output to')],
+    input_folder: Annotated[str, Option(..., '--input_folder', help='The folder with text files to convert to MDS')],
+    concat_tokens: Annotated[int, Option(..., help='Convert text to tokens and concatenate up to this many tokens')],
+    tokenizer: Annotated[str, Option(..., help='The name of the tokenizer to use')],
+    bos_text: Annotated[str, Option(help='The text to prepend to each example to separate concatenated examples')] = None,
+    eos_text: Annotated[str, Option(help='The text to append to each example to separate concatenated examples')] = None,
+    compression: Annotated[str, Option(help='The compression algorithm to use for MDS writing')] = 'zstd',
+    use_tokenizer_eos: Annotated[bool, Option(help='Use the EOS text from the tokenizer')] = False,
+    no_wrap: Annotated[bool, Option(help='Whether to let text examples wrap across multiple training examples')] = False,
+    processes: Annotated[int, Option(
         help='The number of processes to use to download and convert the dataset',
-    ),  # type: ignore
-    reprocess: bool = typer.Option(
-        False,
-        '--reprocess',
+    )] = min(max(psutil.cpu_count() - 2, 1), 32), # type: ignore
+    reprocess: Annotated[bool, Option(
         help=
         'If true, reprocess the input_folder to MDS format. Otherwise, only reprocess upon changes to the input folder or dataset creation parameters.',
-    ),  # type: ignore
-    trust_remote_code: bool = typer.Option(
-        False,
-        '--trust_remote_code',
+    )] = False,
+    trust_remote_code: Annotated[bool, Option(
         help='If true, allows custom code to be executed to load the tokenizer',
-    ),  # type: ignore
-    logging_level: str = typer.Option(
-        'INFO', '--logging_level', help='Logging level for the script. Default is INFO.',
-    ),  # type: ignore
+    )] = False,
+    logging_level: Annotated[str, Option(
+        help='Logging level for the script. Default is INFO.',
+    )] = 'INFO',
+
 ):
-    args = Namespace(
+    convert_text_to_mds_from_args(
         output_folder=output_folder,
         input_folder=input_folder,
         compression=compression,
@@ -99,7 +70,6 @@ def convert_text_to_mds_cli(
         trust_remote_code=trust_remote_code,
         logging_level=logging_level,
     )
-    convert_text_to_mds_from_args(args)
 
 
 if __name__ == '__main__':
diff --git a/llmfoundry/data_prep/convert_text_to_mds.py b/llmfoundry/data_prep/convert_text_to_mds.py
index ce6a8f7f70..5a3a16443a 100644
--- a/llmfoundry/data_prep/convert_text_to_mds.py
+++ b/llmfoundry/data_prep/convert_text_to_mds.py
@@ -9,7 +9,7 @@
 from concurrent.futures import ProcessPoolExecutor
 from functools import partial
 from glob import glob
-from typing import Dict, Iterable, List, Tuple, cast
+from typing import Dict, Iterable, List, Optional, Tuple, cast
 
 import numpy as np
 import psutil
@@ -582,36 +582,65 @@ def _configure_logging(logging_level: str):
     log.info(f'Logging level set to {logging_level}')
 
 
-def convert_text_to_mds_from_args(args: Namespace) -> None:
-    if args.use_tokenizer_eos:
+def convert_text_to_mds_from_args(
+    output_folder: str,
+    input_folder: str,
+    compression: Optional[str],
+    concat_tokens: int,
+    tokenizer: str,
+    bos_text: Optional[str],
+    eos_text: Optional[str],
+    use_tokenizer_eos: bool,
+    no_wrap: bool,
+    processes: int,
+    reprocess: bool,
+    trust_remote_code: bool,
+    logging_level: str,
+) -> None:
+    if use_tokenizer_eos:
         # Ensure that eos text is not specified twice.
-        if args.eos_text is not None:
-            args.error(
+        if eos_text is not None:
+            ValueError(
                 'Cannot set --eos_text with --use_tokenizer_eos. Please specify one.',
             )
         tokenizer = AutoTokenizer.from_pretrained(
-            args.tokenizer,
-            trust_remote_code=args.trust_remote_code,
+            tokenizer,
+            trust_remote_code=trust_remote_code,
         )
-        args.eos_text = tokenizer.eos_token
+        eos_text = tokenizer.eos_token
 
     # now that we have validated them, change BOS/EOS to strings
-    if args.bos_text is None:
-        args.bos_text = ''
-    if args.eos_text is None:
-        args.eos_text = ''
-    _configure_logging(args.logging_level)
+    if bos_text is None:
+        bos_text = ''
+    if eos_text is None:
+        eos_text = ''
+    _configure_logging(logging_level)
+
+    # Define args for _args_str
+    args = Namespace(
+        tokenizer=tokenizer,
+        output_folder=output_folder,
+        input_folder=input_folder,
+        compression=compression,
+        concat_tokens=concat_tokens,
+        eos_text=eos_text,
+        bos_text=bos_text,
+        no_wrap=no_wrap,
+        processes=processes,
+        reprocess=reprocess,
+        trust_remote_code=trust_remote_code,
+    )
     convert_text_to_mds(
-        tokenizer_name=args.tokenizer,
-        output_folder=args.output_folder,
-        input_folder=args.input_folder,
-        concat_tokens=args.concat_tokens,
-        eos_text=args.eos_text,
-        bos_text=args.bos_text,
-        no_wrap=args.no_wrap,
-        compression=args.compression,
-        processes=args.processes,
-        reprocess=args.reprocess,
-        trust_remote_code=args.trust_remote_code,
+        tokenizer_name=tokenizer,
+        output_folder=output_folder,
+        input_folder=input_folder,
+        concat_tokens=concat_tokens,
+        eos_text=eos_text,
+        bos_text=bos_text,
+        no_wrap=no_wrap,
+        compression=compression,
+        processes=processes,
+        reprocess=reprocess,
+        trust_remote_code=trust_remote_code,
         args_str=_args_str(args),
     )

From 1873c2fd9d4fc58091daac2c79667c0aad77411b Mon Sep 17 00:00:00 2001
From: v-chen_data <v-chen_data@example.com>
Date: Sat, 13 Jul 2024 00:24:46 -0700
Subject: [PATCH 09/18] precommit

---
 llmfoundry/cli/cli.py                       |  6 +++---
 llmfoundry/data_prep/convert_text_to_mds.py |  6 +++---
 scripts/data_prep/convert_text_to_mds.py    | 16 +++++++++++++++-
 3 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/llmfoundry/cli/cli.py b/llmfoundry/cli/cli.py
index 56a3df2204..0837d51028 100644
--- a/llmfoundry/cli/cli.py
+++ b/llmfoundry/cli/cli.py
@@ -30,13 +30,13 @@ def train(
 
 
 @app.command(name='convert_text_to_mds')
-def convert_text_to_mds_cli(
+def convert_text_to_mds(
     output_folder: Annotated[str, Option(..., '--output_folder', help='The folder to write output to')],
     input_folder: Annotated[str, Option(..., '--input_folder', help='The folder with text files to convert to MDS')],
     concat_tokens: Annotated[int, Option(..., help='Convert text to tokens and concatenate up to this many tokens')],
     tokenizer: Annotated[str, Option(..., help='The name of the tokenizer to use')],
-    bos_text: Annotated[str, Option(help='The text to prepend to each example to separate concatenated examples')] = None,
-    eos_text: Annotated[str, Option(help='The text to append to each example to separate concatenated examples')] = None,
+    bos_text: Annotated[Optional[str], Option(help='The text to prepend to each example to separate concatenated examples')] = None,
+    eos_text: Annotated[Optional[str], Option(help='The text to append to each example to separate concatenated examples')] = None,
     compression: Annotated[str, Option(help='The compression algorithm to use for MDS writing')] = 'zstd',
     use_tokenizer_eos: Annotated[bool, Option(help='Use the EOS text from the tokenizer')] = False,
     no_wrap: Annotated[bool, Option(help='Whether to let text examples wrap across multiple training examples')] = False,
diff --git a/llmfoundry/data_prep/convert_text_to_mds.py b/llmfoundry/data_prep/convert_text_to_mds.py
index 5a3a16443a..35bf9810bc 100644
--- a/llmfoundry/data_prep/convert_text_to_mds.py
+++ b/llmfoundry/data_prep/convert_text_to_mds.py
@@ -585,7 +585,7 @@ def _configure_logging(logging_level: str):
 def convert_text_to_mds_from_args(
     output_folder: str,
     input_folder: str,
-    compression: Optional[str],
+    compression: str,
     concat_tokens: int,
     tokenizer: str,
     bos_text: Optional[str],
@@ -603,11 +603,11 @@ def convert_text_to_mds_from_args(
             ValueError(
                 'Cannot set --eos_text with --use_tokenizer_eos. Please specify one.',
             )
-        tokenizer = AutoTokenizer.from_pretrained(
+        built_tokenizer = AutoTokenizer.from_pretrained(
             tokenizer,
             trust_remote_code=trust_remote_code,
         )
-        eos_text = tokenizer.eos_token
+        eos_text = built_tokenizer.eos_token
 
     # now that we have validated them, change BOS/EOS to strings
     if bos_text is None:
diff --git a/scripts/data_prep/convert_text_to_mds.py b/scripts/data_prep/convert_text_to_mds.py
index f0ab28a90f..5d1a2b3da8 100644
--- a/scripts/data_prep/convert_text_to_mds.py
+++ b/scripts/data_prep/convert_text_to_mds.py
@@ -118,4 +118,18 @@ def parse_args() -> Namespace:
 
 if __name__ == '__main__':
     args = parse_args()
-    convert_text_to_mds_from_args(args)
+    convert_text_to_mds_from_args(
+        output_folder=args.output_folder,
+        input_folder=args.input_folder,
+        compression=args.compression,
+        concat_tokens=args.concat_tokens,
+        tokenizer=args.tokenizer,
+        bos_text=args.bos_text,
+        eos_text=args.eos_text,
+        use_tokenizer_eos=args.use_tokenizer_eos,
+        no_wrap=args.no_wrap,
+        processes=args.processes,
+        reprocess=args.reprocess,
+        trust_remote_code=args.trust_remote_code,
+        logging_level=args.logging,
+    )

From d063da7c65eef1dde060d199c61c702c777ee967 Mon Sep 17 00:00:00 2001
From: v-chen_data <v-chen_data@example.com>
Date: Sat, 13 Jul 2024 03:37:23 -0700
Subject: [PATCH 10/18] typo

---
 scripts/data_prep/convert_text_to_mds.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/data_prep/convert_text_to_mds.py b/scripts/data_prep/convert_text_to_mds.py
index 5d1a2b3da8..52152207df 100644
--- a/scripts/data_prep/convert_text_to_mds.py
+++ b/scripts/data_prep/convert_text_to_mds.py
@@ -131,5 +131,5 @@ def parse_args() -> Namespace:
         processes=args.processes,
         reprocess=args.reprocess,
         trust_remote_code=args.trust_remote_code,
-        logging_level=args.logging,
+        logging_level=args.logging_level,
     )

From 3cfeeabd73ad93fb1b99d26a90911090f81d7e15 Mon Sep 17 00:00:00 2001
From: v-chen_data <v-chen_data@example.com>
Date: Sat, 13 Jul 2024 03:41:46 -0700
Subject: [PATCH 11/18] typo

---
 llmfoundry/cli/cli.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llmfoundry/cli/cli.py b/llmfoundry/cli/cli.py
index 0837d51028..4dc1f05c7d 100644
--- a/llmfoundry/cli/cli.py
+++ b/llmfoundry/cli/cli.py
@@ -31,8 +31,8 @@ def train(
 
 @app.command(name='convert_text_to_mds')
 def convert_text_to_mds(
-    output_folder: Annotated[str, Option(..., '--output_folder', help='The folder to write output to')],
-    input_folder: Annotated[str, Option(..., '--input_folder', help='The folder with text files to convert to MDS')],
+    output_folder: Annotated[str, Option(..., help='The folder to write output to')],
+    input_folder: Annotated[str, Option(..., help='The folder with text files to convert to MDS')],
     concat_tokens: Annotated[int, Option(..., help='Convert text to tokens and concatenate up to this many tokens')],
     tokenizer: Annotated[str, Option(..., help='The name of the tokenizer to use')],
     bos_text: Annotated[Optional[str], Option(help='The text to prepend to each example to separate concatenated examples')] = None,

From f4022bcd234485c5f550853f4ceb8cec3a56d257 Mon Sep 17 00:00:00 2001
From: v-chen_data <v-chen_data@example.com>
Date: Sat, 13 Jul 2024 17:38:48 -0700
Subject: [PATCH 12/18] arg_str

---
 llmfoundry/data_prep/convert_text_to_mds.py | 51 ++++++---------------
 1 file changed, 14 insertions(+), 37 deletions(-)

diff --git a/llmfoundry/data_prep/convert_text_to_mds.py b/llmfoundry/data_prep/convert_text_to_mds.py
index 35bf9810bc..3814787f25 100644
--- a/llmfoundry/data_prep/convert_text_to_mds.py
+++ b/llmfoundry/data_prep/convert_text_to_mds.py
@@ -543,29 +543,6 @@ def convert_text_to_mds(
             )
 
 
-def _args_str(original_args: Namespace) -> str:
-    """Create a string from the args to determine whether to reprocess.
-
-    Args:
-        original_args (Namespace): Arguments to main function.
-    """
-    # Take the arguments that influence the final result.
-    # reprocess and max_mds_writer_workers are not taken.
-    args = Namespace(
-        tokenizer_name=original_args.tokenizer,
-        output_folder=original_args.output_folder,
-        input_folder=original_args.input_folder,
-        concat_tokens=original_args.concat_tokens,
-        eos_text=original_args.eos_text,
-        bos_text=original_args.bos_text,
-        no_wrap=original_args.no_wrap,
-        compression=original_args.compression,
-        processes=original_args.processes,
-    )
-
-    return str(args)
-
-
 def _configure_logging(logging_level: str):
     """Configure logging.
 
@@ -617,19 +594,19 @@ def convert_text_to_mds_from_args(
     _configure_logging(logging_level)
 
     # Define args for _args_str
-    args = Namespace(
-        tokenizer=tokenizer,
-        output_folder=output_folder,
-        input_folder=input_folder,
-        compression=compression,
-        concat_tokens=concat_tokens,
-        eos_text=eos_text,
-        bos_text=bos_text,
-        no_wrap=no_wrap,
-        processes=processes,
-        reprocess=reprocess,
-        trust_remote_code=trust_remote_code,
-    )
+    args = {
+        'tokenizer': tokenizer,
+        'output_folder': output_folder,
+        'input_folder': input_folder,
+        'compression': compression,
+        'concat_tokens': concat_tokens,
+        'eos_text': eos_text,
+        'bos_text': bos_text,
+        'no_wrap': no_wrap,
+        'processes': processes,
+        'reprocess': reprocess,
+        'trust_remote_code': trust_remote_code,
+    }
     convert_text_to_mds(
         tokenizer_name=tokenizer,
         output_folder=output_folder,
@@ -642,5 +619,5 @@ def convert_text_to_mds_from_args(
         processes=processes,
         reprocess=reprocess,
         trust_remote_code=trust_remote_code,
-        args_str=_args_str(args),
+        args_str=str(args),
     )

From 9b1cc6f5635c0714f6fe98e5e5352f7610f58a16 Mon Sep 17 00:00:00 2001
From: v-chen_data <v-chen_data@example.com>
Date: Mon, 15 Jul 2024 11:59:12 -0700
Subject: [PATCH 13/18] annotation + help

---
 llmfoundry/cli/cli.py                       |  1 +
 llmfoundry/data_prep/convert_text_to_mds.py | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/llmfoundry/cli/cli.py b/llmfoundry/cli/cli.py
index 4dc1f05c7d..5f5e37c5a9 100644
--- a/llmfoundry/cli/cli.py
+++ b/llmfoundry/cli/cli.py
@@ -55,6 +55,7 @@ def convert_text_to_mds(
     )] = 'INFO',
 
 ):
+    """Convert text files to MDS streaming format."""
     convert_text_to_mds_from_args(
         output_folder=output_folder,
         input_folder=input_folder,
diff --git a/llmfoundry/data_prep/convert_text_to_mds.py b/llmfoundry/data_prep/convert_text_to_mds.py
index 3814787f25..0ee03a0d93 100644
--- a/llmfoundry/data_prep/convert_text_to_mds.py
+++ b/llmfoundry/data_prep/convert_text_to_mds.py
@@ -574,6 +574,26 @@ def convert_text_to_mds_from_args(
     trust_remote_code: bool,
     logging_level: str,
 ) -> None:
+    """A wrapper for `convert_finetuning_dataset`
+
+    Args:
+        output_folder (str): The folder to write output to
+        input_folder (str): The folder with text files to convert to MDS
+        compression (str): The compression algorithm to use for MDS writing
+        concat_tokens (int): Convert text to tokens and concatenate up to this many tokens
+        tokenizer (str): The name of the tokenizer to use
+        bos_text (Optional[str]): The text to prepend to each example to separate concatenated examples
+        eos_text (Optional[str]): The text to append to each example to separate concatenated examples
+        use_tokenizer_eos (bool): Use the EOS text from the tokenizer
+        no_wrap (bool): Whether to let text examples wrap across multiple training examples
+        processes (int): The number of processes to use to download and convert the dataset
+        reprocess (bool): If true, reprocess the input_folder to MDS format. Otherwise, only reprocess upon changes to the input folder or dataset creation parameters.
+        trust_remote_code (bool): If true, allows custom code to be executed to load the tokenizer
+        logging_level (str): Logging level for the script. Default is INFO.
+
+    Raises:
+        ValueError: If `use_tokenizer_eos` is True and `eos_text` is not None
+    """
     if use_tokenizer_eos:
         # Ensure that eos text is not specified twice.
         if eos_text is not None:

From 3a14cb65f8db1dd5101ce008f59495465c15dcdb Mon Sep 17 00:00:00 2001
From: v-chen_data <v-chen_data@example.com>
Date: Mon, 15 Jul 2024 12:06:42 -0700
Subject: [PATCH 14/18] update annotation

---
 llmfoundry/data_prep/convert_text_to_mds.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llmfoundry/data_prep/convert_text_to_mds.py b/llmfoundry/data_prep/convert_text_to_mds.py
index 0ee03a0d93..ba818fd83b 100644
--- a/llmfoundry/data_prep/convert_text_to_mds.py
+++ b/llmfoundry/data_prep/convert_text_to_mds.py
@@ -574,7 +574,7 @@ def convert_text_to_mds_from_args(
     trust_remote_code: bool,
     logging_level: str,
 ) -> None:
-    """A wrapper for `convert_finetuning_dataset`
+    """A wrapper for `convert_text_to_mds`
 
     Args:
         output_folder (str): The folder to write output to

From ec73674a139091c39e8e52be7ace38e898e50f92 Mon Sep 17 00:00:00 2001
From: v-chen_data <v-chen_data@example.com>
Date: Mon, 15 Jul 2024 12:12:48 -0700
Subject: [PATCH 15/18] typo

---
 llmfoundry/data_prep/convert_text_to_mds.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llmfoundry/data_prep/convert_text_to_mds.py b/llmfoundry/data_prep/convert_text_to_mds.py
index ba818fd83b..b9e48139f7 100644
--- a/llmfoundry/data_prep/convert_text_to_mds.py
+++ b/llmfoundry/data_prep/convert_text_to_mds.py
@@ -574,13 +574,13 @@ def convert_text_to_mds_from_args(
     trust_remote_code: bool,
     logging_level: str,
 ) -> None:
-    """A wrapper for `convert_text_to_mds`
+    """A wrapper for `convert_text_to_mds` to parse arguments.
 
     Args:
-        output_folder (str): The folder to write output to
-        input_folder (str): The folder with text files to convert to MDS
+        output_folder (str): Folder to write MDS shards to
+        input_folder (str): Folder of text files to process
         compression (str): The compression algorithm to use for MDS writing
-        concat_tokens (int): Convert text to tokens and concatenate up to this many tokens
+        concat_tokens (int): Concatenate up to this many tokens
         tokenizer (str): The name of the tokenizer to use
         bos_text (Optional[str]): The text to prepend to each example to separate concatenated examples
         eos_text (Optional[str]): The text to append to each example to separate concatenated examples

From b0dcceec2df12b78dcd4ad5db6fa307bcb8d8987 Mon Sep 17 00:00:00 2001
From: v-chen_data <v-chen_data@example.com>
Date: Wed, 17 Jul 2024 13:35:08 -0700
Subject: [PATCH 16/18] precommit

---
 llmfoundry/command_utils/data_prep/convert_text_to_mds.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py
index 36b88256f9..5c6256e05e 100644
--- a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py
+++ b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py
@@ -1,3 +1,6 @@
+# Copyright 2024 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
 import logging
 import math
 import os
@@ -5,7 +8,7 @@
 from concurrent.futures import ProcessPoolExecutor
 from functools import partial
 from glob import glob
-from typing import Dict, Iterable, List, Tuple, cast, Optional
+from typing import Dict, Iterable, List, Optional, Tuple, cast
 
 import numpy as np
 from composer.utils import (

From 5b4adf491dd0cbbaf9f73d6e99d53bdf0ff4287d Mon Sep 17 00:00:00 2001
From: v-chen_data <v-chen_data@example.com>
Date: Wed, 17 Jul 2024 14:07:30 -0700
Subject: [PATCH 17/18] precommit

---
 llmfoundry/cli/data_prep_cli.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llmfoundry/cli/data_prep_cli.py b/llmfoundry/cli/data_prep_cli.py
index b1fa8346ed..befded7278 100644
--- a/llmfoundry/cli/data_prep_cli.py
+++ b/llmfoundry/cli/data_prep_cli.py
@@ -3,6 +3,7 @@
 
 from typing import Annotated, Optional
 
+import psutil
 from typer import Option, Typer
 
 from llmfoundry.command_utils import (

From 61bb594c0705f230f08c96bb915b2f430864edf3 Mon Sep 17 00:00:00 2001
From: v-chen_data <v-chen_data@example.com>
Date: Wed, 17 Jul 2024 17:30:43 -0700
Subject: [PATCH 18/18] pr comments

---
 llmfoundry/cli/data_prep_cli.py                    |  2 +-
 llmfoundry/command_utils/__init__.py               | 14 --------------
 .../command_utils/data_prep/convert_text_to_mds.py | 14 +++++++-------
 scripts/data_prep/convert_text_to_mds.py           |  2 +-
 4 files changed, 9 insertions(+), 23 deletions(-)

diff --git a/llmfoundry/cli/data_prep_cli.py b/llmfoundry/cli/data_prep_cli.py
index befded7278..9cb7b0d240 100644
--- a/llmfoundry/cli/data_prep_cli.py
+++ b/llmfoundry/cli/data_prep_cli.py
@@ -95,7 +95,7 @@ def convert_text_to_mds(
         input_folder=input_folder,
         compression=compression,
         concat_tokens=concat_tokens,
-        tokenizer=tokenizer,
+        tokenizer_name=tokenizer,
         bos_text=bos_text,
         eos_text=eos_text,
         use_tokenizer_eos=use_tokenizer_eos,
diff --git a/llmfoundry/command_utils/__init__.py b/llmfoundry/command_utils/__init__.py
index fd9b866c1e..bdd9ad24f4 100644
--- a/llmfoundry/command_utils/__init__.py
+++ b/llmfoundry/command_utils/__init__.py
@@ -5,15 +5,8 @@
     convert_dataset_hf_from_args,
 )
 from llmfoundry.command_utils.data_prep.convert_text_to_mds import (
-    DONE_FILENAME,
     convert_text_to_mds,
     convert_text_to_mds_from_args,
-    download_and_convert,
-    is_already_processed,
-    maybe_create_object_store_from_uri,
-    merge_shard_groups,
-    parse_uri,
-    write_done_file,
 )
 from llmfoundry.command_utils.eval import (
     eval_from_yaml,
@@ -39,11 +32,4 @@
     'convert_dataset_hf_from_args',
     'convert_text_to_mds',
     'convert_text_to_mds_from_args',
-    'maybe_create_object_store_from_uri',
-    'parse_uri',
-    'download_and_convert',
-    'merge_shard_groups',
-    'is_already_processed',
-    'write_done_file',
-    'DONE_FILENAME',
 ]
diff --git a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py
index 5c6256e05e..14afe279fd 100644
--- a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py
+++ b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py
@@ -503,7 +503,7 @@ def convert_text_to_mds_from_args(
     input_folder: str,
     compression: str,
     concat_tokens: int,
-    tokenizer: str,
+    tokenizer_name: str,
     bos_text: Optional[str],
     eos_text: Optional[str],
     use_tokenizer_eos: bool,
@@ -520,7 +520,7 @@ def convert_text_to_mds_from_args(
         input_folder (str): Folder of text files to process
         compression (str): The compression algorithm to use for MDS writing
         concat_tokens (int): Concatenate up to this many tokens
-        tokenizer (str): The name of the tokenizer to use
+        tokenizer_name (str): The name of the tokenizer to use
         bos_text (Optional[str]): The text to prepend to each example to separate concatenated examples
         eos_text (Optional[str]): The text to append to each example to separate concatenated examples
         use_tokenizer_eos (bool): Use the EOS text from the tokenizer
@@ -539,11 +539,11 @@ def convert_text_to_mds_from_args(
             ValueError(
                 'Cannot set --eos_text with --use_tokenizer_eos. Please specify one.',
             )
-        built_tokenizer = AutoTokenizer.from_pretrained(
-            tokenizer,
+        tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_name,
             trust_remote_code=trust_remote_code,
         )
-        eos_text = built_tokenizer.eos_token
+        eos_text = tokenizer.eos_token
 
     # now that we have validated them, change BOS/EOS to strings
     if bos_text is None:
@@ -554,7 +554,7 @@ def convert_text_to_mds_from_args(
 
     # Define args for _args_str
     args = {
-        'tokenizer': tokenizer,
+        'tokenizer': tokenizer_name,
         'output_folder': output_folder,
         'input_folder': input_folder,
         'compression': compression,
@@ -567,7 +567,7 @@ def convert_text_to_mds_from_args(
         'trust_remote_code': trust_remote_code,
     }
     convert_text_to_mds(
-        tokenizer_name=tokenizer,
+        tokenizer_name=tokenizer_name,
         output_folder=output_folder,
         input_folder=input_folder,
         concat_tokens=concat_tokens,
diff --git a/scripts/data_prep/convert_text_to_mds.py b/scripts/data_prep/convert_text_to_mds.py
index b820d38b7d..c808fa871f 100644
--- a/scripts/data_prep/convert_text_to_mds.py
+++ b/scripts/data_prep/convert_text_to_mds.py
@@ -123,7 +123,7 @@ def parse_args() -> Namespace:
         input_folder=args.input_folder,
         compression=args.compression,
         concat_tokens=args.concat_tokens,
-        tokenizer=args.tokenizer,
+        tokenizer_name=args.tokenizer,
         bos_text=args.bos_text,
         eos_text=args.eos_text,
         use_tokenizer_eos=args.use_tokenizer_eos,