From cd0eaca555f4236d32cb314a861142e020f61279 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Thu, 11 Jul 2024 18:06:28 -0700 Subject: [PATCH 01/18] cli --- llmfoundry/data_prep/__init__.py | 23 + llmfoundry/data_prep/convert_text_to_mds.py | 617 ++++++++++++++++++ scripts/data_prep/convert_text_to_mds.py | 503 +------------- .../data_prep/test_convert_text_to_mds.py | 18 +- 4 files changed, 651 insertions(+), 510 deletions(-) create mode 100644 llmfoundry/data_prep/__init__.py create mode 100644 llmfoundry/data_prep/convert_text_to_mds.py diff --git a/llmfoundry/data_prep/__init__.py b/llmfoundry/data_prep/__init__.py new file mode 100644 index 0000000000..c959c0ddcd --- /dev/null +++ b/llmfoundry/data_prep/__init__.py @@ -0,0 +1,23 @@ +from llmfoundry.data_prep.convert_text_to_mds import ( + convert_text_to_mds, + convert_text_to_mds_from_args, + maybe_create_object_store_from_uri, + parse_uri, + download_and_convert, + merge_shard_groups, + is_already_processed, + write_done_file, + DONE_FILENAME, +) + +__all__ = [ + 'convert_text_to_mds', + 'convert_text_to_mds_from_args', + 'maybe_create_object_store_from_uri', + 'parse_uri', + 'download_and_convert', + 'merge_shard_groups', + 'is_already_processed', + 'write_done_file', + 'DONE_FILENAME' +] \ No newline at end of file diff --git a/llmfoundry/data_prep/convert_text_to_mds.py b/llmfoundry/data_prep/convert_text_to_mds.py new file mode 100644 index 0000000000..ce6a8f7f70 --- /dev/null +++ b/llmfoundry/data_prep/convert_text_to_mds.py @@ -0,0 +1,617 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +import logging +import math +import os +import tempfile +from argparse import ArgumentParser, Namespace +from concurrent.futures import ProcessPoolExecutor +from functools import partial +from glob import glob +from typing import Dict, Iterable, List, Tuple, cast + +import numpy as np +import psutil +from composer.utils import ( + ObjectStore, + maybe_create_object_store_from_uri, + parse_uri, +) +from numpy.typing import NDArray +from streaming import MDSWriter +from tqdm import tqdm +from transformers import AutoTokenizer, PreTrainedTokenizerBase + +from llmfoundry.data.data import AbstractConcatTokensDataset +from llmfoundry.utils.data_prep_utils import ( + DownloadingIterable, + download_file, + merge_shard_groups, +) +from llmfoundry.utils.exceptions import ( + InputFolderMissingDataError, + OutputFolderNotEmptyError, +) + +log = logging.getLogger(__name__) + +DONE_FILENAME = '.text_to_mds_conversion_done' + + +class ConcatTokensFromFilesDataset(AbstractConcatTokensDataset): + """An IterableDataset that returns token samples for MDSWriter from files. + + Returns dicts of {'tokens': ndarray:int32} + + Each file is considered a sequence. + """ + + def __init__( + self, + files: Iterable[str], + tokenizer: PreTrainedTokenizerBase, + max_length: int, + bos_text: str, + eos_text: str, + no_wrap: bool, + ): + self.files = files + super().__init__(tokenizer, max_length, bos_text, eos_text, no_wrap) + + def __iter__(self) -> Iterable[Dict[str, NDArray]]: + + buffer = [] + for file in self.files: + with open(file, 'r') as f: + buffer += self.bos_tokens + first_chunk = True + # Read the file in 1MB chunks to avoid memory issues + for chunk in iter(partial(f.read, 1000000), ''): + # Tokenize the chunk + encoded = self.tokenizer( + chunk, + truncation=False, + padding=False, + ) + iids = encoded['input_ids'] + + # If this is not the first chunk, remove the BOS token + if not first_chunk: + if iids[0] == self.tokenizer.bos_token_id: + iids = iids[1:] + + # Add the tokens to the buffer + buffer += iids + while len(buffer) >= self.max_length: + concat_sample = buffer[:self.max_length] + buffer = buffer[self. + max_length:] if self.should_wrap else [] + yield { + 'tokens': np.asarray(concat_sample, dtype=np.int32), + } + + first_chunk = False + + # Add the EOS token to the buffer to separate files. + buffer += self.eos_tokens + + # Yield any remaining samples of size max_length. + while len(buffer) >= self.max_length: + concat_sample = buffer[:self.max_length] + buffer = buffer[self.max_length:] if self.should_wrap else [] + yield {'tokens': np.asarray(concat_sample, dtype=np.int32)} + + +def parse_args() -> Namespace: + """Parse commandline arguments.""" + parser = ArgumentParser( + description= + 'Convert text files into MDS format, optionally concatenating and tokenizing', + ) + parser.add_argument( + '--output_folder', + type=str, + required=True, + help='The folder to write output to', + ) + parser.add_argument( + '--input_folder', + type=str, + required=True, + help='The folder with text files to convert to mds', + ) + parser.add_argument( + '--compression', + type=str, + default='zstd', + required=False, + help='The compression algorithm to use for MDS writing', + ) + + parser.add_argument( + '--concat_tokens', + type=int, + required=True, + help='Convert text to tokens and concatenate up to this many tokens', + ) + + parser.add_argument( + '--tokenizer', + type=str, + required=True, + help='The name of the tokenizer to use', + ) + parser.add_argument( + '--bos_text', + type=str, + required=False, + default=None, + help= + 'The text to prepend to each example to separate concatenated examples', + ) + parser.add_argument( + '--eos_text', + type=str, + required=False, + default=None, + help= + 'The text to append to each example to separate concatenated examples', + ) + parser.add_argument( + '--use_tokenizer_eos', + required=False, + action='store_true', + default=False, + help='Use the EOS text from the tokenizer.', + ) + parser.add_argument( + '--no_wrap', + default=False, + action='store_true', + help= + 'Whether to let text examples wrap across multiple training examples', + ) + parser.add_argument( + '--processes', + type=int, + required=False, + default=min(max(psutil.cpu_count() - 2, 1), 32), + help= + 'The number of processes to use to download and convert the dataset', + ) + parser.add_argument( + '--reprocess', + type=bool, + required=False, + default=False, + help='If true, reprocess the input_folder to mds format. Otherwise, ' + + 'only reprocess upon changes to the input folder or dataset creation parameters.', + ) + parser.add_argument( + '--trust-remote-code', + type=bool, + required=False, + default=False, + help='If true, allows custom code to be executed to load the tokenizer', + ) + parser.add_argument( + '--logging-level', + type=str, + required=False, + default='INFO', + help='Logging level for the script. Default is INFO.', + ) + parsed = parser.parse_args() + return parsed + + +def get_object_names(input_folder: str) -> List[str]: + """Get object names from a local or remote folder. + + Args: + input_folder (str): local or remote folder path. + """ + object_store = maybe_create_object_store_from_uri(input_folder) + if object_store is not None: + _, _, folder_prefix = parse_uri(input_folder) + names = [ + name for name in object_store.list_objects(folder_prefix) + if name.endswith('.txt') + ] + else: + # input_folder is a local folder + names = [ + text_file for dirpath, _, _ in os.walk(input_folder) + for text_file in glob(os.path.join(dirpath, '*.txt')) + ] + # return names, sizes + log.info(f'Found {len(names)} text files at {input_folder}') + + return names + + +def get_task_args( + object_names: List[str], + output_root: str, + input_folder: str, + n_groups: int, + tokenizer_name: str, + concat_tokens: int, + eos_text: str, + bos_text: str, + no_wrap: bool, + compression: str, + trust_remote_code: bool, +) -> Iterable: + """Get download_and_convert arguments split across n_groups. + + Each group handles a portion of object_names. + + Args: + object_names (List[str]): Names of objects to process + output_root (str): Folder to write MDS shards to + input_folder (str): Folder of text files to process + n_groups (int): Number of groups to split the object names into + tokenizer_name (str): Name of tokenizer to use + concat_tokens (int): Concatenate up to this many tokens + eos_text (str): Text to append to each example to separate concatenated samples + bos_text (str): Text to prepend to each example to separate concatenated samples + no_wrap: (bool): Whether to let text examples wrap across multiple training examples + compression (str): The compression algorithm to use for MDS writing + trust_remote_code (bool): If true, allows custom code to be executed to load the tokenizer + """ + num_objects = len(object_names) + objs_per_group = math.ceil(num_objects / n_groups) + for group, i in enumerate(range(0, num_objects, objs_per_group)): + output_subdir = os.path.join(output_root, str(group)) + yield ( + object_names[i:min(i + objs_per_group, num_objects)], + output_subdir, + input_folder, + tokenizer_name, + concat_tokens, + eos_text, + bos_text, + no_wrap, + compression, + trust_remote_code, + ) + + +def download_and_convert_starargs(args: Tuple): + """Helper function to call download_and_convert with star args. + + This helps us use download_and_convert with multiprocessing. + """ + return download_and_convert(*args) + + +def download_and_convert( + file_names: List[str], + output_folder: str, + input_folder: str, + tokenizer_name: str, + concat_tokens: int, + eos_text: str, + bos_text: str, + no_wrap: bool, + compression: str, + trust_remote_code: bool, +): + """Downloads and converts text files to MDS format. + + Args: + file_names (List[str]): Files to process + output_folder (str): Folder to write MDS shards to + input_folder (str): Folder of text files to process + tokenizer_name (str): Name of tokenizer to use + concat_tokens (int): Concatenate up to this many tokens + eos_text (str): Text to append to each example to separate concatenated samples + bos_text (str): Text to prepend to each example to separate concatenated samples + no_wrap: (bool): Whether to let text examples wrap across multiple training examples + compression (str): The compression algorithm to use for MDS writing + trust_remote_code (bool): If true, allows custom code to be executed to load the tokenizer + """ + object_store = maybe_create_object_store_from_uri(input_folder) + + # Download file_names + with tempfile.TemporaryDirectory() as tmp_dir: + downloading_iter = DownloadingIterable( + object_names=file_names, + output_folder=tmp_dir, + object_store=object_store, + ) + tokenizer = AutoTokenizer.from_pretrained( + tokenizer_name, + trust_remote_code=trust_remote_code, + ) + tokenizer.model_max_length = 5000000000 # Hack to prevent warnings from HuggingFace + + # Use the ConcatTokensDataset from LLM-foundry to concatenate sequences of tokens up + # to the maximum sequence length + dataset = ConcatTokensFromFilesDataset( + files=downloading_iter, + max_length=concat_tokens, + tokenizer=tokenizer, + eos_text=eos_text, + bos_text=bos_text, + no_wrap=no_wrap, + ) + + columns = {'tokens': 'ndarray:int32'} + + log.info('Converting to MDS format...') + with MDSWriter( + out=output_folder, + columns=columns, + compression=compression, + ) as out: + for sample in tqdm(dataset): + out.write(sample) + + +def is_remote_path(path: str) -> bool: + """Checks whether a path is a remote path. + + Args: + path (str): path to check + """ + backend, _, _ = parse_uri(path) + return backend != '' + + +def is_already_processed( + output_root: str, + args_str: str, + object_names: List[str], +) -> bool: + """Determines whether a group of text files has already been processed. + + Checks the done fie at output root to determine this. + + Args: + output_root (str): Output folder where a done file may exist + args_str (str): String representation of the arguments + object_names (List[str]): Names of objects to convert to MDS format + """ + # Retrieve the done file contents + output_object_store = maybe_create_object_store_from_uri(output_root) + if output_object_store is not None: + # Download and read the done file from the remote object store + _, _, output_folder_prefix = parse_uri(output_root) + try: + with tempfile.TemporaryDirectory() as tmp_dir: + done_file = os.path.join(tmp_dir, DONE_FILENAME) + download_file( + object_store=output_object_store, + object_name=os.path.join( + output_folder_prefix, + DONE_FILENAME, + ), + output_filename=done_file, + ) + with open(done_file) as df: + done_file_contents = df.read().splitlines() + except FileNotFoundError: + return False + else: + # Read the local done file + done_file = os.path.join(output_root, DONE_FILENAME) + if not os.path.isfile(done_file): + return False + with open(done_file) as df: + done_file_contents = df.read().splitlines() + # Compare the arguments + prev_args_str = done_file_contents[0] + if prev_args_str != args_str: + return False + + # Compare file names + prev_names = done_file_contents[1:] + if len(prev_names) != len(object_names): + return False + for idx, prev_name in enumerate(prev_names): + if object_names[idx] != prev_name: + return False + return True + + +def write_done_file(folder: str, args_str: str, object_names: List[str]): + """Write a file to signify completion. + + This the done file includes the arguments to processing and + a list of objects that were processed. + + Args: + folder (str): Folder to write the done file to + args_str (str): String representation of arguments + object_names (List[str]): List of objects to convert to MDS format + """ + with open(os.path.join(folder, DONE_FILENAME), 'w') as done_file: + done_file.write('\n'.join([args_str] + object_names) + '\n') + + +def convert_text_to_mds( + tokenizer_name: str, + output_folder: str, + input_folder: str, + concat_tokens: int, + eos_text: str, + bos_text: str, + no_wrap: bool, + compression: str, + processes: int, + args_str: str, + reprocess: bool, + trust_remote_code: bool, +): + """Convert a folder of text files to MDS format. + + Args: + tokenizer_name (str): Name of tokenizer to use + output_folder (str): Folder to write MDS shards to + input_folder (str): Folder of text files to process + concat_tokens (int): Concatenate up to this many tokens + eos_text (str): Text to append to each example to separate concatenated samples + bos_text (str): Text to prepend to each example to separate concatenated samples + no_wrap: (bool): Whether to let text examples wrap across multiple training examples + compression (str): The compression algorithm to use for MDS writing + processes (int): The number of processes to use. + args_str (str): String representation of the arguments + reprocess (bool): Whether to always reprocess the given folder of text files + trust_remote_code (bool): If true, allows custom code to be executed to load the tokenizer + """ + is_remote_output = is_remote_path(output_folder) + + object_names = get_object_names(input_folder) + if len(object_names) == 0: + raise InputFolderMissingDataError(input_folder) + + # Check if the text files in the bucket have already been processed. + if not reprocess and is_already_processed( + output_folder, + args_str, + object_names, + ): + log.info( + f'Input folder {input_folder} is already processed at {output_folder} and ' + + + 'reprocess is set to False. Set reprocess to True if you would like to force reprocessing.', + ) + return + + # Use a temporary local directory if the output is remote and there are more than 1 processes + local_output_folder = tempfile.TemporaryDirectory( + ).name if is_remote_output else output_folder + + if os.path.isdir(output_folder) and len(os.listdir(output_folder)) > 0: + raise OutputFolderNotEmptyError(output_folder) + + if processes > 1: + # Download and convert the text files in parallel + args = get_task_args( + object_names, + local_output_folder, + input_folder, + processes, + tokenizer_name, + concat_tokens, + eos_text, + bos_text, + no_wrap, + compression, + trust_remote_code, + ) + with ProcessPoolExecutor(max_workers=processes) as executor: + list(executor.map(download_and_convert_starargs, args)) + + # Merge the mds shards from each of the processes into a single folder + merge_shard_groups(local_output_folder) + else: + download_and_convert( + object_names, + local_output_folder, + input_folder, + tokenizer_name, + concat_tokens, + eos_text, + bos_text, + no_wrap, + compression, + trust_remote_code, + ) + + # Write a done file with the args and object names + write_done_file(local_output_folder, args_str, object_names) + + if is_remote_output: + # Upload the local output to the remote location + output_object_store = cast( + ObjectStore, + maybe_create_object_store_from_uri(output_folder), + ) + _, _, output_folder_prefix = parse_uri(output_folder) + files_to_upload = os.listdir(local_output_folder) + + for file in files_to_upload: + assert not os.path.isdir(file) + remote_path = os.path.join(output_folder_prefix, file) + output_object_store.upload_object( + remote_path, + os.path.join(local_output_folder, file), + ) + + +def _args_str(original_args: Namespace) -> str: + """Create a string from the args to determine whether to reprocess. + + Args: + original_args (Namespace): Arguments to main function. + """ + # Take the arguments that influence the final result. + # reprocess and max_mds_writer_workers are not taken. + args = Namespace( + tokenizer_name=original_args.tokenizer, + output_folder=original_args.output_folder, + input_folder=original_args.input_folder, + concat_tokens=original_args.concat_tokens, + eos_text=original_args.eos_text, + bos_text=original_args.bos_text, + no_wrap=original_args.no_wrap, + compression=original_args.compression, + processes=original_args.processes, + ) + + return str(args) + + +def _configure_logging(logging_level: str): + """Configure logging. + + Args: + logging_level (str): Logging level. + """ + logging.basicConfig( + format= + f'%(asctime)s: [%(process)d][%(threadName)s]: %(levelname)s: %(name)s: %(message)s', + ) + logging_level = logging_level.upper() + logging.getLogger('llmfoundry').setLevel(logging_level) + logging.getLogger(__name__).setLevel(logging_level) + log.info(f'Logging level set to {logging_level}') + + +def convert_text_to_mds_from_args(args: Namespace) -> None: + if args.use_tokenizer_eos: + # Ensure that eos text is not specified twice. + if args.eos_text is not None: + args.error( + 'Cannot set --eos_text with --use_tokenizer_eos. Please specify one.', + ) + tokenizer = AutoTokenizer.from_pretrained( + args.tokenizer, + trust_remote_code=args.trust_remote_code, + ) + args.eos_text = tokenizer.eos_token + + # now that we have validated them, change BOS/EOS to strings + if args.bos_text is None: + args.bos_text = '' + if args.eos_text is None: + args.eos_text = '' + _configure_logging(args.logging_level) + convert_text_to_mds( + tokenizer_name=args.tokenizer, + output_folder=args.output_folder, + input_folder=args.input_folder, + concat_tokens=args.concat_tokens, + eos_text=args.eos_text, + bos_text=args.bos_text, + no_wrap=args.no_wrap, + compression=args.compression, + processes=args.processes, + reprocess=args.reprocess, + trust_remote_code=args.trust_remote_code, + args_str=_args_str(args), + ) diff --git a/scripts/data_prep/convert_text_to_mds.py b/scripts/data_prep/convert_text_to_mds.py index 92c36eb35d..f0ab28a90f 100644 --- a/scripts/data_prep/convert_text_to_mds.py +++ b/scripts/data_prep/convert_text_to_mds.py @@ -2,107 +2,17 @@ # SPDX-License-Identifier: Apache-2.0 import logging -import math -import os -import tempfile from argparse import ArgumentParser, Namespace -from concurrent.futures import ProcessPoolExecutor -from functools import partial -from glob import glob -from typing import Dict, Iterable, List, Tuple, cast -import numpy as np import psutil -from composer.utils import ( - ObjectStore, - maybe_create_object_store_from_uri, - parse_uri, -) -from numpy.typing import NDArray -from streaming import MDSWriter -from tqdm import tqdm -from transformers import AutoTokenizer, PreTrainedTokenizerBase -from llmfoundry.data.data import AbstractConcatTokensDataset -from llmfoundry.utils.data_prep_utils import ( - DownloadingIterable, - download_file, - merge_shard_groups, -) -from llmfoundry.utils.exceptions import ( - InputFolderMissingDataError, - OutputFolderNotEmptyError, -) +from llmfoundry.data_prep import convert_text_to_mds_from_args log = logging.getLogger(__name__) DONE_FILENAME = '.text_to_mds_conversion_done' -class ConcatTokensFromFilesDataset(AbstractConcatTokensDataset): - """An IterableDataset that returns token samples for MDSWriter from files. - - Returns dicts of {'tokens': ndarray:int32} - - Each file is considered a sequence. - """ - - def __init__( - self, - files: Iterable[str], - tokenizer: PreTrainedTokenizerBase, - max_length: int, - bos_text: str, - eos_text: str, - no_wrap: bool, - ): - self.files = files - super().__init__(tokenizer, max_length, bos_text, eos_text, no_wrap) - - def __iter__(self) -> Iterable[Dict[str, NDArray]]: - - buffer = [] - for file in self.files: - with open(file, 'r') as f: - buffer += self.bos_tokens - first_chunk = True - # Read the file in 1MB chunks to avoid memory issues - for chunk in iter(partial(f.read, 1000000), ''): - # Tokenize the chunk - encoded = self.tokenizer( - chunk, - truncation=False, - padding=False, - ) - iids = encoded['input_ids'] - - # If this is not the first chunk, remove the BOS token - if not first_chunk: - if iids[0] == self.tokenizer.bos_token_id: - iids = iids[1:] - - # Add the tokens to the buffer - buffer += iids - while len(buffer) >= self.max_length: - concat_sample = buffer[:self.max_length] - buffer = buffer[self. - max_length:] if self.should_wrap else [] - yield { - 'tokens': np.asarray(concat_sample, dtype=np.int32), - } - - first_chunk = False - - # Add the EOS token to the buffer to separate files. - buffer += self.eos_tokens - - # Yield any remaining samples of size max_length. - while len(buffer) >= self.max_length: - concat_sample = buffer[:self.max_length] - buffer = buffer[self.max_length:] if self.should_wrap else [] - yield {'tokens': np.asarray(concat_sample, dtype=np.int32)} - - def parse_args() -> Namespace: """Parse commandline arguments.""" parser = ArgumentParser( @@ -203,418 +113,9 @@ def parse_args() -> Namespace: help='Logging level for the script. Default is INFO.', ) parsed = parser.parse_args() - - # Set eos token. - if parsed.use_tokenizer_eos: - # Ensure that eos text is not specified twice. - if parsed.eos_text is not None: - parser.error( - 'Cannot set --eos_text with --use_tokenizer_eos. Please specify one.', - ) - tokenizer = AutoTokenizer.from_pretrained( - parsed.tokenizer, - trust_remote_code=parsed.trust_remote_code, - ) - parsed.eos_text = tokenizer.eos_token - - # now that we have validated them, change BOS/EOS to strings - if parsed.bos_text is None: - parsed.bos_text = '' - if parsed.eos_text is None: - parsed.eos_text = '' return parsed -def get_object_names(input_folder: str) -> List[str]: - """Get object names from a local or remote folder. - - Args: - input_folder (str): local or remote folder path. - """ - object_store = maybe_create_object_store_from_uri(input_folder) - if object_store is not None: - _, _, folder_prefix = parse_uri(input_folder) - names = [ - name for name in object_store.list_objects(folder_prefix) - if name.endswith('.txt') - ] - else: - # input_folder is a local folder - names = [ - text_file for dirpath, _, _ in os.walk(input_folder) - for text_file in glob(os.path.join(dirpath, '*.txt')) - ] - # return names, sizes - log.info(f'Found {len(names)} text files at {input_folder}') - - return names - - -def get_task_args( - object_names: List[str], - output_root: str, - input_folder: str, - n_groups: int, - tokenizer_name: str, - concat_tokens: int, - eos_text: str, - bos_text: str, - no_wrap: bool, - compression: str, - trust_remote_code: bool, -) -> Iterable: - """Get download_and_convert arguments split across n_groups. - - Each group handles a portion of object_names. - - Args: - object_names (List[str]): Names of objects to process - output_root (str): Folder to write MDS shards to - input_folder (str): Folder of text files to process - n_groups (int): Number of groups to split the object names into - tokenizer_name (str): Name of tokenizer to use - concat_tokens (int): Concatenate up to this many tokens - eos_text (str): Text to append to each example to separate concatenated samples - bos_text (str): Text to prepend to each example to separate concatenated samples - no_wrap: (bool): Whether to let text examples wrap across multiple training examples - compression (str): The compression algorithm to use for MDS writing - trust_remote_code (bool): If true, allows custom code to be executed to load the tokenizer - """ - num_objects = len(object_names) - objs_per_group = math.ceil(num_objects / n_groups) - for group, i in enumerate(range(0, num_objects, objs_per_group)): - output_subdir = os.path.join(output_root, str(group)) - yield ( - object_names[i:min(i + objs_per_group, num_objects)], - output_subdir, - input_folder, - tokenizer_name, - concat_tokens, - eos_text, - bos_text, - no_wrap, - compression, - trust_remote_code, - ) - - -def download_and_convert_starargs(args: Tuple): - """Helper function to call download_and_convert with star args. - - This helps us use download_and_convert with multiprocessing. - """ - return download_and_convert(*args) - - -def download_and_convert( - file_names: List[str], - output_folder: str, - input_folder: str, - tokenizer_name: str, - concat_tokens: int, - eos_text: str, - bos_text: str, - no_wrap: bool, - compression: str, - trust_remote_code: bool, -): - """Downloads and converts text files to MDS format. - - Args: - file_names (List[str]): Files to process - output_folder (str): Folder to write MDS shards to - input_folder (str): Folder of text files to process - tokenizer_name (str): Name of tokenizer to use - concat_tokens (int): Concatenate up to this many tokens - eos_text (str): Text to append to each example to separate concatenated samples - bos_text (str): Text to prepend to each example to separate concatenated samples - no_wrap: (bool): Whether to let text examples wrap across multiple training examples - compression (str): The compression algorithm to use for MDS writing - trust_remote_code (bool): If true, allows custom code to be executed to load the tokenizer - """ - object_store = maybe_create_object_store_from_uri(input_folder) - - # Download file_names - with tempfile.TemporaryDirectory() as tmp_dir: - downloading_iter = DownloadingIterable( - object_names=file_names, - output_folder=tmp_dir, - object_store=object_store, - ) - tokenizer = AutoTokenizer.from_pretrained( - tokenizer_name, - trust_remote_code=trust_remote_code, - ) - tokenizer.model_max_length = 5000000000 # Hack to prevent warnings from HuggingFace - - # Use the ConcatTokensDataset from LLM-foundry to concatenate sequences of tokens up - # to the maximum sequence length - dataset = ConcatTokensFromFilesDataset( - files=downloading_iter, - max_length=concat_tokens, - tokenizer=tokenizer, - eos_text=eos_text, - bos_text=bos_text, - no_wrap=no_wrap, - ) - - columns = {'tokens': 'ndarray:int32'} - - log.info('Converting to MDS format...') - with MDSWriter( - out=output_folder, - columns=columns, - compression=compression, - ) as out: - for sample in tqdm(dataset): - out.write(sample) - - -def is_remote_path(path: str) -> bool: - """Checks whether a path is a remote path. - - Args: - path (str): path to check - """ - backend, _, _ = parse_uri(path) - return backend != '' - - -def is_already_processed( - output_root: str, - args_str: str, - object_names: List[str], -) -> bool: - """Determines whether a group of text files has already been processed. - - Checks the done fie at output root to determine this. - - Args: - output_root (str): Output folder where a done file may exist - args_str (str): String representation of the arguments - object_names (List[str]): Names of objects to convert to MDS format - """ - # Retrieve the done file contents - output_object_store = maybe_create_object_store_from_uri(output_root) - if output_object_store is not None: - # Download and read the done file from the remote object store - _, _, output_folder_prefix = parse_uri(output_root) - try: - with tempfile.TemporaryDirectory() as tmp_dir: - done_file = os.path.join(tmp_dir, DONE_FILENAME) - download_file( - object_store=output_object_store, - object_name=os.path.join( - output_folder_prefix, - DONE_FILENAME, - ), - output_filename=done_file, - ) - with open(done_file) as df: - done_file_contents = df.read().splitlines() - except FileNotFoundError: - return False - else: - # Read the local done file - done_file = os.path.join(output_root, DONE_FILENAME) - if not os.path.isfile(done_file): - return False - with open(done_file) as df: - done_file_contents = df.read().splitlines() - # Compare the arguments - prev_args_str = done_file_contents[0] - if prev_args_str != args_str: - return False - - # Compare file names - prev_names = done_file_contents[1:] - if len(prev_names) != len(object_names): - return False - for idx, prev_name in enumerate(prev_names): - if object_names[idx] != prev_name: - return False - return True - - -def write_done_file(folder: str, args_str: str, object_names: List[str]): - """Write a file to signify completion. - - This the done file includes the arguments to processing and - a list of objects that were processed. - - Args: - folder (str): Folder to write the done file to - args_str (str): String representation of arguments - object_names (List[str]): List of objects to convert to MDS format - """ - with open(os.path.join(folder, DONE_FILENAME), 'w') as done_file: - done_file.write('\n'.join([args_str] + object_names) + '\n') - - -def convert_text_to_mds( - tokenizer_name: str, - output_folder: str, - input_folder: str, - concat_tokens: int, - eos_text: str, - bos_text: str, - no_wrap: bool, - compression: str, - processes: int, - args_str: str, - reprocess: bool, - trust_remote_code: bool, -): - """Convert a folder of text files to MDS format. - - Args: - tokenizer_name (str): Name of tokenizer to use - output_folder (str): Folder to write MDS shards to - input_folder (str): Folder of text files to process - concat_tokens (int): Concatenate up to this many tokens - eos_text (str): Text to append to each example to separate concatenated samples - bos_text (str): Text to prepend to each example to separate concatenated samples - no_wrap: (bool): Whether to let text examples wrap across multiple training examples - compression (str): The compression algorithm to use for MDS writing - processes (int): The number of processes to use. - args_str (str): String representation of the arguments - reprocess (bool): Whether to always reprocess the given folder of text files - trust_remote_code (bool): If true, allows custom code to be executed to load the tokenizer - """ - is_remote_output = is_remote_path(output_folder) - - object_names = get_object_names(input_folder) - if len(object_names) == 0: - raise InputFolderMissingDataError(input_folder) - - # Check if the text files in the bucket have already been processed. - if not reprocess and is_already_processed( - output_folder, - args_str, - object_names, - ): - log.info( - f'Input folder {input_folder} is already processed at {output_folder} and ' - + - 'reprocess is set to False. Set reprocess to True if you would like to force reprocessing.', - ) - return - - # Use a temporary local directory if the output is remote and there are more than 1 processes - local_output_folder = tempfile.TemporaryDirectory( - ).name if is_remote_output else output_folder - - if os.path.isdir(output_folder) and len(os.listdir(output_folder)) > 0: - raise OutputFolderNotEmptyError(output_folder) - - if processes > 1: - # Download and convert the text files in parallel - args = get_task_args( - object_names, - local_output_folder, - input_folder, - processes, - tokenizer_name, - concat_tokens, - eos_text, - bos_text, - no_wrap, - compression, - trust_remote_code, - ) - with ProcessPoolExecutor(max_workers=processes) as executor: - list(executor.map(download_and_convert_starargs, args)) - - # Merge the mds shards from each of the processes into a single folder - merge_shard_groups(local_output_folder) - else: - download_and_convert( - object_names, - local_output_folder, - input_folder, - tokenizer_name, - concat_tokens, - eos_text, - bos_text, - no_wrap, - compression, - trust_remote_code, - ) - - # Write a done file with the args and object names - write_done_file(local_output_folder, args_str, object_names) - - if is_remote_output: - # Upload the local output to the remote location - output_object_store = cast( - ObjectStore, - maybe_create_object_store_from_uri(output_folder), - ) - _, _, output_folder_prefix = parse_uri(output_folder) - files_to_upload = os.listdir(local_output_folder) - - for file in files_to_upload: - assert not os.path.isdir(file) - remote_path = os.path.join(output_folder_prefix, file) - output_object_store.upload_object( - remote_path, - os.path.join(local_output_folder, file), - ) - - -def _args_str(original_args: Namespace) -> str: - """Create a string from the args to determine whether to reprocess. - - Args: - original_args (Namespace): Arguments to main function. - """ - # Take the arguments that influence the final result. - # reprocess and max_mds_writer_workers are not taken. - args = Namespace( - tokenizer_name=original_args.tokenizer, - output_folder=original_args.output_folder, - input_folder=original_args.input_folder, - concat_tokens=original_args.concat_tokens, - eos_text=original_args.eos_text, - bos_text=original_args.bos_text, - no_wrap=original_args.no_wrap, - compression=original_args.compression, - processes=original_args.processes, - ) - - return str(args) - - -def _configure_logging(logging_level: str): - """Configure logging. - - Args: - logging_level (str): Logging level. - """ - logging.basicConfig( - format= - f'%(asctime)s: [%(process)d][%(threadName)s]: %(levelname)s: %(name)s: %(message)s', - ) - logging_level = logging_level.upper() - logging.getLogger('llmfoundry').setLevel(logging_level) - logging.getLogger(__name__).setLevel(logging_level) - log.info(f'Logging level set to {logging_level}') - - if __name__ == '__main__': args = parse_args() - _configure_logging(args.logging_level) - convert_text_to_mds( - tokenizer_name=args.tokenizer, - output_folder=args.output_folder, - input_folder=args.input_folder, - concat_tokens=args.concat_tokens, - eos_text=args.eos_text, - bos_text=args.bos_text, - no_wrap=args.no_wrap, - compression=args.compression, - processes=args.processes, - reprocess=args.reprocess, - trust_remote_code=args.trust_remote_code, - args_str=_args_str(args), - ) + convert_text_to_mds_from_args(args) diff --git a/tests/a_scripts/data_prep/test_convert_text_to_mds.py b/tests/a_scripts/data_prep/test_convert_text_to_mds.py index 8dac151f55..7cdaed51a8 100644 --- a/tests/a_scripts/data_prep/test_convert_text_to_mds.py +++ b/tests/a_scripts/data_prep/test_convert_text_to_mds.py @@ -13,11 +13,7 @@ from streaming import StreamingDataset from transformers import AutoTokenizer -from llmfoundry.utils.exceptions import ( - InputFolderMissingDataError, - OutputFolderNotEmptyError, -) -from scripts.data_prep.convert_text_to_mds import ( +from llmfoundry.data_prep import ( DONE_FILENAME, convert_text_to_mds, download_and_convert, @@ -25,6 +21,10 @@ merge_shard_groups, write_done_file, ) +from llmfoundry.utils.exceptions import ( + InputFolderMissingDataError, + OutputFolderNotEmptyError, +) class MockObjectStore(): @@ -83,15 +83,15 @@ def _assert_files_exist(prefix: str, files: List[str]): @pytest.mark.parametrize('processes', [1, 2, 3]) @patch.object(ProcessPoolExecutor, 'map', new=Mock(wraps=_mock_map)) @patch( - 'scripts.data_prep.convert_text_to_mds.maybe_create_object_store_from_uri', + 'llmfoundry.data_prep.maybe_create_object_store_from_uri', ) -@patch('scripts.data_prep.convert_text_to_mds.parse_uri') +@patch('llmfoundry.data_prep.parse_uri') @patch( - 'scripts.data_prep.convert_text_to_mds.download_and_convert', + 'llmfoundry.data_prep.download_and_convert', wraps=download_and_convert, ) @patch( - 'scripts.data_prep.convert_text_to_mds.merge_shard_groups', + 'llmfoundry.data_prep..merge_shard_groups', wraps=merge_shard_groups, ) def test_single_and_multi_process( From f7b0084eed3641cab8b9026167ee38d0a1ae11dd Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Thu, 11 Jul 2024 18:50:18 -0700 Subject: [PATCH 02/18] cli --- llmfoundry/cli/cli.py | 69 ++++++++++++++++++++++++++++++++ llmfoundry/data_prep/__init__.py | 15 ++++--- 2 files changed, 78 insertions(+), 6 deletions(-) diff --git a/llmfoundry/cli/cli.py b/llmfoundry/cli/cli.py index 8e86e76467..46c167b20d 100644 --- a/llmfoundry/cli/cli.py +++ b/llmfoundry/cli/cli.py @@ -1,11 +1,13 @@ # Copyright 2024 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 +from argparse import Namespace from typing import Optional import typer from llmfoundry.cli import registry_cli +from llmfoundry.data_prep import convert_text_to_mds_from_args from llmfoundry.train import train_from_yaml app = typer.Typer(pretty_exceptions_show_locals=False) @@ -25,5 +27,72 @@ def train( train_from_yaml(yaml_path, args_list) +@app.command(name='convert_text_to_mds') +def convert_text_to_mds_cli( + output_folder: str = typer. + Option(..., help='The folder to write output to'), # type: ignore + input_folder: str = typer.Option( + ..., help='The folder with text files to convert to MDS' + ), # type: ignore + compression: str = typer.Option( + 'zstd', help='The compression algorithm to use for MDS writing' + ), # type: ignore + concat_tokens: int = typer.Option( + ..., + help='Convert text to tokens and concatenate up to this many tokens' + ), # type: ignore + tokenizer: str = typer.Option(..., help='The name of the tokenizer to use' + ), # type: ignore + bos_text: Optional[str] = typer.Option( + None, + help= + 'The text to prepend to each example to separate concatenated examples' + ), # type: ignore + eos_text: Optional[str] = typer.Option( + None, + help= + 'The text to append to each example to separate concatenated examples' + ), # type: ignore + use_tokenizer_eos: bool = typer. + Option(False, help='Use the EOS text from the tokenizer.'), # type: ignore + no_wrap: bool = typer.Option( + False, + help='Whether to let text examples wrap across multiple training examples' + ), # type: ignore + processes: int = typer.Option( + min(max(psutil.cpu_count() - 2, 1), 32), + help='The number of processes to use to download and convert the dataset' + ), # type: ignore + reprocess: bool = typer.Option( + False, + help= + 'If true, reprocess the input_folder to MDS format. Otherwise, only reprocess upon changes to the input folder or dataset creation parameters.' + ), # type: ignore + trust_remote_code: bool = typer.Option( + False, + help='If true, allows custom code to be executed to load the tokenizer' + ), # type: ignore + logging_level: str = typer.Option( + 'INFO', help='Logging level for the script. Default is INFO.' + ), # type: ignore +): + args = Namespace( + output_folder=output_folder, + input_folder=input_folder, + compression=compression, + concat_tokens=concat_tokens, + tokenizer=tokenizer, + bos_text=bos_text, + eos_text=eos_text, + use_tokenizer_eos=use_tokenizer_eos, + no_wrap=no_wrap, + processes=processes, + reprocess=reprocess, + trust_remote_code=trust_remote_code, + logging_level=logging_level, + ) + convert_text_to_mds_from_args(args) + + if __name__ == '__main__': app() diff --git a/llmfoundry/data_prep/__init__.py b/llmfoundry/data_prep/__init__.py index c959c0ddcd..55e17f9eaf 100644 --- a/llmfoundry/data_prep/__init__.py +++ b/llmfoundry/data_prep/__init__.py @@ -1,13 +1,16 @@ +# Copyright 2024 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + from llmfoundry.data_prep.convert_text_to_mds import ( + DONE_FILENAME, convert_text_to_mds, convert_text_to_mds_from_args, - maybe_create_object_store_from_uri, - parse_uri, download_and_convert, - merge_shard_groups, is_already_processed, + maybe_create_object_store_from_uri, + merge_shard_groups, + parse_uri, write_done_file, - DONE_FILENAME, ) __all__ = [ @@ -19,5 +22,5 @@ 'merge_shard_groups', 'is_already_processed', 'write_done_file', - 'DONE_FILENAME' -] \ No newline at end of file + 'DONE_FILENAME', +] From 20d7eb5b83b710f5b02a0e1f8f8346c4206559a6 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Thu, 11 Jul 2024 18:50:49 -0700 Subject: [PATCH 03/18] ignore --- llmfoundry/cli/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmfoundry/cli/cli.py b/llmfoundry/cli/cli.py index 46c167b20d..c2cb504fd8 100644 --- a/llmfoundry/cli/cli.py +++ b/llmfoundry/cli/cli.py @@ -60,7 +60,7 @@ def convert_text_to_mds_cli( help='Whether to let text examples wrap across multiple training examples' ), # type: ignore processes: int = typer.Option( - min(max(psutil.cpu_count() - 2, 1), 32), + min(max(psutil.cpu_count() - 2, 1), 32), # type: ignore help='The number of processes to use to download and convert the dataset' ), # type: ignore reprocess: bool = typer.Option( From 53cd776f5fdc0a210f24c83f3baea796ba18bf9e Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Thu, 11 Jul 2024 18:58:43 -0700 Subject: [PATCH 04/18] import --- llmfoundry/cli/cli.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/llmfoundry/cli/cli.py b/llmfoundry/cli/cli.py index c2cb504fd8..c4897d935d 100644 --- a/llmfoundry/cli/cli.py +++ b/llmfoundry/cli/cli.py @@ -4,6 +4,7 @@ from argparse import Namespace from typing import Optional +import psutil import typer from llmfoundry.cli import registry_cli @@ -32,48 +33,48 @@ def convert_text_to_mds_cli( output_folder: str = typer. Option(..., help='The folder to write output to'), # type: ignore input_folder: str = typer.Option( - ..., help='The folder with text files to convert to MDS' + ..., help='The folder with text files to convert to MDS', ), # type: ignore compression: str = typer.Option( - 'zstd', help='The compression algorithm to use for MDS writing' + 'zstd', help='The compression algorithm to use for MDS writing', ), # type: ignore concat_tokens: int = typer.Option( ..., - help='Convert text to tokens and concatenate up to this many tokens' + help='Convert text to tokens and concatenate up to this many tokens', ), # type: ignore - tokenizer: str = typer.Option(..., help='The name of the tokenizer to use' + tokenizer: str = typer.Option(..., help='The name of the tokenizer to use', ), # type: ignore bos_text: Optional[str] = typer.Option( None, help= - 'The text to prepend to each example to separate concatenated examples' + 'The text to prepend to each example to separate concatenated examples', ), # type: ignore eos_text: Optional[str] = typer.Option( None, help= - 'The text to append to each example to separate concatenated examples' + 'The text to append to each example to separate concatenated examples', ), # type: ignore use_tokenizer_eos: bool = typer. Option(False, help='Use the EOS text from the tokenizer.'), # type: ignore no_wrap: bool = typer.Option( False, - help='Whether to let text examples wrap across multiple training examples' + help='Whether to let text examples wrap across multiple training examples', ), # type: ignore processes: int = typer.Option( min(max(psutil.cpu_count() - 2, 1), 32), # type: ignore - help='The number of processes to use to download and convert the dataset' + help='The number of processes to use to download and convert the dataset', ), # type: ignore reprocess: bool = typer.Option( False, help= - 'If true, reprocess the input_folder to MDS format. Otherwise, only reprocess upon changes to the input folder or dataset creation parameters.' + 'If true, reprocess the input_folder to MDS format. Otherwise, only reprocess upon changes to the input folder or dataset creation parameters.', ), # type: ignore trust_remote_code: bool = typer.Option( False, - help='If true, allows custom code to be executed to load the tokenizer' + help='If true, allows custom code to be executed to load the tokenizer', ), # type: ignore logging_level: str = typer.Option( - 'INFO', help='Logging level for the script. Default is INFO.' + 'INFO', help='Logging level for the script. Default is INFO.', ), # type: ignore ): args = Namespace( From 3710a94ab087e2d2fb89d7d5f538f928d67c148f Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Thu, 11 Jul 2024 19:06:44 -0700 Subject: [PATCH 05/18] naming --- llmfoundry/cli/cli.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/llmfoundry/cli/cli.py b/llmfoundry/cli/cli.py index c4897d935d..c145737cf8 100644 --- a/llmfoundry/cli/cli.py +++ b/llmfoundry/cli/cli.py @@ -31,50 +31,57 @@ def train( @app.command(name='convert_text_to_mds') def convert_text_to_mds_cli( output_folder: str = typer. - Option(..., help='The folder to write output to'), # type: ignore + Option(..., '--output_folder', help='The folder to write output to'), # type: ignore input_folder: str = typer.Option( - ..., help='The folder with text files to convert to MDS', + ..., '--input_folder', help='The folder with text files to convert to MDS', ), # type: ignore compression: str = typer.Option( - 'zstd', help='The compression algorithm to use for MDS writing', + 'zstd', '--compression', help='The compression algorithm to use for MDS writing', ), # type: ignore concat_tokens: int = typer.Option( ..., + '--concat_tokens', help='Convert text to tokens and concatenate up to this many tokens', ), # type: ignore - tokenizer: str = typer.Option(..., help='The name of the tokenizer to use', + tokenizer: str = typer.Option(..., '--tokenizer', help='The name of the tokenizer to use', ), # type: ignore bos_text: Optional[str] = typer.Option( None, + '--bos_text', help= 'The text to prepend to each example to separate concatenated examples', ), # type: ignore eos_text: Optional[str] = typer.Option( None, + '--eos_text', help= 'The text to append to each example to separate concatenated examples', ), # type: ignore use_tokenizer_eos: bool = typer. - Option(False, help='Use the EOS text from the tokenizer.'), # type: ignore + Option(False, '--use_tokenizer_eos', help='Use the EOS text from the tokenizer.'), # type: ignore no_wrap: bool = typer.Option( False, + '--no_wrap', help='Whether to let text examples wrap across multiple training examples', ), # type: ignore processes: int = typer.Option( min(max(psutil.cpu_count() - 2, 1), 32), # type: ignore + '--processes', help='The number of processes to use to download and convert the dataset', ), # type: ignore reprocess: bool = typer.Option( False, + '--reprocess', help= 'If true, reprocess the input_folder to MDS format. Otherwise, only reprocess upon changes to the input folder or dataset creation parameters.', ), # type: ignore trust_remote_code: bool = typer.Option( False, + '--trust_remote_code', help='If true, allows custom code to be executed to load the tokenizer', ), # type: ignore logging_level: str = typer.Option( - 'INFO', help='Logging level for the script. Default is INFO.', + 'INFO', '--logging_level', help='Logging level for the script. Default is INFO.', ), # type: ignore ): args = Namespace( From 5c5c7e639e873c983b971820e73335f9cc12a42b Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Fri, 12 Jul 2024 10:07:05 -0700 Subject: [PATCH 06/18] typo --- tests/a_scripts/data_prep/test_convert_text_to_mds.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/a_scripts/data_prep/test_convert_text_to_mds.py b/tests/a_scripts/data_prep/test_convert_text_to_mds.py index 7cdaed51a8..4817144ee6 100644 --- a/tests/a_scripts/data_prep/test_convert_text_to_mds.py +++ b/tests/a_scripts/data_prep/test_convert_text_to_mds.py @@ -91,7 +91,7 @@ def _assert_files_exist(prefix: str, files: List[str]): wraps=download_and_convert, ) @patch( - 'llmfoundry.data_prep..merge_shard_groups', + 'llmfoundry.data_prep.merge_shard_groups', wraps=merge_shard_groups, ) def test_single_and_multi_process( From 9dfe42c4959d22e09b8277ec95f43ca243a971e5 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Fri, 12 Jul 2024 12:10:54 -0700 Subject: [PATCH 07/18] test --- tests/a_scripts/data_prep/test_convert_text_to_mds.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/a_scripts/data_prep/test_convert_text_to_mds.py b/tests/a_scripts/data_prep/test_convert_text_to_mds.py index 4817144ee6..c94477042d 100644 --- a/tests/a_scripts/data_prep/test_convert_text_to_mds.py +++ b/tests/a_scripts/data_prep/test_convert_text_to_mds.py @@ -13,7 +13,7 @@ from streaming import StreamingDataset from transformers import AutoTokenizer -from llmfoundry.data_prep import ( +from llmfoundry.data_prep.convert_text_to_mds import ( DONE_FILENAME, convert_text_to_mds, download_and_convert, @@ -83,15 +83,15 @@ def _assert_files_exist(prefix: str, files: List[str]): @pytest.mark.parametrize('processes', [1, 2, 3]) @patch.object(ProcessPoolExecutor, 'map', new=Mock(wraps=_mock_map)) @patch( - 'llmfoundry.data_prep.maybe_create_object_store_from_uri', + 'llmfoundry.data_prep.convert_text_to_mds.maybe_create_object_store_from_uri', ) -@patch('llmfoundry.data_prep.parse_uri') +@patch('llmfoundry.data_prep.convert_text_to_mds.parse_uri') @patch( - 'llmfoundry.data_prep.download_and_convert', + 'llmfoundry.data_prep.convert_text_to_mds.download_and_convert', wraps=download_and_convert, ) @patch( - 'llmfoundry.data_prep.merge_shard_groups', + 'llmfoundry.data_prep.convert_text_to_mds.merge_shard_groups', wraps=merge_shard_groups, ) def test_single_and_multi_process( From e4e2b80c73e03aae657db7cfc2ea788ccc06e10e Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Fri, 12 Jul 2024 23:34:46 -0700 Subject: [PATCH 08/18] commit comments 1 --- llmfoundry/cli/cli.py | 92 +++++++-------------- llmfoundry/data_prep/convert_text_to_mds.py | 77 +++++++++++------ 2 files changed, 84 insertions(+), 85 deletions(-) diff --git a/llmfoundry/cli/cli.py b/llmfoundry/cli/cli.py index c145737cf8..56a3df2204 100644 --- a/llmfoundry/cli/cli.py +++ b/llmfoundry/cli/cli.py @@ -1,28 +1,29 @@ # Copyright 2024 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 -from argparse import Namespace -from typing import Optional +from typing import Annotated, Optional import psutil -import typer +from typer import Argument, Option, Typer from llmfoundry.cli import registry_cli from llmfoundry.data_prep import convert_text_to_mds_from_args from llmfoundry.train import train_from_yaml -app = typer.Typer(pretty_exceptions_show_locals=False) +app = Typer(pretty_exceptions_show_locals=False) app.add_typer(registry_cli.app, name='registry') @app.command(name='train') def train( - yaml_path: str = typer.Argument( - ..., - help='Path to the YAML configuration file', - ), # type: ignore - args_list: Optional[list[str]] = typer. - Argument(None, help='Additional command line arguments'), # type: ignore + yaml_path: Annotated[str, + Argument( + ..., + help='Path to the YAML configuration file', + )], + args_list: Annotated[ + Optional[list[str]], + Argument(help='Additional command line arguments')] = None, ): """Run the training with optional overrides from CLI.""" train_from_yaml(yaml_path, args_list) @@ -30,61 +31,31 @@ def train( @app.command(name='convert_text_to_mds') def convert_text_to_mds_cli( - output_folder: str = typer. - Option(..., '--output_folder', help='The folder to write output to'), # type: ignore - input_folder: str = typer.Option( - ..., '--input_folder', help='The folder with text files to convert to MDS', - ), # type: ignore - compression: str = typer.Option( - 'zstd', '--compression', help='The compression algorithm to use for MDS writing', - ), # type: ignore - concat_tokens: int = typer.Option( - ..., - '--concat_tokens', - help='Convert text to tokens and concatenate up to this many tokens', - ), # type: ignore - tokenizer: str = typer.Option(..., '--tokenizer', help='The name of the tokenizer to use', - ), # type: ignore - bos_text: Optional[str] = typer.Option( - None, - '--bos_text', - help= - 'The text to prepend to each example to separate concatenated examples', - ), # type: ignore - eos_text: Optional[str] = typer.Option( - None, - '--eos_text', - help= - 'The text to append to each example to separate concatenated examples', - ), # type: ignore - use_tokenizer_eos: bool = typer. - Option(False, '--use_tokenizer_eos', help='Use the EOS text from the tokenizer.'), # type: ignore - no_wrap: bool = typer.Option( - False, - '--no_wrap', - help='Whether to let text examples wrap across multiple training examples', - ), # type: ignore - processes: int = typer.Option( - min(max(psutil.cpu_count() - 2, 1), 32), # type: ignore - '--processes', + output_folder: Annotated[str, Option(..., '--output_folder', help='The folder to write output to')], + input_folder: Annotated[str, Option(..., '--input_folder', help='The folder with text files to convert to MDS')], + concat_tokens: Annotated[int, Option(..., help='Convert text to tokens and concatenate up to this many tokens')], + tokenizer: Annotated[str, Option(..., help='The name of the tokenizer to use')], + bos_text: Annotated[str, Option(help='The text to prepend to each example to separate concatenated examples')] = None, + eos_text: Annotated[str, Option(help='The text to append to each example to separate concatenated examples')] = None, + compression: Annotated[str, Option(help='The compression algorithm to use for MDS writing')] = 'zstd', + use_tokenizer_eos: Annotated[bool, Option(help='Use the EOS text from the tokenizer')] = False, + no_wrap: Annotated[bool, Option(help='Whether to let text examples wrap across multiple training examples')] = False, + processes: Annotated[int, Option( help='The number of processes to use to download and convert the dataset', - ), # type: ignore - reprocess: bool = typer.Option( - False, - '--reprocess', + )] = min(max(psutil.cpu_count() - 2, 1), 32), # type: ignore + reprocess: Annotated[bool, Option( help= 'If true, reprocess the input_folder to MDS format. Otherwise, only reprocess upon changes to the input folder or dataset creation parameters.', - ), # type: ignore - trust_remote_code: bool = typer.Option( - False, - '--trust_remote_code', + )] = False, + trust_remote_code: Annotated[bool, Option( help='If true, allows custom code to be executed to load the tokenizer', - ), # type: ignore - logging_level: str = typer.Option( - 'INFO', '--logging_level', help='Logging level for the script. Default is INFO.', - ), # type: ignore + )] = False, + logging_level: Annotated[str, Option( + help='Logging level for the script. Default is INFO.', + )] = 'INFO', + ): - args = Namespace( + convert_text_to_mds_from_args( output_folder=output_folder, input_folder=input_folder, compression=compression, @@ -99,7 +70,6 @@ def convert_text_to_mds_cli( trust_remote_code=trust_remote_code, logging_level=logging_level, ) - convert_text_to_mds_from_args(args) if __name__ == '__main__': diff --git a/llmfoundry/data_prep/convert_text_to_mds.py b/llmfoundry/data_prep/convert_text_to_mds.py index ce6a8f7f70..5a3a16443a 100644 --- a/llmfoundry/data_prep/convert_text_to_mds.py +++ b/llmfoundry/data_prep/convert_text_to_mds.py @@ -9,7 +9,7 @@ from concurrent.futures import ProcessPoolExecutor from functools import partial from glob import glob -from typing import Dict, Iterable, List, Tuple, cast +from typing import Dict, Iterable, List, Optional, Tuple, cast import numpy as np import psutil @@ -582,36 +582,65 @@ def _configure_logging(logging_level: str): log.info(f'Logging level set to {logging_level}') -def convert_text_to_mds_from_args(args: Namespace) -> None: - if args.use_tokenizer_eos: +def convert_text_to_mds_from_args( + output_folder: str, + input_folder: str, + compression: Optional[str], + concat_tokens: int, + tokenizer: str, + bos_text: Optional[str], + eos_text: Optional[str], + use_tokenizer_eos: bool, + no_wrap: bool, + processes: int, + reprocess: bool, + trust_remote_code: bool, + logging_level: str, +) -> None: + if use_tokenizer_eos: # Ensure that eos text is not specified twice. - if args.eos_text is not None: - args.error( + if eos_text is not None: + ValueError( 'Cannot set --eos_text with --use_tokenizer_eos. Please specify one.', ) tokenizer = AutoTokenizer.from_pretrained( - args.tokenizer, - trust_remote_code=args.trust_remote_code, + tokenizer, + trust_remote_code=trust_remote_code, ) - args.eos_text = tokenizer.eos_token + eos_text = tokenizer.eos_token # now that we have validated them, change BOS/EOS to strings - if args.bos_text is None: - args.bos_text = '' - if args.eos_text is None: - args.eos_text = '' - _configure_logging(args.logging_level) + if bos_text is None: + bos_text = '' + if eos_text is None: + eos_text = '' + _configure_logging(logging_level) + + # Define args for _args_str + args = Namespace( + tokenizer=tokenizer, + output_folder=output_folder, + input_folder=input_folder, + compression=compression, + concat_tokens=concat_tokens, + eos_text=eos_text, + bos_text=bos_text, + no_wrap=no_wrap, + processes=processes, + reprocess=reprocess, + trust_remote_code=trust_remote_code, + ) convert_text_to_mds( - tokenizer_name=args.tokenizer, - output_folder=args.output_folder, - input_folder=args.input_folder, - concat_tokens=args.concat_tokens, - eos_text=args.eos_text, - bos_text=args.bos_text, - no_wrap=args.no_wrap, - compression=args.compression, - processes=args.processes, - reprocess=args.reprocess, - trust_remote_code=args.trust_remote_code, + tokenizer_name=tokenizer, + output_folder=output_folder, + input_folder=input_folder, + concat_tokens=concat_tokens, + eos_text=eos_text, + bos_text=bos_text, + no_wrap=no_wrap, + compression=compression, + processes=processes, + reprocess=reprocess, + trust_remote_code=trust_remote_code, args_str=_args_str(args), ) From 1873c2fd9d4fc58091daac2c79667c0aad77411b Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Sat, 13 Jul 2024 00:24:46 -0700 Subject: [PATCH 09/18] precommit --- llmfoundry/cli/cli.py | 6 +++--- llmfoundry/data_prep/convert_text_to_mds.py | 6 +++--- scripts/data_prep/convert_text_to_mds.py | 16 +++++++++++++++- 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/llmfoundry/cli/cli.py b/llmfoundry/cli/cli.py index 56a3df2204..0837d51028 100644 --- a/llmfoundry/cli/cli.py +++ b/llmfoundry/cli/cli.py @@ -30,13 +30,13 @@ def train( @app.command(name='convert_text_to_mds') -def convert_text_to_mds_cli( +def convert_text_to_mds( output_folder: Annotated[str, Option(..., '--output_folder', help='The folder to write output to')], input_folder: Annotated[str, Option(..., '--input_folder', help='The folder with text files to convert to MDS')], concat_tokens: Annotated[int, Option(..., help='Convert text to tokens and concatenate up to this many tokens')], tokenizer: Annotated[str, Option(..., help='The name of the tokenizer to use')], - bos_text: Annotated[str, Option(help='The text to prepend to each example to separate concatenated examples')] = None, - eos_text: Annotated[str, Option(help='The text to append to each example to separate concatenated examples')] = None, + bos_text: Annotated[Optional[str], Option(help='The text to prepend to each example to separate concatenated examples')] = None, + eos_text: Annotated[Optional[str], Option(help='The text to append to each example to separate concatenated examples')] = None, compression: Annotated[str, Option(help='The compression algorithm to use for MDS writing')] = 'zstd', use_tokenizer_eos: Annotated[bool, Option(help='Use the EOS text from the tokenizer')] = False, no_wrap: Annotated[bool, Option(help='Whether to let text examples wrap across multiple training examples')] = False, diff --git a/llmfoundry/data_prep/convert_text_to_mds.py b/llmfoundry/data_prep/convert_text_to_mds.py index 5a3a16443a..35bf9810bc 100644 --- a/llmfoundry/data_prep/convert_text_to_mds.py +++ b/llmfoundry/data_prep/convert_text_to_mds.py @@ -585,7 +585,7 @@ def _configure_logging(logging_level: str): def convert_text_to_mds_from_args( output_folder: str, input_folder: str, - compression: Optional[str], + compression: str, concat_tokens: int, tokenizer: str, bos_text: Optional[str], @@ -603,11 +603,11 @@ def convert_text_to_mds_from_args( ValueError( 'Cannot set --eos_text with --use_tokenizer_eos. Please specify one.', ) - tokenizer = AutoTokenizer.from_pretrained( + built_tokenizer = AutoTokenizer.from_pretrained( tokenizer, trust_remote_code=trust_remote_code, ) - eos_text = tokenizer.eos_token + eos_text = built_tokenizer.eos_token # now that we have validated them, change BOS/EOS to strings if bos_text is None: diff --git a/scripts/data_prep/convert_text_to_mds.py b/scripts/data_prep/convert_text_to_mds.py index f0ab28a90f..5d1a2b3da8 100644 --- a/scripts/data_prep/convert_text_to_mds.py +++ b/scripts/data_prep/convert_text_to_mds.py @@ -118,4 +118,18 @@ def parse_args() -> Namespace: if __name__ == '__main__': args = parse_args() - convert_text_to_mds_from_args(args) + convert_text_to_mds_from_args( + output_folder=args.output_folder, + input_folder=args.input_folder, + compression=args.compression, + concat_tokens=args.concat_tokens, + tokenizer=args.tokenizer, + bos_text=args.bos_text, + eos_text=args.eos_text, + use_tokenizer_eos=args.use_tokenizer_eos, + no_wrap=args.no_wrap, + processes=args.processes, + reprocess=args.reprocess, + trust_remote_code=args.trust_remote_code, + logging_level=args.logging, + ) From d063da7c65eef1dde060d199c61c702c777ee967 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Sat, 13 Jul 2024 03:37:23 -0700 Subject: [PATCH 10/18] typo --- scripts/data_prep/convert_text_to_mds.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/data_prep/convert_text_to_mds.py b/scripts/data_prep/convert_text_to_mds.py index 5d1a2b3da8..52152207df 100644 --- a/scripts/data_prep/convert_text_to_mds.py +++ b/scripts/data_prep/convert_text_to_mds.py @@ -131,5 +131,5 @@ def parse_args() -> Namespace: processes=args.processes, reprocess=args.reprocess, trust_remote_code=args.trust_remote_code, - logging_level=args.logging, + logging_level=args.logging_level, ) From 3cfeeabd73ad93fb1b99d26a90911090f81d7e15 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Sat, 13 Jul 2024 03:41:46 -0700 Subject: [PATCH 11/18] typo --- llmfoundry/cli/cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llmfoundry/cli/cli.py b/llmfoundry/cli/cli.py index 0837d51028..4dc1f05c7d 100644 --- a/llmfoundry/cli/cli.py +++ b/llmfoundry/cli/cli.py @@ -31,8 +31,8 @@ def train( @app.command(name='convert_text_to_mds') def convert_text_to_mds( - output_folder: Annotated[str, Option(..., '--output_folder', help='The folder to write output to')], - input_folder: Annotated[str, Option(..., '--input_folder', help='The folder with text files to convert to MDS')], + output_folder: Annotated[str, Option(..., help='The folder to write output to')], + input_folder: Annotated[str, Option(..., help='The folder with text files to convert to MDS')], concat_tokens: Annotated[int, Option(..., help='Convert text to tokens and concatenate up to this many tokens')], tokenizer: Annotated[str, Option(..., help='The name of the tokenizer to use')], bos_text: Annotated[Optional[str], Option(help='The text to prepend to each example to separate concatenated examples')] = None, From f4022bcd234485c5f550853f4ceb8cec3a56d257 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Sat, 13 Jul 2024 17:38:48 -0700 Subject: [PATCH 12/18] arg_str --- llmfoundry/data_prep/convert_text_to_mds.py | 51 ++++++--------------- 1 file changed, 14 insertions(+), 37 deletions(-) diff --git a/llmfoundry/data_prep/convert_text_to_mds.py b/llmfoundry/data_prep/convert_text_to_mds.py index 35bf9810bc..3814787f25 100644 --- a/llmfoundry/data_prep/convert_text_to_mds.py +++ b/llmfoundry/data_prep/convert_text_to_mds.py @@ -543,29 +543,6 @@ def convert_text_to_mds( ) -def _args_str(original_args: Namespace) -> str: - """Create a string from the args to determine whether to reprocess. - - Args: - original_args (Namespace): Arguments to main function. - """ - # Take the arguments that influence the final result. - # reprocess and max_mds_writer_workers are not taken. - args = Namespace( - tokenizer_name=original_args.tokenizer, - output_folder=original_args.output_folder, - input_folder=original_args.input_folder, - concat_tokens=original_args.concat_tokens, - eos_text=original_args.eos_text, - bos_text=original_args.bos_text, - no_wrap=original_args.no_wrap, - compression=original_args.compression, - processes=original_args.processes, - ) - - return str(args) - - def _configure_logging(logging_level: str): """Configure logging. @@ -617,19 +594,19 @@ def convert_text_to_mds_from_args( _configure_logging(logging_level) # Define args for _args_str - args = Namespace( - tokenizer=tokenizer, - output_folder=output_folder, - input_folder=input_folder, - compression=compression, - concat_tokens=concat_tokens, - eos_text=eos_text, - bos_text=bos_text, - no_wrap=no_wrap, - processes=processes, - reprocess=reprocess, - trust_remote_code=trust_remote_code, - ) + args = { + 'tokenizer': tokenizer, + 'output_folder': output_folder, + 'input_folder': input_folder, + 'compression': compression, + 'concat_tokens': concat_tokens, + 'eos_text': eos_text, + 'bos_text': bos_text, + 'no_wrap': no_wrap, + 'processes': processes, + 'reprocess': reprocess, + 'trust_remote_code': trust_remote_code, + } convert_text_to_mds( tokenizer_name=tokenizer, output_folder=output_folder, @@ -642,5 +619,5 @@ def convert_text_to_mds_from_args( processes=processes, reprocess=reprocess, trust_remote_code=trust_remote_code, - args_str=_args_str(args), + args_str=str(args), ) From 9b1cc6f5635c0714f6fe98e5e5352f7610f58a16 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Mon, 15 Jul 2024 11:59:12 -0700 Subject: [PATCH 13/18] annotation + help --- llmfoundry/cli/cli.py | 1 + llmfoundry/data_prep/convert_text_to_mds.py | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/llmfoundry/cli/cli.py b/llmfoundry/cli/cli.py index 4dc1f05c7d..5f5e37c5a9 100644 --- a/llmfoundry/cli/cli.py +++ b/llmfoundry/cli/cli.py @@ -55,6 +55,7 @@ def convert_text_to_mds( )] = 'INFO', ): + """Convert text files to MDS streaming format.""" convert_text_to_mds_from_args( output_folder=output_folder, input_folder=input_folder, diff --git a/llmfoundry/data_prep/convert_text_to_mds.py b/llmfoundry/data_prep/convert_text_to_mds.py index 3814787f25..0ee03a0d93 100644 --- a/llmfoundry/data_prep/convert_text_to_mds.py +++ b/llmfoundry/data_prep/convert_text_to_mds.py @@ -574,6 +574,26 @@ def convert_text_to_mds_from_args( trust_remote_code: bool, logging_level: str, ) -> None: + """A wrapper for `convert_finetuning_dataset` + + Args: + output_folder (str): The folder to write output to + input_folder (str): The folder with text files to convert to MDS + compression (str): The compression algorithm to use for MDS writing + concat_tokens (int): Convert text to tokens and concatenate up to this many tokens + tokenizer (str): The name of the tokenizer to use + bos_text (Optional[str]): The text to prepend to each example to separate concatenated examples + eos_text (Optional[str]): The text to append to each example to separate concatenated examples + use_tokenizer_eos (bool): Use the EOS text from the tokenizer + no_wrap (bool): Whether to let text examples wrap across multiple training examples + processes (int): The number of processes to use to download and convert the dataset + reprocess (bool): If true, reprocess the input_folder to MDS format. Otherwise, only reprocess upon changes to the input folder or dataset creation parameters. + trust_remote_code (bool): If true, allows custom code to be executed to load the tokenizer + logging_level (str): Logging level for the script. Default is INFO. + + Raises: + ValueError: If `use_tokenizer_eos` is True and `eos_text` is not None + """ if use_tokenizer_eos: # Ensure that eos text is not specified twice. if eos_text is not None: From 3a14cb65f8db1dd5101ce008f59495465c15dcdb Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Mon, 15 Jul 2024 12:06:42 -0700 Subject: [PATCH 14/18] update annotation --- llmfoundry/data_prep/convert_text_to_mds.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmfoundry/data_prep/convert_text_to_mds.py b/llmfoundry/data_prep/convert_text_to_mds.py index 0ee03a0d93..ba818fd83b 100644 --- a/llmfoundry/data_prep/convert_text_to_mds.py +++ b/llmfoundry/data_prep/convert_text_to_mds.py @@ -574,7 +574,7 @@ def convert_text_to_mds_from_args( trust_remote_code: bool, logging_level: str, ) -> None: - """A wrapper for `convert_finetuning_dataset` + """A wrapper for `convert_text_to_mds` Args: output_folder (str): The folder to write output to From ec73674a139091c39e8e52be7ace38e898e50f92 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Mon, 15 Jul 2024 12:12:48 -0700 Subject: [PATCH 15/18] typo --- llmfoundry/data_prep/convert_text_to_mds.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llmfoundry/data_prep/convert_text_to_mds.py b/llmfoundry/data_prep/convert_text_to_mds.py index ba818fd83b..b9e48139f7 100644 --- a/llmfoundry/data_prep/convert_text_to_mds.py +++ b/llmfoundry/data_prep/convert_text_to_mds.py @@ -574,13 +574,13 @@ def convert_text_to_mds_from_args( trust_remote_code: bool, logging_level: str, ) -> None: - """A wrapper for `convert_text_to_mds` + """A wrapper for `convert_text_to_mds` to parse arguments. Args: - output_folder (str): The folder to write output to - input_folder (str): The folder with text files to convert to MDS + output_folder (str): Folder to write MDS shards to + input_folder (str): Folder of text files to process compression (str): The compression algorithm to use for MDS writing - concat_tokens (int): Convert text to tokens and concatenate up to this many tokens + concat_tokens (int): Concatenate up to this many tokens tokenizer (str): The name of the tokenizer to use bos_text (Optional[str]): The text to prepend to each example to separate concatenated examples eos_text (Optional[str]): The text to append to each example to separate concatenated examples From b0dcceec2df12b78dcd4ad5db6fa307bcb8d8987 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Wed, 17 Jul 2024 13:35:08 -0700 Subject: [PATCH 16/18] precommit --- llmfoundry/command_utils/data_prep/convert_text_to_mds.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py index 36b88256f9..5c6256e05e 100644 --- a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py +++ b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py @@ -1,3 +1,6 @@ +# Copyright 2024 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + import logging import math import os @@ -5,7 +8,7 @@ from concurrent.futures import ProcessPoolExecutor from functools import partial from glob import glob -from typing import Dict, Iterable, List, Tuple, cast, Optional +from typing import Dict, Iterable, List, Optional, Tuple, cast import numpy as np from composer.utils import ( From 5b4adf491dd0cbbaf9f73d6e99d53bdf0ff4287d Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Wed, 17 Jul 2024 14:07:30 -0700 Subject: [PATCH 17/18] precommit --- llmfoundry/cli/data_prep_cli.py | 1 + 1 file changed, 1 insertion(+) diff --git a/llmfoundry/cli/data_prep_cli.py b/llmfoundry/cli/data_prep_cli.py index b1fa8346ed..befded7278 100644 --- a/llmfoundry/cli/data_prep_cli.py +++ b/llmfoundry/cli/data_prep_cli.py @@ -3,6 +3,7 @@ from typing import Annotated, Optional +import psutil from typer import Option, Typer from llmfoundry.command_utils import ( From 61bb594c0705f230f08c96bb915b2f430864edf3 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Wed, 17 Jul 2024 17:30:43 -0700 Subject: [PATCH 18/18] pr comments --- llmfoundry/cli/data_prep_cli.py | 2 +- llmfoundry/command_utils/__init__.py | 14 -------------- .../command_utils/data_prep/convert_text_to_mds.py | 14 +++++++------- scripts/data_prep/convert_text_to_mds.py | 2 +- 4 files changed, 9 insertions(+), 23 deletions(-) diff --git a/llmfoundry/cli/data_prep_cli.py b/llmfoundry/cli/data_prep_cli.py index befded7278..9cb7b0d240 100644 --- a/llmfoundry/cli/data_prep_cli.py +++ b/llmfoundry/cli/data_prep_cli.py @@ -95,7 +95,7 @@ def convert_text_to_mds( input_folder=input_folder, compression=compression, concat_tokens=concat_tokens, - tokenizer=tokenizer, + tokenizer_name=tokenizer, bos_text=bos_text, eos_text=eos_text, use_tokenizer_eos=use_tokenizer_eos, diff --git a/llmfoundry/command_utils/__init__.py b/llmfoundry/command_utils/__init__.py index fd9b866c1e..bdd9ad24f4 100644 --- a/llmfoundry/command_utils/__init__.py +++ b/llmfoundry/command_utils/__init__.py @@ -5,15 +5,8 @@ convert_dataset_hf_from_args, ) from llmfoundry.command_utils.data_prep.convert_text_to_mds import ( - DONE_FILENAME, convert_text_to_mds, convert_text_to_mds_from_args, - download_and_convert, - is_already_processed, - maybe_create_object_store_from_uri, - merge_shard_groups, - parse_uri, - write_done_file, ) from llmfoundry.command_utils.eval import ( eval_from_yaml, @@ -39,11 +32,4 @@ 'convert_dataset_hf_from_args', 'convert_text_to_mds', 'convert_text_to_mds_from_args', - 'maybe_create_object_store_from_uri', - 'parse_uri', - 'download_and_convert', - 'merge_shard_groups', - 'is_already_processed', - 'write_done_file', - 'DONE_FILENAME', ] diff --git a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py index 5c6256e05e..14afe279fd 100644 --- a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py +++ b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py @@ -503,7 +503,7 @@ def convert_text_to_mds_from_args( input_folder: str, compression: str, concat_tokens: int, - tokenizer: str, + tokenizer_name: str, bos_text: Optional[str], eos_text: Optional[str], use_tokenizer_eos: bool, @@ -520,7 +520,7 @@ def convert_text_to_mds_from_args( input_folder (str): Folder of text files to process compression (str): The compression algorithm to use for MDS writing concat_tokens (int): Concatenate up to this many tokens - tokenizer (str): The name of the tokenizer to use + tokenizer_name (str): The name of the tokenizer to use bos_text (Optional[str]): The text to prepend to each example to separate concatenated examples eos_text (Optional[str]): The text to append to each example to separate concatenated examples use_tokenizer_eos (bool): Use the EOS text from the tokenizer @@ -539,11 +539,11 @@ def convert_text_to_mds_from_args( ValueError( 'Cannot set --eos_text with --use_tokenizer_eos. Please specify one.', ) - built_tokenizer = AutoTokenizer.from_pretrained( - tokenizer, + tokenizer = AutoTokenizer.from_pretrained( + tokenizer_name, trust_remote_code=trust_remote_code, ) - eos_text = built_tokenizer.eos_token + eos_text = tokenizer.eos_token # now that we have validated them, change BOS/EOS to strings if bos_text is None: @@ -554,7 +554,7 @@ def convert_text_to_mds_from_args( # Define args for _args_str args = { - 'tokenizer': tokenizer, + 'tokenizer': tokenizer_name, 'output_folder': output_folder, 'input_folder': input_folder, 'compression': compression, @@ -567,7 +567,7 @@ def convert_text_to_mds_from_args( 'trust_remote_code': trust_remote_code, } convert_text_to_mds( - tokenizer_name=tokenizer, + tokenizer_name=tokenizer_name, output_folder=output_folder, input_folder=input_folder, concat_tokens=concat_tokens, diff --git a/scripts/data_prep/convert_text_to_mds.py b/scripts/data_prep/convert_text_to_mds.py index b820d38b7d..c808fa871f 100644 --- a/scripts/data_prep/convert_text_to_mds.py +++ b/scripts/data_prep/convert_text_to_mds.py @@ -123,7 +123,7 @@ def parse_args() -> Namespace: input_folder=args.input_folder, compression=args.compression, concat_tokens=args.concat_tokens, - tokenizer=args.tokenizer, + tokenizer_name=args.tokenizer, bos_text=args.bos_text, eos_text=args.eos_text, use_tokenizer_eos=args.use_tokenizer_eos,