Merge branch 'main' into enhance/tolerance

modelscope · Sep 9, 2024 · cd51eec · cd51eec
2 parents bc7d55f + adb4ac9
commit cd51eec
Show file tree

Hide file tree

Showing 28 changed files with 893 additions and 55 deletions.
diff --git a/.github/workflows/docker/docker-compose.yml b/.github/workflows/docker/docker-compose.yml
@@ -1,7 +1,7 @@
 version: '3'
 services:
   ray-head:
-    image: data-juicer-unittest:0.2.1
+    image: data-juicer-unittest:0.2.2
     pull_policy: never
     command: ray start --head --dashboard-host 0.0.0.0 --include-dashboard true --block
     environment:
@@ -30,7 +30,7 @@ services:
             capabilities: [gpu]
 
   ray-worker:
-    image: data-juicer-unittest:0.2.1
+    image: data-juicer-unittest:0.2.2
     pull_policy: never
     command: ray start --address=ray-head:6379 --block
     environment:

diff --git a/.gitignore b/.gitignore
@@ -14,3 +14,4 @@ dist
 .idea/
 wandb/
 __pycache__
+.vscode/
diff --git a/configs/config_all.yaml b/configs/config_all.yaml
@@ -62,8 +62,29 @@ process:
   - clean_copyright_mapper:                                 # remove copyright comments.
   - expand_macro_mapper:                                    # expand macro definitions in Latex text.
   - extract_qa_mapper:                                      # mapper to extract question and answer pair from text.
-      hf_model: 'alibaba-pai/pai-qwen1_5-7b-doc2qa'
+      hf_model: 'alibaba-pai/pai-qwen1_5-7b-doc2qa'           # model name on huggingface to extract question and answer pair.
+      pattern: null                                           # regular expression pattern to search for within text.
+      qa_format: 'chatml'                                     # Output format of question and answer pair.
+      enable_vllm: true                                       # Whether to use vllm for inference acceleration.
+      tensor_parallel_size: null                              # It is only valid when enable_vllm is True. The number of GPUs to use for distributed execution with tensor parallelism.
+      max_model_len: null                                     # It is only valid when enable_vllm is True. Model context length. If unspecified, will be automatically derived from the model config.
+      max_num_seqs: 256                                       # It is only valid when enable_vllm is True. Maximum number of sequences to be processed in a single iteration.
+      sampling_params: {}                                     # Sampling parameters for text generation. e.g {'temperature': 0.9, 'top_p': 0.95}
   - fix_unicode_mapper:                                     # fix unicode errors in text.
+  - generate_instruction_mapper:                            # generate new instruction text data.
+      hf_model: 'Qwen/Qwen-7B-Chat'                           # model name on huggingface to generate instruction.
+      seed_file: 'demos/data/demo-dataset-chatml.jsonl'       # Seed file as instruction samples to generate new instructions, chatml format.
+      instruct_num: 3                                         # the number of generated samples.
+      similarity_threshold: 0.7                               # the similarity score threshold between the generated samples and the seed samples.Range from 0 to 1. Samples with similarity score less than this threshold will be kept.
+      prompt_template: null                                   # Prompt template for generate samples. Please make sure the template contains "{augmented_data}", which corresponds to the augmented samples.
+      qa_pair_template: null                                  # Prompt template for generate question and answer pair description. Please make sure the template contains two "{}" to format question and answer. Default: '【问题】\n{}\n【回答】\n{}\n'.
+      example_template: null                                  # Prompt template for generate examples. Please make sure the template contains "{qa_pairs}", which corresponds to the question and answer pair description generated by param `qa_pair_template`.
+      qa_extraction_pattern: null                             # Regular expression pattern for parsing question and answer from model response.
+      enable_vllm: true                                       # Whether to use vllm for inference acceleration.
+      tensor_parallel_size: null                              # It is only valid when enable_vllm is True. The number of GPUs to use for distributed execution with tensor parallelism.
+      max_model_len: null                                     # It is only valid when enable_vllm is True. Model context length. If unspecified, will be automatically derived from the model config.
+      max_num_seqs: 256                                       # It is only valid when enable_vllm is True. Maximum number of sequences to be processed in a single iteration.
+      sampling_params: {}                                     # Sampling parameters for text generation. e.g {'temperature': 0.9, 'top_p': 0.95}
   - image_blur_mapper:                                      # mapper to blur images.
       p: 0.2                                                  # probability of the image being blured
       blur_type: 'gaussian'                                   # type of blur kernel, including ['mean', 'box', 'gaussian']
@@ -123,6 +144,13 @@ process:
       delete_random_char: false                               # whether to open the augmentation method of deleting random characters from the original texts. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有5种不同的数据增强"
       swap_random_char: false                                 # whether to open the augmentation method of swapping random contiguous characters in the original texts. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有5种不同的数据强增方法"
       replace_equivalent_num: false                           # whether to open the augmentation method of replacing random numbers with their equivalent representations in the original texts. **Notice**: Only for numbers for now. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有伍种不同的数据增强方法"
+  - optimize_instruction_mapper:                            # optimize instruction.
+      hf_model: 'alibaba-pai/Qwen2-7B-Instruct-Refine'        # model name on huggingface to optimize instruction
+      enable_vllm: true                                       # whether to use vllm for inference acceleration.
+      tensor_parallel_size: null                              # It is only valid when enable_vllm is True. The number of GPUs to use for distributed execution with tensor parallelism.
+      max_model_len: null                                     # It is only valid when enable_vllm is True. Model context length. If unspecified, will be automatically derived from the model config.
+      max_num_seqs: 256                                       # It is only valid when enable_vllm is True. Maximum number of sequences to be processed in a single iteration.
+      sampling_params: {}                                     # Sampling parameters for text generation. e.g {'temperature': 0.9, 'top_p': 0.95}
   - punctuation_normalization_mapper:                       # normalize unicode punctuations to English punctuations.
   - remove_bibliography_mapper:                             # remove bibliography from Latex text.
   - remove_comments_mapper:                                 # remove comments from Latex text, code, etc.

diff --git a/configs/demo/analyser.yaml → configs/demo/analyzer.yaml b/configs/demo/analyser.yaml → configs/demo/analyzer.yaml
diff --git a/data_juicer/config/config.py b/data_juicer/config/config.py
@@ -86,6 +86,13 @@ def init_configs(args=None):
         help='Path to datasets with optional weights(0.0-1.0), 1.0 as '
         'default. Accepted format:<w1> dataset1-path <w2> dataset2-path '
         '<w3> dataset3-path ...')
+    parser.add_argument(
+        '--generated_dataset_config',
+        type=Dict,
+        default=None,
+        help='Configuration used to create a dataset. '
+        'The dataset will be created from this configuration if provided. '
+        'It must contain the `type` field to specify the dataset name.')
     parser.add_argument(
         '--export_path',
         type=str,
@@ -377,7 +384,7 @@ def init_setup_from_cfg(cfg):
                  redirect=cfg.executor_type == 'default')
 
     # check and get dataset dir
-    if os.path.exists(cfg.dataset_path):
+    if cfg.get('dataset_path', None) and os.path.exists(cfg.dataset_path):
         cfg.dataset_path = os.path.abspath(cfg.dataset_path)
         if os.path.isdir(cfg.dataset_path):
             cfg.dataset_dir = cfg.dataset_path

diff --git a/data_juicer/core/analyzer.py b/data_juicer/core/analyzer.py
@@ -39,9 +39,12 @@ def __init__(self, cfg=None):
 
         # setup formatter
         logger.info('Setting up data formatter...')
-        self.formatter = load_formatter(self.cfg.dataset_path,
-                                        self.cfg.text_keys, self.cfg.suffixes,
-                                        self.cfg.add_suffix)
+        self.formatter = load_formatter(
+            dataset_path=self.cfg.dataset_path,
+            generated_dataset_config=self.cfg.generated_dataset_config,
+            text_keys=self.cfg.text_keys,
+            suffixes=self.cfg.suffixes,
+            add_suffix=self.cfg.add_suffix)
 
         # prepare exporter and check export path suffix
         # NOTICE: no need to export dataset texts for analyzer

diff --git a/data_juicer/core/data.py b/data_juicer/core/data.py
@@ -229,8 +229,9 @@ def map(self, *args, **kargs):
 
         if inspect.ismethod(called_func):
             # batched is required for fault-tolerant or batched OP
-            if not called_func.__self__.turbo or \
-                    called_func.__self__.is_batched_op():
+            if not called_func.__self__.turbo or hasattr(
+                    called_func.__self__,
+                    'is_batched_op') and called_func.__self__.is_batched_op():
                 kargs['batched'] = True
                 kargs['batch_size'] = kargs.pop('batch_size', 1)
             else:

diff --git a/data_juicer/core/executor.py b/data_juicer/core/executor.py
@@ -48,9 +48,12 @@ def __init__(self, cfg=None):
 
         # setup formatter
         logger.info('Setting up data formatter...')
-        self.formatter = load_formatter(self.cfg.dataset_path,
-                                        self.cfg.text_keys, self.cfg.suffixes,
-                                        self.cfg.add_suffix)
+        self.formatter = load_formatter(
+            dataset_path=self.cfg.dataset_path,
+            generated_dataset_config=self.cfg.generated_dataset_config,
+            text_keys=self.cfg.text_keys,
+            suffixes=self.cfg.suffixes,
+            add_suffix=self.cfg.add_suffix)
 
         # whether to use checkpoint mechanism. If it's true, Executor will
         # check if there are existing checkpoints first and try to load the

diff --git a/data_juicer/core/ray_executor.py b/data_juicer/core/ray_executor.py
@@ -47,7 +47,17 @@ def run(self, load_data_np=None):
         """
         # 1. load data
         logger.info('Loading dataset with Ray...')
-        dataset = rd.read_json(self.cfg.dataset_path)
+
+        if self.cfg.get('generated_dataset_config', None):
+            generated_dataset_config = self.cfg.generated_dataset_config
+            assert isinstance(generated_dataset_config,
+                              dict) and 'type' in generated_dataset_config
+            args = generated_dataset_config.copy()
+            obj_name = args.pop('type')
+            from data_juicer.format.formatter import FORMATTERS
+            dataset = FORMATTERS.modules[obj_name](**args).load_dataset()
+        else:
+            dataset = rd.read_json(self.cfg.dataset_path)
 
         # convert all the path in dataset to absolute path
         dataset = RayDataset(dataset, self.cfg.dataset_path, self.cfg)

diff --git a/data_juicer/format/__init__.py b/data_juicer/format/__init__.py
@@ -1,6 +1,8 @@
-from . import (csv_formatter, json_formatter, mixture_formatter,
-               parquet_formatter, text_formatter, tsv_formatter)
+from . import (csv_formatter, empty_formatter, json_formatter,
+               mixture_formatter, parquet_formatter, text_formatter,
+               tsv_formatter)
 from .csv_formatter import CsvFormatter
+from .empty_formatter import EmptyFormatter, RayEmptyFormatter
 from .formatter import LocalFormatter, RemoteFormatter
 from .json_formatter import JsonFormatter
 from .load import load_formatter
@@ -12,5 +14,5 @@
 __all__ = [
     'load_formatter', 'JsonFormatter', 'LocalFormatter', 'RemoteFormatter',
     'TextFormatter', 'ParquetFormatter', 'CsvFormatter', 'TsvFormatter',
-    'MixtureFormatter'
+    'MixtureFormatter', 'EmptyFormatter', 'RayEmptyFormatter'
 ]
diff --git a/data_juicer/format/empty_formatter.py b/data_juicer/format/empty_formatter.py
@@ -0,0 +1,84 @@
+from typing import List
+
+import pandas as pd
+import ray
+from datasets import Dataset, Features, Value
+
+from .formatter import FORMATTERS, BaseFormatter
+
+
+@FORMATTERS.register_module()
+class EmptyFormatter(BaseFormatter):
+    """
+    The class is used to create empty data.
+    """
+    SUFFIXES = []
+
+    def __init__(self, length, feature_keys: List[str] = [], *args, **kwargs):
+        """
+        Initialization method.
+
+        :param length: The empty dataset length.
+        :param feature_keys: feature key name list.
+        """
+        self.length = length
+        self.feature_keys = feature_keys
+        if isinstance(self.feature_keys, str):
+            self.feature_keys = [self.feature_keys]
+
+    @property
+    def null_value(self):
+        return None
+
+    def load_dataset(self, *args, **kwargs):
+        data_dict = {}
+        features = Features()
+
+        for key in self.feature_keys:
+            features.update({key: Value('string')})
+            data_dict.update(
+                {key: [self.null_value for _ in range(self.length)]})
+
+        empty_dataset = Dataset.from_dict(data_dict, features=features)
+
+        from data_juicer.core.data import NestedDataset
+        empty_dataset = NestedDataset(empty_dataset)
+
+        return empty_dataset
+
+
+@FORMATTERS.register_module()
+class RayEmptyFormatter(BaseFormatter):
+    """
+    The class is used to create empty data for ray.
+    """
+    SUFFIXES = []
+
+    def __init__(self, length, feature_keys: List[str] = [], *args, **kwargs):
+        """
+        Initialization method.
+
+        :param length: The empty dataset length.
+        :param feature_keys: feature key name list.
+        """
+        self.length = length
+        self.feature_keys = feature_keys
+        if isinstance(self.feature_keys, str):
+            self.feature_keys = [self.feature_keys]
+
+    @property
+    def null_value(self):
+        return {}
+
+    def load_dataset(self, *args, **kwargs):
+        if len(self.feature_keys):
+            df = pd.DataFrame({
+                col: [self.null_value for _ in range(self.length)]
+                for col in self.feature_keys
+            })
+        else:
+            df = pd.DataFrame([self.null_value for _ in range(self.length)])
+
+        empty_dataset = ray.data.from_pandas(df)
+
+        return empty_dataset
diff --git a/data_juicer/format/load.py b/data_juicer/format/load.py
@@ -3,6 +3,7 @@
 
 
 def load_formatter(dataset_path,
+                   generated_dataset_config=None,
                    text_keys=None,
                    suffixes=[],
                    add_suffix=False,
@@ -12,13 +13,26 @@ def load_formatter(dataset_path,
     weight(default 1.0) according to their formats.
 
     :param dataset_path: path to a dataset file or a dataset directory
+    :param generated_dataset_config: Configuration used to create a dataset.
+        The dataset will be created from this configuration if provided.
+        It must contain the `type` field to specify the dataset name.
     :param text_keys: key names of field that stores sample text.
         Default: None
     :param suffixes: files with specified suffixes to be processed.
     :param add_suffix: whether to add the file suffix to dataset meta
         info
     :return: a dataset formatter.
     """
+    if generated_dataset_config:
+        assert isinstance(generated_dataset_config,
+                          dict) and 'type' in generated_dataset_config
+        args = generated_dataset_config.copy()
+        obj_name = args.pop('type')
+        args.update(kwargs)
+
+        from .formatter import FORMATTERS
+        return FORMATTERS.modules[obj_name](**args)
+
     formatter = MixtureFormatter(dataset_path=dataset_path,
                                  text_keys=text_keys,
                                  suffixes=suffixes,

diff --git a/data_juicer/ops/mapper/__init__.py b/data_juicer/ops/mapper/__init__.py
@@ -2,10 +2,11 @@
 from . import (audio_ffmpeg_wrapped_mapper, chinese_convert_mapper,
                clean_copyright_mapper, clean_email_mapper, clean_html_mapper,
                clean_ip_mapper, clean_links_mapper, expand_macro_mapper,
-               extract_qa_mapper, fix_unicode_mapper, image_blur_mapper,
+               extract_qa_mapper, fix_unicode_mapper,
+               generate_instruction_mapper, image_blur_mapper,
                image_captioning_from_gpt4v_mapper, image_captioning_mapper,
                image_diffusion_mapper, image_face_blur_mapper,
-               nlpaug_en_mapper, nlpcda_zh_mapper,
+               nlpaug_en_mapper, nlpcda_zh_mapper, optimize_instruction_mapper,
                punctuation_normalization_mapper, remove_bibliography_mapper,
                remove_comments_mapper, remove_header_mapper,
                remove_long_words_mapper, remove_non_chinese_character_mapper,
@@ -34,13 +35,15 @@
 from .expand_macro_mapper import ExpandMacroMapper
 from .extract_qa_mapper import ExtractQAMapper
 from .fix_unicode_mapper import FixUnicodeMapper
+from .generate_instruction_mapper import GenerateInstructionMapper
 from .image_blur_mapper import ImageBlurMapper
 from .image_captioning_from_gpt4v_mapper import ImageCaptioningFromGPT4VMapper
 from .image_captioning_mapper import ImageCaptioningMapper
 from .image_diffusion_mapper import ImageDiffusionMapper
 from .image_face_blur_mapper import ImageFaceBlurMapper
 from .nlpaug_en_mapper import NlpaugEnMapper
 from .nlpcda_zh_mapper import NlpcdaZhMapper
+from .optimize_instruction_mapper import OptimizeInstructionMapper
 from .punctuation_normalization_mapper import PunctuationNormalizationMapper
 from .remove_bibliography_mapper import RemoveBibliographyMapper
 from .remove_comments_mapper import RemoveCommentsMapper
@@ -92,6 +95,7 @@
     'VideoFFmpegWrappedMapper',
     'ChineseConvertMapper',
     'NlpcdaZhMapper',
+    'OptimizeInstructionMapper',
     'ImageBlurMapper',
     'CleanCopyrightMapper',
     'RemoveNonChineseCharacterlMapper',
@@ -108,6 +112,7 @@
     'RemoveWordsWithIncorrectSubstringsMapper',
     'VideoCaptioningFromVideoMapper',
     'VideoCaptioningFromSummarizerMapper',
+    'GenerateInstructionMapper',
     'FixUnicodeMapper',
     'NlpaugEnMapper',
     'VideoCaptioningFromFramesMapper',