From 573a7044c65fdf75b5a89ed76bf0168059a7dc86 Mon Sep 17 00:00:00 2001
From: "hesen.chs" <hesen.chs@alibaba-inc.com>
Date: Thu, 16 Nov 2023 18:45:54 +0800
Subject: [PATCH 01/17] fix opencc serialization error

---
 data_juicer/ops/mapper/chinese_convert_mapper.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/data_juicer/ops/mapper/chinese_convert_mapper.py b/data_juicer/ops/mapper/chinese_convert_mapper.py
index 7d87a9165..8fc0a41c3 100644
--- a/data_juicer/ops/mapper/chinese_convert_mapper.py
+++ b/data_juicer/ops/mapper/chinese_convert_mapper.py
@@ -1,8 +1,12 @@
-import opencc
-
 from ..base_op import OPERATORS, Mapper
 
 
+def prepare_converter(mode):
+    global OPENCC_CONVERTER
+    import opencc
+    OPENCC_CONVERTER = opencc.OpenCC(mode + '.json')
+
+
 @OPERATORS.register_module('chinese_convert_mapper')
 class ChineseConvertMapper(Mapper):
     """Mapper to convert Chinese between Traditional Chinese, Simplified Chinese
@@ -39,9 +43,9 @@ def __init__(self, mode: str = 's2t', *args, **kwargs):
         ]
         assert mode in mode_list, 'Please make sure mode is one of {}'.format(
             mode_list)
-        self.converter = opencc.OpenCC(mode + '.json')
+        prepare_converter(mode)
 
     def process(self, sample):
 
-        sample[self.text_key] = self.converter.convert(sample[self.text_key])
+        sample[self.text_key] = OPENCC_CONVERTER.convert(sample[self.text_key])
         return sample

From 4fee9a121e0e4825f749033e90886b357ab067d7 Mon Sep 17 00:00:00 2001
From: "hesen.chs" <hesen.chs@alibaba-inc.com>
Date: Tue, 21 Nov 2023 19:19:53 +0800
Subject: [PATCH 02/17] support audio-text data reading

---
 configs/config_all.yaml                       |   2 +
 data_juicer/utils/mm_utils.py                 |  15 +-
 tools/multimodal/README.md                    |   1 +
 tools/multimodal/README_ZH.md                 |   1 +
 .../dj_to_llava.py                            |   4 +-
 .../dj_to_wavcaps.py                          | 122 +++++++++
 .../wavcaps_to_dj.py                          | 231 ++++++++++++++++++
 7 files changed, 372 insertions(+), 4 deletions(-)
 create mode 100644 tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py
 create mode 100644 tools/multimodal/source_format_to_data_juicer_format/wavcaps_to_dj.py

diff --git a/configs/config_all.yaml b/configs/config_all.yaml
index 58970a08b..95d9623fc 100644
--- a/configs/config_all.yaml
+++ b/configs/config_all.yaml
@@ -26,6 +26,8 @@ cache_compress: null                                        # The compression me
 # for multimodal data processing
 image_key: 'images'                                         # Key name of field to store the list of sample image paths.
 image_special_token: '<__dj__image>'                        # The special token that represents an image in the text. In default, it's "<__dj__image>". You can specify your own special token according to your input dataset.
+audio_key: 'audios'                                         # Key name of field to store the list of sample audio paths.
+audio_special_token: '<__dj__audio>'                        # The special token that represents an audio in the text. In default, it's "<__dj__audio>". You can specify your own special token according to your input dataset.
 
 eoc_special_token: '<|__dj__eoc|>'                          # The special token that represents the end of a chunk in the text. In default, it's "<|__dj__eoc|>". You can specify your own special token according to your input dataset.
 
diff --git a/data_juicer/utils/mm_utils.py b/data_juicer/utils/mm_utils.py
index ea6b2063f..8a3f8c67e 100644
--- a/data_juicer/utils/mm_utils.py
+++ b/data_juicer/utils/mm_utils.py
@@ -1,4 +1,4 @@
-from datasets import Image
+from datasets import Image, Audio
 
 from data_juicer.utils.constant import DEFAULT_PREFIX
 
@@ -8,6 +8,7 @@
 class SpecialTokens(object):
     # modality
     image = f'<{DEFAULT_PREFIX}image>'
+    audio = f'<{DEFAULT_PREFIX}audio>'
 
     # others
     eoc = f'<|{DEFAULT_PREFIX}eoc|>'
@@ -17,13 +18,23 @@ def load_images(paths):
     return [load_image(path) for path in paths]
 
 
+def load_audios(paths):
+    return [load_audio(path) for path in paths]
+
+
 def load_image(path):
     img_feature = Image()
     img = img_feature.decode_example(img_feature.encode_example(path))
     return img
 
 
-def get_image_size(path):
+def load_audio(path, sampling_rate=None):
+    aud_feature = Audio(sampling_rate)
+    aud = aud_feature.decode_example(aud_feature.encode_example(path))
+    return (aud['array'], aud['sampling_rate'])
+    
+
+def get_image_size(path, ):
     import os
     return os.path.getsize(path)
 
diff --git a/tools/multimodal/README.md b/tools/multimodal/README.md
index b9175c27c..d6d62c62b 100644
--- a/tools/multimodal/README.md
+++ b/tools/multimodal/README.md
@@ -18,6 +18,7 @@ For now, dataset formats that are supported by Data-Juicer are listed in the fol
 | Format     | source_format_to_data_juicer_format | data_juicer_format_to_target_format | Ref.                                                                                                             |
 |------------|-------------------------------------|-------------------------------------|------------------------------------------------------------------------------------------------------------------|
 | LLaVA-like | `llava_to_dj.py`                    | `dj_to_llava.py`                    | [Format Description](https://github.com/haotian-liu/LLaVA/blob/main/docs/Finetune_Custom_Data.md#dataset-format) |
+| WavCaps-like  | `wavcaps_to_dj.py`                    | `dj_to_wavcaps.py`                    | - |
 
 For all tools, you can run the following command to find out the usage of them:
 
diff --git a/tools/multimodal/README_ZH.md b/tools/multimodal/README_ZH.md
index 9eb7757ce..af05a610d 100644
--- a/tools/multimodal/README_ZH.md
+++ b/tools/multimodal/README_ZH.md
@@ -15,6 +15,7 @@
 | 格式        | source_format_to_data_juicer_format | data_juicer_format_to_target_format | 格式参考                                                                                               |
 |-----------|-------------------------------------|-------------------------------------|----------------------------------------------------------------------------------------------------|
 | 类LLaVA格式  | `llava_to_dj.py`                    | `dj_to_llava.py`                    | [格式描述](https://github.com/haotian-liu/LLaVA/blob/main/docs/Finetune_Custom_Data.md#dataset-format) |
+| 类WavCaps格式  | `wavcaps_to_dj.py`                    | `dj_to_wavcaps.py`                    | - |
 
 对于所有工具，您可以运行以下命令来了解它们的详细用法：
 
diff --git a/tools/multimodal/data_juicer_format_to_target_format/dj_to_llava.py b/tools/multimodal/data_juicer_format_to_target_format/dj_to_llava.py
index b0c1495df..477d0c2e3 100644
--- a/tools/multimodal/data_juicer_format_to_target_format/dj_to_llava.py
+++ b/tools/multimodal/data_juicer_format_to_target_format/dj_to_llava.py
@@ -1,5 +1,5 @@
-# This tool is used to convert multimodal dataset in LLaVA format to a target
-# dataset in Data-Juicer format.
+# This tool is used to convert multimodal dataset in Data-Juicer format to a target
+# dataset in LLaVA format.
 #
 # Corresponding Data-Juicer format:
 #   - multi-chunk interleaved image-text sequence
diff --git a/tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py b/tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py
new file mode 100644
index 000000000..d88937b14
--- /dev/null
+++ b/tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py
@@ -0,0 +1,122 @@
+# This tool is used to convert multimodal dataset in Data-Juicer format to a target
+# dataset in WavCaps format.
+#
+# Data-Juicer format:
+# {'audios': ['./path/to/audio/2219.flac'],
+#  'text': '<audio>\n'
+#          '[[caption]]: An airplane is landing. <|__dj__eoc|>',
+#  '__dj__meta__': {
+#       'num_captions_per_audio': 1,
+#       'title': 'Airplane Landing Airport',
+#       'description': 'Large commercial airplane landing at an airport runway.',
+#       'author': 'Daniel Simion',
+#       'href': '2219-Airplane-Landing-Airport.html',
+#       'caption': 'An airplane is landing.',
+#       'id': '2219',
+#       'duration': 14.1424375,
+#       'audio': 'wav_path',
+#       'download_link': 'http://soundbible.com/grab.php?id=2219&type=wav',
+#       'category': '',
+#       'tags': '' }}
+# {'audios': ['./path/to/audio/2218.flac'],
+#  'text': '<audio>\n'
+#          '[[caption]]: Someone is ringing a bell. <|__dj__eoc|>',
+#  '__dj__meta__': {
+#       'num_captions_per_audio': 1,
+#       'title': 'Service Bell Help',
+#       'description': 'Customer ringing service bell in need of help in a store.',
+#       'author': 'Daniel Simion',
+#       'href': '2218-Service-Bell-Help.html',
+#       'caption': 'Someone is ringing a bell.',
+#       'id': '2218',
+#       'duration': 1.5698125,
+#       'audio': 'wav_path',
+#       'download_link': 'http://soundbible.com/grab.php?id=2218&type=wav',
+#       'category': '',
+#       'tags': '' }}
+#
+# Corresponding WavCps format:
+# { num_captions_per_audio: 1,
+#   data: [{
+#       'title': 'Airplane Landing Airport',
+#       'description': 'Large commercial airplane landing at an airport runway.',
+#       'author': 'Daniel Simion',
+#       'href': '2219-Airplane-Landing-Airport.html',
+#       'caption': 'An airplane is landing.',
+#       'id': '2219',
+#       'duration': 14.1424375,
+#       'audio': 'wav_path',
+#       'download_link': 'http://soundbible.com/grab.php?id=2219&type=wav'    
+#   },  {
+#       'title': 'Service Bell Help',
+#       'description': 'Customer ringing service bell in need of help in a store.',
+#       'author': 'Daniel Simion',
+#       'href': '2218-Service-Bell-Help.html',
+#       'caption': 'Someone is ringing a bell.',
+#       'id': '2218',
+#       'duration': 1.5698125,
+#       'audio': 'wav_path',
+#       'download_link': 'http://soundbible.com/grab.php?id=2218&type=wav'
+#   },
+#   ...]
+# }
+
+import json
+import os
+
+import fire
+import jsonlines as jl
+from loguru import logger
+from tqdm import tqdm
+
+from data_juicer.utils.constant import Fields
+
+
+@logger.catch
+def main(
+    dj_ds_path: str,
+    target_wavcaps_ds_path: str
+):
+    """
+    Convert a Data-Juicer-format dataset to a WavCaps-like dataset.
+
+    :param dj_ds_path: path to the input dataset in Data-Juicer format.
+    :param target_wavcaps_ds_path: path to store the converted dataset in WavCaps
+        format.
+    """
+
+    if not os.path.exists(dj_ds_path):
+        raise FileNotFoundError(
+            f'Input dataset [{dj_ds_path}] can not be found.')
+    if not target_wavcaps_ds_path.endswith('.json'):
+        raise ValueError(
+            'Only support "json" target dataset file for WavCaps now.')
+    if os.path.dirname(target_wavcaps_ds_path) \
+            and not os.path.exists(os.path.dirname(target_wavcaps_ds_path)):
+        logger.info(
+            f'Create directory [{os.path.dirname(target_wavcaps_ds_path)}] for '
+            f'the target dataset.')
+        os.makedirs(os.path.dirname(target_wavcaps_ds_path))
+
+    logger.info('Start to convert.')
+    samples = {'num_captions_per_audio': 1, 
+               'data': []}
+    with jl.open(dj_ds_path, 'r') as reader:
+        for sample in tqdm(reader):
+            if not Fields.meta in sample:
+                logger.info(
+                    f'Create directory [{os.path.dirname(target_wavcaps_ds_path)}] for '
+                    f'the target dataset.')   
+                continue
+            else:
+                samples['num_captions_per_audio'] = sample[Fields.meta]['num_captions_per_audio']
+                del sample[Fields.meta]['num_captions_per_audio']
+                samples['data'].append(sample[Fields.meta])
+
+    logger.info(f'Start to write the converted dataset to '
+                f'[{target_wavcaps_ds_path}]...')
+    json.dump(samples, open(target_wavcaps_ds_path, 'w', encoding='utf-8'))
+
+
+if __name__ == '__main__':
+    fire.Fire(main)
diff --git a/tools/multimodal/source_format_to_data_juicer_format/wavcaps_to_dj.py b/tools/multimodal/source_format_to_data_juicer_format/wavcaps_to_dj.py
new file mode 100644
index 000000000..e84831fb0
--- /dev/null
+++ b/tools/multimodal/source_format_to_data_juicer_format/wavcaps_to_dj.py
@@ -0,0 +1,231 @@
+# This tool is used to convert multimodal dataset in WavCaps format to a target
+# dataset in Data-Juicer format.
+#
+# WavCps format:
+# { num_captions_per_audio: 1,
+#   data: [{
+#       'title': 'Airplane Landing Airport',
+#       'description': 'Large commercial airplane landing at an airport runway.',
+#       'author': 'Daniel Simion',
+#       'href': '2219-Airplane-Landing-Airport.html',
+#       'caption': 'An airplane is landing.',
+#       'id': '2219',
+#       'duration': 14.1424375,
+#       'audio': 'wav_path',
+#       'download_link': 'http://soundbible.com/grab.php?id=2219&type=wav'    
+#   },  {
+#       'title': 'Service Bell Help',
+#       'description': 'Customer ringing service bell in need of help in a store.',
+#       'author': 'Daniel Simion',
+#       'href': '2218-Service-Bell-Help.html',
+#       'caption': 'Someone is ringing a bell.',
+#       'id': '2218',
+#       'duration': 1.5698125,
+#       'audio': 'wav_path',
+#       'download_link': 'http://soundbible.com/grab.php?id=2218&type=wav'
+#   },
+#   ...]
+# }
+#
+# Corresponding Data-Juicer format:
+# {'audios': ['./path/to/audio/2219.flac'],
+#  'text': '<audio>\n'
+#          '[[caption]]: An airplane is landing. <|__dj__eoc|>',
+#  '__dj__meta__': {
+#       'num_captions_per_audio': 1,
+#       'title': 'Airplane Landing Airport',
+#       'description': 'Large commercial airplane landing at an airport runway.',
+#       'author': 'Daniel Simion',
+#       'href': '2219-Airplane-Landing-Airport.html',
+#       'caption': 'An airplane is landing.',
+#       'id': '2219',
+#       'duration': 14.1424375,
+#       'audio': 'wav_path',
+#       'download_link': 'http://soundbible.com/grab.php?id=2219&type=wav',
+#       'category': '',
+#       'tags': '' }}
+# {'audios': ['./path/to/audio/2218.flac'],
+#  'text': '<audio>\n'
+#          '[[caption]]: Someone is ringing a bell. <|__dj__eoc|>',
+#  '__dj__meta__': {
+#       'num_captions_per_audio': 1,
+#       'title': 'Service Bell Help',
+#       'description': 'Customer ringing service bell in need of help in a store.',
+#       'author': 'Daniel Simion',
+#       'href': '2218-Service-Bell-Help.html',
+#       'caption': 'Someone is ringing a bell.',
+#       'id': '2218',
+#       'duration': 1.5698125,
+#       'audio': 'wav_path',
+#       'download_link': 'http://soundbible.com/grab.php?id=2218&type=wav',
+#       'category': '',
+#       'tags': '' }}
+
+import json
+import os
+
+import fire
+import jsonlines as jl
+from loguru import logger
+from tqdm import tqdm
+from typing import List, Union
+
+from data_juicer.utils.mm_utils import SpecialTokens
+from data_juicer.utils.constant import Fields
+
+
+def creat_meta_filed(num_captions_per_audio, source_meta):
+    meta_dict = {
+      'num_captions_per_audio': num_captions_per_audio,
+      'title': '',
+      'description': '',
+      'author': '',
+      'href': '',
+      'caption': '',
+      'id': '',
+      'duration': '',
+      'audio': '',
+      'download_link': '',
+      'category': '',
+      'tags': ''
+    }
+    for key in source_meta:
+        meta_dict[key] = source_meta[key]
+    return meta_dict
+
+
+def get_all_files(dirname):
+    result = {}
+    for maindir, subdir, file_name_list in os.walk(dirname):
+        for filename in file_name_list:
+            filepath = os.path.join(maindir, filename)
+            result[filename] = filepath
+    return result
+
+
+@logger.catch
+def main(
+    wavcaps_json_path: str,
+    wavcaps_audio_path: str,
+    target_ds_path: str,
+    target_field: Union[str, List[str]] = 'caption',
+    eoc_special_token: str = SpecialTokens.eoc,
+    audio_special_token: str = '<audio>',
+    add_eoc_at_last: bool = True,
+    add_target_field_token: bool = True,
+    sent_seperator: str = '\n',
+):
+    """
+    Convert a WavCaps-like dataset to the Data-Juicer format.
+
+    :param wavcaps_json_path: path to the json files of WavCaps-like dataset.
+    :param wavcaps_audio_path: path to the audio files of WavCaps-like dataset.  
+    :param target_ds_path: path to store the converted dataset in Data-Juicer
+        format.
+    :param target_field: the field used to describe audio in the WavCaps-like 
+        dataset, which can be one or more of ['caption', 'title', 'description'].
+    :param eoc_special_token: the special token for "end of a chunk". It's used
+        to split conversation chunks explicitly. Default: <|__dj__eoc|> (from
+        Data-Juicer).
+    :param audio_special_token: the special token for audios. It's used to
+        locate the audios in the text. In typical WavCaps-like datasets,
+        this token always be "<audio>". You can change it to align with your
+        own WavCaps-like datasets but should be careful of possible compatibility
+        problems that come from this change. Default: <audio>.
+    :param add_eoc_at_last: whether to add an extra eoc_special_token at the
+        end of text. Default: True.
+    :param add_target_field_token: whether to add an extra target_field_token into
+        text.
+    :param sent_seperator: seperator to split different sentences. Default: \n.
+    """
+    # ----- Constant settings. Better not to change them. -----
+    text_key = 'text'  # default key of field to store the sample text
+    audio_key = 'audios'  # default key of field to store the audio list
+    from_format = '[[%s]]: '  # default handle method for the text label
+    # ----- Constant settings. Better not to change them. -----
+
+    # check arguments
+    # check paths
+    if not os.path.exists(wavcaps_json_path):
+        raise FileNotFoundError(f'Input WavCaps json path [{wavcaps_json_path}] can '
+                                f'not be found.')
+    if not os.path.exists(wavcaps_audio_path):
+        raise FileNotFoundError(f'Input WavCaps audio path [{wavcaps_audio_path}] can '
+                                f'not be found.')
+    if not target_ds_path.endswith('.jsonl'):
+        raise ValueError('Only support "jsonl" target dataset file now.')
+
+    if not isinstance(target_field, list):
+        target_field = [target_field]
+    for tag in target_field:
+        if tag not in ['caption', 'description', 'title']:
+            raise ValueError("target_filed must be in '['caption', 'description', 'title']'")
+
+    if os.path.dirname(target_ds_path) \
+            and not os.path.exists(os.path.dirname(target_ds_path)):
+        logger.info(f'Create directory [{os.path.dirname(target_ds_path)}] '
+                    f'for the target dataset.')
+        os.makedirs(os.path.dirname(target_ds_path))
+
+    # check if the default audio special token is changed
+    if audio_special_token != '<audio>':
+        logger.warning('The audio_special_token used in the original WavCaps '
+                       'dataset is "<audio>". It\'s better to align the this '
+                       'token. There might be some compatibility problem if '
+                       'you change it.')
+    # check whether to add the eoc special token at last
+    if not add_eoc_at_last:
+        logger.warning('You choose not to add special eoc token at the last, '
+                       'which might cause some compatibility problems for '
+                       'other type of datasets (e.g. OpenFlamingo).')
+    
+    if isinstance(target_field, str):
+        target_field = [target_field]
+
+    # load WavCaps dataset
+    logger.info('Loading original WavCaps dataset.')
+    wavcaps_ds = json.load(open(wavcaps_json_path, 'r', encoding='utf-8'))
+    num_captions_per_audio = wavcaps_ds['num_captions_per_audio']
+    wavcaps_ds = wavcaps_ds['data']
+    logger.info(f'Load [{len(wavcaps_ds)}] samples.')
+    all_audio_files = get_all_files(wavcaps_audio_path)
+
+    with jl.open(target_ds_path, 'w') as writer:
+        for sample in tqdm(wavcaps_ds):
+            # id
+            audio_name = sample['id'].strip().split('.')[0] + '.flac'
+            target_meta = creat_meta_filed(num_captions_per_audio, sample)
+           
+            # audio and text
+            if audio_name not in all_audio_files:
+                logger.warning(f'No audios in the sample with id [{audio_name}], '
+                               f'which means this sample is not a multimodal '
+                               f'sample. You\'d better remove this sample '
+                               f'before converting.')
+                continue
+            audio = [all_audio_files[audio_name]]
+            text = audio_special_token
+            for tag in target_field:
+                if tag not in sample.keys():
+                    logger.warning(f'{tag} does not exist in this sample.')
+                    continue
+                if add_target_field_token:
+                    text += sent_seperator + from_format % tag + sample[tag]
+                else:
+                    text += sent_seperator + sample[tag]
+
+            if add_eoc_at_last:
+                text += eoc_special_token
+            
+            # get the new sample with Data-Juicer format
+            new_sample = {
+                text_key: text,
+                audio_key: audio,
+                Fields.meta: target_meta
+            }
+            writer.write(new_sample)
+    logger.info(f'Store the target dataset into [{target_ds_path}].')
+
+
+if __name__ == '__main__':
+    fire.Fire(main)

From d856a8080b5a0e9e36ef411321827ae83d5b7355 Mon Sep 17 00:00:00 2001
From: "hesen.chs" <hesen.chs@alibaba-inc.com>
Date: Wed, 22 Nov 2023 12:48:51 +0800
Subject: [PATCH 03/17] update multimodal_README

---
 tools/multimodal/README.md                    | 38 ++++++++++++++++++-
 tools/multimodal/README_ZH.md                 | 36 +++++++++++++++++-
 .../dj_to_wavcaps.py                          |  4 +-
 .../wavcaps_to_dj.py                          |  4 +-
 4 files changed, 76 insertions(+), 6 deletions(-)

diff --git a/tools/multimodal/README.md b/tools/multimodal/README.md
index d6d62c62b..b950ea08b 100644
--- a/tools/multimodal/README.md
+++ b/tools/multimodal/README.md
@@ -18,7 +18,7 @@ For now, dataset formats that are supported by Data-Juicer are listed in the fol
 | Format     | source_format_to_data_juicer_format | data_juicer_format_to_target_format | Ref.                                                                                                             |
 |------------|-------------------------------------|-------------------------------------|------------------------------------------------------------------------------------------------------------------|
 | LLaVA-like | `llava_to_dj.py`                    | `dj_to_llava.py`                    | [Format Description](https://github.com/haotian-liu/LLaVA/blob/main/docs/Finetune_Custom_Data.md#dataset-format) |
-| WavCaps-like  | `wavcaps_to_dj.py`                    | `dj_to_wavcaps.py`                    | - |
+| WavCaps-like  | `wavcaps_to_dj.py`                    | `dj_to_wavcaps.py`                    | [Format Description](https://github.com/XinhaoMei/WavCaps#table-of-contents) |
 
 For all tools, you can run the following command to find out the usage of them:
 
@@ -92,3 +92,39 @@ and converted datasets, so we can regard this sample is aligned with the origina
     }
 ]
 ```
+
+### WavCaps-like
+
+The [WavCaps](https://github.com/XinhaoMei/WavCaps#dataset) is composed of four sub-datasets: [FreeSound](https://freesound.org/), [BBC Sound Effects](https://sound-effects.bbcrewind.co.uk/),[SoundBible](https://soundbible.com/) and [AudioSet Strongly-labelled Subset](https://research.google.com/audioset/download_strong.html). Each sub-dataset has different fields. For example, the 'description' field is included in SoundBible, but does not exist in AudioSet. To ensure that the different sub-datasets can be properly merged after conversion, the union of all fields from the sub-datasets is used during the wavcaps_to_dj stage, and all fields are fully retained during the dj_to_wavcaps stage.
+
+```json
+# original dataset
+{ "num_captions_per_audio": 1,
+  "data": [{
+        "title": "Airplane Landing Airport",
+        "description": "Large commercial airplane landing at an airport runway.",
+        "author": "Daniel Simion",
+        "href": "2219-Airplane-Landing-Airport.html",
+        "caption": "An airplane is landing.",
+        "id": "2219",
+        "duration": 14.1424375,
+        "audio": "wav_path",
+        "download_link": "http://soundbible.com/grab.php?id=2219&type=wav"}]    
+}
+
+# converted dataset
+{ "num_captions_per_audio": 1,
+  "data": [{
+        "title": "Airplane Landing Airport",
+        "description": "Large commercial airplane landing at an airport runway.",
+        "author": "Daniel Simion",
+        "href": "2219-Airplane-Landing-Airport.html",
+        "caption": "An airplane is landing.",
+        "id": "2219",
+        "duration": 14.1424375,
+        "audio": "wav_path",
+        "download_link": "http://soundbible.com/grab.php?id=2219&type=wav",
+        "category": "",
+        "tags": "" }]    
+}
+```
diff --git a/tools/multimodal/README_ZH.md b/tools/multimodal/README_ZH.md
index af05a610d..be63af955 100644
--- a/tools/multimodal/README_ZH.md
+++ b/tools/multimodal/README_ZH.md
@@ -15,7 +15,7 @@
 | 格式        | source_format_to_data_juicer_format | data_juicer_format_to_target_format | 格式参考                                                                                               |
 |-----------|-------------------------------------|-------------------------------------|----------------------------------------------------------------------------------------------------|
 | 类LLaVA格式  | `llava_to_dj.py`                    | `dj_to_llava.py`                    | [格式描述](https://github.com/haotian-liu/LLaVA/blob/main/docs/Finetune_Custom_Data.md#dataset-format) |
-| 类WavCaps格式  | `wavcaps_to_dj.py`                    | `dj_to_wavcaps.py`                    | - |
+| 类WavCaps格式  | `wavcaps_to_dj.py`                    | `dj_to_wavcaps.py`                    | [格式描述](https://github.com/XinhaoMei/WavCaps#table-of-contents) |
 
 对于所有工具，您可以运行以下命令来了解它们的详细用法：
 
@@ -75,3 +75,37 @@ python tools/multimodal/source_format_to_data_juicer_format/llava_to_dj.py --hel
     }
 ]
 ```
+
+#### 类WavCaps格式
+[WavCaps](https://github.com/XinhaoMei/WavCaps#dataset) 数据集由 [FreeSound](https://freesound.org/)，[BBC Sound Effects](https://sound-effects.bbcrewind.co.uk/)，[SoundBible](https://soundbible.com/)，[AudioSet Strongly-labelled Subset](https://research.google.com/audioset/download_strong.html) 四个子数据集组成，每个数据集里都有不同的字段。例如SoundBible里包含了‘description’字段，而该字段在AudioSet里并不存在。为了保证不同子数据集在转换后能够正常合并，在wavcaps_to_dj阶段使用了所有子数据集字段的并集，并在dj_to_wavcaps阶段完整保留了所有字段。
+```json
+# 原始数据集
+{ "num_captions_per_audio": 1,
+  "data": [{
+        "title": "Airplane Landing Airport",
+        "description": "Large commercial airplane landing at an airport runway.",
+        "author": "Daniel Simion",
+        "href": "2219-Airplane-Landing-Airport.html",
+        "caption": "An airplane is landing.",
+        "id": "2219",
+        "duration": 14.1424375,
+        "audio": "wav_path",
+        "download_link": "http://soundbible.com/grab.php?id=2219&type=wav"}]    
+}
+
+# 转换后数据集
+{ "num_captions_per_audio": 1,
+  "data": [{
+        "title": "Airplane Landing Airport",
+        "description": "Large commercial airplane landing at an airport runway.",
+        "author": "Daniel Simion",
+        "href": "2219-Airplane-Landing-Airport.html",
+        "caption": "An airplane is landing.",
+        "id": "2219",
+        "duration": 14.1424375,
+        "audio": "wav_path",
+        "download_link": "http://soundbible.com/grab.php?id=2219&type=wav",
+        "category": "",
+        "tags": "" }]    
+}
+```
diff --git a/tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py b/tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py
index d88937b14..1f95b7a59 100644
--- a/tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py
+++ b/tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py
@@ -36,8 +36,8 @@
 #       'tags': '' }}
 #
 # Corresponding WavCps format:
-# { num_captions_per_audio: 1,
-#   data: [{
+# { 'num_captions_per_audio': 1,
+#   'data': [{
 #       'title': 'Airplane Landing Airport',
 #       'description': 'Large commercial airplane landing at an airport runway.',
 #       'author': 'Daniel Simion',
diff --git a/tools/multimodal/source_format_to_data_juicer_format/wavcaps_to_dj.py b/tools/multimodal/source_format_to_data_juicer_format/wavcaps_to_dj.py
index e84831fb0..61c4d74ca 100644
--- a/tools/multimodal/source_format_to_data_juicer_format/wavcaps_to_dj.py
+++ b/tools/multimodal/source_format_to_data_juicer_format/wavcaps_to_dj.py
@@ -2,8 +2,8 @@
 # dataset in Data-Juicer format.
 #
 # WavCps format:
-# { num_captions_per_audio: 1,
-#   data: [{
+# { 'num_captions_per_audio': 1,
+#   'data': [{
 #       'title': 'Airplane Landing Airport',
 #       'description': 'Large commercial airplane landing at an airport runway.',
 #       'author': 'Daniel Simion',

From fc98733483522a94caa55788fea70ba0cbf9ddf6 Mon Sep 17 00:00:00 2001
From: "hesen.chs" <hesen.chs@alibaba-inc.com>
Date: Thu, 23 Nov 2023 11:31:06 +0800
Subject: [PATCH 04/17] fix pre-commit error

---
 data_juicer/utils/mm_utils.py                 |  4 +-
 .../dj_to_llava.py                            |  4 +-
 .../dj_to_wavcaps.py                          | 39 ++++-----
 .../wavcaps_to_dj.py                          | 81 ++++++++++---------
 4 files changed, 64 insertions(+), 64 deletions(-)

diff --git a/data_juicer/utils/mm_utils.py b/data_juicer/utils/mm_utils.py
index 8a3f8c67e..817f298bd 100644
--- a/data_juicer/utils/mm_utils.py
+++ b/data_juicer/utils/mm_utils.py
@@ -1,4 +1,4 @@
-from datasets import Image, Audio
+from datasets import Audio, Image
 
 from data_juicer.utils.constant import DEFAULT_PREFIX
 
@@ -32,7 +32,7 @@ def load_audio(path, sampling_rate=None):
     aud_feature = Audio(sampling_rate)
     aud = aud_feature.decode_example(aud_feature.encode_example(path))
     return (aud['array'], aud['sampling_rate'])
-    
+
 
 def get_image_size(path, ):
     import os
diff --git a/tools/multimodal/data_juicer_format_to_target_format/dj_to_llava.py b/tools/multimodal/data_juicer_format_to_target_format/dj_to_llava.py
index 477d0c2e3..c58a06604 100644
--- a/tools/multimodal/data_juicer_format_to_target_format/dj_to_llava.py
+++ b/tools/multimodal/data_juicer_format_to_target_format/dj_to_llava.py
@@ -1,5 +1,5 @@
-# This tool is used to convert multimodal dataset in Data-Juicer format to a target
-# dataset in LLaVA format.
+# This tool is used to convert multimodal dataset in Data-Juicer format to a
+# target dataset in LLaVA format.
 #
 # Corresponding Data-Juicer format:
 #   - multi-chunk interleaved image-text sequence
diff --git a/tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py b/tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py
index 1f95b7a59..4f98166cf 100644
--- a/tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py
+++ b/tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py
@@ -1,5 +1,5 @@
-# This tool is used to convert multimodal dataset in Data-Juicer format to a target
-# dataset in WavCaps format.
+# This tool is used to convert multimodal dataset in Data-Juicer format to a
+# target dataset in WavCaps format.
 #
 # Data-Juicer format:
 # {'audios': ['./path/to/audio/2219.flac'],
@@ -8,7 +8,7 @@
 #  '__dj__meta__': {
 #       'num_captions_per_audio': 1,
 #       'title': 'Airplane Landing Airport',
-#       'description': 'Large commercial airplane landing at an airport runway.',
+#       'description': 'Large commercial airplane landing at an airport runway.',  # noqa: E501
 #       'author': 'Daniel Simion',
 #       'href': '2219-Airplane-Landing-Airport.html',
 #       'caption': 'An airplane is landing.',
@@ -24,7 +24,7 @@
 #  '__dj__meta__': {
 #       'num_captions_per_audio': 1,
 #       'title': 'Service Bell Help',
-#       'description': 'Customer ringing service bell in need of help in a store.',
+#       'description': 'Customer ringing service bell in need of help in a store.',  # noqa: E501
 #       'author': 'Daniel Simion',
 #       'href': '2218-Service-Bell-Help.html',
 #       'caption': 'Someone is ringing a bell.',
@@ -39,17 +39,17 @@
 # { 'num_captions_per_audio': 1,
 #   'data': [{
 #       'title': 'Airplane Landing Airport',
-#       'description': 'Large commercial airplane landing at an airport runway.',
+#       'description': 'Large commercial airplane landing at an airport runway.',  # noqa: E501
 #       'author': 'Daniel Simion',
 #       'href': '2219-Airplane-Landing-Airport.html',
 #       'caption': 'An airplane is landing.',
 #       'id': '2219',
 #       'duration': 14.1424375,
 #       'audio': 'wav_path',
-#       'download_link': 'http://soundbible.com/grab.php?id=2219&type=wav'    
+#       'download_link': 'http://soundbible.com/grab.php?id=2219&type=wav'
 #   },  {
 #       'title': 'Service Bell Help',
-#       'description': 'Customer ringing service bell in need of help in a store.',
+#       'description': 'Customer ringing service bell in need of help in a store.',  # noqa: E501
 #       'author': 'Daniel Simion',
 #       'href': '2218-Service-Bell-Help.html',
 #       'caption': 'Someone is ringing a bell.',
@@ -73,16 +73,13 @@
 
 
 @logger.catch
-def main(
-    dj_ds_path: str,
-    target_wavcaps_ds_path: str
-):
+def main(dj_ds_path: str, target_wavcaps_ds_path: str):
     """
     Convert a Data-Juicer-format dataset to a WavCaps-like dataset.
 
     :param dj_ds_path: path to the input dataset in Data-Juicer format.
-    :param target_wavcaps_ds_path: path to store the converted dataset in WavCaps
-        format.
+    :param target_wavcaps_ds_path: path to store the converted dataset in
+        WavCaps format.
     """
 
     if not os.path.exists(dj_ds_path):
@@ -94,22 +91,20 @@ def main(
     if os.path.dirname(target_wavcaps_ds_path) \
             and not os.path.exists(os.path.dirname(target_wavcaps_ds_path)):
         logger.info(
-            f'Create directory [{os.path.dirname(target_wavcaps_ds_path)}] for '
-            f'the target dataset.')
+            f'Create directory [{os.path.dirname(target_wavcaps_ds_path)}] '
+            f'for the target dataset.')
         os.makedirs(os.path.dirname(target_wavcaps_ds_path))
 
     logger.info('Start to convert.')
-    samples = {'num_captions_per_audio': 1, 
-               'data': []}
+    samples = {'num_captions_per_audio': 1, 'data': []}
     with jl.open(dj_ds_path, 'r') as reader:
         for sample in tqdm(reader):
-            if not Fields.meta in sample:
-                logger.info(
-                    f'Create directory [{os.path.dirname(target_wavcaps_ds_path)}] for '
-                    f'the target dataset.')   
+            if Fields.meta not in sample:
+                logger.warning(f'{Fields.meta} does not exist in this sample.')
                 continue
             else:
-                samples['num_captions_per_audio'] = sample[Fields.meta]['num_captions_per_audio']
+                samples['num_captions_per_audio'] = sample[
+                    Fields.meta]['num_captions_per_audio']
                 del sample[Fields.meta]['num_captions_per_audio']
                 samples['data'].append(sample[Fields.meta])
 
diff --git a/tools/multimodal/source_format_to_data_juicer_format/wavcaps_to_dj.py b/tools/multimodal/source_format_to_data_juicer_format/wavcaps_to_dj.py
index 61c4d74ca..d91989cbe 100644
--- a/tools/multimodal/source_format_to_data_juicer_format/wavcaps_to_dj.py
+++ b/tools/multimodal/source_format_to_data_juicer_format/wavcaps_to_dj.py
@@ -5,17 +5,17 @@
 # { 'num_captions_per_audio': 1,
 #   'data': [{
 #       'title': 'Airplane Landing Airport',
-#       'description': 'Large commercial airplane landing at an airport runway.',
+#       'description': 'Large commercial airplane landing at an airport runway.',  # noqa: E501
 #       'author': 'Daniel Simion',
 #       'href': '2219-Airplane-Landing-Airport.html',
 #       'caption': 'An airplane is landing.',
 #       'id': '2219',
 #       'duration': 14.1424375,
 #       'audio': 'wav_path',
-#       'download_link': 'http://soundbible.com/grab.php?id=2219&type=wav'    
+#       'download_link': 'http://soundbible.com/grab.php?id=2219&type=wav'
 #   },  {
 #       'title': 'Service Bell Help',
-#       'description': 'Customer ringing service bell in need of help in a store.',
+#       'description': 'Customer ringing service bell in need of help in a store.',  # noqa: E501
 #       'author': 'Daniel Simion',
 #       'href': '2218-Service-Bell-Help.html',
 #       'caption': 'Someone is ringing a bell.',
@@ -34,7 +34,7 @@
 #  '__dj__meta__': {
 #       'num_captions_per_audio': 1,
 #       'title': 'Airplane Landing Airport',
-#       'description': 'Large commercial airplane landing at an airport runway.',
+#       'description': 'Large commercial airplane landing at an airport runway.',  # noqa: E501
 #       'author': 'Daniel Simion',
 #       'href': '2219-Airplane-Landing-Airport.html',
 #       'caption': 'An airplane is landing.',
@@ -50,7 +50,7 @@
 #  '__dj__meta__': {
 #       'num_captions_per_audio': 1,
 #       'title': 'Service Bell Help',
-#       'description': 'Customer ringing service bell in need of help in a store.',
+#       'description': 'Customer ringing service bell in need of help in a store.',  # noqa: E501
 #       'author': 'Daniel Simion',
 #       'href': '2218-Service-Bell-Help.html',
 #       'caption': 'Someone is ringing a bell.',
@@ -63,31 +63,31 @@
 
 import json
 import os
+from typing import List, Union
 
 import fire
 import jsonlines as jl
 from loguru import logger
 from tqdm import tqdm
-from typing import List, Union
 
-from data_juicer.utils.mm_utils import SpecialTokens
 from data_juicer.utils.constant import Fields
+from data_juicer.utils.mm_utils import SpecialTokens
 
 
 def creat_meta_filed(num_captions_per_audio, source_meta):
     meta_dict = {
-      'num_captions_per_audio': num_captions_per_audio,
-      'title': '',
-      'description': '',
-      'author': '',
-      'href': '',
-      'caption': '',
-      'id': '',
-      'duration': '',
-      'audio': '',
-      'download_link': '',
-      'category': '',
-      'tags': ''
+        'num_captions_per_audio': num_captions_per_audio,
+        'title': '',
+        'description': '',
+        'author': '',
+        'href': '',
+        'caption': '',
+        'id': '',
+        'duration': '',
+        'audio': '',
+        'download_link': '',
+        'category': '',
+        'tags': ''
     }
     for key in source_meta:
         meta_dict[key] = source_meta[key]
@@ -119,23 +119,23 @@ def main(
     Convert a WavCaps-like dataset to the Data-Juicer format.
 
     :param wavcaps_json_path: path to the json files of WavCaps-like dataset.
-    :param wavcaps_audio_path: path to the audio files of WavCaps-like dataset.  
+    :param wavcaps_audio_path: path to the audio files of WavCaps-like dataset.
     :param target_ds_path: path to store the converted dataset in Data-Juicer
         format.
-    :param target_field: the field used to describe audio in the WavCaps-like 
-        dataset, which can be one or more of ['caption', 'title', 'description'].
+    :param target_field: the field used to describe audio in the WavCaps-like
+        dataset, which can be one or more of ['caption','title','description'].
     :param eoc_special_token: the special token for "end of a chunk". It's used
         to split conversation chunks explicitly. Default: <|__dj__eoc|> (from
         Data-Juicer).
     :param audio_special_token: the special token for audios. It's used to
         locate the audios in the text. In typical WavCaps-like datasets,
         this token always be "<audio>". You can change it to align with your
-        own WavCaps-like datasets but should be careful of possible compatibility
-        problems that come from this change. Default: <audio>.
+        own WavCaps-like datasets but should be careful of possible
+        compatibility problems that come from this change. Default: <audio>.
     :param add_eoc_at_last: whether to add an extra eoc_special_token at the
         end of text. Default: True.
-    :param add_target_field_token: whether to add an extra target_field_token into
-        text.
+    :param add_target_field_token: whether to add an extra target_field_token
+        into text.
     :param sent_seperator: seperator to split different sentences. Default: \n.
     """
     # ----- Constant settings. Better not to change them. -----
@@ -147,11 +147,13 @@ def main(
     # check arguments
     # check paths
     if not os.path.exists(wavcaps_json_path):
-        raise FileNotFoundError(f'Input WavCaps json path [{wavcaps_json_path}] can '
-                                f'not be found.')
+        raise FileNotFoundError(
+            f'Input WavCaps json path [{wavcaps_json_path}] can '
+            f'not be found.')
     if not os.path.exists(wavcaps_audio_path):
-        raise FileNotFoundError(f'Input WavCaps audio path [{wavcaps_audio_path}] can '
-                                f'not be found.')
+        raise FileNotFoundError(
+            f'Input WavCaps audio path [{wavcaps_audio_path}] can '
+            f'not be found.')
     if not target_ds_path.endswith('.jsonl'):
         raise ValueError('Only support "jsonl" target dataset file now.')
 
@@ -159,7 +161,9 @@ def main(
         target_field = [target_field]
     for tag in target_field:
         if tag not in ['caption', 'description', 'title']:
-            raise ValueError("target_filed must be in '['caption', 'description', 'title']'")
+            raise ValueError(
+                "target_filed must be in '['caption', 'description', 'title']'"
+            )
 
     if os.path.dirname(target_ds_path) \
             and not os.path.exists(os.path.dirname(target_ds_path)):
@@ -178,7 +182,7 @@ def main(
         logger.warning('You choose not to add special eoc token at the last, '
                        'which might cause some compatibility problems for '
                        'other type of datasets (e.g. OpenFlamingo).')
-    
+
     if isinstance(target_field, str):
         target_field = [target_field]
 
@@ -195,13 +199,14 @@ def main(
             # id
             audio_name = sample['id'].strip().split('.')[0] + '.flac'
             target_meta = creat_meta_filed(num_captions_per_audio, sample)
-           
+
             # audio and text
             if audio_name not in all_audio_files:
-                logger.warning(f'No audios in the sample with id [{audio_name}], '
-                               f'which means this sample is not a multimodal '
-                               f'sample. You\'d better remove this sample '
-                               f'before converting.')
+                logger.warning(
+                    f'No audios in the sample with id [{audio_name}], '
+                    f'which means this sample is not a multimodal '
+                    f'sample. You\'d better remove this sample '
+                    f'before converting.')
                 continue
             audio = [all_audio_files[audio_name]]
             text = audio_special_token
@@ -216,7 +221,7 @@ def main(
 
             if add_eoc_at_last:
                 text += eoc_special_token
-            
+
             # get the new sample with Data-Juicer format
             new_sample = {
                 text_key: text,

From 539f09971dea8373db9214cc686d80cee579a472 Mon Sep 17 00:00:00 2001
From: "hesen.chs" <hesen.chs@alibaba-inc.com>
Date: Thu, 23 Nov 2023 15:51:03 +0800
Subject: [PATCH 05/17] modify audio_special_token

---
 .../dj_to_wavcaps.py                          |  8 +++----
 .../wavcaps_to_dj.py                          | 23 ++++++-------------
 2 files changed, 11 insertions(+), 20 deletions(-)

diff --git a/tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py b/tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py
index 4f98166cf..4e0d13eff 100644
--- a/tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py
+++ b/tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py
@@ -3,8 +3,8 @@
 #
 # Data-Juicer format:
 # {'audios': ['./path/to/audio/2219.flac'],
-#  'text': '<audio>\n'
-#          '[[caption]]: An airplane is landing. <|__dj__eoc|>',
+#  'text': '<__dj__audio>\n'
+#          'An airplane is landing. <|__dj__eoc|>',
 #  '__dj__meta__': {
 #       'num_captions_per_audio': 1,
 #       'title': 'Airplane Landing Airport',
@@ -19,8 +19,8 @@
 #       'category': '',
 #       'tags': '' }}
 # {'audios': ['./path/to/audio/2218.flac'],
-#  'text': '<audio>\n'
-#          '[[caption]]: Someone is ringing a bell. <|__dj__eoc|>',
+#  'text': '<__dj__audio>\n'
+#          'Someone is ringing a bell. <|__dj__eoc|>',
 #  '__dj__meta__': {
 #       'num_captions_per_audio': 1,
 #       'title': 'Service Bell Help',
diff --git a/tools/multimodal/source_format_to_data_juicer_format/wavcaps_to_dj.py b/tools/multimodal/source_format_to_data_juicer_format/wavcaps_to_dj.py
index d91989cbe..911d403eb 100644
--- a/tools/multimodal/source_format_to_data_juicer_format/wavcaps_to_dj.py
+++ b/tools/multimodal/source_format_to_data_juicer_format/wavcaps_to_dj.py
@@ -29,8 +29,8 @@
 #
 # Corresponding Data-Juicer format:
 # {'audios': ['./path/to/audio/2219.flac'],
-#  'text': '<audio>\n'
-#          '[[caption]]: An airplane is landing. <|__dj__eoc|>',
+#  'text': '<__dj__audio>\n'
+#          'An airplane is landing. <|__dj__eoc|>',
 #  '__dj__meta__': {
 #       'num_captions_per_audio': 1,
 #       'title': 'Airplane Landing Airport',
@@ -45,8 +45,8 @@
 #       'category': '',
 #       'tags': '' }}
 # {'audios': ['./path/to/audio/2218.flac'],
-#  'text': '<audio>\n'
-#          '[[caption]]: Someone is ringing a bell. <|__dj__eoc|>',
+#  'text': '<__dj__audio>\n'
+#          'Someone is ringing a bell. <|__dj__eoc|>',
 #  '__dj__meta__': {
 #       'num_captions_per_audio': 1,
 #       'title': 'Service Bell Help',
@@ -110,9 +110,9 @@ def main(
     target_ds_path: str,
     target_field: Union[str, List[str]] = 'caption',
     eoc_special_token: str = SpecialTokens.eoc,
-    audio_special_token: str = '<audio>',
+    audio_special_token: str = SpecialTokens.audio,
     add_eoc_at_last: bool = True,
-    add_target_field_token: bool = True,
+    add_target_field_token: bool = False,
     sent_seperator: str = '\n',
 ):
     """
@@ -128,10 +128,7 @@ def main(
         to split conversation chunks explicitly. Default: <|__dj__eoc|> (from
         Data-Juicer).
     :param audio_special_token: the special token for audios. It's used to
-        locate the audios in the text. In typical WavCaps-like datasets,
-        this token always be "<audio>". You can change it to align with your
-        own WavCaps-like datasets but should be careful of possible
-        compatibility problems that come from this change. Default: <audio>.
+        locate the audios in the text.
     :param add_eoc_at_last: whether to add an extra eoc_special_token at the
         end of text. Default: True.
     :param add_target_field_token: whether to add an extra target_field_token
@@ -171,12 +168,6 @@ def main(
                     f'for the target dataset.')
         os.makedirs(os.path.dirname(target_ds_path))
 
-    # check if the default audio special token is changed
-    if audio_special_token != '<audio>':
-        logger.warning('The audio_special_token used in the original WavCaps '
-                       'dataset is "<audio>". It\'s better to align the this '
-                       'token. There might be some compatibility problem if '
-                       'you change it.')
     # check whether to add the eoc special token at last
     if not add_eoc_at_last:
         logger.warning('You choose not to add special eoc token at the last, '

From 4c27643f4631835b843f2e0ec191616fdc61beea Mon Sep 17 00:00:00 2001
From: "hesen.chs" <hesen.chs@alibaba-inc.com>
Date: Thu, 23 Nov 2023 17:33:18 +0800
Subject: [PATCH 06/17] support only one target_field

---
 .../dj_to_wavcaps.py                          | 50 ++++++++++++++++---
 .../wavcaps_to_dj.py                          | 34 +++++--------
 2 files changed, 57 insertions(+), 27 deletions(-)

diff --git a/tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py b/tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py
index 4e0d13eff..3b5fc63c0 100644
--- a/tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py
+++ b/tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py
@@ -70,17 +70,42 @@
 from tqdm import tqdm
 
 from data_juicer.utils.constant import Fields
+from data_juicer.utils.mm_utils import SpecialTokens
 
 
 @logger.catch
-def main(dj_ds_path: str, target_wavcaps_ds_path: str):
+def main(
+    dj_ds_path: str, 
+    target_wavcaps_ds_path: str,
+    target_field: str = 'caption',
+    eoc_special_token: str = SpecialTokens.eoc,
+    audio_special_token: str = SpecialTokens.audio,
+    remove_eoc_at_last: bool = True,
+    remove_target_field_token: bool = False,
+    sent_seperator: str = '\n',
+    ):
     """
     Convert a Data-Juicer-format dataset to a WavCaps-like dataset.
 
     :param dj_ds_path: path to the input dataset in Data-Juicer format.
     :param target_wavcaps_ds_path: path to store the converted dataset in
         WavCaps format.
+    :param target_field: the field used to describe audio in the WavCaps-like
+        dataset, which can be one of ['caption','title','description'].
+    :param eoc_special_token: the special token for "end of a chunk". It's used
+        to split conversation chunks explicitly. Default: <|__dj__eoc|> (from
+        Data-Juicer).
+    :param audio_special_token: the special token for audios. It's used to
+        locate the audios in the text.
+    :param remove_eoc_at_last: whether to remove the extra eoc_special_token at the
+        end of text. Default: True.
+    :param remove_target_field_token: whether to remove the extra target_field_token
+        at text.
+    :param sent_seperator: seperator to split different sentences. Default: \n.
     """
+    # ----- Constant settings. Better not to change them. -----
+    from_format = '[[%s]]: '  # default handle method for the text label
+    # ----- Constant settings. Better not to change them. -----
 
     if not os.path.exists(dj_ds_path):
         raise FileNotFoundError(
@@ -95,6 +120,11 @@ def main(dj_ds_path: str, target_wavcaps_ds_path: str):
             f'for the target dataset.')
         os.makedirs(os.path.dirname(target_wavcaps_ds_path))
 
+    if target_field not in ['caption', 'description', 'title']:
+        raise ValueError(
+            "target_field must be in '['caption', 'description', 'title']'"
+        )
+
     logger.info('Start to convert.')
     samples = {'num_captions_per_audio': 1, 'data': []}
     with jl.open(dj_ds_path, 'r') as reader:
@@ -102,11 +132,19 @@ def main(dj_ds_path: str, target_wavcaps_ds_path: str):
             if Fields.meta not in sample:
                 logger.warning(f'{Fields.meta} does not exist in this sample.')
                 continue
-            else:
-                samples['num_captions_per_audio'] = sample[
-                    Fields.meta]['num_captions_per_audio']
-                del sample[Fields.meta]['num_captions_per_audio']
-                samples['data'].append(sample[Fields.meta])
+
+            if target_field not in sample[Fields.meta].keys():
+                logger.warning(f'{target_field} does not exist in this sample.')
+                continue
+            samples['num_captions_per_audio'] = sample[Fields.meta]['num_captions_per_audio']
+            del sample[Fields.meta]['num_captions_per_audio']
+
+            sample[Fields.meta][target_field] = sample['text'].replace(audio_special_token + sent_seperator, "")
+            if remove_eoc_at_last:
+                sample[Fields.meta][target_field] = sample[Fields.meta][target_field].replace(eoc_special_token, "")
+            if remove_target_field_token:
+                sample[Fields.meta][target_field] = sample[Fields.meta][target_field].replace(from_format % target_field, "")         
+            samples['data'].append(sample[Fields.meta])
 
     logger.info(f'Start to write the converted dataset to '
                 f'[{target_wavcaps_ds_path}]...')
diff --git a/tools/multimodal/source_format_to_data_juicer_format/wavcaps_to_dj.py b/tools/multimodal/source_format_to_data_juicer_format/wavcaps_to_dj.py
index 911d403eb..4ea228fca 100644
--- a/tools/multimodal/source_format_to_data_juicer_format/wavcaps_to_dj.py
+++ b/tools/multimodal/source_format_to_data_juicer_format/wavcaps_to_dj.py
@@ -108,7 +108,7 @@ def main(
     wavcaps_json_path: str,
     wavcaps_audio_path: str,
     target_ds_path: str,
-    target_field: Union[str, List[str]] = 'caption',
+    target_field: str = 'caption',
     eoc_special_token: str = SpecialTokens.eoc,
     audio_special_token: str = SpecialTokens.audio,
     add_eoc_at_last: bool = True,
@@ -123,7 +123,7 @@ def main(
     :param target_ds_path: path to store the converted dataset in Data-Juicer
         format.
     :param target_field: the field used to describe audio in the WavCaps-like
-        dataset, which can be one or more of ['caption','title','description'].
+        dataset, which can be one of ['caption','title','description'].
     :param eoc_special_token: the special token for "end of a chunk". It's used
         to split conversation chunks explicitly. Default: <|__dj__eoc|> (from
         Data-Juicer).
@@ -154,13 +154,10 @@ def main(
     if not target_ds_path.endswith('.jsonl'):
         raise ValueError('Only support "jsonl" target dataset file now.')
 
-    if not isinstance(target_field, list):
-        target_field = [target_field]
-    for tag in target_field:
-        if tag not in ['caption', 'description', 'title']:
-            raise ValueError(
-                "target_filed must be in '['caption', 'description', 'title']'"
-            )
+    if target_field not in ['caption', 'description', 'title']:
+        raise ValueError(
+            "target_field must be in '['caption', 'description', 'title']'"
+        )
 
     if os.path.dirname(target_ds_path) \
             and not os.path.exists(os.path.dirname(target_ds_path)):
@@ -174,9 +171,6 @@ def main(
                        'which might cause some compatibility problems for '
                        'other type of datasets (e.g. OpenFlamingo).')
 
-    if isinstance(target_field, str):
-        target_field = [target_field]
-
     # load WavCaps dataset
     logger.info('Loading original WavCaps dataset.')
     wavcaps_ds = json.load(open(wavcaps_json_path, 'r', encoding='utf-8'))
@@ -200,16 +194,14 @@ def main(
                     f'before converting.')
                 continue
             audio = [all_audio_files[audio_name]]
-            text = audio_special_token
-            for tag in target_field:
-                if tag not in sample.keys():
-                    logger.warning(f'{tag} does not exist in this sample.')
-                    continue
-                if add_target_field_token:
-                    text += sent_seperator + from_format % tag + sample[tag]
-                else:
-                    text += sent_seperator + sample[tag]
+            text = audio_special_token + sent_seperator
+            if target_field not in sample.keys():
+                logger.warning(f'{target_field} does not exist in this sample.')
+                continue
 
+            if add_target_field_token:
+                text += from_format % target_field
+            text += sample[target_field]
             if add_eoc_at_last:
                 text += eoc_special_token
 

From e54d1976a97d75f82f50f82613a428d15cf089ad Mon Sep 17 00:00:00 2001
From: "hesen.chs" <hesen.chs@alibaba-inc.com>
Date: Thu, 23 Nov 2023 17:48:30 +0800
Subject: [PATCH 07/17] fix pre-commit

---
 .../dj_to_wavcaps.py                          | 31 +++++++++++--------
 .../wavcaps_to_dj.py                          |  7 ++---
 2 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py b/tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py
index 3b5fc63c0..a1c561e19 100644
--- a/tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py
+++ b/tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py
@@ -75,7 +75,7 @@
 
 @logger.catch
 def main(
-    dj_ds_path: str, 
+    dj_ds_path: str,
     target_wavcaps_ds_path: str,
     target_field: str = 'caption',
     eoc_special_token: str = SpecialTokens.eoc,
@@ -83,7 +83,7 @@ def main(
     remove_eoc_at_last: bool = True,
     remove_target_field_token: bool = False,
     sent_seperator: str = '\n',
-    ):
+):
     """
     Convert a Data-Juicer-format dataset to a WavCaps-like dataset.
 
@@ -97,10 +97,10 @@ def main(
         Data-Juicer).
     :param audio_special_token: the special token for audios. It's used to
         locate the audios in the text.
-    :param remove_eoc_at_last: whether to remove the extra eoc_special_token at the
-        end of text. Default: True.
-    :param remove_target_field_token: whether to remove the extra target_field_token
-        at text.
+    :param remove_eoc_at_last: whether to remove the extra eoc_special_token at
+        the end of text. Default: True.
+    :param remove_target_field_token: whether to remove the extra
+        target_field_token at text.
     :param sent_seperator: seperator to split different sentences. Default: \n.
     """
     # ----- Constant settings. Better not to change them. -----
@@ -122,8 +122,7 @@ def main(
 
     if target_field not in ['caption', 'description', 'title']:
         raise ValueError(
-            "target_field must be in '['caption', 'description', 'title']'"
-        )
+            "target_field must be in '['caption', 'description', 'title']'")
 
     logger.info('Start to convert.')
     samples = {'num_captions_per_audio': 1, 'data': []}
@@ -134,16 +133,22 @@ def main(
                 continue
 
             if target_field not in sample[Fields.meta].keys():
-                logger.warning(f'{target_field} does not exist in this sample.')
+                logger.warning(
+                    f'{target_field} does not exist in this sample.')
                 continue
-            samples['num_captions_per_audio'] = sample[Fields.meta]['num_captions_per_audio']
+            samples['num_captions_per_audio'] = sample[
+                Fields.meta]['num_captions_per_audio']
             del sample[Fields.meta]['num_captions_per_audio']
 
-            sample[Fields.meta][target_field] = sample['text'].replace(audio_special_token + sent_seperator, "")
+            sample[Fields.meta][target_field] = sample['text'].replace(
+                audio_special_token + sent_seperator, '')
             if remove_eoc_at_last:
-                sample[Fields.meta][target_field] = sample[Fields.meta][target_field].replace(eoc_special_token, "")
+                sample[Fields.meta][target_field] = sample[
+                    Fields.meta][target_field].replace(eoc_special_token, '')
             if remove_target_field_token:
-                sample[Fields.meta][target_field] = sample[Fields.meta][target_field].replace(from_format % target_field, "")         
+                sample[Fields.meta][target_field] = sample[
+                    Fields.meta][target_field].replace(
+                        from_format % target_field, '')
             samples['data'].append(sample[Fields.meta])
 
     logger.info(f'Start to write the converted dataset to '
diff --git a/tools/multimodal/source_format_to_data_juicer_format/wavcaps_to_dj.py b/tools/multimodal/source_format_to_data_juicer_format/wavcaps_to_dj.py
index 4ea228fca..de8b720a9 100644
--- a/tools/multimodal/source_format_to_data_juicer_format/wavcaps_to_dj.py
+++ b/tools/multimodal/source_format_to_data_juicer_format/wavcaps_to_dj.py
@@ -63,7 +63,6 @@
 
 import json
 import os
-from typing import List, Union
 
 import fire
 import jsonlines as jl
@@ -156,8 +155,7 @@ def main(
 
     if target_field not in ['caption', 'description', 'title']:
         raise ValueError(
-            "target_field must be in '['caption', 'description', 'title']'"
-        )
+            "target_field must be in '['caption', 'description', 'title']'")
 
     if os.path.dirname(target_ds_path) \
             and not os.path.exists(os.path.dirname(target_ds_path)):
@@ -196,7 +194,8 @@ def main(
             audio = [all_audio_files[audio_name]]
             text = audio_special_token + sent_seperator
             if target_field not in sample.keys():
-                logger.warning(f'{target_field} does not exist in this sample.')
+                logger.warning(
+                    f'{target_field} does not exist in this sample.')
                 continue
 
             if add_target_field_token:

From 4743e622010e7290888fee0ee1bad92e62d6cd69 Mon Sep 17 00:00:00 2001
From: "hesen.chs" <hesen.chs@alibaba-inc.com>
Date: Fri, 24 Nov 2023 11:17:38 +0800
Subject: [PATCH 08/17] add id for log

---
 .../dj_to_wavcaps.py                          | 14 +++++++---
 .../wavcaps_to_dj.py                          | 26 ++++++++++++-------
 2 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py b/tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py
index a1c561e19..b7cf268e1 100644
--- a/tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py
+++ b/tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py
@@ -2,7 +2,8 @@
 # target dataset in WavCaps format.
 #
 # Data-Juicer format:
-# {'audios': ['./path/to/audio/2219.flac'],
+# {'id': 2219,
+#  'audios': ['./path/to/audio/2219.flac'],
 #  'text': '<__dj__audio>\n'
 #          'An airplane is landing. <|__dj__eoc|>',
 #  '__dj__meta__': {
@@ -18,7 +19,8 @@
 #       'download_link': 'http://soundbible.com/grab.php?id=2219&type=wav',
 #       'category': '',
 #       'tags': '' }}
-# {'audios': ['./path/to/audio/2218.flac'],
+# {'id': 2218,
+#  'audios': ['./path/to/audio/2218.flac'],
 #  'text': '<__dj__audio>\n'
 #          'Someone is ringing a bell. <|__dj__eoc|>',
 #  '__dj__meta__': {
@@ -128,13 +130,17 @@ def main(
     samples = {'num_captions_per_audio': 1, 'data': []}
     with jl.open(dj_ds_path, 'r') as reader:
         for sample in tqdm(reader):
+            id = sample['id']
             if Fields.meta not in sample:
-                logger.warning(f'{Fields.meta} does not exist in this sample.')
+                logger.warning(
+                    f'{Fields.meta} does not exist in this sample with '
+                    f'id [{id}].')
                 continue
 
             if target_field not in sample[Fields.meta].keys():
                 logger.warning(
-                    f'{target_field} does not exist in this sample.')
+                    f'{target_field} does not exist in this sample with '
+                    f'id [{id}].')
                 continue
             samples['num_captions_per_audio'] = sample[
                 Fields.meta]['num_captions_per_audio']
diff --git a/tools/multimodal/source_format_to_data_juicer_format/wavcaps_to_dj.py b/tools/multimodal/source_format_to_data_juicer_format/wavcaps_to_dj.py
index de8b720a9..7cb9470a2 100644
--- a/tools/multimodal/source_format_to_data_juicer_format/wavcaps_to_dj.py
+++ b/tools/multimodal/source_format_to_data_juicer_format/wavcaps_to_dj.py
@@ -28,7 +28,8 @@
 # }
 #
 # Corresponding Data-Juicer format:
-# {'audios': ['./path/to/audio/2219.flac'],
+# {'id': 2219,
+#  'audios': ['./path/to/audio/2219.flac'],
 #  'text': '<__dj__audio>\n'
 #          'An airplane is landing. <|__dj__eoc|>',
 #  '__dj__meta__': {
@@ -44,7 +45,8 @@
 #       'download_link': 'http://soundbible.com/grab.php?id=2219&type=wav',
 #       'category': '',
 #       'tags': '' }}
-# {'audios': ['./path/to/audio/2218.flac'],
+# {'id': 2218,
+#  'audios': ['./path/to/audio/2218.flac'],
 #  'text': '<__dj__audio>\n'
 #          'Someone is ringing a bell. <|__dj__eoc|>',
 #  '__dj__meta__': {
@@ -107,6 +109,7 @@ def main(
     wavcaps_json_path: str,
     wavcaps_audio_path: str,
     target_ds_path: str,
+    str_id: bool = True,
     target_field: str = 'caption',
     eoc_special_token: str = SpecialTokens.eoc,
     audio_special_token: str = SpecialTokens.audio,
@@ -180,22 +183,26 @@ def main(
     with jl.open(target_ds_path, 'w') as writer:
         for sample in tqdm(wavcaps_ds):
             # id
-            audio_name = sample['id'].strip().split('.')[0] + '.flac'
+            id = sample['id']
+            if str_id:
+                id = str(id)
+
+            audio_name = id.strip().split('.')[0] + '.flac'
             target_meta = creat_meta_filed(num_captions_per_audio, sample)
 
             # audio and text
             if audio_name not in all_audio_files:
-                logger.warning(
-                    f'No audios in the sample with id [{audio_name}], '
-                    f'which means this sample is not a multimodal '
-                    f'sample. You\'d better remove this sample '
-                    f'before converting.')
+                logger.warning(f'No audios in the sample with id [{id}], '
+                               f'which means this sample is not a multimodal '
+                               f'sample. You\'d better remove this sample '
+                               f'before converting.')
                 continue
             audio = [all_audio_files[audio_name]]
             text = audio_special_token + sent_seperator
             if target_field not in sample.keys():
                 logger.warning(
-                    f'{target_field} does not exist in this sample.')
+                    f'{target_field} does not exist in this sample with '
+                    f'id [{id}].')
                 continue
 
             if add_target_field_token:
@@ -206,6 +213,7 @@ def main(
 
             # get the new sample with Data-Juicer format
             new_sample = {
+                'id': id,
                 text_key: text,
                 audio_key: audio,
                 Fields.meta: target_meta

From 8a4a0e736ac2c8e1a2f7ea8359ddefb3b3e67e26 Mon Sep 17 00:00:00 2001
From: "hesen.chs" <hesen.chs@alibaba-inc.com>
Date: Thu, 21 Dec 2023 14:27:43 +0800
Subject: [PATCH 09/17] add remove_repeat_sentences_mapper

---
 configs/config_all.yaml                       |  4 ++
 data_juicer/ops/mapper/__init__.py            |  3 +-
 .../mapper/remove_repeat_sentences_mapper.py  | 71 +++++++++++++++++++
 docs/Operators.md                             |  3 +-
 docs/Operators_ZH.md                          |  5 +-
 .../test_remove_repeat_sentences_mapper.py    | 57 +++++++++++++++
 6 files changed, 139 insertions(+), 4 deletions(-)
 create mode 100644 data_juicer/ops/mapper/remove_repeat_sentences_mapper.py
 create mode 100644 tests/ops/mapper/test_remove_repeat_sentences_mapper.py

diff --git a/configs/config_all.yaml b/configs/config_all.yaml
index 269df841b..3c38e19be 100644
--- a/configs/config_all.yaml
+++ b/configs/config_all.yaml
@@ -85,6 +85,10 @@ process:
       keep_alphabet: true                                     # whether to keep alpabet
       keep_number: true                                       # whether to keep number
       keep_punc: true                                         # whether to keep punctuation
+  - remove_repeat_sentences_mapper:                         # remove repeat sentences in text samples.
+      lowercase: false                                        # whether to convert sample text to lower case
+      ignore_special_character: true                          # whether to ignore special characters when judging repeated sentences. Special characters are all characters except Chinese characters, letters and numbers
+      min_repeat_sentence_length: 2                           # sentences shorter than this length will not be deduplicated. If ignore_special_character is set to True, then special characters are not included in this length
   - remove_specific_chars_mapper:                           # remove characters specified by users
       chars_to_remove: '◆●■►▼▲▴∆▻▷❖♡□'                        # a string or a list including those characters that need to be removed
   - remove_table_text_mapper:                               # remove possible table texts from text.
diff --git a/data_juicer/ops/mapper/__init__.py b/data_juicer/ops/mapper/__init__.py
index 52d60c3e7..626290d07 100644
--- a/data_juicer/ops/mapper/__init__.py
+++ b/data_juicer/ops/mapper/__init__.py
@@ -5,6 +5,7 @@
                punctuation_normalization_mapper, remove_bibliography_mapper,
                remove_comments_mapper, remove_header_mapper,
                remove_long_words_mapper, remove_non_chinese_character_mapper,
-               remove_specific_chars_mapper, remove_table_text_mapper,
+               remove_repeat_sentences_mapper, remove_specific_chars_mapper,
+               remove_table_text_mapper,
                remove_words_with_incorrect_substrings_mapper,
                sentence_split_mapper, whitespace_normalization_mapper)
diff --git a/data_juicer/ops/mapper/remove_repeat_sentences_mapper.py b/data_juicer/ops/mapper/remove_repeat_sentences_mapper.py
new file mode 100644
index 000000000..a844dccca
--- /dev/null
+++ b/data_juicer/ops/mapper/remove_repeat_sentences_mapper.py
@@ -0,0 +1,71 @@
+import regex as re
+
+from ..base_op import OPERATORS, Mapper
+
+
+def split_sentence(text):
+    text = re.sub('([.。！!？\?])([^’”])',r'\1\n\2',text)                # noqa
+    text = re.sub('(\.{6})([^’”])',r'\1\n\2',text)                      # noqa
+    text = re.sub('(\…{2})([^’”])',r'\1\n\2',text)                      # noqa
+    text = re.sub('([.。!！？\?\.{6}\…{2}][’”])([^’”])',r'\1\n\2',text)  # noqa
+    return text.split('\n')
+
+
+@OPERATORS.register_module('remove_repeat_sentences_mapper')
+class RemoveRepeatSentencesMapper(Mapper):
+    """Mapper to remove repeat sentences in text samples."""
+
+    def __init__(self,
+                 lowercase: bool = False,
+                 ignore_special_character: bool = True,
+                 min_repeat_sentence_length: int = 2,
+                 *args,
+                 **kwargs):
+        """
+        Initialization method.
+
+        :param lowercase: Whether to convert sample text to lower case
+        :param ignore_special_character: Whether to ignore special
+            characters when judging repeated sentences. Special characters
+            are all characters except Chinese characters, letters and
+            numbers.
+        :param min_repeat_sentence_length: Sentences shorter than this
+            length will not be deduplicated. If ignore_special_character is
+            set to True, then special characters are not included in this
+            length.
+        :param args: extra args
+        :param kwargs: extra args
+        """
+
+        super().__init__(*args, **kwargs)
+        self.lowercase = lowercase
+        self.min_repeat_sentence_length = min_repeat_sentence_length
+        self.remove_regex = re.compile(
+            r'[^a-zA-Z0-9\u4e00-\u9fa5\n\t ]'
+        ) if ignore_special_character else None
+
+    def process(self, sample):
+
+        lines = [e for e in sample[self.text_key].split('\n')]
+        new_lines = []
+        hash_set = set([])
+        for line in lines:
+            new_sent = ''
+            if line:
+                sentences = split_sentence(line)
+                for sentence in sentences:
+                    copy = sentence.strip()
+                    if self.lowercase:
+                        copy = copy.lower()
+                    if self.remove_regex:
+                        copy = self.remove_regex.sub('', copy)
+
+                    if len(copy) < self.min_repeat_sentence_length:
+                        new_sent += sentence
+                    elif copy not in hash_set:
+                        new_sent += sentence
+                        hash_set.add(copy)
+            new_lines.append(new_sent)
+
+        sample[self.text_key] = '\n'.join(new_lines)
+        return sample
diff --git a/docs/Operators.md b/docs/Operators.md
index fde4dccac..646e91664 100644
--- a/docs/Operators.md
+++ b/docs/Operators.md
@@ -10,7 +10,7 @@ The operators in Data-Juicer are categorized into 5 types.
 | Type                              | Number | Description                                     |
 |-----------------------------------|:------:|-------------------------------------------------|
 | [ Formatter ]( #formatter )       |   7    | Discovers, loads, and canonicalizes source data |
-| [ Mapper ]( #mapper )             |   21   | Edits and transforms samples                    |
+| [ Mapper ]( #mapper )             |   22   | Edits and transforms samples                    |
 | [ Filter ]( #filter )             |   24   | Filters out low-quality samples                 |
 | [ Deduplicator ]( #deduplicator ) |   4    | Detects and removes duplicate samples           |
 | [ Selector ]( #selector )         |   2    | Selects top samples based on ranking            |
@@ -63,6 +63,7 @@ All the specific operators are listed below, each featured with several capabili
 | remove_header_mapper                                | LaTeX              | en, zh | Removes the running headers of TeX documents, e.g., titles, chapter or section numbers/names                   |
 | remove_long_words_mapper                            | General            | en, zh | Removes words with length outside the specified range                                                          |
 | remove_non_chinese_character_mapper                 | General            | en, zh | Remove non Chinese character in text samples. |
+| remove_repeat_sentences_mapper                      | General            | en, zh | Remove repeat sentences in text samples. |
 | remove_specific_chars_mapper                        | General            | en, zh | Removes any user-specified characters or substrings                                                            |
 | remove_table_text_mapper                            | General, Financial | en     | Detects and removes possible table contents (:warning: relies on regular expression matching and thus fragile) |
 | remove_words_with_incorrect_<br />substrings_mapper | General            | en, zh | Removes words containing specified substrings                                                                  |
diff --git a/docs/Operators_ZH.md b/docs/Operators_ZH.md
index 23d577259..a1dac2d76 100644
--- a/docs/Operators_ZH.md
+++ b/docs/Operators_ZH.md
@@ -9,7 +9,7 @@ Data-Juicer 中的算子分为以下 5 种类型。
 | 类型                                 | 数量 | 描述            |
 |------------------------------------|:--:|---------------|
 | [ Formatter ]( #formatter )        |  7 | 发现、加载、规范化原始数据 |
-| [ Mapper ]( #mapper )              | 21 | 对数据样本进行编辑和转换  |
+| [ Mapper ]( #mapper )              | 22 | 对数据样本进行编辑和转换  |
 | [ Filter ]( #filter )              | 24 | 过滤低质量样本       |
 | [ Deduplicator ]( #deduplicator )  |  4 | 识别、删除重复样本     |
 | [ Selector ]( #selector )          |  2 | 基于排序选取高质量样本   |
@@ -60,7 +60,8 @@ Data-Juicer 中的算子分为以下 5 种类型。
 | remove_comments_mapper                              | LaTeX                 | en, zh    | 删除 TeX 文档中的注释                                          |
 | remove_header_mapper                                | LaTeX                 | en, zh    | 删除 TeX 文档头，例如标题、章节数字/名称等                               |
 | remove_long_words_mapper                            | General               | en, zh    | 删除长度超出指定范围的单词                                          |
-| remove_non_chinese_character_mapper                 | General               | en, zh    | 删除样本中的非中文字符
+| remove_non_chinese_character_mapper                 | General               | en, zh    | 删除样本中的非中文字符                                              |
+| remove_repeat_sentences_mapper                      | General               | en, zh    | 删除样本中的重复句子                                                |
 | remove_specific_chars_mapper                        | General               | en, zh    | 删除任何用户指定的字符或子字符串                                       |
 | remove_table_text_mapper                            | General, Financial    | en        | 检测并删除可能的表格内容（:warning: 依赖正则表达式匹配，因此很脆弱）                |
 | remove_words_with_incorrect_<br />substrings_mapper | General               | en, zh    | 删除包含指定子字符串的单词                                          |
diff --git a/tests/ops/mapper/test_remove_repeat_sentences_mapper.py b/tests/ops/mapper/test_remove_repeat_sentences_mapper.py
new file mode 100644
index 000000000..923ac5824
--- /dev/null
+++ b/tests/ops/mapper/test_remove_repeat_sentences_mapper.py
@@ -0,0 +1,57 @@
+import unittest
+
+from data_juicer.ops.mapper.remove_repeat_sentences_mapper import RemoveRepeatSentencesMapper
+
+
+class RemoveRepeatSentencesMapperTest(unittest.TestCase):
+
+    def _run_helper(self, samples, op):
+        for sample in samples:
+            result = op.process(sample)
+            self.assertEqual(result['text'], result['target'])
+
+    def test_text(self):
+
+        samples = [
+            {
+                'text': '今天天气真不错，阳光明媚，适合出去散步。小明说：“今天天气真不错，我们去海边吧。” 小红回答说：“好主意！” 但是，小李觉得：“今天天气真不错，我们去爬山吧。” 今天天气真不错，阳光明媚，适合出去散步。昨天下了一整天的雨，今天终于放晴了。昨天下了一整天的雨，今天终于放晴了。',
+                'target': '今天天气真不错，阳光明媚，适合出去散步。小明说：“今天天气真不错，我们去海边吧。” 小红回答说：“好主意！” 但是，小李觉得：“今天天气真不错，我们去爬山吧。”昨天下了一整天的雨，今天终于放晴了。',
+            }, {
+                'text': 'The quick brown fox jumps over the lazy dog. Isn\'t it amazing how a simple sentence can contain every letter of the alphabet? The quick brown fox jumps over the lazy dog. Speaking of weather, yesterday was quite dreary; however, today is absolutely delightful. Isn\'t it amazing how a simple sentence can contain every letter of the alphabet? "Let\'s seize the day," Tom exclaimed, full of enthusiasm. "Let\'s seize the day," Tom exclaimed, full of enthusiasm.',
+                'target': 'The quick brown fox jumps over the lazy dog. Isn\'t it amazing how a simple sentence can contain every letter of the alphabet? Speaking of weather, yesterday was quite dreary; however, today is absolutely delightful. "Let\'s seize the day," Tom exclaimed, full of enthusiasm.'
+            }, {
+                'text': '''我很开心 。但是你不开心  。我很开心 。\n你好呀！我很开心 。我好的。你好呀！''',
+                'target': '''我很开心 。但是你不开心  。\n你好呀！我好的。'''
+            }, {
+                'text': '默认配置下，长度低于2的句子不会被去重。去重？去重。去重！重。重...... 重! 1234？3215. 1234. 3. 3. 3',
+                'target': '默认配置下，长度低于2的句子不会被去重。去重？重。重...... 重! 1234？3215. 3. 3. 3'
+            }
+        ]
+
+        op = RemoveRepeatSentencesMapper()
+        self._run_helper(samples, op)
+
+    def test_text2(self):
+
+        samples = [
+            {
+                'text': 'Life is what happens when you\'re busy making other plans. John Lennon once said. Life is what happens when you\'re busy making other plans. This phrase has resonated with many people over the years. 人生就是当你忙于制定其他计划时发生的事情。对很多人来说，这句话引起了共鸣。',
+                'target': 'Life is what happens when you\'re busy making other plans. John Lennon once said. This phrase has resonated with many people over the years. 人生就是当你忙于制定其他计划时发生的事情。对很多人来说，这句话引起了共鸣。',
+            }, {
+                'text': 'The quick brown fox jumps over the lazy dog. Isn\'t it amazing how a simple sentence can contain every letter of the alphabet? The quick brown fox jumps over the lazy dog. Speaking of weather, yesterday was quite dreary; however, today is absolutely delightful. Isn\'t it amazing how a simple sentence can contain every letter of the alphabet? "Let\'s seize the day," Tom exclaimed, full of enthusiasm. "Let\'s seize the day," Tom exclaimed, full of enthusiasm.',
+                'target': 'The quick brown fox jumps over the lazy dog. Isn\'t it amazing how a simple sentence can contain every letter of the alphabet? Speaking of weather, yesterday was quite dreary; however, today is absolutely delightful. "Let\'s seize the day," Tom exclaimed, full of enthusiasm.'
+            }, {
+                'text': '''我很开心 。但是你不开心  。我很开心 。\n你好呀！我很开心 。我好的。你好呀！''',
+                'target': '''我很开心 。但是你不开心  。\n你好呀！我好的。你好呀！'''
+            }, {
+                'text': '去重？去重。去重！重。重...... 重! 1234？3215. 1234. 3. 3. 3',
+                'target': '去重？去重。去重！重。重...... 重! 1234？3215. 1234. 3. 3. 3'
+            }
+        ]
+
+        op = RemoveRepeatSentencesMapper(lowercase=True, ignore_special_character=False, min_repeat_sentence_length=5)
+        self._run_helper(samples, op)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 021d14bf5f48ea08c5bbf4fc698eec913c1b56a0 Mon Sep 17 00:00:00 2001
From: "hesen.chs" <hesen.chs@alibaba-inc.com>
Date: Wed, 27 Dec 2023 10:35:57 +0800
Subject: [PATCH 10/17] modify mapper op number

---
 docs/Operators.md    | 2 +-
 docs/Operators_ZH.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/Operators.md b/docs/Operators.md
index 2802f8aa5..119111ec7 100644
--- a/docs/Operators.md
+++ b/docs/Operators.md
@@ -10,7 +10,7 @@ The operators in Data-Juicer are categorized into 5 types.
 | Type                              | Number | Description                                     |
 |-----------------------------------|:------:|-------------------------------------------------|
 | [ Formatter ]( #formatter )       |   7    | Discovers, loads, and canonicalizes source data |
-| [ Mapper ]( #mapper )             |   22   | Edits and transforms samples                    |
+| [ Mapper ]( #mapper )             |   23   | Edits and transforms samples                    |
 | [ Filter ]( #filter )             |   24   | Filters out low-quality samples                 |
 | [ Deduplicator ]( #deduplicator ) |   4    | Detects and removes duplicate samples           |
 | [ Selector ]( #selector )         |   2    | Selects top samples based on ranking            |
diff --git a/docs/Operators_ZH.md b/docs/Operators_ZH.md
index 2702f59f8..e684ca950 100644
--- a/docs/Operators_ZH.md
+++ b/docs/Operators_ZH.md
@@ -9,7 +9,7 @@ Data-Juicer 中的算子分为以下 5 种类型。
 | 类型                                 | 数量 | 描述            |
 |------------------------------------|:--:|---------------|
 | [ Formatter ]( #formatter )        |  7 | 发现、加载、规范化原始数据 |
-| [ Mapper ]( #mapper )              | 22 | 对数据样本进行编辑和转换  |
+| [ Mapper ]( #mapper )              | 23 | 对数据样本进行编辑和转换  |
 | [ Filter ]( #filter )              | 24 | 过滤低质量样本       |
 | [ Deduplicator ]( #deduplicator )  |  4 | 识别、删除重复样本     |
 | [ Selector ]( #selector )          |  2 | 基于排序选取高质量样本   |

From 1688099bbbba10f0ce05ac65edc092155a72b5a6 Mon Sep 17 00:00:00 2001
From: "hesen.chs" <hesen.chs@alibaba-inc.com>
Date: Fri, 5 Jan 2024 10:31:37 +0800
Subject: [PATCH 11/17] update image_blur

---
 data_juicer/ops/mapper/__init__.py          |   2 +-
 data_juicer/ops/mapper/image_blur_mapper.py |  91 ++++++++++++++
 tests/ops/mapper/test_image_blur_mapper.py  | 125 ++++++++++++++++++++
 3 files changed, 217 insertions(+), 1 deletion(-)
 create mode 100644 data_juicer/ops/mapper/image_blur_mapper.py
 create mode 100644 tests/ops/mapper/test_image_blur_mapper.py

diff --git a/data_juicer/ops/mapper/__init__.py b/data_juicer/ops/mapper/__init__.py
index b78be1884..06a8d994f 100644
--- a/data_juicer/ops/mapper/__init__.py
+++ b/data_juicer/ops/mapper/__init__.py
@@ -1,7 +1,7 @@
 from . import (chinese_convert_mapper, clean_copyright_mapper,
                clean_email_mapper, clean_html_mapper, clean_ip_mapper,
                clean_links_mapper, expand_macro_mapper, fix_unicode_mapper,
-               nlpaug_en_mapper, nlpcda_zh_mapper,
+               image_blur_mapper, nlpaug_en_mapper, nlpcda_zh_mapper,
                punctuation_normalization_mapper, remove_bibliography_mapper,
                remove_comments_mapper, remove_header_mapper,
                remove_long_words_mapper, remove_non_chinese_character_mapper,
diff --git a/data_juicer/ops/mapper/image_blur_mapper.py b/data_juicer/ops/mapper/image_blur_mapper.py
new file mode 100644
index 000000000..dac732880
--- /dev/null
+++ b/data_juicer/ops/mapper/image_blur_mapper.py
@@ -0,0 +1,91 @@
+import os
+
+import numpy as np
+
+from data_juicer.utils.constant import Fields, StatsKeys
+from data_juicer.utils.mm_utils import load_image
+
+from ..base_op import OPERATORS, Mapper
+from ..op_fusion import LOADED_IMAGES
+
+BLUR_KERNEL = {'MEAN'}
+
+@OPERATORS.register_module('image_blur_mapper')
+@LOADED_IMAGES.register_module('image_blur_mapper')
+class ImageBlurMapper(Mapper):
+    """Mapper to blur images.
+    """
+
+    def __init__(self,
+                 p: float = 0.2,
+                 blur_type: str = 'gaussian',
+                 radius: float = 2,
+                 *args,
+                 **kwargs):
+        """
+        Initialization method.
+        
+        :param p: Probability of the image being blured.
+        :param blur_type: Type of blur kernel, including ['mean', 'box', 'gaussian'].
+        :param radius: Radius of blur kernel.
+        :param cover: Whether the blurred image covers the original image. If set to
+             false, the blurred image will be added with the suffix '_blur' and then
+             saved in the same directory.
+        :param args: extra args
+        :param kwargs: extra args
+        """
+        super().__init__(*args, **kwargs)
+        if blur_type not in ['mean', 'box', 'gaussian']:
+            raise ValueError(f'Blur_type [{blur_type}] is not supported. '
+                             f'Can only be one of ["mean", "box", "gaussian"].')
+        if radius < 0:
+            raise ValueError(f'Radius must be >= 0.')
+        
+        self.p = p   
+    
+        from PIL import ImageFilter
+        if blur_type == 'mean':
+            self.blur = ImageFilter.BLUR
+        elif blur_type == 'box':
+            self.blur = ImageFilter.BoxBlur(radius)
+        else:
+            self.blur = ImageFilter.GaussianBlur(radius)
+
+    def process(self, sample, context=False):
+        # there is no image in this sample
+        if self.image_key not in sample or not sample[self.image_key]:
+            return sample
+
+        # load images
+        loaded_image_keys = sample[self.image_key]
+        images = {}
+        for loaded_image_key in loaded_image_keys:
+            if context and loaded_image_key in sample[Fields.context]:
+                # load from context
+                images[loaded_image_key] = sample[
+                    Fields.context][loaded_image_key]
+            else:
+                if loaded_image_key not in images:
+                    # avoid load the same images
+                    image = load_image(loaded_image_key)
+                    images[loaded_image_key] = image
+                    if context:
+                        # store the image data into context
+                        sample[Fields.context][loaded_image_key] = image
+
+        for index, value in enumerate(loaded_image_keys):
+            if self.p < np.random.rand():
+                continue
+            else:
+                blured_image_key = os.path.join(os.path.dirname(value), '_blured.'.join(os.path.basename(value).split('.')))
+                if not os.path.exists(blured_image_key):
+                    img_mode = images[value].mode
+                    blured_image = images[value].convert('RGB').filter(self.blur)
+                    blured_image = blured_image.convert(img_mode)
+                    blured_image.save(blured_image_key)
+                    if context:
+                        sample[Fields.context][blured_image_key] = blured_image
+                loaded_image_keys[index] = blured_image_key
+
+        sample[self.image_key] = loaded_image_keys
+        return sample
\ No newline at end of file
diff --git a/tests/ops/mapper/test_image_blur_mapper.py b/tests/ops/mapper/test_image_blur_mapper.py
new file mode 100644
index 000000000..803d7353d
--- /dev/null
+++ b/tests/ops/mapper/test_image_blur_mapper.py
@@ -0,0 +1,125 @@
+import os
+import unittest
+
+from datasets import Dataset
+
+from data_juicer.ops.mapper.image_blur_mapper import ImageBlurMapper
+from data_juicer.utils.constant import Fields
+
+
+class ImageBlurMapperTest(unittest.TestCase):
+
+    data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
+                             '..', 'data')
+    img1_path = os.path.join(data_path, 'img1.png')
+    img2_path = os.path.join(data_path, 'img2.jpg')
+    img3_path = os.path.join(data_path, 'img3.jpg')
+
+    def _run_image_blur_mapper(self,
+                                dataset: Dataset,
+                                target_list,
+                                op):
+        if Fields.stats not in dataset.features:
+            dataset = dataset.add_column(name=Fields.stats,
+                                         column=[{}] * dataset.num_rows)
+        dataset = dataset.map(op.compute_stats)
+        dataset = dataset.filter(op.process)
+        dataset = dataset.select_columns(column_names=[op.image_key])
+        res_list = dataset.to_list()
+        self.assertEqual(res_list, target_list)
+
+    def test_filter1(self):
+
+        ds_list = [{
+            'images': [self.img1_path]
+        }, {
+            'images': [self.img2_path]
+        }, {
+            'images': [self.img3_path]
+        }]
+        tgt_list = [{
+            'images': [self.img2_path]
+        }]
+        dataset = Dataset.from_list(ds_list)
+        op = ImageShapeFilter(min_width=400,
+                              min_height=400)
+        self._run_image_shape_filter(dataset, tgt_list, op)
+
+    def test_filter2(self):
+
+        ds_list = [{
+            'images': [self.img1_path]
+        }, {
+            'images': [self.img2_path]
+        }, {
+            'images': [self.img3_path]
+        }]
+        tgt_list = [{
+            'images': [self.img1_path]
+        }, {
+            'images': [self.img3_path]
+        }]
+        dataset = Dataset.from_list(ds_list)
+        op = ImageShapeFilter(max_width=500,
+                              max_height=500)
+        self._run_image_shape_filter(dataset, tgt_list, op)
+
+    def test_filter3(self):
+
+        ds_list = [{
+            'images': [self.img1_path]
+        }, {
+            'images': [self.img2_path]
+        }, {
+            'images': [self.img3_path]
+        }]
+        tgt_list = [{
+            'images': [self.img1_path]
+        }, {
+            'images': [self.img2_path]
+        }, {
+            'images': [self.img3_path]
+        }]
+        dataset = Dataset.from_list(ds_list)
+        op = ImageShapeFilter()
+        self._run_image_shape_filter(dataset, tgt_list, op)
+
+    def test_any(self):
+
+        ds_list = [{
+            'images': [self.img1_path, self.img2_path]
+        }, {
+            'images': [self.img2_path, self.img3_path]
+        }, {
+            'images': [self.img1_path, self.img3_path]
+        }]
+        tgt_list = [{
+            'images': [self.img1_path, self.img2_path]
+        }, {
+            'images': [self.img2_path, self.img3_path]
+        }]
+        dataset = Dataset.from_list(ds_list)
+        op = ImageShapeFilter(min_width=400,
+                              min_height=400,
+                              any_or_all='any')
+        self._run_image_shape_filter(dataset, tgt_list, op)
+
+    def test_all(self):
+
+        ds_list = [{
+            'images': [self.img1_path, self.img2_path]
+        }, {
+            'images': [self.img2_path, self.img3_path]
+        }, {
+            'images': [self.img1_path, self.img3_path]
+        }]
+        tgt_list = []
+        dataset = Dataset.from_list(ds_list)
+        op = ImageShapeFilter(min_width=400,
+                              min_height=400,
+                              any_or_all='all')
+        self._run_image_shape_filter(dataset, tgt_list, op)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 8b3d87d9ff8d2dca885351db089e3201b7687b54 Mon Sep 17 00:00:00 2001
From: "hesen.chs" <hesen.chs@alibaba-inc.com>
Date: Wed, 17 Jan 2024 15:34:36 +0800
Subject: [PATCH 12/17] add image_blur_mapper

---
 configs/config_all.yaml                     |   4 +
 data_juicer/ops/mapper/__init__.py          |   4 -
 data_juicer/ops/mapper/image_blur_mapper.py |   5 +-
 docs/Operators.md                           |   5 +-
 docs/Operators_ZH.md                        |   3 +-
 tests/ops/mapper/test_image_blur_mapper.py  | 131 ++++++++++----------
 6 files changed, 75 insertions(+), 77 deletions(-)

diff --git a/configs/config_all.yaml b/configs/config_all.yaml
index f5deb0326..047662840 100644
--- a/configs/config_all.yaml
+++ b/configs/config_all.yaml
@@ -54,6 +54,10 @@ process:
       hf_blip2: 'Salesforce/blip2-opt-2.7b'                 # blip2 model name on huggingface to generate caption
       caption_num: 1                                        # how many candidate captions to generate for each image
       keep_candidate_mode: 'random_any'                     # retain strategy for the generated $caption_num$ candidates. should be in ["random_any", "similar_one_simhash", "all"].
+  - image_blur_mapper:                                      # mapper to blur images.
+      p: 0.2                                                # probability of the image being blured
+      blur_type: 'gaussian'                                 # type of blur kernel, including ['mean', 'box', 'gaussian']
+      radius: 2                                             # radius of blur kernel
   - nlpaug_en_mapper:                                       # simply augment texts in English based on the nlpaug library
       sequential: false                                       # whether combine all augmentation methods to a sequence. If it's True, a sample will be augmented by all opened augmentation methods sequentially. If it's False, each opened augmentation method would generate its augmented samples independently.
       aug_num: 1                                              # number of augmented samples to be generated. If `sequential` is True, there will be total aug_num augmented samples generated. If it's False, there will be (aug_num * #opened_aug_method) augmented samples generated.
diff --git a/data_juicer/ops/mapper/__init__.py b/data_juicer/ops/mapper/__init__.py
index 19b0dbbda..6d2bd46df 100644
--- a/data_juicer/ops/mapper/__init__.py
+++ b/data_juicer/ops/mapper/__init__.py
@@ -2,11 +2,7 @@
 from . import (chinese_convert_mapper, clean_copyright_mapper,
                clean_email_mapper, clean_html_mapper, clean_ip_mapper,
                clean_links_mapper, expand_macro_mapper, fix_unicode_mapper,
-<<<<<<< HEAD
-               image_blur_mapper, nlpaug_en_mapper, nlpcda_zh_mapper,
-=======
                generate_caption_mapper, nlpaug_en_mapper, nlpcda_zh_mapper,
->>>>>>> 2588004f47c54dfccff37a51820a38404a6eaf17
                punctuation_normalization_mapper, remove_bibliography_mapper,
                remove_comments_mapper, remove_header_mapper,
                remove_long_words_mapper, remove_non_chinese_character_mapper,
diff --git a/data_juicer/ops/mapper/image_blur_mapper.py b/data_juicer/ops/mapper/image_blur_mapper.py
index e225b45e1..969e37180 100644
--- a/data_juicer/ops/mapper/image_blur_mapper.py
+++ b/data_juicer/ops/mapper/image_blur_mapper.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 
-from data_juicer.utils.constant import Fields, StatsKeys
+from data_juicer.utils.constant import Fields
 from data_juicer.utils.mm_utils import load_image
 
 from ..base_op import OPERATORS, Mapper
@@ -27,9 +27,6 @@ def __init__(self,
         :param p: Probability of the image being blured.
         :param blur_type: Type of blur kernel, including ['mean', 'box', 'gaussian'].
         :param radius: Radius of blur kernel.
-        :param cover: Whether the blurred image covers the original image. If set to
-             false, the blurred image will be added with the suffix '_blur' and then
-             saved in the same directory.
         :param args: extra args
         :param kwargs: extra args
         """
diff --git a/docs/Operators.md b/docs/Operators.md
index 0c8dc5601..7da8a353d 100644
--- a/docs/Operators.md
+++ b/docs/Operators.md
@@ -10,7 +10,7 @@ The operators in Data-Juicer are categorized into 5 types.
 | Type                              | Number | Description                                     |
 |-----------------------------------|:------:|-------------------------------------------------|
 | [ Formatter ]( #formatter )       |   7    | Discovers, loads, and canonicalizes source data |
-| [ Mapper ]( #mapper )             |   24   | Edits and transforms samples                    |
+| [ Mapper ]( #mapper )             |   25   | Edits and transforms samples                    |
 | [ Filter ]( #filter )             |   25   | Filters out low-quality samples                 |
 | [ Deduplicator ]( #deduplicator ) |   4    | Detects and removes duplicate samples           |
 | [ Selector ]( #selector )         |   2    | Selects top samples based on ranking            |
@@ -55,7 +55,8 @@ All the specific operators are listed below, each featured with several capabili
 | clean_links_mapper                                  | General, Code      | en, zh | Removes links, such as those starting with http or ftp                                                         |
 | expand_macro_mapper                                 | LaTeX              | en, zh | Expands macros usually defined at the top of TeX documents                                                     |
 | fix_unicode_mapper                                  | General            | en, zh | Fixes broken Unicodes (by [ftfy](https://ftfy.readthedocs.io/))                                                |
-| generate_caption_mapper                             | Multimodal         |  -     | generate samples whose captions are generated based on another model (such as blip2) and the figure within the original sample. | 
+| generate_caption_mapper                             | Multimodal         |  -     | generate samples whose captions are generated based on another model (such as blip2) and the figure within the original sample |
+| image_blur_mapper                                   | Multimodal         |  -     | Blur images |
 | nlpaug_en_mapper                                    | General            | en     | Simply augments texts in English based on the `nlpaug` library                                                 | 
 | nlpcda_zh_mapper                                    | General            | zh     | Simply augments texts in Chinese based on the `nlpcda` library                                                 | 
 | punctuation_normalization_mapper                    | General            | en, zh | Normalizes various Unicode punctuations to their ASCII equivalents                                             |
diff --git a/docs/Operators_ZH.md b/docs/Operators_ZH.md
index 911e655db..c73c38ea6 100644
--- a/docs/Operators_ZH.md
+++ b/docs/Operators_ZH.md
@@ -9,7 +9,7 @@ Data-Juicer 中的算子分为以下 5 种类型。
 | 类型                                | 数量 | 描述            |
 |------------------------------------|:--:|---------------|
 | [ Formatter ]( #formatter )        |  7 | 发现、加载、规范化原始数据 |
-| [ Mapper ]( #mapper )              | 24 | 对数据样本进行编辑和转换  |
+| [ Mapper ]( #mapper )              | 25 | 对数据样本进行编辑和转换  |
 | [ Filter ]( #filter )              | 25 | 过滤低质量样本       |
 | [ Deduplicator ]( #deduplicator )  |  4 | 识别、删除重复样本     |
 | [ Selector ]( #selector )          |  2 | 基于排序选取高质量样本   |
@@ -54,6 +54,7 @@ Data-Juicer 中的算子分为以下 5 种类型。
 | expand_macro_mapper                                 | LaTeX                 | en, zh    | 扩展通常在 TeX 文档顶部定义的宏                                     |
 | fix_unicode_mapper                                  | General               | en, zh    | 修复损坏的 Unicode（借助 [ftfy](https://ftfy.readthedocs.io/)） |
 | generate_caption_mapper                             | Multimodal            |  -    | 生成样本，其标题是根据另一个辅助模型（例如 blip2）和原始样本中的图形生成的。                                             |
+| image_blur_mapper                                   | Multimodal            |  -        | 对图像进行模糊处理                                              ｜
 | nlpaug_en_mapper                                    | General               | en        | 使用`nlpaug`库对英语文本进行简单增强                                 | 
 | nlpcda_zh_mapper                                    | General               | zh        | 使用`nlpcda`库对中文文本进行简单增强                                 | 
 | punctuation_normalization_mapper                    | General               | en, zh    | 将各种 Unicode 标点符号标准化为其 ASCII 等效项                        |
diff --git a/tests/ops/mapper/test_image_blur_mapper.py b/tests/ops/mapper/test_image_blur_mapper.py
index 803d7353d..c0885e295 100644
--- a/tests/ops/mapper/test_image_blur_mapper.py
+++ b/tests/ops/mapper/test_image_blur_mapper.py
@@ -1,10 +1,11 @@
 import os
 import unittest
+import numpy as np
 
 from datasets import Dataset
+from data_juicer.utils.mm_utils import load_image
 
 from data_juicer.ops.mapper.image_blur_mapper import ImageBlurMapper
-from data_juicer.utils.constant import Fields
 
 
 class ImageBlurMapperTest(unittest.TestCase):
@@ -15,21 +16,34 @@ class ImageBlurMapperTest(unittest.TestCase):
     img2_path = os.path.join(data_path, 'img2.jpg')
     img3_path = os.path.join(data_path, 'img3.jpg')
 
-    def _run_image_blur_mapper(self,
-                                dataset: Dataset,
-                                target_list,
-                                op):
-        if Fields.stats not in dataset.features:
-            dataset = dataset.add_column(name=Fields.stats,
-                                         column=[{}] * dataset.num_rows)
-        dataset = dataset.map(op.compute_stats)
-        dataset = dataset.filter(op.process)
-        dataset = dataset.select_columns(column_names=[op.image_key])
+    def _get_blured_img_path(self, path):
+        return os.path.join(os.path.dirname(path), '_blured.'.join(os.path.basename(path).split('.')))
+    
+    def _get_blur_kernel(self, blur_type = 'gaussian', radius = 2):
+        from PIL import ImageFilter
+        if blur_type == 'mean':
+            return ImageFilter.BLUR
+        elif blur_type == 'box':
+            return ImageFilter.BoxBlur(radius)
+        else:
+            return ImageFilter.GaussianBlur(radius)
+
+    def _run_image_blur_mapper(self, op, source_list, target_list, blur_kernel):
+        dataset = Dataset.from_list(source_list)
+        dataset = dataset.map(op.process)
         res_list = dataset.to_list()
         self.assertEqual(res_list, target_list)
-
-    def test_filter1(self):
-
+        for source, res in zip(source_list, res_list):
+            for s_path, r_path in zip(source[op.image_key], res[op.image_key]):
+                s_img = load_image(s_path).convert('RGB').filter(blur_kernel)
+                t_path = 'temp4test' + os.path.splitext(s_path)[-1]
+                s_img.save(t_path)
+                t_img = np.array(load_image(t_path))
+                r_img = np.array(load_image(r_path))
+                os.remove(t_path)
+                np.testing.assert_array_equal(t_img, r_img)
+
+    def test(self):
         ds_list = [{
             'images': [self.img1_path]
         }, {
@@ -38,87 +52,72 @@ def test_filter1(self):
             'images': [self.img3_path]
         }]
         tgt_list = [{
-            'images': [self.img2_path]
+            'images': [self._get_blured_img_path(self.img1_path)]
+        }, {
+            'images': [self._get_blured_img_path(self.img2_path)]
+        }, {
+            'images': [self._get_blured_img_path(self.img3_path)]
         }]
-        dataset = Dataset.from_list(ds_list)
-        op = ImageShapeFilter(min_width=400,
-                              min_height=400)
-        self._run_image_shape_filter(dataset, tgt_list, op)
-
-    def test_filter2(self):
+        op = ImageBlurMapper(p = 1, blur_type = 'gaussian', radius = 2)
+        blur_kernel = self._get_blur_kernel('gaussian', 2)
+        self._run_image_blur_mapper(op, ds_list, tgt_list, blur_kernel)
 
+    def test_blur_type(self):
         ds_list = [{
-            'images': [self.img1_path]
-        }, {
             'images': [self.img2_path]
         }, {
             'images': [self.img3_path]
+        }, {
+            'images': [self.img1_path]
         }]
         tgt_list = [{
-            'images': [self.img1_path]
+            'images': [self._get_blured_img_path(self.img2_path)]
         }, {
-            'images': [self.img3_path]
+            'images': [self._get_blured_img_path(self.img3_path)]
+        }, {
+            'images': [self._get_blured_img_path(self.img1_path)]
         }]
-        dataset = Dataset.from_list(ds_list)
-        op = ImageShapeFilter(max_width=500,
-                              max_height=500)
-        self._run_image_shape_filter(dataset, tgt_list, op)
-
-    def test_filter3(self):
+        op = ImageBlurMapper(p = 1, blur_type = 'box', radius = 2)
+        blur_kernel = self._get_blur_kernel('box', 2)
+        self._run_image_blur_mapper(op, ds_list, tgt_list, blur_kernel)
 
+    def test_radius(self):
         ds_list = [{
-            'images': [self.img1_path]
+            'images': [self.img3_path]
         }, {
             'images': [self.img2_path]
         }, {
-            'images': [self.img3_path]
+            'images': [self.img1_path]
         }]
         tgt_list = [{
-            'images': [self.img1_path]
+            'images': [self._get_blured_img_path(self.img3_path)]
         }, {
-            'images': [self.img2_path]
+            'images': [self._get_blured_img_path(self.img2_path)]
         }, {
-            'images': [self.img3_path]
+            'images': [self._get_blured_img_path(self.img1_path)]
         }]
-        dataset = Dataset.from_list(ds_list)
-        op = ImageShapeFilter()
-        self._run_image_shape_filter(dataset, tgt_list, op)
-
-    def test_any(self):
+        op = ImageBlurMapper(p = 1, blur_type = 'gaussian', radius = 5)
+        blur_kernel = self._get_blur_kernel('gaussian', 5)
+        self._run_image_blur_mapper(op, ds_list, tgt_list, blur_kernel)
 
+    def test_multi_img(self):
         ds_list = [{
-            'images': [self.img1_path, self.img2_path]
+            'images': [self.img1_path, self.img2_path, self.img3_path]
         }, {
-            'images': [self.img2_path, self.img3_path]
+            'images': [self.img2_path]
         }, {
-            'images': [self.img1_path, self.img3_path]
+            'images': [self.img3_path, self.img1_path]
         }]
         tgt_list = [{
-            'images': [self.img1_path, self.img2_path]
-        }, {
-            'images': [self.img2_path, self.img3_path]
-        }]
-        dataset = Dataset.from_list(ds_list)
-        op = ImageShapeFilter(min_width=400,
-                              min_height=400,
-                              any_or_all='any')
-        self._run_image_shape_filter(dataset, tgt_list, op)
-
-    def test_all(self):
-
-        ds_list = [{
-            'images': [self.img1_path, self.img2_path]
+            'images': [self._get_blured_img_path(self.img1_path), self._get_blured_img_path(self.img2_path), self._get_blured_img_path(self.img3_path)]
         }, {
-            'images': [self.img2_path, self.img3_path]
+            'images': [self._get_blured_img_path(self.img2_path)]
         }, {
-            'images': [self.img1_path, self.img3_path]
+            'images': [self._get_blured_img_path(self.img3_path), self._get_blured_img_path(self.img1_path)]
         }]
-        tgt_list = []
-        dataset = Dataset.from_list(ds_list)
-        op = ImageShapeFilter(min_width=400,
-                              min_height=400,
-                              any_or_all='all')
-        self._run_image_shape_filter(dataset, tgt_list, op)
+        op = ImageBlurMapper(p = 1, blur_type = 'gaussian', radius = 2)
+        blur_kernel = self._get_blur_kernel('gaussian', 2)
+        self._run_image_blur_mapper(op, ds_list, tgt_list, blur_kernel)
 
 
 if __name__ == '__main__':

From d80d8683f65c9ff46ff98f7949cba3298c3afd69 Mon Sep 17 00:00:00 2001
From: "hesen.chs" <hesen.chs@alibaba-inc.com>
Date: Wed, 17 Jan 2024 15:38:45 +0800
Subject: [PATCH 13/17] precommit

---
 data_juicer/ops/mapper/image_blur_mapper.py | 30 ++++++++++++---------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/data_juicer/ops/mapper/image_blur_mapper.py b/data_juicer/ops/mapper/image_blur_mapper.py
index 969e37180..5b03d8eb7 100644
--- a/data_juicer/ops/mapper/image_blur_mapper.py
+++ b/data_juicer/ops/mapper/image_blur_mapper.py
@@ -23,22 +23,24 @@ def __init__(self,
                  **kwargs):
         """
         Initialization method.
-        
+
         :param p: Probability of the image being blured.
-        :param blur_type: Type of blur kernel, including ['mean', 'box', 'gaussian'].
+        :param blur_type: Type of blur kernel, including
+        ['mean', 'box', 'gaussian'].
         :param radius: Radius of blur kernel.
         :param args: extra args
         :param kwargs: extra args
         """
         super().__init__(*args, **kwargs)
         if blur_type not in ['mean', 'box', 'gaussian']:
-            raise ValueError(f'Blur_type [{blur_type}] is not supported. '
-                             f'Can only be one of ["mean", "box", "gaussian"].')
+            raise ValueError(
+                f'Blur_type [{blur_type}] is not supported. '
+                f'Can only be one of ["mean", "box", "gaussian"]. ')
         if radius < 0:
-            raise ValueError(f'Radius must be >= 0.')
-        
-        self.p = p   
-    
+            raise ValueError('Radius must be >= 0. ')
+
+        self.p = p
+
         from PIL import ImageFilter
         if blur_type == 'mean':
             self.blur = ImageFilter.BLUR
@@ -73,9 +75,13 @@ def process(self, sample, context=False):
             if self.p < np.random.rand():
                 continue
             else:
-                blured_image_key = os.path.join(os.path.dirname(value), '_blured.'.join(os.path.basename(value).split('.')))
-                if not os.path.exists(blured_image_key) or blured_image_key not in images:
-                    blured_image = images[value].convert('RGB').filter(self.blur)
+                blured_image_key = os.path.join(
+                    os.path.dirname(value),
+                    '_blured.'.join(os.path.basename(value).split('.')))
+                if not os.path.exists(
+                        blured_image_key) or blured_image_key not in images:
+                    blured_image = images[value].convert('RGB').filter(
+                        self.blur)
                     images[blured_image_key] = blured_image
                     blured_image.save(blured_image_key)
                     if context:
@@ -83,4 +89,4 @@ def process(self, sample, context=False):
                 loaded_image_keys[index] = blured_image_key
 
         sample[self.image_key] = loaded_image_keys
-        return sample
\ No newline at end of file
+        return sample

From 3e264b603e8987405ffa1a0be6399796e185301e Mon Sep 17 00:00:00 2001
From: "hesen.chs" <hesen.chs@alibaba-inc.com>
Date: Wed, 17 Jan 2024 16:54:49 +0800
Subject: [PATCH 14/17] update __init__

---
 data_juicer/ops/mapper/__init__.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/data_juicer/ops/mapper/__init__.py b/data_juicer/ops/mapper/__init__.py
index 6d2bd46df..ae554d52f 100644
--- a/data_juicer/ops/mapper/__init__.py
+++ b/data_juicer/ops/mapper/__init__.py
@@ -2,10 +2,11 @@
 from . import (chinese_convert_mapper, clean_copyright_mapper,
                clean_email_mapper, clean_html_mapper, clean_ip_mapper,
                clean_links_mapper, expand_macro_mapper, fix_unicode_mapper,
-               generate_caption_mapper, nlpaug_en_mapper, nlpcda_zh_mapper,
-               punctuation_normalization_mapper, remove_bibliography_mapper,
-               remove_comments_mapper, remove_header_mapper,
-               remove_long_words_mapper, remove_non_chinese_character_mapper,
+               generate_caption_mapper, image_blur_mapper, nlpaug_en_mapper,
+               nlpcda_zh_mapper, punctuation_normalization_mapper,
+               remove_bibliography_mapper, remove_comments_mapper,
+               remove_header_mapper, remove_long_words_mapper,
+               remove_non_chinese_character_mapper,
                remove_repeat_sentences_mapper, remove_specific_chars_mapper,
                remove_table_text_mapper,
                remove_words_with_incorrect_substrings_mapper,

From 344e240ea96310b2b149823d898ada67f19c533c Mon Sep 17 00:00:00 2001
From: "hesen.chs" <hesen.chs@alibaba-inc.com>
Date: Fri, 19 Jan 2024 10:46:23 +0800
Subject: [PATCH 15/17] replaced by the latest load_data_with_context

---
 configs/config_all.yaml                     |  6 +++---
 data_juicer/ops/mapper/image_blur_mapper.py | 18 +++---------------
 docs/Operators.md                           |  2 +-
 docs/Operators_ZH.md                        |  2 +-
 4 files changed, 8 insertions(+), 20 deletions(-)

diff --git a/configs/config_all.yaml b/configs/config_all.yaml
index 739762311..f429c1607 100644
--- a/configs/config_all.yaml
+++ b/configs/config_all.yaml
@@ -56,9 +56,9 @@ process:
       keep_candidate_mode: 'random_any'                       # retain strategy for the generated $caption_num$ candidates. should be in ["random_any", "similar_one_simhash", "all"].
       keep_original_sample: true                              # whether to keep the original sample. If it's set to False, there will be only generated captions in the final datasets and the original captions will be removed. It's True in default.
   - image_blur_mapper:                                      # mapper to blur images.
-      p: 0.2                                                # probability of the image being blured
-      blur_type: 'gaussian'                                 # type of blur kernel, including ['mean', 'box', 'gaussian']
-      radius: 2                                             # radius of blur kernel
+      p: 0.2                                                  # probability of the image being blured
+      blur_type: 'gaussian'                                   # type of blur kernel, including ['mean', 'box', 'gaussian']
+      radius: 2                                               # radius of blur kernel
   - nlpaug_en_mapper:                                       # simply augment texts in English based on the nlpaug library
       sequential: false                                       # whether combine all augmentation methods to a sequence. If it's True, a sample will be augmented by all opened augmentation methods sequentially. If it's False, each opened augmentation method would generate its augmented samples independently.
       aug_num: 1                                              # number of augmented samples to be generated. If `sequential` is True, there will be total aug_num augmented samples generated. If it's False, there will be (aug_num * #opened_aug_method) augmented samples generated.
diff --git a/data_juicer/ops/mapper/image_blur_mapper.py b/data_juicer/ops/mapper/image_blur_mapper.py
index 5b03d8eb7..3edb22c20 100644
--- a/data_juicer/ops/mapper/image_blur_mapper.py
+++ b/data_juicer/ops/mapper/image_blur_mapper.py
@@ -3,7 +3,7 @@
 import numpy as np
 
 from data_juicer.utils.constant import Fields
-from data_juicer.utils.mm_utils import load_image
+from data_juicer.utils.mm_utils import load_data_with_context, load_image
 
 from ..base_op import OPERATORS, Mapper
 from ..op_fusion import LOADED_IMAGES
@@ -56,20 +56,8 @@ def process(self, sample, context=False):
 
         # load images
         loaded_image_keys = sample[self.image_key]
-        images = {}
-        for loaded_image_key in loaded_image_keys:
-            if context and loaded_image_key in sample[Fields.context]:
-                # load from context
-                images[loaded_image_key] = sample[
-                    Fields.context][loaded_image_key]
-            else:
-                if loaded_image_key not in images:
-                    # avoid load the same images
-                    image = load_image(loaded_image_key)
-                    images[loaded_image_key] = image
-                    if context:
-                        # store the image data into context
-                        sample[Fields.context][loaded_image_key] = image
+        sample, images = load_data_with_context(sample, context,
+                                                loaded_image_keys, load_image)
 
         for index, value in enumerate(loaded_image_keys):
             if self.p < np.random.rand():
diff --git a/docs/Operators.md b/docs/Operators.md
index c932880ee..504719304 100644
--- a/docs/Operators.md
+++ b/docs/Operators.md
@@ -11,7 +11,7 @@ The operators in Data-Juicer are categorized into 5 types.
 |-----------------------------------|:------:|-------------------------------------------------|
 | [ Formatter ]( #formatter )       |   7    | Discovers, loads, and canonicalizes source data |
 | [ Mapper ]( #mapper )             |   25   | Edits and transforms samples                    |
-| [ Filter ]( #filter )             |   26   | Filters out low-quality samples                 |
+| [ Filter ]( #filter )             |   27   | Filters out low-quality samples                 |
 | [ Deduplicator ]( #deduplicator ) |   4    | Detects and removes duplicate samples           |
 | [ Selector ]( #selector )         |   2    | Selects top samples based on ranking            |
 
diff --git a/docs/Operators_ZH.md b/docs/Operators_ZH.md
index 4092be081..97d9ccdae 100644
--- a/docs/Operators_ZH.md
+++ b/docs/Operators_ZH.md
@@ -10,7 +10,7 @@ Data-Juicer 中的算子分为以下 5 种类型。
 |------------------------------------|:--:|---------------|
 | [ Formatter ]( #formatter )        |  7 | 发现、加载、规范化原始数据 |
 | [ Mapper ]( #mapper )              | 25 | 对数据样本进行编辑和转换  |
-| [ Filter ]( #filter )              | 26 | 过滤低质量样本       |
+| [ Filter ]( #filter )              | 27 | 过滤低质量样本       |
 | [ Deduplicator ]( #deduplicator )  |  4 | 识别、删除重复样本     |
 | [ Selector ]( #selector )          |  2 | 基于排序选取高质量样本   |
 

From 5a6b552340f6f26c9928b341b358a42eb6fb1309 Mon Sep 17 00:00:00 2001
From: "hesen.chs" <hesen.chs@alibaba-inc.com>
Date: Fri, 19 Jan 2024 11:00:43 +0800
Subject: [PATCH 16/17] fix Operators_ZH

---
 docs/Operators_ZH.md | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/docs/Operators_ZH.md b/docs/Operators_ZH.md
index e0142130c..766a64a31 100644
--- a/docs/Operators_ZH.md
+++ b/docs/Operators_ZH.md
@@ -9,11 +9,7 @@ Data-Juicer 中的算子分为以下 5 种类型。
 | 类型                                | 数量 | 描述            |
 |------------------------------------|:--:|---------------|
 | [ Formatter ]( #formatter )        |  7 | 发现、加载、规范化原始数据 |
-<<<<<<< HEAD
 | [ Mapper ]( #mapper )              | 25 | 对数据样本进行编辑和转换  |
-=======
-| [ Mapper ]( #mapper )              | 24 | 对数据样本进行编辑和转换  |
->>>>>>> 33d82feb5fff0112c28283acb07f55ae45b1ff20
 | [ Filter ]( #filter )              | 27 | 过滤低质量样本       |
 | [ Deduplicator ]( #deduplicator )  |  4 | 识别、删除重复样本     |
 | [ Selector ]( #selector )          |  2 | 基于排序选取高质量样本   |

From 402f4bf54468aff47b02c68d59c72fb91b68a9db Mon Sep 17 00:00:00 2001
From: "hesen.chs" <hesen.chs@alibaba-inc.com>
Date: Tue, 30 Jan 2024 10:38:25 +0800
Subject: [PATCH 17/17] fix redpajama link

---
 LICENSE                                              |  2 +-
 README.md                                            |  2 +-
 README_ZH.md                                         |  2 +-
 configs/reproduced_redpajama/README.md               | 10 +++++-----
 configs/reproduced_redpajama/README_ZH.md            | 10 +++++-----
 data_juicer/ops/mapper/clean_copyright_mapper.py     |  2 +-
 data_juicer/ops/mapper/clean_html_mapper.py          |  2 +-
 data_juicer/ops/mapper/expand_macro_mapper.py        |  2 +-
 data_juicer/ops/mapper/remove_bibliography_mapper.py |  2 +-
 data_juicer/ops/mapper/remove_comments_mapper.py     |  2 +-
 data_juicer/ops/mapper/remove_header_mapper.py       |  2 +-
 tools/preprocess/README.md                           |  4 ++--
 tools/preprocess/README_ZH.md                        |  4 ++--
 tools/preprocess/raw_arxiv_to_jsonl.py               |  4 ++--
 tools/preprocess/raw_stackexchange_to_jsonl.py       |  4 ++--
 15 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/LICENSE b/LICENSE
index bc0945c47..5033caeee 100644
--- a/LICENSE
+++ b/LICENSE
@@ -251,7 +251,7 @@ Code in data_juicer/ops/mapper/clean_copyright_mapper.py, data_juicer/ops/mapper
 data_juicer/ops/mapper/expand_macro_mapper.py, data_juicer/ops/mapper/remove_bibliography_mapper.py,
 data_juicer/ops/mapper/remove_comments_mapper.py, data_juicer/ops/mapper/remove_header_mapper.py,
 is adapted from
-https://github.com/togethercomputer/RedPajama-Data
+https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/
 
    Copyright 2023 RedPajama authors.
 
diff --git a/README.md b/README.md
index ae131073a..43105b5b6 100644
--- a/README.md
+++ b/README.md
@@ -350,7 +350,7 @@ Cloud's platform for AI (PAI).
 We look forward to more of your experience, suggestions and discussions for collaboration!
 
 Data-Juicer thanks and refers to several community projects, such as 
-[Huggingface-Datasets](https://github.com/huggingface/datasets), [Bloom](https://huggingface.co/bigscience/bloom), [RedPajama](https://github.com/togethercomputer/RedPajama-Data), [Pile](https://huggingface.co/datasets/EleutherAI/pile), [Alpaca-Cot](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT), [Megatron-LM](https://github.com/NVIDIA/Megatron-LM), [DeepSpeed](https://www.deepspeed.ai/), [Arrow](https://github.com/apache/arrow), [Ray](https://github.com/ray-project/ray), [Beam](https://github.com/apache/beam),  [LM-Harness](https://github.com/EleutherAI/lm-evaluation-harness), [HELM](https://github.com/stanford-crfm/helm), ....
+[Huggingface-Datasets](https://github.com/huggingface/datasets), [Bloom](https://huggingface.co/bigscience/bloom), [RedPajama](https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1), [Pile](https://huggingface.co/datasets/EleutherAI/pile), [Alpaca-Cot](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT), [Megatron-LM](https://github.com/NVIDIA/Megatron-LM), [DeepSpeed](https://www.deepspeed.ai/), [Arrow](https://github.com/apache/arrow), [Ray](https://github.com/ray-project/ray), [Beam](https://github.com/apache/beam),  [LM-Harness](https://github.com/EleutherAI/lm-evaluation-harness), [HELM](https://github.com/stanford-crfm/helm), ....
 
 
 
diff --git a/README_ZH.md b/README_ZH.md
index 1b5b9f50e..e22e506b2 100644
--- a/README_ZH.md
+++ b/README_ZH.md
@@ -328,7 +328,7 @@ Data-Juicer 被各种 LLM产品和研究工作使用，包括来自阿里云-通
 
 
 Data-Juicer 感谢并参考了社区开源项目：
-[Huggingface-Datasets](https://github.com/huggingface/datasets), [Bloom](https://huggingface.co/bigscience/bloom), [RedPajama](https://github.com/togethercomputer/RedPajama-Data), [Pile](https://huggingface.co/datasets/EleutherAI/pile), [Alpaca-Cot](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT), [Megatron-LM](https://github.com/NVIDIA/Megatron-LM), [DeepSpeed](https://www.deepspeed.ai/), [Arrow](https://github.com/apache/arrow), [Ray](https://github.com/ray-project/ray), [Beam](https://github.com/apache/beam),  [LM-Harness](https://github.com/EleutherAI/lm-evaluation-harness), [HELM](https://github.com/stanford-crfm/helm), ....
+[Huggingface-Datasets](https://github.com/huggingface/datasets), [Bloom](https://huggingface.co/bigscience/bloom), [RedPajama](https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1), [Pile](https://huggingface.co/datasets/EleutherAI/pile), [Alpaca-Cot](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT), [Megatron-LM](https://github.com/NVIDIA/Megatron-LM), [DeepSpeed](https://www.deepspeed.ai/), [Arrow](https://github.com/apache/arrow), [Ray](https://github.com/ray-project/ray), [Beam](https://github.com/apache/beam),  [LM-Harness](https://github.com/EleutherAI/lm-evaluation-harness), [HELM](https://github.com/stanford-crfm/helm), ....
 
 
 
diff --git a/configs/reproduced_redpajama/README.md b/configs/reproduced_redpajama/README.md
index e17703425..b6a0b12b1 100644
--- a/configs/reproduced_redpajama/README.md
+++ b/configs/reproduced_redpajama/README.md
@@ -1,9 +1,9 @@
 # Redpajama Config Files
 
-This folder contains example configuration files to easily and quickly reproduce the processing flow of [Redpajama](https://github.com/togethercomputer/RedPajama-Data/tree/main/data_prep).
+This folder contains example configuration files to easily and quickly reproduce the processing flow of [Redpajama](https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/data_prep).
 
 ## arXiv
-The raw data files can be downloaded from the same AWS link as in [Redpajama/arXiv](https://github.com/togethercomputer/RedPajama-Data/tree/main/data_prep/arxiv).
+The raw data files can be downloaded from the same AWS link as in [Redpajama/arXiv](https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/data_prep/arxiv).
 
 Once downloaded, use [raw_arxiv_to_jsonl.py](../../tools/preprocess/raw_arxiv_to_jsonl.py) to convert from the original format to `jsonl` that Data-Juicer can handle easily:
 
@@ -30,7 +30,7 @@ python tools/process_data.py --config configs/reproduced_redpajama/redpajama-arx
 
 ## Books
 
-The raw data files can be downloaded from the same HuggingFace datasets as in [Redpajama/Books](https://github.com/togethercomputer/RedPajama-Data/tree/main/data_prep/book).
+The raw data files can be downloaded from the same HuggingFace datasets as in [Redpajama/Books](https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/data_prep/book).
 
 Once downloaded, modify the path configurations in [redpajama-books.yaml](redpajama-books.yaml) and execute the following command to reproduce the processing flow of RedPajama.
 
@@ -47,7 +47,7 @@ python tools/process_data.py --config configs/reproduced_redpajama/redpajama-boo
 
 ## Code
 
-The raw data files can be downloaded from Google BigQuery as in [Redpajama/Code](https://github.com/togethercomputer/RedPajama-Data/tree/main/data_prep/github).
+The raw data files can be downloaded from Google BigQuery as in [Redpajama/Code](https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/data_prep/github).
 
 Once downloaded, unzip and delete files whose extensions are not in the following whitelist:
 
@@ -70,7 +70,7 @@ python tools/process_data.py --config configs/redpajama/redpajama-code.yaml
 
 ## StackExchange
 
-The raw data files can be downloaded from the same Archive link as in [Redpajama/Stack_exchange](https://github.com/togethercomputer/RedPajama-Data/tree/main/data_prep/stack_exchange).
+The raw data files can be downloaded from the same Archive link as in [Redpajama/Stack_exchange](https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/data_prep/stack_exchange).
 
 Once downloaded, use [raw_stackexchange_to_jsonl.py](../../tools/preprocess/raw_stackexchange_to_jsonl.py) to convert from the original format to `jsonl` that Data-Juicer can handle easily:
 
diff --git a/configs/reproduced_redpajama/README_ZH.md b/configs/reproduced_redpajama/README_ZH.md
index 9a527c093..41c487f61 100644
--- a/configs/reproduced_redpajama/README_ZH.md
+++ b/configs/reproduced_redpajama/README_ZH.md
@@ -1,10 +1,10 @@
 # Redpajama 配置文件
 
-此文件夹包含的配置文件用于轻松复现 [Redpajama](https://github.com/togethercomputer/RedPajama-Data/tree/main/data_prep) 的处理流程。
+此文件夹包含的配置文件用于轻松复现 [Redpajama](https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/data_prep) 的处理流程。
 
 ## arXiv
 
-原始数据文件从 [Redpajama/arXiv](https://github.com/togethercomputer/RedPajama-Data/tree/main/data_prep/arxiv) 中相同的 AWS 链接下载。
+原始数据文件从 [Redpajama/arXiv](https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/data_prep/arxiv) 中相同的 AWS 链接下载。
 
 下载完成后，使用 [raw_arxiv_to_jsonl.py](../../tools/preprocess/raw_arxiv_to_jsonl.py) 将原始格式转换为 Data-Juicer 易于处理的格式：
 
@@ -31,7 +31,7 @@ python tools/process_data.py --config configs/reproduced_redpajama/redpajama-arx
 
 ## Books
 
-原始数据文件从 [Redpajama/Books](https://github.com/togethercomputer/RedPajama-Data/tree/main/data_prep/book) 中相同的 HuggingFace 链接下载。
+原始数据文件从 [Redpajama/Books](https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/data_prep/book) 中相同的 HuggingFace 链接下载。
 
 下载完成后，修改 [redpajama-books.yaml](redpajama-books.yaml) 中的数据路径，执行以下命令复现 RedPajama 的处理流程：
 
@@ -48,7 +48,7 @@ python tools/process_data.py --config configs/reproduced_redpajama/redpajama-boo
 
 ## Code
 
-原始数据文件从 [Redpajama/Code](https://github.com/togethercomputer/RedPajama-Data/tree/main/data_prep/github) 中相同的 Google BigQuery 获取。
+原始数据文件从 [Redpajama/Code](https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/data_prep/github) 中相同的 Google BigQuery 获取。
 
 下载完成后，解压缩并删除扩展名不在以下白名单中的其他文件：
 
@@ -71,7 +71,7 @@ python tools/process_data.py --config configs/redpajama/redpajama-code.yaml
 
 ## StackExchange
 
-原始数据文件从 [Redpajama/Stack_exchange](https://github.com/togethercomputer/RedPajama-Data/tree/main/data_prep/stack_exchange) 中相同的 Archive 链接获取。
+原始数据文件从 [Redpajama/Stack_exchange](https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/data_prep/stack_exchange) 中相同的 Archive 链接获取。
 
 下载完成后，使用 [raw_stackexchange_to_jsonl.py](../../tools/preprocess/raw_stackexchange_to_jsonl.py) 将原始格式转换为 Data-Juicer 易于处理的格式：
 
diff --git a/data_juicer/ops/mapper/clean_copyright_mapper.py b/data_juicer/ops/mapper/clean_copyright_mapper.py
index c5b046d0e..dabb0cd40 100644
--- a/data_juicer/ops/mapper/clean_copyright_mapper.py
+++ b/data_juicer/ops/mapper/clean_copyright_mapper.py
@@ -1,5 +1,5 @@
 # Some code here has been modified from:
-# https://github.com/togethercomputer/RedPajama-Data/
+# https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/
 # --------------------------------------------------------
 
 import regex as re
diff --git a/data_juicer/ops/mapper/clean_html_mapper.py b/data_juicer/ops/mapper/clean_html_mapper.py
index dc45754fa..5c2c30c57 100644
--- a/data_juicer/ops/mapper/clean_html_mapper.py
+++ b/data_juicer/ops/mapper/clean_html_mapper.py
@@ -1,5 +1,5 @@
 # Some code here has been modified from:
-# https://github.com/togethercomputer/RedPajama-Data/
+# https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/
 # --------------------------------------------------------
 
 from data_juicer.utils.availability_utils import AvailabilityChecking
diff --git a/data_juicer/ops/mapper/expand_macro_mapper.py b/data_juicer/ops/mapper/expand_macro_mapper.py
index 1792796ca..2f5d7fe83 100644
--- a/data_juicer/ops/mapper/expand_macro_mapper.py
+++ b/data_juicer/ops/mapper/expand_macro_mapper.py
@@ -1,5 +1,5 @@
 # Some code here has been modified from:
-# https://github.com/togethercomputer/RedPajama-Data/blob/main/data_prep/arxiv/arxiv_cleaner.py
+# https://github.com/togethercomputer/RedPajama-Data/blob/rp_v1/data_prep/arxiv/arxiv_cleaner.py
 # --------------------------------------------------------
 
 import regex as re
diff --git a/data_juicer/ops/mapper/remove_bibliography_mapper.py b/data_juicer/ops/mapper/remove_bibliography_mapper.py
index 7a5c815ca..2ce852d66 100644
--- a/data_juicer/ops/mapper/remove_bibliography_mapper.py
+++ b/data_juicer/ops/mapper/remove_bibliography_mapper.py
@@ -1,5 +1,5 @@
 # Some code here has been modified from:
-# https://github.com/togethercomputer/RedPajama-Data/
+# https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/
 # --------------------------------------------------------
 
 import regex as re
diff --git a/data_juicer/ops/mapper/remove_comments_mapper.py b/data_juicer/ops/mapper/remove_comments_mapper.py
index b3533dd2b..c5f083c14 100644
--- a/data_juicer/ops/mapper/remove_comments_mapper.py
+++ b/data_juicer/ops/mapper/remove_comments_mapper.py
@@ -1,5 +1,5 @@
 # Some code here has been modified from:
-# https://github.com/togethercomputer/RedPajama-Data/
+# https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/
 # --------------------------------------------------------
 
 from typing import List, Union
diff --git a/data_juicer/ops/mapper/remove_header_mapper.py b/data_juicer/ops/mapper/remove_header_mapper.py
index 4c36bde64..45af546e5 100644
--- a/data_juicer/ops/mapper/remove_header_mapper.py
+++ b/data_juicer/ops/mapper/remove_header_mapper.py
@@ -1,5 +1,5 @@
 # Some code here has been modified from:
-# https://github.com/togethercomputer/RedPajama-Data/
+# https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/
 # --------------------------------------------------------
 
 import regex as re
diff --git a/tools/preprocess/README.md b/tools/preprocess/README.md
index 6a33910ed..b0bf5c3ae 100644
--- a/tools/preprocess/README.md
+++ b/tools/preprocess/README.md
@@ -49,7 +49,7 @@ python tools/preprocess/raw_arxiv_to_jsonl.py  --help
 
 **Note:**
 
-* For downloading process, please refer to [here](https://github.com/togethercomputer/RedPajama-Data/tree/main/data_prep/arxiv).
+* For downloading process, please refer to [here](https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/data_prep/arxiv).
 
 * Before you downloading, converting or processing, you might make sure that your drive space is large enough to store the raw data (over 3TB), converted data (over 3TB), at least processed data (about 500-600GB), and even more cache data during processing.
 
@@ -71,7 +71,7 @@ python tools/preprocess/raw_arxiv_stackexchange_to_jsonl.py           \
 # get help
 python tools/preprocess/raw_stackexchange_to_jsonl.py  --help
 ```
-- `src_dir`: if you download raw Stack Exchange data as Redpajama did, you will get a directory src which includes hundreds of 7z files whose filenames are like `*.*.com.7z `. You need to unzip these files and rename the POSTs.xml to the corresponding compressed package name and place it in that dir. For more details, please refer to [here](https://github.com/togethercomputer/RedPajama-Data/tree/main/data_prep/stack_exchange).
+- `src_dir`: if you download raw Stack Exchange data as Redpajama did, you will get a directory src which includes hundreds of 7z files whose filenames are like `*.*.com.7z `. You need to unzip these files and rename the POSTs.xml to the corresponding compressed package name and place it in that dir. For more details, please refer to [here](https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/data_prep/stack_exchange).
 - `target_dir`: result directory to store the converted jsonl files.
 - `topk` (optional): select the topk sites with the most content. Default it's 28.
 - `num_proc` (optional): number of process workers. Default it's 1.
diff --git a/tools/preprocess/README_ZH.md b/tools/preprocess/README_ZH.md
index f715a50df..8f2799ed2 100644
--- a/tools/preprocess/README_ZH.md
+++ b/tools/preprocess/README_ZH.md
@@ -48,7 +48,7 @@ python tools/preprocess/raw_arxiv_to_jsonl.py  --help
 
 **注意事项：**
 
-* 下载过程请参考[这里](https://github.com/togethercomputer/RedPajama-Data/tree/main/data_prep/arxiv)。
+* 下载过程请参考[这里](https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/data_prep/arxiv)。
 
 * 在下载、转换或处理之前，您需要确保您的硬盘空间足够大，可以存储原始数据（超过 3TB）、转换后的数据（超过 3TB）、最小处理后的数据（大约 500-600GB），以及处理期间的缓存数据。
 
@@ -69,7 +69,7 @@ python tools/preprocess/raw_arxiv_stackexchange_to_jsonl.py           \
 python tools/preprocess/raw_stackexchange_to_jsonl.py  --help
 ```
 
-- `src_dir`: 如果像 Redpajama 一样下载原始 Stack Exchange 数据，你将得到一个目录 src，其中包含数百个 7z 文件，其文件名类似于 `*.*.com.7z`。 您需要解压这些文件并将 POSTs.xml 重命名为相应的压缩包名称并将其放在该目录中。更多详情请参考[这里](https://github.com/togethercomputer/RedPajama-Data/tree/main/data_prep/stack_exchange)。
+- `src_dir`: 如果像 Redpajama 一样下载原始 Stack Exchange 数据，你将得到一个目录 src，其中包含数百个 7z 文件，其文件名类似于 `*.*.com.7z`。 您需要解压这些文件并将 POSTs.xml 重命名为相应的压缩包名称并将其放在该目录中。更多详情请参考[这里](https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/data_prep/stack_exchange)。
 - `target_dir`: 用于存储转换后的 jsonl 文件的结果目录。
 - `topk` (可选): 选择内容最多的 k 个站点，默认为 28.
 - `num_proc` (可选): worker 进程数量，默认为 1。
diff --git a/tools/preprocess/raw_arxiv_to_jsonl.py b/tools/preprocess/raw_arxiv_to_jsonl.py
index 1b1637cae..d92efd235 100644
--- a/tools/preprocess/raw_arxiv_to_jsonl.py
+++ b/tools/preprocess/raw_arxiv_to_jsonl.py
@@ -1,12 +1,12 @@
 # Part of the code here has been modified from:
-# https://github.com/togethercomputer/RedPajama-Data/blob/main/data_prep/arxiv/arxiv_cleaner.py
+# https://github.com/togethercomputer/RedPajama-Data/blob/rp_v1/data_prep/arxiv/arxiv_cleaner.py
 # --------------------------------------------------------
 #
 # This tool is used for converting the raw arxiv data downloaded from S3
 # (ref: https://info.arxiv.org/help/bulk_data_s3.html) to several jsonl files.
 #
 # For downloading process, please refer to:
-# https://github.com/togethercomputer/RedPajama-Data/tree/main/data_prep/arxiv
+# https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/data_prep/arxiv
 #
 # Notice: before you downloading, converting or processing, you might make sure
 # that your drive space is large enough to store the raw data (over 3TB),
diff --git a/tools/preprocess/raw_stackexchange_to_jsonl.py b/tools/preprocess/raw_stackexchange_to_jsonl.py
index a9f267211..ad1a0bfe4 100644
--- a/tools/preprocess/raw_stackexchange_to_jsonl.py
+++ b/tools/preprocess/raw_stackexchange_to_jsonl.py
@@ -1,5 +1,5 @@
 # Part of the code here has been modified from:
-# https://github.com/togethercomputer/RedPajama-Data/tree/main/data_prep/stack_exchange
+# https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/data_prep/stack_exchange
 # --------------------------------------------------------
 #
 # This tool is used for converting the raw Stack Exchange data downloaded from
@@ -7,7 +7,7 @@
 # jsonl files.
 #
 # For downloading process, please refer to:
-# https://github.com/togethercomputer/RedPajama-Data/tree/main/data_prep/stack_exchange
+# https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/data_prep/stack_exchange
 #
 # Notice: before you downloading, converting or processing, you might make sure
 # that your drive space is large enough to store the raw data (over 100GB),