From 573a7044c65fdf75b5a89ed76bf0168059a7dc86 Mon Sep 17 00:00:00 2001 From: "hesen.chs" Date: Thu, 16 Nov 2023 18:45:54 +0800 Subject: [PATCH 01/17] fix opencc serialization error --- data_juicer/ops/mapper/chinese_convert_mapper.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/data_juicer/ops/mapper/chinese_convert_mapper.py b/data_juicer/ops/mapper/chinese_convert_mapper.py index 7d87a9165..8fc0a41c3 100644 --- a/data_juicer/ops/mapper/chinese_convert_mapper.py +++ b/data_juicer/ops/mapper/chinese_convert_mapper.py @@ -1,8 +1,12 @@ -import opencc - from ..base_op import OPERATORS, Mapper +def prepare_converter(mode): + global OPENCC_CONVERTER + import opencc + OPENCC_CONVERTER = opencc.OpenCC(mode + '.json') + + @OPERATORS.register_module('chinese_convert_mapper') class ChineseConvertMapper(Mapper): """Mapper to convert Chinese between Traditional Chinese, Simplified Chinese @@ -39,9 +43,9 @@ def __init__(self, mode: str = 's2t', *args, **kwargs): ] assert mode in mode_list, 'Please make sure mode is one of {}'.format( mode_list) - self.converter = opencc.OpenCC(mode + '.json') + prepare_converter(mode) def process(self, sample): - sample[self.text_key] = self.converter.convert(sample[self.text_key]) + sample[self.text_key] = OPENCC_CONVERTER.convert(sample[self.text_key]) return sample From 4fee9a121e0e4825f749033e90886b357ab067d7 Mon Sep 17 00:00:00 2001 From: "hesen.chs" Date: Tue, 21 Nov 2023 19:19:53 +0800 Subject: [PATCH 02/17] support audio-text data reading --- configs/config_all.yaml | 2 + data_juicer/utils/mm_utils.py | 15 +- tools/multimodal/README.md | 1 + tools/multimodal/README_ZH.md | 1 + .../dj_to_llava.py | 4 +- .../dj_to_wavcaps.py | 122 +++++++++ .../wavcaps_to_dj.py | 231 ++++++++++++++++++ 7 files changed, 372 insertions(+), 4 deletions(-) create mode 100644 tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py create mode 100644 tools/multimodal/source_format_to_data_juicer_format/wavcaps_to_dj.py diff --git a/configs/config_all.yaml b/configs/config_all.yaml index 58970a08b..95d9623fc 100644 --- a/configs/config_all.yaml +++ b/configs/config_all.yaml @@ -26,6 +26,8 @@ cache_compress: null # The compression me # for multimodal data processing image_key: 'images' # Key name of field to store the list of sample image paths. image_special_token: '<__dj__image>' # The special token that represents an image in the text. In default, it's "<__dj__image>". You can specify your own special token according to your input dataset. +audio_key: 'audios' # Key name of field to store the list of sample audio paths. +audio_special_token: '<__dj__audio>' # The special token that represents an audio in the text. In default, it's "<__dj__audio>". You can specify your own special token according to your input dataset. eoc_special_token: '<|__dj__eoc|>' # The special token that represents the end of a chunk in the text. In default, it's "<|__dj__eoc|>". You can specify your own special token according to your input dataset. diff --git a/data_juicer/utils/mm_utils.py b/data_juicer/utils/mm_utils.py index ea6b2063f..8a3f8c67e 100644 --- a/data_juicer/utils/mm_utils.py +++ b/data_juicer/utils/mm_utils.py @@ -1,4 +1,4 @@ -from datasets import Image +from datasets import Image, Audio from data_juicer.utils.constant import DEFAULT_PREFIX @@ -8,6 +8,7 @@ class SpecialTokens(object): # modality image = f'<{DEFAULT_PREFIX}image>' + audio = f'<{DEFAULT_PREFIX}audio>' # others eoc = f'<|{DEFAULT_PREFIX}eoc|>' @@ -17,13 +18,23 @@ def load_images(paths): return [load_image(path) for path in paths] +def load_audios(paths): + return [load_audio(path) for path in paths] + + def load_image(path): img_feature = Image() img = img_feature.decode_example(img_feature.encode_example(path)) return img -def get_image_size(path): +def load_audio(path, sampling_rate=None): + aud_feature = Audio(sampling_rate) + aud = aud_feature.decode_example(aud_feature.encode_example(path)) + return (aud['array'], aud['sampling_rate']) + + +def get_image_size(path, ): import os return os.path.getsize(path) diff --git a/tools/multimodal/README.md b/tools/multimodal/README.md index b9175c27c..d6d62c62b 100644 --- a/tools/multimodal/README.md +++ b/tools/multimodal/README.md @@ -18,6 +18,7 @@ For now, dataset formats that are supported by Data-Juicer are listed in the fol | Format | source_format_to_data_juicer_format | data_juicer_format_to_target_format | Ref. | |------------|-------------------------------------|-------------------------------------|------------------------------------------------------------------------------------------------------------------| | LLaVA-like | `llava_to_dj.py` | `dj_to_llava.py` | [Format Description](https://github.com/haotian-liu/LLaVA/blob/main/docs/Finetune_Custom_Data.md#dataset-format) | +| WavCaps-like | `wavcaps_to_dj.py` | `dj_to_wavcaps.py` | - | For all tools, you can run the following command to find out the usage of them: diff --git a/tools/multimodal/README_ZH.md b/tools/multimodal/README_ZH.md index 9eb7757ce..af05a610d 100644 --- a/tools/multimodal/README_ZH.md +++ b/tools/multimodal/README_ZH.md @@ -15,6 +15,7 @@ | 格式 | source_format_to_data_juicer_format | data_juicer_format_to_target_format | 格式参考 | |-----------|-------------------------------------|-------------------------------------|----------------------------------------------------------------------------------------------------| | 类LLaVA格式 | `llava_to_dj.py` | `dj_to_llava.py` | [格式描述](https://github.com/haotian-liu/LLaVA/blob/main/docs/Finetune_Custom_Data.md#dataset-format) | +| 类WavCaps格式 | `wavcaps_to_dj.py` | `dj_to_wavcaps.py` | - | 对于所有工具,您可以运行以下命令来了解它们的详细用法: diff --git a/tools/multimodal/data_juicer_format_to_target_format/dj_to_llava.py b/tools/multimodal/data_juicer_format_to_target_format/dj_to_llava.py index b0c1495df..477d0c2e3 100644 --- a/tools/multimodal/data_juicer_format_to_target_format/dj_to_llava.py +++ b/tools/multimodal/data_juicer_format_to_target_format/dj_to_llava.py @@ -1,5 +1,5 @@ -# This tool is used to convert multimodal dataset in LLaVA format to a target -# dataset in Data-Juicer format. +# This tool is used to convert multimodal dataset in Data-Juicer format to a target +# dataset in LLaVA format. # # Corresponding Data-Juicer format: # - multi-chunk interleaved image-text sequence diff --git a/tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py b/tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py new file mode 100644 index 000000000..d88937b14 --- /dev/null +++ b/tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py @@ -0,0 +1,122 @@ +# This tool is used to convert multimodal dataset in Data-Juicer format to a target +# dataset in WavCaps format. +# +# Data-Juicer format: +# {'audios': ['./path/to/audio/2219.flac'], +# 'text': '