modelscope · drcege · Mar 15, 2024 · Mar 14, 2024 · Mar 14, 2024 · Mar 14, 2024
diff --git a/configs/config_all.yaml b/configs/config_all.yaml
@@ -83,7 +83,7 @@ process:
       keep_original_sample: true                              # whether to keep the original sample. If it's set to False, there will be only generated images in the final datasets and the original images will be removed. It's True in default.
       caption_key: null                                       # the key name of fields in samples to store captions for each images, the caption guide the diffusion model to produce what the image is
       hf_img2seq: 'Salesforce/blip2-opt-2.7b'                 # model name on huggingface to generate caption if caption_key is null
-  - image_face_blur_mapper:                                 # mapper to blur faces detected in images.
+  - image_face_blur_mapper:                                 # blur faces detected in images
       blur_type: 'gaussian'                                   # type of blur kernel, including ['mean', 'box', 'gaussian']
       radius: 2                                               # radius of blur kernel
   - nlpaug_en_mapper:                                       # simply augment texts in English based on the nlpaug library
@@ -151,6 +151,9 @@ process:
       frame_num: 3                                            # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
       horizontal_flip: false                                  # flip frame image horizontally (left to right).
       vertical_flip: false                                    # flip frame image vertically (top to bottom).
+  - video_face_blur_mapper:                                 # blur faces detected in videos
+      blur_type: 'gaussian'                                   # type of blur kernel, including ['mean', 'box', 'gaussian']
+      radius: 2                                               # radius of blur kernel
   - video_ffmpeg_wrapped_mapper:                            # simple wrapper for FFmpeg video filters
   - video_remove_watermark_mapper:                          # Remove the watermarks in videos given regions
       roi_strings: ['0,0,0.1,0.1']                            # a given list of regions the watermarks locate. The format of each can be "x1, y1, x2, y2", "(x1, y1, x2, y2)", or "[x1, y1, x2, y2]".

diff --git a/data_juicer/ops/mapper/__init__.py b/data_juicer/ops/mapper/__init__.py
@@ -14,8 +14,9 @@
                remove_words_with_incorrect_substrings_mapper,
                replace_content_mapper, sentence_split_mapper,
                video_captioning_from_audio_mapper,
-               video_captioning_from_video_mapper, video_ffmpeg_wrapped_mapper,
-               video_remove_watermark_mapper, video_resize_aspect_ratio_mapper,
+               video_captioning_from_video_mapper, video_face_blur_mapper,
+               video_ffmpeg_wrapped_mapper, video_remove_watermark_mapper,
+               video_resize_aspect_ratio_mapper,
                video_resize_resolution_mapper, video_split_by_duration_mapper,
                video_split_by_key_frame_mapper, video_split_by_scene_mapper,
                video_tagging_from_audio_mapper,

diff --git a/data_juicer/ops/mapper/video_face_blur_mapper.py b/data_juicer/ops/mapper/video_face_blur_mapper.py
@@ -0,0 +1,111 @@
+import av
+
+from data_juicer.utils.availability_utils import AvailabilityChecking
+from data_juicer.utils.file_utils import transfer_filename
+from data_juicer.utils.mm_utils import (load_data_with_context, load_video,
+                                        pil_to_opencv, process_each_frame)
+
+from ..base_op import OPERATORS, Mapper
+from ..op_fusion import LOADED_VIDEOS
+
+OP_NAME = 'video_face_blur_mapper'
+
+with AvailabilityChecking(['dlib', 'Pillow'], OP_NAME):
+    import dlib
+    from PIL import ImageFilter
+
+
+@OPERATORS.register_module(OP_NAME)
+@LOADED_VIDEOS.register_module(OP_NAME)
+class VideoFaceBlurMapper(Mapper):
+    """Mapper to blur faces detected in videos.
+    """
+
+    _default_kwargs = {'upsample_num_times': 0}
+
+    def __init__(self,
+                 blur_type: str = 'gaussian',
+                 radius: float = 2,
+                 *args,
+                 **kwargs):
+        """
+        Initialization method.
+
+        :param blur_type: Type of blur kernel, including
+        ['mean', 'box', 'gaussian'].
+        :param radius: Radius of blur kernel.
+        :param args: extra args
+        :param kwargs: extra args
+        """
+        super().__init__(*args, **kwargs)
+        self._init_parameters = self.remove_extra_parameters(locals())
+
+        if blur_type not in ['mean', 'box', 'gaussian']:
+            raise ValueError(
+                f'Blur_type [{blur_type}] is not supported. '
+                f'Can only be one of ["mean", "box", "gaussian"]. ')
+        if radius < 0:
+            raise ValueError('Radius must be >= 0. ')
+
+        if blur_type == 'mean':
+            self.blur = ImageFilter.BLUR
+        elif blur_type == 'box':
+            self.blur = ImageFilter.BoxBlur(radius)
+        else:
+            self.blur = ImageFilter.GaussianBlur(radius)
+
+        self.blur_type = blur_type
+        self.radius = radius
+
+        self.extra_kwargs = {
+            k: kwargs.get(k, v)
+            for k, v in self._default_kwargs.items()
+        }
+
+        # Initialize face detector
+        self.detector = dlib.get_frontal_face_detector()
+
+    def process(self, sample, context=False):
+        # there is no video in this sample
+        if self.video_key not in sample or not sample[self.video_key]:
+            return sample
+
+        loaded_video_keys = sample[self.video_key]
+        sample, videos = load_data_with_context(sample, context,
+                                                loaded_video_keys, load_video)
+
+        processed_video_keys = {}
+        for video_key in loaded_video_keys:
+            # skip duplicate
+            if video_key in processed_video_keys:
+                continue
+
+            video = videos[video_key]
+            blured_video_key = transfer_filename(video_key, OP_NAME,
+                                                 **self._init_parameters)
+            output_video_key = process_each_frame(video, blured_video_key,
+                                                  self._blur_face)
+            processed_video_keys[video_key] = output_video_key
+
+            if not context:
+                video.close()
+
+        sample[self.video_key] = [
+            processed_video_keys[key] for key in loaded_video_keys
+        ]
+        return sample
+
+    def _blur_face(self, frame):
+        image = frame.to_image()
+        img = pil_to_opencv(image)
+        dets = self.detector(img, **self.extra_kwargs)
+        if len(dets) > 0:
+            for det in dets:
+                x1 = max(det.left(), 0)
+                y1 = max(det.top(), 0)
+                x2 = min(det.right(), image.width)
+                y2 = min(det.bottom(), image.height)
+                blured_roi = image.crop((x1, y1, x2, y2)).filter(self.blur)
+                image.paste(blured_roi, (x1, y1, x2, y2))
+            frame = av.VideoFrame.from_image(image)
+        return frame
diff --git a/data_juicer/ops/mapper/video_remove_watermark_mapper.py b/data_juicer/ops/mapper/video_remove_watermark_mapper.py
@@ -220,8 +220,9 @@ def process(self, sample, context=False):
                 def process_frame_func(frame):
                     return self._clean_watermark(frame, watermark_mask)
 
-                process_each_frame(video, cleaned_video_key,
-                                   process_frame_func)
+                cleaned_video_key = process_each_frame(video,
+                                                       cleaned_video_key,
+                                                       process_frame_func)
 
             loaded_video_keys[index] = cleaned_video_key
 

diff --git a/data_juicer/utils/mm_utils.py b/data_juicer/utils/mm_utils.py
@@ -2,6 +2,7 @@
 import datetime
 import os
 import re
+import shutil
 from typing import List, Union
 
 import av
@@ -335,6 +336,8 @@ def process_each_frame(input_video: Union[str, av.container.InputContainer],
     :param frame_func: a function which inputs a frame and outputs another
         frame.
     """
+    frame_modified = False
+
     # open the original video
     if isinstance(input_video, str):
         container = av.open(input_video)
@@ -364,6 +367,8 @@ def process_each_frame(input_video: Union[str, av.container.InputContainer],
         for packet in container.demux(input_video_stream):
             for frame in packet.decode():
                 new_frame = frame_func(frame)
+                if new_frame != frame:
+                    frame_modified = True
                 # for resize cases
                 output_video_stream.width = new_frame.width
                 output_video_stream.height = new_frame.height
@@ -379,6 +384,13 @@ def process_each_frame(input_video: Union[str, av.container.InputContainer],
         container.close()
     output_container.close()
 
+    if frame_modified:
+        return output_video
+    else:
+        shutil.rmtree(output_video, ignore_errors=True)
+        return (input_video
+                if isinstance(input_video, str) else input_video.name)
+
 
 def extract_key_frames(input_video: Union[str, av.container.InputContainer]):
     """

diff --git a/docs/Operators.md b/docs/Operators.md
@@ -11,7 +11,7 @@ The operators in Data-Juicer are categorized into 5 types.
 | Type                              | Number | Description                                     |
 |-----------------------------------|:------:|-------------------------------------------------|
 | [ Formatter ]( #formatter )       |   7    | Discovers, loads, and canonicalizes source data |
-| [ Mapper ]( #mapper )             |   40   | Edits and transforms samples                    |
+| [ Mapper ]( #mapper )             |   41   | Edits and transforms samples                    |
 | [ Filter ]( #filter )             |   36   | Filters out low-quality samples                 |
 | [ Deduplicator ]( #deduplicator ) |   5    | Detects and removes duplicate samples           |
 | [ Selector ]( #selector )         |   2    | Selects top samples based on ranking            |
@@ -80,6 +80,7 @@ All the specific operators are listed below, each featured with several capabili
 | sentence_split_mapper                               | General            | en     | Splits and reorganizes sentences according to semantics                                                       |
 | video_captioning_from_audio_mapper                  | Multimodal         | -      | Caption a video according to its audio streams based on Qwen-Audio model                                      |
 | video_captioning_from_video_mapper                  | Multimodal         |  -     | generate samples whose captions are generated based on another model (video-blip) and sampled video frame within the original sample |
+| video_face_blur_mapper                              | Video              |  -     | Blur faces detected in videos                                                                                 |
 | video_ffmpeg_wrapped_mapper                         | Video              | -      | Simple wrapper to run a FFmpeg video filter                                                                   |
 | video_remove_watermark_mapper                       | Video              | -      | Remove the watermarks in videos given regions                                                                 |
 | video_resize_aspect_ratio_mapper                    | Video              | -      | Resize video aspect ratio to a specified range                                                                |

diff --git a/docs/Operators_ZH.md b/docs/Operators_ZH.md
@@ -11,7 +11,7 @@ Data-Juicer 中的算子分为以下 5 种类型。
 | 类型                                | 数量 | 描述            |
 |------------------------------------|:--:|---------------|
 | [ Formatter ]( #formatter )        |  7 | 发现、加载、规范化原始数据 |
-| [ Mapper ]( #mapper )              | 40 | 对数据样本进行编辑和转换  |
+| [ Mapper ]( #mapper )              | 41 | 对数据样本进行编辑和转换  |
 | [ Filter ]( #filter )              | 36 | 过滤低质量样本       |
 | [ Deduplicator ]( #deduplicator )  |  5 | 识别、删除重复样本     |
 | [ Selector ]( #selector )          |  2 | 基于排序选取高质量样本   |
@@ -79,6 +79,7 @@ Data-Juicer 中的算子分为以下 5 种类型。
 | sentence_split_mapper                               | General               | en        | 根据语义拆分和重组句子                                            |
 | video_captioning_from_audio_mapper                         | Multimodal         | -      | 基于 Qwen-Audio 模型根据视频的音频流为视频生成新的标题描述 |
 | video_captioning_from_video_mapper                             | Multimodal            |  -    | 生成样本，其标题是根据另一个辅助模型（video-blip）和原始样本中的视频中指定帧的图像。                                             |
+| video_face_blur_mapper                              | Video                 |  -        | 对视频中的人脸进行模糊处理                                      ｜
 | video_ffmpeg_wrapped_mapper                         | Video                 | -         | 运行 FFmpeg 视频过滤器的简单封装                         |
 | video_remove_watermark_mapper                       | Video                 | -         | 去除视频中给定区域的水印                                            ｜
 | video_resize_aspect_ratio_mapper                    | Video                 | -         | 将视频的宽高比调整到指定范围内                                              |

diff --git a/docs/sphinx_doc/source/data_juicer.ops.mapper.rst b/docs/sphinx_doc/source/data_juicer.ops.mapper.rst
@@ -122,6 +122,14 @@ data\_juicer.ops.mapper.image\_diffusion\_mapper
    :undoc-members:
    :show-inheritance:
 
+data\_juicer.ops.mapper.image\_face\_blur\_mapper
+-------------------------------------------------------------
+
+.. automodule:: data_juicer.ops.mapper.image_face_blur_mapper
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 data\_juicer.ops.mapper.nlpaug\_en\_mapper
 -------------------------------------------------
 

diff --git a/tests/ops/mapper/test_video_face_blur_mapper.py b/tests/ops/mapper/test_video_face_blur_mapper.py
@@ -0,0 +1,85 @@
+import os
+import shutil
+import unittest
+
+from datasets import Dataset
+
+from data_juicer.ops.mapper.video_face_blur_mapper import VideoFaceBlurMapper
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
+
+
+class VideoFaceBlurMapperTest(DataJuicerTestCaseBase):
+
+    maxDiff = None
+
+    data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
+                             'data')
+    vid1_path = os.path.join(data_path, 'video1.mp4')
+    vid4_path = os.path.join(data_path, 'video4.mp4')
+    vid5_path = os.path.join(data_path, 'video5.mp4')
+
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        cls.chk_path = os.path.join(cls.data_path, cls.__name__)
+        shutil.rmtree(cls.chk_path, ignore_errors=True)
+        os.makedirs(cls.chk_path)
+
+    def _run_helper(self, op, source_list, np=1):
+        dataset = Dataset.from_list(source_list)
+        dataset = dataset.map(op.process, num_proc=np)
+        res_list = dataset.to_list()
+        for source, res in zip(source_list, res_list):
+            self.assertEqual(len(source[op.video_key]), len(res[op.video_key]))
+            # for manual check
+            for path in res[op.video_key]:
+                basename = os.path.basename(path)
+                dst = f'{self.chk_path}/{op.blur_type}:{op.radius}_np:{np}_{basename}'
+                shutil.copy(path, dst)
+
+    def test_gaussian_radius(self):
+        ds_list = [{
+            'videos': [self.vid1_path]
+        }, {
+            'videos': [self.vid4_path]
+        }, {
+            'videos': [self.vid5_path]
+        }]
+        op = VideoFaceBlurMapper(blur_type='gaussian', radius=10)
+        self._run_helper(op, ds_list)
+
+    def test_box_radius(self):
+        ds_list = [{
+            'videos': [self.vid1_path]
+        }, {
+            'videos': [self.vid4_path]
+        }, {
+            'videos': [self.vid5_path]
+        }]
+        op = VideoFaceBlurMapper(blur_type='box', radius=10)
+        self._run_helper(op, ds_list)
+
+    def test_mean(self):
+        ds_list = [{
+            'videos': [self.vid1_path]
+        }, {
+            'videos': [self.vid4_path]
+        }, {
+            'videos': [self.vid5_path]
+        }]
+        op = VideoFaceBlurMapper(blur_type='mean')
+        self._run_helper(op, ds_list)
+
+    def test_gaussian_radius_parallel(self):
+        ds_list = [{
+            'videos': [self.vid1_path]
+        }, {
+            'videos': [self.vid4_path]
+        }, {
+            'videos': [self.vid5_path]
+        }]
+        op = VideoFaceBlurMapper(blur_type='gaussian', radius=10)
+        self._run_helper(op, ds_list, np=3)
+
+if __name__ == '__main__':
+    unittest.main()