modelscope · BeachWang · Mar 14, 2024 · Mar 8, 2024 · Mar 12, 2024 · Mar 12, 2024
diff --git a/configs/config_all.yaml b/configs/config_all.yaml
@@ -148,11 +148,14 @@ process:
       frame_num: 3                                            # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
       horizontal_flip: false                                  # flip frame image horizontally (left to right).
       vertical_flip: false                                    # flip frame image vertically (top to bottom).
-  - video_split_by_scene_mapper:                            # split videos into scene clips
-      detector: 'ContentDetector'                             # PySceneDetect scene detector. Should be one of ['ContentDetector', 'ThresholdDetector', 'AdaptiveDetector`]
-      threshold: 27.0                                         # threshold passed to the detector
-      min_scene_len: 15                                       # minimum length of any scene
-      show_progress: false                                    # whether to show progress from scenedetect
+  - video_ffmpeg_wrapped_mapper:                            # simple wrapper for FFmpeg video filters
+  - video_remove_watermark_mapper:                          # Remove the watermarks in videos given regions
+      roi_strings: ['0,0,0.1,0.1']                            # a given list of regions the watermarks locate. The format of each can be "x1, y1, x2, y2", "(x1, y1, x2, y2)", or "[x1, y1, x2, y2]".
+      roi_type: ratio                                         # the roi string type. When the type is 'pixel', (x1, y1), (x2, y2) are the locations of pixels in the top left corner and the bottom right corner respectively. If the roi_type is 'ratio', the coordinates are normalized by wights and heights.
+      roi_key: null                                           # the key name of fields in samples to store roi_strings for each sample. It's used for set different rois for different samples.
+      frame_num: 10                                           # the number of frames to be extracted uniformly from the video to detect the pixels of watermark.
+      min_frame_threshold: 7                                  # a coodination is considered as the location of a watermark pixel when it is a watermark pixel in no less min_frame_threshold frames.
+      detection_method: pixel_value                           # the method to detect the pixels of watermark. If it is 'pixel_value', we consider the distribution of pixel value in each frame. If it is 'pixel_diversity', we will consider the pixel diversity in different frames.
   - video_resize_aspect_ratio_mapper:                       # resize videos aspect ratios of videos (a fraction of width by height, r=w/h) to a specified range
       min_ratio: 9/21                                         # the minimum aspect ratio to enforce videos with an aspect ratio below `min_ratio` will be resized to match this minimum ratio. The ratio should be provided as a string in the format "9:21" or "9/21".
       max_ratio: 21/9                                         # the maximum aspect ratio to enforce videos with an aspect ratio above `max_ratio` will be resized to match this maximum ratio. The ratio should be provided as a string in the format "21:9" or "21/9".
@@ -164,13 +167,17 @@ process:
       max_height: 1080                                        # the max vertical resolution (unit p), videos with height more than 'max_height' will be mapped to videos with equal or smaller height
       force_original_aspect_ratio: 'increase'                 # Enable decreasing or increasing output video width or height if necessary to keep the original aspect ratio
       force_divisible_by: 4                                   # Ensures that both the output dimensions, width and height, are divisible by the given integer when used together with force_original_aspect_ratio
-  - video_ffmpeg_wrapped_mapper:                            # simple wrapper for FFmpeg video filters
   - video_split_by_duration_mapper:                         # Mapper to split video by duration.
       split_duration: 10                                      # duration of each video split in seconds.
       min_last_split_duration: 0.1                              # the minimum allowable duration in seconds for the last video split. If the duration of the last split is less than this value, it will be discarded.
       keep_original_sample: true                              # whether to keep the original sample. If it's set to False, there will be only cut sample in the final datasets and the original sample will be removed. It's True in default
   - video_split_by_key_frame_mapper:                        # Mapper to split video by key frame.
       keep_original_sample: true                              # whether to keep the original sample. If it's set to False, there will be only cut sample in the final datasets and the original sample will be removed. It's True in default
+  - video_split_by_scene_mapper:                            # split videos into scene clips
+      detector: 'ContentDetector'                             # PySceneDetect scene detector. Should be one of ['ContentDetector', 'ThresholdDetector', 'AdaptiveDetector`]
+      threshold: 27.0                                         # threshold passed to the detector
+      min_scene_len: 15                                       # minimum length of any scene
+      show_progress: false                                    # whether to show progress from scenedetect
   - video_tagging_from_audio_mapper:                        # Mapper to generate video tags from audio streams extracted from the video.
       hf_ast: 'MIT/ast-finetuned-audioset-10-10-0.4593'       # Huggingface model name for the audio classification model.
   - video_tagging_from_frames_mapper:                       # Mapper to generate video tags from frames extracted from the video.

diff --git a/data_juicer/ops/mapper/__init__.py b/data_juicer/ops/mapper/__init__.py
@@ -14,7 +14,7 @@
                replace_content_mapper, sentence_split_mapper,
                video_captioning_from_audio_mapper,
                video_captioning_from_video_mapper, video_ffmpeg_wrapped_mapper,
-               video_resize_aspect_ratio_mapper,
+               video_remove_watermark_mapper, video_resize_aspect_ratio_mapper,
                video_resize_resolution_mapper, video_split_by_duration_mapper,
                video_split_by_key_frame_mapper, video_split_by_scene_mapper,
                video_tagging_from_audio_mapper,

diff --git a/data_juicer/ops/mapper/video_remove_watermark_mapper.py b/data_juicer/ops/mapper/video_remove_watermark_mapper.py
@@ -0,0 +1,224 @@
+import os
+
+import av
+import numpy as np
+from jsonargparse.typing import List, PositiveInt
+
+from data_juicer.utils.availability_utils import AvailabilityChecking
+from data_juicer.utils.file_utils import transfer_filename
+from data_juicer.utils.logger_utils import HiddenPrints
+from data_juicer.utils.mm_utils import (extract_video_frames_uniformly,
+                                        load_data_with_context, load_video,
+                                        parse_string_to_roi,
+                                        process_each_frame)
+
+from ..base_op import OPERATORS, Mapper
+from ..op_fusion import LOADED_VIDEOS
+
+OP_NAME = 'video_remove_watermark_mapper'
+
+with AvailabilityChecking(['opencv-python'], OP_NAME), HiddenPrints():
+    import cv2 as cv
+
+
+@OPERATORS.register_module(OP_NAME)
+@LOADED_VIDEOS.register_module(OP_NAME)
+class VideoRemoveWatermarkMapper(Mapper):
+    """
+        Remove the watermarks in videos given regions.
+    """
+
+    def __init__(self,
+                 roi_strings: List[str] = ['0,0,0.1,0.1'],
+                 roi_type: str = 'ratio',
+                 roi_key: str = None,
+                 frame_num: PositiveInt = 10,
+                 min_frame_threshold: PositiveInt = 7,
+                 detection_method: str = 'pixel_value',
+                 *args,
+                 **kwargs):
+        """
+        Initialization method.
+
+        :param roi_strings: a given list of regions the watermarks locate.
+            The format of each can be "x1, y1, x2, y2", "(x1, y1, x2, y2)",
+            or "[x1, y1, x2, y2]".
+        :param roi_type: the roi string type. When the type is 'pixel', (x1,
+            y1), (x2, y2) are the locations of pixels in the top left corner
+            and the bottom right corner respectively. If the roi_type is
+            'ratio', the coordinates are normalized by wights and heights.
+        :param roi_key: the key name of fields in samples to store roi_strings
+            for each sample. It's used for set different rois for different
+            samples. If it's none, use rois in parameter "roi_strings".
+            It's None in default.
+        :param frame_num: the number of frames to be extracted uniformly from
+            the video to detect the pixels of watermark.
+        :param min_frame_threshold: a coodination is considered as the
+            location of a watermark pixel when it is that in no less
+            min_frame_threshold frames.
+        :detection_method: the method to detect the pixels of watermark. If it
+            is 'pixel_value', we consider the distribution of pixel value in
+            each frame. If it is 'pixel_diversity', we will consider the pixel
+            diversity in different frames. The min_frame_threshold is useless
+            and frame_num must be greater than 1 in 'pixel_diversity' mode.
+        :param args: extra args
+        :param kwargs: extra args
+        """
+        super().__init__(*args, **kwargs)
+        self._init_parameters = self.remove_extra_parameters(locals())
+
+        if roi_type not in ['ratio', 'pixel']:
+            raise ValueError(f'roi_type [{roi_type}]'
+                             f' is not supported. '
+                             f"Can only be one of ['ratio', 'pixel']. ")
+
+        if detection_method not in ['pixel_value', 'pixel_diversity']:
+            raise ValueError(
+                f'etection_method [{detection_method}]'
+                f' is not supported. '
+                f"Can only be one of ['pixel_value', 'pixel_diversity']. ")
+
+        if detection_method == 'pixel_diversity' and frame_num < 2:
+            raise ValueError(
+                "frame_num must be gteater than 1 in 'pixel_diversity' mode.")
+
+        rois = []
+        if roi_key is None:
+            for roi_string in roi_strings:
+                roi = parse_string_to_roi(roi_string, roi_type)
+                if roi is None:
+                    raise ValueError(
+                        'The roi in roi_strings must be four no negative'
+                        ' numbers in the format of "x1, y1, x2, y2", '
+                        '"(x1, y1, x2, y2)", or "[x1, y1, x2, y2]".')
+                rois.append(roi)
+
+        self.roi_type = roi_type
+        self.rois = rois
+        self.roi_key = roi_key
+        self.frame_num = frame_num
+        self.min_frame_threshold = min_frame_threshold
+        self.detection_method = detection_method
+
+    def _detect_watermark_via_pixel_value(self, frames, rois):
+
+        masks = []
+        for frame in frames:
+            frame = frame.to_ndarray(format='bgr24')
+            mask = np.zeros_like(frame[:, :, 0], dtype=np.uint8)
+            for roi in rois:
+                # dimension of ndarray frame: height x width x channel
+                roi_frame = frame[roi[1]:roi[3], roi[0]:roi[2]]
+                gray_frame = cv.cvtColor(roi_frame, cv.COLOR_BGR2GRAY)
+                _, binary_frame = cv.threshold(
+                    gray_frame, 0, 255, cv.THRESH_BINARY + cv.THRESH_OTSU)
+
+                # assume the watermark is located in the box, so the pixel in
+                # the edge must be 0, if not, reverse binary_frame
+                edge_postive_num = (binary_frame[0] >
+                                    0).sum() + (binary_frame[:, 0] > 0).sum()
+                total = binary_frame.shape[0] + binary_frame.shape[1]
+                if edge_postive_num * 2 > total:
+                    binary_frame = ~binary_frame
+
+                mask[roi[1]:roi[3],
+                     roi[0]:roi[2]] = mask[roi[1]:roi[3],
+                                           roi[0]:roi[2]] | binary_frame
+            masks.append(mask)
+        final_mask = sum((mask == 255).astype(np.uint8) for mask in masks)
+        final_mask = np.where(final_mask >= self.min_frame_threshold, 255, 0)
+        final_mask = final_mask.astype(np.uint8)
+        return final_mask
+
+    def _detect_watermark_via_pixel_diversity(self, frames, rois):
+
+        mask = np.zeros((frames[0].height, frames[0].width), dtype=np.uint8)
+        frames = [frame.to_ndarray(format='bgr24') for frame in frames]
+
+        for roi in rois:
+            roi_frames = [
+                frame[roi[1]:roi[3], roi[0]:roi[2]] for frame in frames
+            ]
+            roi_frames = np.stack(roi_frames, axis=0)
+            pixel_diversity = roi_frames.std(axis=0)
+            pixel_diversity = pixel_diversity.sum(-1)
+            max_diversity = np.max(pixel_diversity)
+            min_diversity = np.min(pixel_diversity)
+            scaled_diversity = 255 * (pixel_diversity - min_diversity) / (
+                max_diversity - min_diversity)
+            scaled_diversity = scaled_diversity.astype(np.uint8)
+            _, binary_frame = cv.threshold(scaled_diversity, 0, 255,
+                                           cv.THRESH_BINARY + cv.THRESH_OTSU)
+            # the watermark pixels have less diversity
+            binary_frame = ~binary_frame
+            mask[roi[1]:roi[3],
+                 roi[0]:roi[2]] = mask[roi[1]:roi[3],
+                                       roi[0]:roi[2]] | binary_frame
+
+        return mask
+
+    def _generate_watermark_mask(self, video, sample):
+        frames = extract_video_frames_uniformly(video, self.frame_num)
+
+        if self.roi_key is not None:
+            roi_strings = sample[self.roi_key]
+            if isinstance(roi_strings, str):
+                roi_strings = [roi_strings]
+            rois = [
+                parse_string_to_roi(roi_string, self.roi_type)
+                for roi_string in roi_strings
+            ]
+            rois = [roi for roi in rois if roi is not None]
+        else:
+            rois = self.rois
+        if self.roi_type == 'ratio':
+            rois = [
+                tuple([
+                    int(roi[0] * frames[0].width),
+                    int(roi[1] * frames[0].height),
+                    int(roi[2] * frames[0].width),
+                    int(roi[3] * frames[0].height)
+                ]) for roi in self.rois
+            ]
+
+        if self.detection_method == 'pixel_value':
+            mask = self._detect_watermark_via_pixel_value(frames, rois)
+        else:
+            mask = self._detect_watermark_via_pixel_diversity(frames, rois)
+
+        kernel = np.ones((5, 5), np.uint8)
+        return cv.dilate(mask, kernel)
+
+    def _clean_watermark(self, frame, watermark_mask):
+        np_frame = frame.to_ndarray(format='bgr24')
+        new_np_frame = cv.inpaint(np_frame, watermark_mask, 3, cv.INPAINT_NS)
+        return av.VideoFrame.from_ndarray(new_np_frame, format='bgr24')
+
+    def process(self, sample, context=False):
+        # there is no video in this sample
+        if self.video_key not in sample or not sample[self.video_key]:
+            return sample
+
+        loaded_video_keys = sample[self.video_key]
+        sample, videos = load_data_with_context(sample, context,
+                                                loaded_video_keys, load_video)
+
+        for index, video_key in enumerate(loaded_video_keys):
+            video = videos[video_key]
+            cleaned_video_key = transfer_filename(video_key, OP_NAME,
+                                                  **self._init_parameters)
+
+            if (not os.path.exists(cleaned_video_key)
+                    or cleaned_video_key not in loaded_video_keys):
+                watermark_mask = self._generate_watermark_mask(video, sample)
+
+                def process_frame_func(frame):
+                    return self._clean_watermark(frame, watermark_mask)
+
+                process_each_frame(video, cleaned_video_key,
+                                   process_frame_func)
+
+            loaded_video_keys[index] = cleaned_video_key
+
+        sample[self.video_key] = loaded_video_keys
+        return sample
diff --git a/data_juicer/ops/mapper/video_split_by_duration_mapper.py b/data_juicer/ops/mapper/video_split_by_duration_mapper.py
@@ -84,9 +84,8 @@ def split_videos_by_duration(self, video_key, container):
 
     def _process_single_sample(self, sample):
         # there is no video in this sample
-        if self.video_key not in sample \
-                or sample[self.video_key] is None \
-                or len(sample[self.video_key]) == 0:
+        if self.video_key not in sample or sample[
+                self.video_key] is None or len(sample[self.video_key]) == 0:
             return []
 
         # the split results

diff --git a/data_juicer/ops/mapper/video_split_by_key_frame_mapper.py b/data_juicer/ops/mapper/video_split_by_key_frame_mapper.py
@@ -69,9 +69,8 @@ def get_split_key_frame(self, video_key, container):
 
     def _process_single_sample(self, sample):
         # there is no video in this sample
-        if self.video_key not in sample \
-                or sample[self.video_key] is None \
-                or len(sample[self.video_key]) == 0:
+        if self.video_key not in sample or sample[
+                self.video_key] is None or len(sample[self.video_key]) == 0:
             return []
 
         # the split results