modelscope · drcege · Dec 6, 2023 · Nov 28, 2023 · Dec 4, 2023 · Dec 6, 2023
diff --git a/configs/config_all.yaml b/configs/config_all.yaml
@@ -110,6 +110,10 @@ process:
       rep_len: 10                                             # repetition length for char-level n-gram
       min_ratio: 0.0                                          # the min ratio of filter range
       max_ratio: 0.5                                          # the max ratio of filter range
+  - face_area_filter:                                       # filter samples according to the face area ratios in images (r=face_area/image_area). If multiple faces are available, we use the largest one.
+      min_ratio: 0.0                                          # the min face area ratio of filter range
+      max_ratio: 0.4                                          # the max face area ratio of filter range
+      upsample_num_times: 0                                   # optional argument passing to the underlying dlib face detector
   - flagged_words_filter:                                   # filter text with the flagged-word ratio larger than a specific max value
       lang: en                                                # consider flagged words in what language
       tokenization: false                                     # whether to use model to tokenize documents

diff --git a/data_juicer/ops/filter/__init__.py b/data_juicer/ops/filter/__init__.py
@@ -1,10 +1,10 @@
 from . import (alphanumeric_filter, average_line_length_filter,
-               character_repetition_filter, flagged_words_filter,
-               image_aspect_ratio_filter, image_shape_filter,
-               image_size_filter, image_text_matching_filter,
-               image_text_similarity_filter, language_id_score_filter,
-               maximum_line_length_filter, perplexity_filter,
-               special_characters_filter, specified_field_filter,
-               specified_numeric_field_filter, stopwords_filter, suffix_filter,
-               text_length_filter, token_num_filter, word_num_filter,
-               word_repetition_filter)
+               character_repetition_filter, face_area_filter,
+               flagged_words_filter, image_aspect_ratio_filter,
+               image_shape_filter, image_size_filter,
+               image_text_matching_filter, image_text_similarity_filter,
+               language_id_score_filter, maximum_line_length_filter,
+               perplexity_filter, special_characters_filter,
+               specified_field_filter, specified_numeric_field_filter,
+               stopwords_filter, suffix_filter, text_length_filter,
+               token_num_filter, word_num_filter, word_repetition_filter)
diff --git a/data_juicer/ops/filter/face_area_filter.py b/data_juicer/ops/filter/face_area_filter.py
@@ -0,0 +1,125 @@
+import numpy as np
+from jsonargparse.typing import ClosedUnitInterval
+
+from data_juicer.utils.availability_utils import AvailabilityChecking
+from data_juicer.utils.constant import Fields, StatsKeys
+from data_juicer.utils.mm_utils import load_image, pil_to_opencv
+
+from ..base_op import OPERATORS, Filter
+from ..op_fusion import LOADED_IMAGES
+
+OP_NAME = 'face_area_filter'
+
+with AvailabilityChecking(['dlib'], OP_NAME):
+    import dlib
+
+
+@OPERATORS.register_module(OP_NAME)
+@LOADED_IMAGES.register_module(OP_NAME)
+class FaceAreaFilter(Filter):
+    """Filter to keep samples with face area ratio within a specific range.
+    """
+
+    def __init__(self,
+                 min_ratio: ClosedUnitInterval = 0.0,
+                 max_ratio: ClosedUnitInterval = 0.4,
+                 any_or_all: str = 'any',
+                 *args,
+                 **kwargs):
+        """
+        Initialization method.
+
+        :param min_ratio: Min ratio for the largest face area in an image.
+        :param max_ratio: Max ratio for the largest face area in an image.
+        :param any_or_all: Keep this sample with 'any' or 'all' strategy of
+            all images. 'any': keep this sample if any images meet the
+            condition. 'all': keep this sample only if all images meet the
+            condition.
+        :param args: Extra positional arguments.
+        :param kwargs: Extra keyword arguments.
+        """
+
+        # Extract face detector arguments from kwargs
+        detector_keys = ['upsample_num_times']
+        self.detector_kwargs = {
+            key: kwargs.pop(key)
+            for key in detector_keys if key in kwargs
+        }
+
+        super().__init__(*args, **kwargs)
+        self.min_ratio = min_ratio
+        self.max_ratio = max_ratio
+
+        if any_or_all not in ['any', 'all']:
+            raise ValueError(f'Keep strategy [{any_or_all}] is not supported. '
+                             f'Can only be one of ["any", "all"].')
+        self.any = (any_or_all == 'any')
+
+        # Initialize face detector
+        self.detector = dlib.get_frontal_face_detector()
+
+    def compute_stats(self, sample, context=False):
+        # check if it's computed already
+        if StatsKeys.face_ratios in sample[Fields.stats]:
+            return sample
+
+        # there is no image in this sample
+        if self.image_key not in sample or not sample[self.image_key]:
+            sample[Fields.stats][StatsKeys.face_ratios] = np.array(
+                [], dtype=np.float64)
+            return sample
+
+        # load images
+        loaded_image_keys = sample[self.image_key]
+        images = {}
+        for loaded_image_key in loaded_image_keys:
+            if context and loaded_image_key in sample[Fields.context]:
+                # load from context
+                images[loaded_image_key] = sample[
+                    Fields.context][loaded_image_key]
+            else:
+                if loaded_image_key not in images:
+                    # avoid load the same images
+                    image = load_image(loaded_image_key)
+                    images[loaded_image_key] = image
+                if context:
+                    # store the image data into context
+                    sample[Fields.context][loaded_image_key] = image
+
+        # detect faces
+        face_detections = {}
+        for key, image in images.items():
+            img = pil_to_opencv(image)
+            dets = self.detector(img, **self.detector_kwargs)
+            face_detections[key] = [[
+                det.left(), det.top(),
+                det.width(), det.height()
+            ] for det in dets]
+
+        # compute face area ratios for each image considering the largest face
+        face_area_ratios = {}
+        for key, dets in face_detections.items():
+            image_area = images[key].width * images[key].height
+            face_area_ratios[key] = max(
+                [w * h / image_area for _, _, w, h in dets], default=0.0)
+
+        sample[Fields.stats][StatsKeys.face_ratios] = [
+            face_area_ratios[key] for key in loaded_image_keys
+        ]
+        return sample
+
+    def process(self, sample):
+        face_ratios = sample[Fields.stats][StatsKeys.face_ratios]
+        if len(face_ratios) <= 0:
+            return True
+
+        keep_bools = np.array([
+            self.min_ratio <= face_ratio <= self.max_ratio
+            for face_ratio in face_ratios
+        ])
+
+        # different strategies
+        if self.any:
+            return keep_bools.any()
+        else:
+            return keep_bools.all()
diff --git a/data_juicer/utils/constant.py b/data_juicer/utils/constant.py
@@ -31,6 +31,8 @@ class StatsKeys(object):
     image_width = 'image_width'
     image_height = 'image_height'
     image_sizes = 'image_sizes'
+    face_ratios = 'face_ratios'
+    face_detections = 'face_detections'
 
     # multimodal
     image_text_similarity = 'image_text_similarity'

diff --git a/data_juicer/utils/mm_utils.py b/data_juicer/utils/mm_utils.py
@@ -1,3 +1,4 @@
+import numpy as np
 from datasets import Audio, Image
 
 from data_juicer.utils.constant import DEFAULT_PREFIX
@@ -49,6 +50,15 @@ def load_audio(path, sampling_rate=None):
     return (aud['array'], aud['sampling_rate'])
 
 
+def pil_to_opencv(pil_image):
+    if pil_image.mode != 'RGB':
+        pil_image = pil_image.convert('RGB')
+    numpy_image = np.array(pil_image)
+    # RGB to BGR
+    opencv_image = numpy_image[:, :, ::-1]
+    return opencv_image
+
+
 def get_image_size(path, ):
     import os
     return os.path.getsize(path)

diff --git a/docs/Operators.md b/docs/Operators.md
@@ -11,7 +11,7 @@ The operators in Data-Juicer are categorized into 5 types.
 |-----------------------------------|:------:|-------------------------------------------------|
 | [ Formatter ]( #formatter )       |   7    | Discovers, loads, and canonicalizes source data |
 | [ Mapper ]( #mapper )             |   21   | Edits and transforms samples                    |
-| [ Filter ]( #filter )             |   21   | Filters out low-quality samples                 |
+| [ Filter ]( #filter )             |   22   | Filters out low-quality samples                 |
 | [ Deduplicator ]( #deduplicator ) |   4    | Detects and removes duplicate samples           |
 | [ Selector ]( #selector )         |   2    | Selects top samples based on ranking            |
 
@@ -47,16 +47,16 @@ All the specific operators are listed below, each featured with several capabili
 
 | Operator                                            | Domain             | Lang   | Description                                                                                                    |
 |-----------------------------------------------------|--------------------|--------|----------------------------------------------------------------------------------------------------------------|
-| chinese_convert_mapper                              | General            | zh     | Convert Chinese between Traditional Chinese, Simplified Chinese and Japanese Kanji (by [opencc](https://github.com/BYVoid/OpenCC))                 |
+| chinese_convert_mapper                              | General            | zh     | Converts Chinese between Traditional Chinese, Simplified Chinese and Japanese Kanji (by [opencc](https://github.com/BYVoid/OpenCC))                 |
 | clean_copyright_mapper                              | Code               | en, zh | Removes copyright notice at the beginning of code files (:warning: must contain the word *copyright*)          |
 | clean_email_mapper                                  | General            | en, zh | Removes email information                                                                                      |
 | clean_html_mapper                                   | General            | en, zh | Removes HTML tags and returns plain text of all the nodes                                                      |
 | clean_ip_mapper                                     | General            | en, zh | Removes IP addresses                                                                                           |
 | clean_links_mapper                                  | General, Code      | en, zh | Removes links, such as those starting with http or ftp                                                         |
 | expand_macro_mapper                                 | LaTeX              | en, zh | Expands macros usually defined at the top of TeX documents                                                     |
 | fix_unicode_mapper                                  | General            | en, zh | Fixes broken Unicodes (by [ftfy](https://ftfy.readthedocs.io/))                                                |
-| nlpaug_en_mapper                                    | General            | en     | Simply augment texts in English based on the `nlpaug` library                                                  | 
-| nlpcda_zh_mapper                                    | General            | zh     | Simply augment texts in Chinese based on the `nlpcda` library                                                  | 
+| nlpaug_en_mapper                                    | General            | en     | Simply augments texts in English based on the `nlpaug` library                                                 | 
+| nlpcda_zh_mapper                                    | General            | zh     | Simply augments texts in Chinese based on the `nlpcda` library                                                 | 
 | punctuation_normalization_mapper                    | General            | en, zh | Normalizes various Unicode punctuations to their ASCII equivalents                                             |
 | remove_bibliography_mapper                          | LaTeX              | en, zh | Removes the bibliography of TeX documents                                                                      |
 | remove_comments_mapper                              | LaTeX              | en, zh | Removes the comments of TeX documents                                                                          |
@@ -77,10 +77,11 @@ All the specific operators are listed below, each featured with several capabili
 | alphanumeric_filter            | General | en, zh | Keeps samples with alphanumeric ratio within the specified range                           |
 | average_line_length_filter     | Code    | en, zh | Keeps samples with average line length within the specified range                          |
 | character_repetition_filter    | General | en, zh | Keeps samples with char-level n-gram repetition ratio within the specified range           |
+| face_area_filter               | Image   |   -    | Keeps samples contains images with face area ratios within the specified range             |
 | flagged_words_filter           | General | en, zh | Keeps samples with flagged-word ratio below the specified threshold                        |
-| image_aspect_ratio_filter      | Image   |   -    | Keeps samples contains images with aspect ratios within specific range                     |
-| image_shape_filter             | Image   |   -    | Keeps samples contains images with widths and heights within specific ranges               |
-| image_size_filter              | Image   |   -    | Keeps samples contains images whose size in bytes are within specific range                     |
+| image_aspect_ratio_filter      | Image   |   -    | Keeps samples contains images with aspect ratios within the specified range                |
+| image_shape_filter             | Image   |   -    | Keeps samples contains images with widths and heights within the specified range           |
+| image_size_filter              | Image   |   -    | Keeps samples contains images whose size in bytes are within the specified range                     |
 | image_text_matching_filter     | Multimodal |   -    |  Keeps samples with image-text classification matching score within the specified range based on a BLIP model          |
 | image_text_similarity_filter   | Multimodal |   -    |  Keeps samples with image-text feature cosine similarity within the specified range based on a CLIP model          |
 | language_id_score_filter       | General | en, zh | Keeps samples of the specified language, judged by a predicted confidence score            |
@@ -99,12 +100,12 @@ All the specific operators are listed below, each featured with several capabili
 
 ## Deduplicator <a name="deduplicator"/>
 
-| Operator                      | Domain  | Lang   | Description                                                 |
-|-------------------------------|---------|--------|-------------------------------------------------------------|
-| document_deduplicator         | General | en, zh | Deduplicate samples at document-level by comparing MD5 hash |
-| document_minhash_deduplicator | General | en, zh | Deduplicate samples at document-level using MinHashLSH      |
-| document_simhash_deduplicator | General | en, zh | Deduplicate samples at document-level using SimHash         |
-| image_deduplicator            | Image   |   -    | Deduplicate samples at document-level using exact matching of images between documents |
+| Operator                      | Domain  | Lang   | Description                                                  |
+|-------------------------------|---------|--------|--------------------------------------------------------------|
+| document_deduplicator         | General | en, zh | Deduplicates samples at document-level by comparing MD5 hash |
+| document_minhash_deduplicator | General | en, zh | Deduplicates samples at document-level using MinHashLSH      |
+| document_simhash_deduplicator | General | en, zh | Deduplicates samples at document-level using SimHash         |
+| image_deduplicator            | Image   |   -    | Deduplicates samples at document-level using exact matching of images between documents |
 
 
 ## Selector <a name="selector"/>

diff --git a/docs/Operators_ZH.md b/docs/Operators_ZH.md
@@ -10,7 +10,7 @@ Data-Juicer 中的算子分为以下 5 种类型。
 |------------------------------------|:--:|---------------|
 | [ Formatter ]( #formatter )        |  7 | 发现、加载、规范化原始数据 |
 | [ Mapper ]( #mapper )              | 21 | 对数据样本进行编辑和转换  |
-| [ Filter ]( #filter )              | 21  | 过滤低质量样本       |
+| [ Filter ]( #filter )              | 22 | 过滤低质量样本       |
 | [ Deduplicator ]( #deduplicator )  |  4 | 识别、删除重复样本     |
 | [ Selector ]( #selector )          |  2 | 基于排序选取高质量样本   |
 
@@ -74,6 +74,7 @@ Data-Juicer 中的算子分为以下 5 种类型。
 | alphanumeric_filter            | General | en, zh | 保留字母数字比例在指定范围内的样本                  |
 | average_line_length_filter     | Code    | en, zh | 保留平均行长度在指定范围内的样本                   |
 | character_repetition_filter    | General | en, zh | 保留 char-level n-gram 重复比率在指定范围内的样本 |
+| face_area_filter               | Image   |   -    | 保留样本中包含的图片的最大脸部区域在指定范围内的样本     |
 | flagged_words_filter           | General | en, zh | 保留使标记字比率保持在指定阈值以下的样本               |
 | image_aspect_ratio_filter      | Image   | -      | 保留样本中包含的图片的宽高比在指定范围内的样本            |
 | image_shape_filter             | Image   |   -    | 保留样本中包含的图片的形状（即宽和高）在指定范围内的样本       |

diff --git a/environments/science_requires.txt b/environments/science_requires.txt
@@ -12,3 +12,4 @@ transformers
 opencc==1.1.6
 imagededup
 torch
+dlib
diff --git a/tests/ops/data/lena-face.jpg b/tests/ops/data/lena-face.jpg
diff --git a/tests/ops/data/lena.jpg b/tests/ops/data/lena.jpg
-Original file line number
+Diff line change
@@ Expand Up / @@ -12,3 +12,4 @@ transformers @@
     opencc==1.1.6
     imagededup
     torch
+    dlib