modelscope · chenhesen · Dec 27, 2023 · Nov 16, 2023 · Nov 20, 2023 · Nov 21, 2023
diff --git a/configs/config_all.yaml b/configs/config_all.yaml
@@ -85,6 +85,10 @@ process:
       keep_alphabet: true                                     # whether to keep alpabet
       keep_number: true                                       # whether to keep number
       keep_punc: true                                         # whether to keep punctuation
+  - remove_repeat_sentences_mapper:                         # remove repeat sentences in text samples.
+      lowercase: false                                        # whether to convert sample text to lower case
+      ignore_special_character: true                          # whether to ignore special characters when judging repeated sentences. Special characters are all characters except Chinese characters, letters and numbers
+      min_repeat_sentence_length: 2                           # sentences shorter than this length will not be deduplicated. If ignore_special_character is set to True, then special characters are not included in this length
   - remove_specific_chars_mapper:                           # remove characters specified by users
       chars_to_remove: '◆●■►▼▲▴∆▻▷❖♡□'                        # a string or a list including those characters that need to be removed
   - remove_table_text_mapper:                               # remove possible table texts from text.

diff --git a/data_juicer/ops/mapper/__init__.py b/data_juicer/ops/mapper/__init__.py
@@ -5,6 +5,7 @@
                punctuation_normalization_mapper, remove_bibliography_mapper,
                remove_comments_mapper, remove_header_mapper,
                remove_long_words_mapper, remove_non_chinese_character_mapper,
-               remove_specific_chars_mapper, remove_table_text_mapper,
+               remove_repeat_sentences_mapper, remove_specific_chars_mapper,
+               remove_table_text_mapper,
                remove_words_with_incorrect_substrings_mapper,
                sentence_split_mapper, whitespace_normalization_mapper)
diff --git a/data_juicer/ops/mapper/remove_repeat_sentences_mapper.py b/data_juicer/ops/mapper/remove_repeat_sentences_mapper.py
@@ -0,0 +1,71 @@
+import regex as re
+
+from ..base_op import OPERATORS, Mapper
+
+
+def split_sentence(text):
+    text = re.sub('([.。！!？\?])([^’”])',r'\1\n\2',text)                # noqa
+    text = re.sub('(\.{6})([^’”])',r'\1\n\2',text)                      # noqa
+    text = re.sub('(\…{2})([^’”])',r'\1\n\2',text)                      # noqa
+    text = re.sub('([.。!！？\?\.{6}\…{2}][’”])([^’”])',r'\1\n\2',text)  # noqa
+    return text.split('\n')
+
+
+@OPERATORS.register_module('remove_repeat_sentences_mapper')
+class RemoveRepeatSentencesMapper(Mapper):
+    """Mapper to remove repeat sentences in text samples."""
+
+    def __init__(self,
+                 lowercase: bool = False,
+                 ignore_special_character: bool = True,
+                 min_repeat_sentence_length: int = 2,
+                 *args,
+                 **kwargs):
+        """
+        Initialization method.
+
+        :param lowercase: Whether to convert sample text to lower case
+        :param ignore_special_character: Whether to ignore special
+            characters when judging repeated sentences. Special characters
+            are all characters except Chinese characters, letters and
+            numbers.
+        :param min_repeat_sentence_length: Sentences shorter than this
+            length will not be deduplicated. If ignore_special_character is
+            set to True, then special characters are not included in this
+            length.
+        :param args: extra args
+        :param kwargs: extra args
+        """
+
+        super().__init__(*args, **kwargs)
+        self.lowercase = lowercase
+        self.min_repeat_sentence_length = min_repeat_sentence_length
+        self.remove_regex = re.compile(
+            r'[^a-zA-Z0-9\u4e00-\u9fa5\n\t ]'
+        ) if ignore_special_character else None
+
+    def process(self, sample):
+
+        lines = [e for e in sample[self.text_key].split('\n')]
+        new_lines = []
+        hash_set = set([])
+        for line in lines:
+            new_sent = ''
+            if line:
+                sentences = split_sentence(line)
+                for sentence in sentences:
+                    copy = sentence.strip()
+                    if self.lowercase:
+                        copy = copy.lower()
+                    if self.remove_regex:
+                        copy = self.remove_regex.sub('', copy)
+
+                    if len(copy) < self.min_repeat_sentence_length:
+                        new_sent += sentence
+                    elif copy not in hash_set:
+                        new_sent += sentence
+                        hash_set.add(copy)
+            new_lines.append(new_sent)
+
+        sample[self.text_key] = '\n'.join(new_lines)
+        return sample
diff --git a/docs/Operators.md b/docs/Operators.md
@@ -10,7 +10,7 @@ The operators in Data-Juicer are categorized into 5 types.
 | Type                              | Number | Description                                     |
 |-----------------------------------|:------:|-------------------------------------------------|
 | [ Formatter ]( #formatter )       |   7    | Discovers, loads, and canonicalizes source data |
-| [ Mapper ]( #mapper )             |   21   | Edits and transforms samples                    |
+| [ Mapper ]( #mapper )             |   22   | Edits and transforms samples                    |
 | [ Filter ]( #filter )             |   24   | Filters out low-quality samples                 |
 | [ Deduplicator ]( #deduplicator ) |   4    | Detects and removes duplicate samples           |
 | [ Selector ]( #selector )         |   2    | Selects top samples based on ranking            |
@@ -63,6 +63,7 @@ All the specific operators are listed below, each featured with several capabili
 | remove_header_mapper                                | LaTeX              | en, zh | Removes the running headers of TeX documents, e.g., titles, chapter or section numbers/names                   |
 | remove_long_words_mapper                            | General            | en, zh | Removes words with length outside the specified range                                                          |
 | remove_non_chinese_character_mapper                 | General            | en, zh | Remove non Chinese character in text samples. |
+| remove_repeat_sentences_mapper                      | General            | en, zh | Remove repeat sentences in text samples. |
 | remove_specific_chars_mapper                        | General            | en, zh | Removes any user-specified characters or substrings                                                            |
 | remove_table_text_mapper                            | General, Financial | en     | Detects and removes possible table contents (:warning: relies on regular expression matching and thus fragile) |
 | remove_words_with_incorrect_<br />substrings_mapper | General            | en, zh | Removes words containing specified substrings                                                                  |

diff --git a/docs/Operators_ZH.md b/docs/Operators_ZH.md
@@ -9,7 +9,7 @@ Data-Juicer 中的算子分为以下 5 种类型。
 | 类型                                 | 数量 | 描述            |
 |------------------------------------|:--:|---------------|
 | [ Formatter ]( #formatter )        |  7 | 发现、加载、规范化原始数据 |
-| [ Mapper ]( #mapper )              | 21 | 对数据样本进行编辑和转换  |
+| [ Mapper ]( #mapper )              | 22 | 对数据样本进行编辑和转换  |
 | [ Filter ]( #filter )              | 24 | 过滤低质量样本       |
 | [ Deduplicator ]( #deduplicator )  |  4 | 识别、删除重复样本     |
 | [ Selector ]( #selector )          |  2 | 基于排序选取高质量样本   |
@@ -60,7 +60,8 @@ Data-Juicer 中的算子分为以下 5 种类型。
 | remove_comments_mapper                              | LaTeX                 | en, zh    | 删除 TeX 文档中的注释                                          |
 | remove_header_mapper                                | LaTeX                 | en, zh    | 删除 TeX 文档头，例如标题、章节数字/名称等                               |
 | remove_long_words_mapper                            | General               | en, zh    | 删除长度超出指定范围的单词                                          |
-| remove_non_chinese_character_mapper                 | General               | en, zh    | 删除样本中的非中文字符
+| remove_non_chinese_character_mapper                 | General               | en, zh    | 删除样本中的非中文字符                                              |
+| remove_repeat_sentences_mapper                      | General               | en, zh    | 删除样本中的重复句子                                                |
 | remove_specific_chars_mapper                        | General               | en, zh    | 删除任何用户指定的字符或子字符串                                       |
 | remove_table_text_mapper                            | General, Financial    | en        | 检测并删除可能的表格内容（:warning: 依赖正则表达式匹配，因此很脆弱）                |
 | remove_words_with_incorrect_<br />substrings_mapper | General               | en, zh    | 删除包含指定子字符串的单词                                          |

diff --git a/tests/ops/mapper/test_remove_repeat_sentences_mapper.py b/tests/ops/mapper/test_remove_repeat_sentences_mapper.py
@@ -0,0 +1,57 @@
+import unittest
+
+from data_juicer.ops.mapper.remove_repeat_sentences_mapper import RemoveRepeatSentencesMapper
+
+
+class RemoveRepeatSentencesMapperTest(unittest.TestCase):
+
+    def _run_helper(self, samples, op):
+        for sample in samples:
+            result = op.process(sample)
+            self.assertEqual(result['text'], result['target'])
+
+    def test_text(self):
+
+        samples = [
+            {
+                'text': '今天天气真不错，阳光明媚，适合出去散步。小明说：“今天天气真不错，我们去海边吧。” 小红回答说：“好主意！” 但是，小李觉得：“今天天气真不错，我们去爬山吧。” 今天天气真不错，阳光明媚，适合出去散步。昨天下了一整天的雨，今天终于放晴了。昨天下了一整天的雨，今天终于放晴了。',
+                'target': '今天天气真不错，阳光明媚，适合出去散步。小明说：“今天天气真不错，我们去海边吧。” 小红回答说：“好主意！” 但是，小李觉得：“今天天气真不错，我们去爬山吧。”昨天下了一整天的雨，今天终于放晴了。',
+            }, {
+                'text': 'The quick brown fox jumps over the lazy dog. Isn\'t it amazing how a simple sentence can contain every letter of the alphabet? The quick brown fox jumps over the lazy dog. Speaking of weather, yesterday was quite dreary; however, today is absolutely delightful. Isn\'t it amazing how a simple sentence can contain every letter of the alphabet? "Let\'s seize the day," Tom exclaimed, full of enthusiasm. "Let\'s seize the day," Tom exclaimed, full of enthusiasm.',
+                'target': 'The quick brown fox jumps over the lazy dog. Isn\'t it amazing how a simple sentence can contain every letter of the alphabet? Speaking of weather, yesterday was quite dreary; however, today is absolutely delightful. "Let\'s seize the day," Tom exclaimed, full of enthusiasm.'
+            }, {
+                'text': '''我很开心 。但是你不开心  。我很开心 。\n你好呀！我很开心 。我好的。你好呀！''',
+                'target': '''我很开心 。但是你不开心  。\n你好呀！我好的。'''
+            }, {
+                'text': '默认配置下，长度低于2的句子不会被去重。去重？去重。去重！重。重...... 重! 1234？3215. 1234. 3. 3. 3',
+                'target': '默认配置下，长度低于2的句子不会被去重。去重？重。重...... 重! 1234？3215. 3. 3. 3'
+            }
+        ]
+
+        op = RemoveRepeatSentencesMapper()
+        self._run_helper(samples, op)
+
+    def test_text2(self):
+
+        samples = [
+            {
+                'text': 'Life is what happens when you\'re busy making other plans. John Lennon once said. Life is what happens when you\'re busy making other plans. This phrase has resonated with many people over the years. 人生就是当你忙于制定其他计划时发生的事情。对很多人来说，这句话引起了共鸣。',
+                'target': 'Life is what happens when you\'re busy making other plans. John Lennon once said. This phrase has resonated with many people over the years. 人生就是当你忙于制定其他计划时发生的事情。对很多人来说，这句话引起了共鸣。',
+            }, {
+                'text': 'The quick brown fox jumps over the lazy dog. Isn\'t it amazing how a simple sentence can contain every letter of the alphabet? The quick brown fox jumps over the lazy dog. Speaking of weather, yesterday was quite dreary; however, today is absolutely delightful. Isn\'t it amazing how a simple sentence can contain every letter of the alphabet? "Let\'s seize the day," Tom exclaimed, full of enthusiasm. "Let\'s seize the day," Tom exclaimed, full of enthusiasm.',
+                'target': 'The quick brown fox jumps over the lazy dog. Isn\'t it amazing how a simple sentence can contain every letter of the alphabet? Speaking of weather, yesterday was quite dreary; however, today is absolutely delightful. "Let\'s seize the day," Tom exclaimed, full of enthusiasm.'
+            }, {
+                'text': '''我很开心 。但是你不开心  。我很开心 。\n你好呀！我很开心 。我好的。你好呀！''',
+                'target': '''我很开心 。但是你不开心  。\n你好呀！我好的。你好呀！'''
+            }, {
+                'text': '去重？去重。去重！重。重...... 重! 1234？3215. 1234. 3. 3. 3',
+                'target': '去重？去重。去重！重。重...... 重! 1234？3215. 1234. 3. 3. 3'
+            }
+        ]
+
+        op = RemoveRepeatSentencesMapper(lowercase=True, ignore_special_character=False, min_repeat_sentence_length=5)
+        self._run_helper(samples, op)
+
+
+if __name__ == '__main__':
+    unittest.main()