Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add remove_repeat_sentences_mapper #149

Merged
merged 21 commits into from
Dec 27, 2023
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions configs/config_all.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,10 @@ process:
keep_alphabet: true # whether to keep alpabet
keep_number: true # whether to keep number
keep_punc: true # whether to keep punctuation
- remove_repeat_sentences_mapper: # remove repeat sentences in text samples.
lowercase: false # whether to convert sample text to lower case
ignore_special_character: true # whether to ignore special characters when judging repeated sentences. Special characters are all characters except Chinese characters, letters and numbers
min_repeat_sentence_length: 2 # sentences shorter than this length will not be deduplicated. If ignore_special_character is set to True, then special characters are not included in this length
- remove_specific_chars_mapper: # remove characters specified by users
chars_to_remove: '◆●■►▼▲▴∆▻▷❖♡□' # a string or a list including those characters that need to be removed
- remove_table_text_mapper: # remove possible table texts from text.
Expand Down
3 changes: 2 additions & 1 deletion data_juicer/ops/mapper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
punctuation_normalization_mapper, remove_bibliography_mapper,
remove_comments_mapper, remove_header_mapper,
remove_long_words_mapper, remove_non_chinese_character_mapper,
remove_specific_chars_mapper, remove_table_text_mapper,
remove_repeat_sentences_mapper, remove_specific_chars_mapper,
remove_table_text_mapper,
remove_words_with_incorrect_substrings_mapper,
sentence_split_mapper, whitespace_normalization_mapper)
71 changes: 71 additions & 0 deletions data_juicer/ops/mapper/remove_repeat_sentences_mapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import regex as re

from ..base_op import OPERATORS, Mapper


def split_sentence(text):
text = re.sub('([.。!!?\?])([^’”])',r'\1\n\2',text) # noqa
text = re.sub('(\.{6})([^’”])',r'\1\n\2',text) # noqa
text = re.sub('(\…{2})([^’”])',r'\1\n\2',text) # noqa
text = re.sub('([.。!!?\?\.{6}\…{2}][’”])([^’”])',r'\1\n\2',text) # noqa
return text.split('\n')


@OPERATORS.register_module('remove_repeat_sentences_mapper')
class RemoveRepeatSentencesMapper(Mapper):
"""Mapper to remove repeat sentences in text samples."""

def __init__(self,
lowercase: bool = False,
ignore_special_character: bool = True,
min_repeat_sentence_length: int = 2,
*args,
**kwargs):
"""
Initialization method.

:param lowercase: Whether to convert sample text to lower case
:param ignore_special_character: Whether to ignore special
characters when judging repeated sentences. Special characters
are all characters except Chinese characters, letters and
numbers.
:param min_repeat_sentence_length: Sentences shorter than this
length will not be deduplicated. If ignore_special_character is
set to True, then special characters are not included in this
length.
:param args: extra args
:param kwargs: extra args
"""

super().__init__(*args, **kwargs)
self.lowercase = lowercase
self.min_repeat_sentence_length = min_repeat_sentence_length
self.remove_regex = re.compile(
r'[^a-zA-Z0-9\u4e00-\u9fa5\n\t ]'
) if ignore_special_character else None

def process(self, sample):

lines = [e for e in sample[self.text_key].split('\n')]
new_lines = []
hash_set = set([])
for line in lines:
new_sent = ''
if line:
sentences = split_sentence(line)
for sentence in sentences:
copy = sentence.strip()
if self.lowercase:
copy = copy.lower()
if self.remove_regex:
copy = self.remove_regex.sub('', copy)

if len(copy) < self.min_repeat_sentence_length:
new_sent += sentence
elif copy not in hash_set:
new_sent += sentence
hash_set.add(copy)
new_lines.append(new_sent)

sample[self.text_key] = '\n'.join(new_lines)
return sample
3 changes: 2 additions & 1 deletion docs/Operators.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ The operators in Data-Juicer are categorized into 5 types.
| Type | Number | Description |
|-----------------------------------|:------:|-------------------------------------------------|
| [ Formatter ]( #formatter ) | 7 | Discovers, loads, and canonicalizes source data |
| [ Mapper ]( #mapper ) | 21 | Edits and transforms samples |
| [ Mapper ]( #mapper ) | 22 | Edits and transforms samples |
| [ Filter ]( #filter ) | 24 | Filters out low-quality samples |
| [ Deduplicator ]( #deduplicator ) | 4 | Detects and removes duplicate samples |
| [ Selector ]( #selector ) | 2 | Selects top samples based on ranking |
Expand Down Expand Up @@ -63,6 +63,7 @@ All the specific operators are listed below, each featured with several capabili
| remove_header_mapper | LaTeX | en, zh | Removes the running headers of TeX documents, e.g., titles, chapter or section numbers/names |
| remove_long_words_mapper | General | en, zh | Removes words with length outside the specified range |
| remove_non_chinese_character_mapper | General | en, zh | Remove non Chinese character in text samples. |
| remove_repeat_sentences_mapper | General | en, zh | Remove repeat sentences in text samples. |
| remove_specific_chars_mapper | General | en, zh | Removes any user-specified characters or substrings |
| remove_table_text_mapper | General, Financial | en | Detects and removes possible table contents (:warning: relies on regular expression matching and thus fragile) |
| remove_words_with_incorrect_<br />substrings_mapper | General | en, zh | Removes words containing specified substrings |
Expand Down
5 changes: 3 additions & 2 deletions docs/Operators_ZH.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ Data-Juicer 中的算子分为以下 5 种类型。
| 类型 | 数量 | 描述 |
|------------------------------------|:--:|---------------|
| [ Formatter ]( #formatter ) | 7 | 发现、加载、规范化原始数据 |
| [ Mapper ]( #mapper ) | 21 | 对数据样本进行编辑和转换 |
| [ Mapper ]( #mapper ) | 22 | 对数据样本进行编辑和转换 |
HYLcool marked this conversation as resolved.
Show resolved Hide resolved
| [ Filter ]( #filter ) | 24 | 过滤低质量样本 |
| [ Deduplicator ]( #deduplicator ) | 4 | 识别、删除重复样本 |
| [ Selector ]( #selector ) | 2 | 基于排序选取高质量样本 |
Expand Down Expand Up @@ -60,7 +60,8 @@ Data-Juicer 中的算子分为以下 5 种类型。
| remove_comments_mapper | LaTeX | en, zh | 删除 TeX 文档中的注释 |
| remove_header_mapper | LaTeX | en, zh | 删除 TeX 文档头,例如标题、章节数字/名称等 |
| remove_long_words_mapper | General | en, zh | 删除长度超出指定范围的单词 |
| remove_non_chinese_character_mapper | General | en, zh | 删除样本中的非中文字符
| remove_non_chinese_character_mapper | General | en, zh | 删除样本中的非中文字符 |
| remove_repeat_sentences_mapper | General | en, zh | 删除样本中的重复句子 |
| remove_specific_chars_mapper | General | en, zh | 删除任何用户指定的字符或子字符串 |
| remove_table_text_mapper | General, Financial | en | 检测并删除可能的表格内容(:warning: 依赖正则表达式匹配,因此很脆弱) |
| remove_words_with_incorrect_<br />substrings_mapper | General | en, zh | 删除包含指定子字符串的单词 |
Expand Down
57 changes: 57 additions & 0 deletions tests/ops/mapper/test_remove_repeat_sentences_mapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import unittest

from data_juicer.ops.mapper.remove_repeat_sentences_mapper import RemoveRepeatSentencesMapper


class RemoveRepeatSentencesMapperTest(unittest.TestCase):

def _run_helper(self, samples, op):
for sample in samples:
result = op.process(sample)
self.assertEqual(result['text'], result['target'])

def test_text(self):

samples = [
{
'text': '今天天气真不错,阳光明媚,适合出去散步。小明说:“今天天气真不错,我们去海边吧。” 小红回答说:“好主意!” 但是,小李觉得:“今天天气真不错,我们去爬山吧。” 今天天气真不错,阳光明媚,适合出去散步。昨天下了一整天的雨,今天终于放晴了。昨天下了一整天的雨,今天终于放晴了。',
'target': '今天天气真不错,阳光明媚,适合出去散步。小明说:“今天天气真不错,我们去海边吧。” 小红回答说:“好主意!” 但是,小李觉得:“今天天气真不错,我们去爬山吧。”昨天下了一整天的雨,今天终于放晴了。',
}, {
'text': 'The quick brown fox jumps over the lazy dog. Isn\'t it amazing how a simple sentence can contain every letter of the alphabet? The quick brown fox jumps over the lazy dog. Speaking of weather, yesterday was quite dreary; however, today is absolutely delightful. Isn\'t it amazing how a simple sentence can contain every letter of the alphabet? "Let\'s seize the day," Tom exclaimed, full of enthusiasm. "Let\'s seize the day," Tom exclaimed, full of enthusiasm.',
'target': 'The quick brown fox jumps over the lazy dog. Isn\'t it amazing how a simple sentence can contain every letter of the alphabet? Speaking of weather, yesterday was quite dreary; however, today is absolutely delightful. "Let\'s seize the day," Tom exclaimed, full of enthusiasm.'
}, {
'text': '''我很开心 。但是你不开心 。我很开心 。\n你好呀!我很开心 。我好的。你好呀!''',
'target': '''我很开心 。但是你不开心 。\n你好呀!我好的。'''
}, {
'text': '默认配置下,长度低于2的句子不会被去重。去重?去重。去重!重。重...... 重! 1234?3215. 1234. 3. 3. 3',
'target': '默认配置下,长度低于2的句子不会被去重。去重?重。重...... 重! 1234?3215. 3. 3. 3'
}
]

op = RemoveRepeatSentencesMapper()
self._run_helper(samples, op)

def test_text2(self):

samples = [
{
'text': 'Life is what happens when you\'re busy making other plans. John Lennon once said. Life is what happens when you\'re busy making other plans. This phrase has resonated with many people over the years. 人生就是当你忙于制定其他计划时发生的事情。对很多人来说,这句话引起了共鸣。',
'target': 'Life is what happens when you\'re busy making other plans. John Lennon once said. This phrase has resonated with many people over the years. 人生就是当你忙于制定其他计划时发生的事情。对很多人来说,这句话引起了共鸣。',
}, {
'text': 'The quick brown fox jumps over the lazy dog. Isn\'t it amazing how a simple sentence can contain every letter of the alphabet? The quick brown fox jumps over the lazy dog. Speaking of weather, yesterday was quite dreary; however, today is absolutely delightful. Isn\'t it amazing how a simple sentence can contain every letter of the alphabet? "Let\'s seize the day," Tom exclaimed, full of enthusiasm. "Let\'s seize the day," Tom exclaimed, full of enthusiasm.',
'target': 'The quick brown fox jumps over the lazy dog. Isn\'t it amazing how a simple sentence can contain every letter of the alphabet? Speaking of weather, yesterday was quite dreary; however, today is absolutely delightful. "Let\'s seize the day," Tom exclaimed, full of enthusiasm.'
}, {
'text': '''我很开心 。但是你不开心 。我很开心 。\n你好呀!我很开心 。我好的。你好呀!''',
'target': '''我很开心 。但是你不开心 。\n你好呀!我好的。你好呀!'''
}, {
'text': '去重?去重。去重!重。重...... 重! 1234?3215. 1234. 3. 3. 3',
'target': '去重?去重。去重!重。重...... 重! 1234?3215. 1234. 3. 3. 3'
}
]

op = RemoveRepeatSentencesMapper(lowercase=True, ignore_special_character=False, min_repeat_sentence_length=5)
self._run_helper(samples, op)


if __name__ == '__main__':
unittest.main()