Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

20 allowing for lexical exceptions in text extraction function #29

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 15 additions & 8 deletions notebooks/reuters-preprocess.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 15,
"id": "57b8140a-8534-478a-a665-53928a0f700b",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -141,20 +141,27 @@
},
{
"cell_type": "code",
"execution_count": 50,
"execution_count": 6,
"id": "2ebed073-9c18-451c-86a8-27f04f7ab55e",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|█████████████████████████████| 1936485/1936485 [00:12<00:00, 158872.53it/s]\n"
]
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "134332053ede44929e081893f48aaab0",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/1936485 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"sentences = ppcs.extract_text(\"reuters.vrt\", lemmatize=True, min_freq=30, en=True)"
"sentences = ppcs.extract_text(\"reuters.vrt\", lemmatize=True, min_freq=30, en=\"chunking\")"
]
},
{
Expand Down
74 changes: 37 additions & 37 deletions sinr/text/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,10 +137,7 @@ def do_txt_to_vrt(self):
corpus_opened.close()
logger.info(f"VRT-style file written in {self.corpus_output.absolute()}")


def extract_text(corpus_path, lemmatize=True, stop_words=False, lower_words=True, number=False, punct=False,
exclude_pos=[],
en=True, min_freq=50, alpha=True, exclude_en=[], min_length_word=3):
def extract_text(corpus_path, exceptions_path=None, lemmatize=True, stop_words=False, lower_words=True, number=False, punct=False, exclude_pos=[], en="chunking", min_freq=50, alpha=True, exclude_en=[], min_length_word=3):
"""Extracts the text from a VRT corpus file.

:param corpus_path: str
Expand All @@ -150,7 +147,7 @@ def extract_text(corpus_path, lemmatize=True, stop_words=False, lower_words=True
:param number: bool (Default value = False)
:param punct: bool (Default value = False)
:param exclude_pos: list (Default value = [])
:param en: bool (Default value = True)
:param en: str ("chunking", "tagging", "deleting") (Default value = "chunking")
:param min_freq: int (Default value = 50)
:param alpha: bool (Default value = True)
:param exclude_en: list (Default value = [])
Expand All @@ -165,7 +162,19 @@ def extract_text(corpus_path, lemmatize=True, stop_words=False, lower_words=True
pattern = re.compile(r"<text[^<>]*\"\>{1}")
stop_words, number, punct, alpha = str(stop_words), str(number), str(punct), str(alpha)
sentence = []


if en != "chunking" and en != "tagging" and en != "deleting" :
logger.info(f"No correct option for en was provided: {en} is not valid. en option was thus set to chunking")
en = "chunking"

if exceptions_path != None :
exceptions_file = open_corpus(exceptions_path)
exceptions = exceptions_file.read().splitlines()
if lower_words:
exceptions = [w.lower() for w in exceptions]
else :
exceptions = []

for line in tqdm(text, total=len(text)):
if line.startswith("<s>"):
sentence = []
Expand All @@ -181,40 +190,32 @@ def extract_text(corpus_path, lemmatize=True, stop_words=False, lower_words=True
if bool(re.match('^\t\t', str(i))):
continue
token, lemma, pos, ent_iob, ent_type, is_punct, is_stop, is_alpha, is_digit, like_num = line.split("\t")
if lemmatize:
if stop_words == is_stop and is_punct == punct and is_digit == number and like_num == number and not pos in exclude_pos and not ent_type in exclude_en and (
alpha == is_alpha or ent_type != "None"):
if lower_words:
token_ = token.lower()
lemma_ = lemma.lower()
else:
token_ = token
lemma_ = lemma
if not lemmatize:
lemma_ = token_
if token_ in exceptions :
sentence.append(token_)
else :
if stop_words == is_stop and is_punct == punct and is_digit == number and like_num == number and not pos in exclude_pos and not ent_type in exclude_en and (alpha == is_alpha or ent_type != "None"):
if exclude_en and ent_iob != "None":
pass
else:
if lower_words:
if ent_type != "None" and len(lemma) > 1:
sentence.append(token) # sentence.append(lemma.lower())
# print(lemma)
elif len(lemma) > min_length_word:
sentence.append(lemma.lower())
else:
if ent_type != "None":
sentence.append(token)
elif len(lemma) > min_length_word:
sentence.append(lemma.lower())
if ent_type != "None" and len(lemma_) > 1:
if en == "chunking" :
sentence.append(token_)
elif en == "tagging" :
sentence.append(ent_type)
elif en == "deleting" :
pass
elif len(lemma) > min_length_word:
sentence.append(lemma_)
else:
pass
else:
if stop_words == is_stop and is_punct == punct and is_digit == number and alpha == is_alpha and like_num == number and not pos in exclude_pos and not ent_type in exclude_en:
if exclude_en and ent_iob != "None":
pass
else:
if lower_words:
if ent_type != "None" and len(token) > 1:
sentence.append(token.lower()) # (token)
elif len(token) > min_length_word:
sentence.append(token.lower())
else:
if ent_type != "None":
sentence.append(token) # (token)
elif len(lemma) > min_length_word:
sentence.append(token)
else:
continue
if min_freq > 1:
Expand All @@ -224,8 +225,7 @@ def extract_text(corpus_path, lemmatize=True, stop_words=False, lower_words=True
# line = corpus.readline().rstrip()
# x+=1
return out



def open_corpus(corpus_path):
"""

Expand Down