Skip to content

Commit

Permalink
20 allowing for lexical exceptions in text extraction function (#29)
Browse files Browse the repository at this point in the history
* similarity evaluation from txt file and dataset

* rename evaluation in evaluate

* similarity with MEN and WS353 datasets

* similarity unit tests

* sparsify and binarize SINrVectors

* notebook sim + sparsify + binarize

* add new path to oanc SINrVectors

* Add oanc model

* allowing for exceptions in preprocess filtering + modifying named entity options to choose between chunking, tagging and deleting

* rename function to match

* Pushing correct refactored function to take into account exception list and lowering

* Deleting deprecated tests

* Deleting oanc model

---------

Co-authored-by: Beranger Anna <aberanger@lst.clusterlst.univ-lemans.fr>
Co-authored-by: Anna Beranger <anbberanger@gmail.com>
Co-authored-by: simon.guillot@univ-lemans.fr <sguillot@lst.clusterlst.univ-lemans.fr>
  • Loading branch information
4 people authored and thibaultprouteau committed Jul 24, 2023
1 parent 34e5e6d commit 00eafc4
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 45 deletions.
23 changes: 15 additions & 8 deletions notebooks/reuters-preprocess.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 15,
"id": "57b8140a-8534-478a-a665-53928a0f700b",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -141,20 +141,27 @@
},
{
"cell_type": "code",
"execution_count": 50,
"execution_count": 6,
"id": "2ebed073-9c18-451c-86a8-27f04f7ab55e",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|█████████████████████████████| 1936485/1936485 [00:12<00:00, 158872.53it/s]\n"
]
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "134332053ede44929e081893f48aaab0",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/1936485 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"sentences = ppcs.extract_text(\"reuters.vrt\", lemmatize=True, min_freq=30, en=True)"
"sentences = ppcs.extract_text(\"reuters.vrt\", lemmatize=True, min_freq=30, en=\"chunking\")"
]
},
{
Expand Down
74 changes: 37 additions & 37 deletions sinr/text/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,10 +137,7 @@ def do_txt_to_vrt(self):
corpus_opened.close()
logger.info(f"VRT-style file written in {self.corpus_output.absolute()}")


def extract_text(corpus_path, lemmatize=True, stop_words=False, lower_words=True, number=False, punct=False,
exclude_pos=[],
en=True, min_freq=50, alpha=True, exclude_en=[], min_length_word=3):
def extract_text(corpus_path, exceptions_path=None, lemmatize=True, stop_words=False, lower_words=True, number=False, punct=False, exclude_pos=[], en="chunking", min_freq=50, alpha=True, exclude_en=[], min_length_word=3):
"""Extracts the text from a VRT corpus file.
:param corpus_path: str
Expand All @@ -150,7 +147,7 @@ def extract_text(corpus_path, lemmatize=True, stop_words=False, lower_words=True
:param number: bool (Default value = False)
:param punct: bool (Default value = False)
:param exclude_pos: list (Default value = [])
:param en: bool (Default value = True)
:param en: str ("chunking", "tagging", "deleting") (Default value = "chunking")
:param min_freq: int (Default value = 50)
:param alpha: bool (Default value = True)
:param exclude_en: list (Default value = [])
Expand All @@ -165,7 +162,19 @@ def extract_text(corpus_path, lemmatize=True, stop_words=False, lower_words=True
pattern = re.compile(r"<text[^<>]*\"\>{1}")
stop_words, number, punct, alpha = str(stop_words), str(number), str(punct), str(alpha)
sentence = []


if en != "chunking" and en != "tagging" and en != "deleting" :
logger.info(f"No correct option for en was provided: {en} is not valid. en option was thus set to chunking")
en = "chunking"

if exceptions_path != None :
exceptions_file = open_corpus(exceptions_path)
exceptions = exceptions_file.read().splitlines()
if lower_words:
exceptions = [w.lower() for w in exceptions]
else :
exceptions = []

for line in tqdm(text, total=len(text)):
if line.startswith("<s>"):
sentence = []
Expand All @@ -181,40 +190,32 @@ def extract_text(corpus_path, lemmatize=True, stop_words=False, lower_words=True
if bool(re.match('^\t\t', str(i))):
continue
token, lemma, pos, ent_iob, ent_type, is_punct, is_stop, is_alpha, is_digit, like_num = line.split("\t")
if lemmatize:
if stop_words == is_stop and is_punct == punct and is_digit == number and like_num == number and not pos in exclude_pos and not ent_type in exclude_en and (
alpha == is_alpha or ent_type != "None"):
if lower_words:
token_ = token.lower()
lemma_ = lemma.lower()
else:
token_ = token
lemma_ = lemma
if not lemmatize:
lemma_ = token_
if token_ in exceptions :
sentence.append(token_)
else :
if stop_words == is_stop and is_punct == punct and is_digit == number and like_num == number and not pos in exclude_pos and not ent_type in exclude_en and (alpha == is_alpha or ent_type != "None"):
if exclude_en and ent_iob != "None":
pass
else:
if lower_words:
if ent_type != "None" and len(lemma) > 1:
sentence.append(token) # sentence.append(lemma.lower())
# print(lemma)
elif len(lemma) > min_length_word:
sentence.append(lemma.lower())
else:
if ent_type != "None":
sentence.append(token)
elif len(lemma) > min_length_word:
sentence.append(lemma.lower())
if ent_type != "None" and len(lemma_) > 1:
if en == "chunking" :
sentence.append(token_)
elif en == "tagging" :
sentence.append(ent_type)
elif en == "deleting" :
pass
elif len(lemma) > min_length_word:
sentence.append(lemma_)
else:
pass
else:
if stop_words == is_stop and is_punct == punct and is_digit == number and alpha == is_alpha and like_num == number and not pos in exclude_pos and not ent_type in exclude_en:
if exclude_en and ent_iob != "None":
pass
else:
if lower_words:
if ent_type != "None" and len(token) > 1:
sentence.append(token.lower()) # (token)
elif len(token) > min_length_word:
sentence.append(token.lower())
else:
if ent_type != "None":
sentence.append(token) # (token)
elif len(lemma) > min_length_word:
sentence.append(token)
else:
continue
if min_freq > 1:
Expand All @@ -224,8 +225,7 @@ def extract_text(corpus_path, lemmatize=True, stop_words=False, lower_words=True
# line = corpus.readline().rstrip()
# x+=1
return out



def open_corpus(corpus_path):
"""
Expand Down

0 comments on commit 00eafc4

Please sign in to comment.