SINr-Embeddings · nicolasdugue · Jul 6, 2023 · May 25, 2023 · May 25, 2023 · May 26, 2023
diff --git a/notebooks/reuters-preprocess.ipynb b/notebooks/reuters-preprocess.ipynb
@@ -35,7 +35,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 15,
    "id": "57b8140a-8534-478a-a665-53928a0f700b",
    "metadata": {},
    "outputs": [],
@@ -141,20 +141,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 50,
+   "execution_count": 6,
    "id": "2ebed073-9c18-451c-86a8-27f04f7ab55e",
    "metadata": {},
    "outputs": [
     {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|█████████████████████████████| 1936485/1936485 [00:12<00:00, 158872.53it/s]\n"
-     ]
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "134332053ede44929e081893f48aaab0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/1936485 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
-    "sentences = ppcs.extract_text(\"reuters.vrt\", lemmatize=True, min_freq=30, en=True)"
+    "sentences = ppcs.extract_text(\"reuters.vrt\", lemmatize=True, min_freq=30, en=\"chunking\")"
    ]
   },
   {

diff --git a/sinr/text/preprocess.py b/sinr/text/preprocess.py
@@ -137,10 +137,7 @@ def do_txt_to_vrt(self):
         corpus_opened.close()
         logger.info(f"VRT-style file written in {self.corpus_output.absolute()}")
 
-
-def extract_text(corpus_path, lemmatize=True, stop_words=False, lower_words=True, number=False, punct=False,
-                 exclude_pos=[],
-                 en=True, min_freq=50, alpha=True, exclude_en=[], min_length_word=3):
+def extract_text(corpus_path, exceptions_path=None, lemmatize=True, stop_words=False, lower_words=True, number=False, punct=False, exclude_pos=[], en="chunking", min_freq=50, alpha=True, exclude_en=[], min_length_word=3):
     """Extracts the text from a VRT corpus file.
 
     :param corpus_path: str
@@ -150,7 +147,7 @@ def extract_text(corpus_path, lemmatize=True, stop_words=False, lower_words=True
     :param number: bool (Default value = False)
     :param punct: bool (Default value = False)
     :param exclude_pos: list (Default value = [])
-    :param en: bool (Default value = True)
+    :param en: str ("chunking", "tagging", "deleting") (Default value = "chunking")
     :param min_freq: int (Default value = 50)
     :param alpha: bool (Default value = True)
     :param exclude_en: list (Default value = [])
@@ -165,7 +162,19 @@ def extract_text(corpus_path, lemmatize=True, stop_words=False, lower_words=True
     pattern = re.compile(r"<text[^<>]*\"\>{1}")
     stop_words, number, punct, alpha = str(stop_words), str(number), str(punct), str(alpha)
     sentence = []
-
+
+    if en != "chunking" and en != "tagging" and en != "deleting" :
+        logger.info(f"No correct option for en was provided: {en} is not valid. en option was thus set to chunking")
+        en = "chunking"
+
+    if exceptions_path != None :
+        exceptions_file = open_corpus(exceptions_path)
+        exceptions = exceptions_file.read().splitlines()
+        if lower_words:
+            exceptions = [w.lower() for w in exceptions]
+    else : 
+        exceptions = []
+
     for line in tqdm(text, total=len(text)):
         if line.startswith("<s>"):
             sentence = []
@@ -181,40 +190,32 @@ def extract_text(corpus_path, lemmatize=True, stop_words=False, lower_words=True
                     if bool(re.match('^\t\t', str(i))):
                         continue
                 token, lemma, pos, ent_iob, ent_type, is_punct, is_stop, is_alpha, is_digit, like_num = line.split("\t")
-                if lemmatize:
-                    if stop_words == is_stop and is_punct == punct and is_digit == number and like_num == number and not pos in exclude_pos and not ent_type in exclude_en and (
-                            alpha == is_alpha or ent_type != "None"):
+                if lower_words:
+                    token_ = token.lower()
+                    lemma_ = lemma.lower()
+                else:
+                    token_ = token
+                    lemma_ = lemma
+                if not lemmatize:
+                    lemma_ = token_
+                if token_ in exceptions : 
+                    sentence.append(token_)
+                else :
+                    if stop_words == is_stop and is_punct == punct and is_digit == number and like_num == number and not pos in exclude_pos and not ent_type in exclude_en and (alpha == is_alpha or ent_type != "None"):
                         if exclude_en and ent_iob != "None":
                             pass
                         else:
-                            if lower_words:
-                                if ent_type != "None" and len(lemma) > 1:
-                                    sentence.append(token)  # sentence.append(lemma.lower())
-                                    # print(lemma)
-                                elif len(lemma) > min_length_word:
-                                    sentence.append(lemma.lower())
-                            else:
-                                if ent_type != "None":
-                                    sentence.append(token)
-                                elif len(lemma) > min_length_word:
-                                    sentence.append(lemma.lower())
+                            if ent_type != "None" and len(lemma_) > 1:
+                                if en == "chunking" :
+                                    sentence.append(token_)
+                                elif en == "tagging" :
+                                    sentence.append(ent_type)  
+                                elif en == "deleting" :
+                                    pass
+                            elif len(lemma) > min_length_word:
+                                sentence.append(lemma_)
                     else:
                         pass
-                else:
-                    if stop_words == is_stop and is_punct == punct and is_digit == number and alpha == is_alpha and like_num == number and not pos in exclude_pos and not ent_type in exclude_en:
-                        if exclude_en and ent_iob != "None":
-                            pass
-                        else:
-                            if lower_words:
-                                if ent_type != "None" and len(token) > 1:
-                                    sentence.append(token.lower())  # (token)
-                                elif len(token) > min_length_word:
-                                    sentence.append(token.lower())
-                            else:
-                                if ent_type != "None":
-                                    sentence.append(token)  # (token)
-                                elif len(lemma) > min_length_word:
-                                    sentence.append(token)
             else:
                 continue
     if min_freq > 1:
@@ -224,8 +225,7 @@ def extract_text(corpus_path, lemmatize=True, stop_words=False, lower_words=True
         # line = corpus.readline().rstrip()
         # x+=1
     return out
-
-
+
 def open_corpus(corpus_path):
     """