openfoodfacts · eric-nguyen-cs · Jan 19, 2024 · Jan 18, 2024 · Jan 18, 2024 · Jan 18, 2024
@@ -5,9 +5,9 @@
 import tempfile
 import urllib.request  # Sending requests
 
-from openfoodfacts_taxonomy_parser import normalizer  # Normalizing tags
 from openfoodfacts_taxonomy_parser import parser  # Parser for taxonomies
 from openfoodfacts_taxonomy_parser import unparser  # Unparser for taxonomies
+from openfoodfacts_taxonomy_parser import utils as parser_utils  # Normalizing tags
 
 from .exceptions import GithubBranchExistsError  # Custom exceptions
 from .exceptions import (
@@ -61,7 +61,7 @@ async def create_node(self, label, entry, main_language_code):
         if label == "ENTRY":
             # Normalizing new canonical tag
             language_code, canonical_tag = entry.split(":", 1)
-            normalised_canonical_tag = normalizer.normalizing(canonical_tag, main_language_code)
+            normalised_canonical_tag = parser_utils.normalizing(canonical_tag, main_language_code)
 
             # Reconstructing and updation of node ID
             params["id"] = language_code + ":" + normalised_canonical_tag
@@ -224,7 +224,7 @@ def is_valid_branch_name(self):
         """
         Helper function to check if a branch name is valid
         """
-        return normalizer.normalizing(self.branch_name, char="_") == self.branch_name
+        return parser_utils.normalize_text(self.branch_name, char="_") == self.branch_name
 
     async def create_project(self, description):
         """
@@ -470,7 +470,9 @@ async def update_nodes(self, label, entry, new_node_keys):
                     keys_language_code = keys.split("_", 1)[1]
                     normalised_value = []
                     for values in new_node_keys[keys]:
-                        normalised_value.append(normalizer.normalizing(values, keys_language_code))
+                        normalised_value.append(
+                            parser_utils.normalize_text(values, keys_language_code)
+                        )
                     normalised_new_node_keys[keys] = normalised_value
                     normalised_new_node_keys["tags_ids_" + keys_language_code] = normalised_value
                 else:
@@ -556,7 +558,7 @@ async def full_text_search(self, text):
         """
         # Escape special characters
         normalized_text = re.sub(r"[^A-Za-z0-9_]", r" ", text)
-        normalized_id_text = normalizer.normalizing(text)
+        normalized_id_text = parser_utils.normalize_text(text)
 
         # If normalized text is empty, no searches are found
         if normalized_text.strip() == "":

@@ -8,7 +8,7 @@
 from neo4j import GraphDatabase, Session, Transaction
 
 from .logger import ParserConsoleLogger
-from ..normalizer import normalizing
+from ..utils import get_project_name, normalize_text
 from .taxonomy_parser import (
     NodeType,
     PreviousLink,
@@ -26,10 +26,6 @@ def __init__(self, session: Session):
         self.session = session
         self.parser_logger = ParserConsoleLogger()
 
-    def _get_project_name(self, taxonomy_name: str, branch_name: str):
-        """Create a project name for given branch and taxonomy"""
-        return "p_" + taxonomy_name + "_" + branch_name
-
     def _create_other_node(self, tx: Transaction, node_data: NodeData, project_label: str):
         """Create a TEXT, SYNONYMS or STOPWORDS node"""
         if node_data.get_node_type() == NodeType.TEXT:
@@ -285,7 +281,7 @@ def _create_node_indexes(self, project_label: str):
         self.parser_logger.info(f"Created indexes in {timeit.default_timer() - start_time} seconds")
 
     def _write_to_database(self, taxonomy: Taxonomy, taxonomy_name: str, branch_name: str):
-        project_label = self._get_project_name(taxonomy_name, branch_name)
+        project_label = get_project_name(taxonomy_name, branch_name)
         # First create nodes, then create node indexes to accelerate relationship creation, then create relationships
         self._create_other_nodes(taxonomy.other_nodes, project_label)
         self._create_entry_nodes(taxonomy.entry_nodes, project_label)
@@ -299,7 +295,7 @@ def __call__(self, filename: str, branch_name: str, taxonomy_name: str):
         """Process the file"""
         start_time = timeit.default_timer()
 
-        branch_name = normalizing(branch_name, char="_")
+        branch_name = normalize_text(branch_name, char="_")
         taxonomy_parser = TaxonomyParser()
         taxonomy = taxonomy_parser.parse_file(filename, self.parser_logger)
         self._write_to_database(taxonomy, taxonomy_name, branch_name)

@@ -8,7 +8,7 @@
 
 from .logger import ParserConsoleLogger
 from .exception import DuplicateIDError
-from ..normalizer import normalizing
+from ..utils import normalize_filename, normalize_text
 
 
 class NodeType(str, Enum):
@@ -75,10 +75,6 @@ class TaxonomyParser:
     def __init__(self):
         self.parser_logger = ParserConsoleLogger()
 
-    def _normalized_filename(self, filename: str) -> str:
-        """Add the .txt extension if it is missing in the filename"""
-        return filename + (".txt" if (len(filename) < 4 or filename[-4:] != ".txt") else "")
-
     def _file_iter(self, filename: str, start: int = 0) -> Iterator[tuple[int, str]]:
         """Generator to get the file line by line"""
         with open(filename, "r", encoding="utf8") as file:
@@ -122,15 +118,15 @@ def _add_line(self, line: str) -> str:
         """
         lc, line = line.split(":", 1)
         new_line = lc + ":"
-        new_line += self._remove_stopwords(lc, normalizing(line, lc))
+        new_line += self._remove_stopwords(lc, normalize_text(line, lc))
         return new_line
 
     def _get_lc_value(self, line: str) -> tuple[str, list[str]]:
         """Get the language code "lc" and a list of normalized values"""
         lc, line = line.split(":", 1)
         new_line: list[str] = []
         for word in line.split(","):
-            new_line.append(self._remove_stopwords(lc, normalizing(word, lc)))
+            new_line.append(self._remove_stopwords(lc, normalize_text(word, lc)))
         return lc, new_line
 
     def _set_data_id(self, data: NodeData, id: str, line_number: int) -> NodeData:
@@ -290,7 +286,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
                     tagsids_list = []
                     for word in line.split(","):
                         tags_list.append(word.strip())
-                        word_normalized = self._remove_stopwords(lang, normalizing(word, lang))
+                        word_normalized = self._remove_stopwords(lang, normalize_text(word, lang))
                         if word_normalized not in tagsids_list:
                             # in case 2 normalized synonyms are the same
                             tagsids_list.append(word_normalized)
@@ -356,7 +352,7 @@ def parse_file(self, filename: str, logger: ParserConsoleLogger | None = None) -
             self.parser_logger = logger
         """Process the file into a Taxonomy object"""
         start_time = timeit.default_timer()
-        filename = self._normalized_filename(filename)
+        filename = normalize_filename(filename)
         taxonomy = self._create_taxonomy(filename)
         self.parser_logger.info(f"Parsing done in {timeit.default_timer() - start_time} seconds.")
         self.parser_logger.info(

@@ -3,7 +3,7 @@
 
 from neo4j import GraphDatabase
 
-from .normalizer import normalizing
+from .utils import get_project_name, normalize_filename, normalize_text
 
 
 class WriteTaxonomy:
@@ -12,20 +12,7 @@ class WriteTaxonomy:
     def __init__(self, session):
         self.session = session
 
-    def normalized_filename(self, filename):
-        """add the .txt extension if it is missing in the filename"""
-        return filename + (".txt" if (len(filename) < 4 or filename[-4:] != ".txt") else "")
-
-    def get_project_name(self, taxonomy_name, branch_name):
-        """Create a project name for given branch and taxonomy"""
-        return "p_" + taxonomy_name + "_" + branch_name
-
-    def create_multi_label(self, taxonomy_name, branch_name):
-        """Create a combined label with taxonomy name and branch name"""
-        project_name = self.get_project_name(taxonomy_name, branch_name)
-        return project_name + ":" + ("t_" + taxonomy_name) + ":" + ("b_" + branch_name)
-
-    def get_all_nodes(self, multi_label):
+    def get_all_nodes(self, project_label):
         """query the database and yield each node with its parents,
         this function use the relationships between nodes"""
         # This query first lists all the nodes in the "is_before" order
@@ -34,7 +21,7 @@ def get_all_nodes(self, multi_label):
         # Note: OPTIONAL MATCH is used to return nodes without parents
         query = f"""
             MATCH path = ShortestPath(
-                (h:{multi_label}:TEXT)-[:is_before*]->(f:{multi_label}:TEXT)
+                (h:{project_label}:TEXT)-[:is_before*]->(f:{project_label}:TEXT)
             )
             WHERE h.id="__header__" AND f.id="__footer__"
             WITH nodes(path) AS nodes, range(0, size(nodes(path))-1) AS indexes
@@ -92,9 +79,9 @@ def get_parents_lines(self, parents):
             parent_id = parent["tags_" + lc][0]
             yield "<" + lc + ":" + parent_id
 
-    def iter_lines(self, multi_label):
+    def iter_lines(self, project_label):
         previous_block_id = ""
-        for node, parents in self.get_all_nodes(multi_label):
+        for node, parents in self.get_all_nodes(project_label):
             node = dict(node)
             has_content = node["id"] not in ["__header__", "__footer__"]
             # eventually add a blank line but in specific case
@@ -134,16 +121,16 @@ def iter_lines(self, multi_label):
 
     def rewrite_file(self, filename, lines):
         """Write a .txt file with the given name"""
-        filename = self.normalized_filename(filename)
+        filename = normalize_filename(filename)
         with open(filename, "w", encoding="utf8") as file:
             for line in lines:
                 file.write(line + "\n")
 
     def __call__(self, filename, branch_name, taxonomy_name):
-        filename = self.normalized_filename(filename)
-        branch_name = normalizing(branch_name, char="_")
-        multi_label = self.create_multi_label(taxonomy_name, branch_name)
-        lines = self.iter_lines(multi_label)
+        filename = normalize_filename(filename)
+        branch_name = normalize_text(branch_name, char="_")
+        project_label = get_project_name(taxonomy_name, branch_name)
+        lines = self.iter_lines(project_label)
         self.rewrite_file(filename, lines)
 
 

@@ -1,13 +1,10 @@
-"""
-String normalizer
-"""
 import re
 import unicodedata
 
 import unidecode
 
 
-def normalizing(line: str, lang="default", char="-"):
+def normalize_text(line: str, lang="default", char="-"):
     """Normalize a string depending on the language code"""
     line = unicodedata.normalize("NFC", line)
 
@@ -33,3 +30,13 @@ def normalizing(line: str, lang="default", char="-"):
     line = re.sub(r"-+", char, line)
     line = line.strip(char)
     return line
+
+
+def normalize_filename(filename: str) -> str:
+    """add the .txt extension if it is missing in the filename"""
+    return filename + (".txt" if (len(filename) < 4 or filename[-4:] != ".txt") else "")
+
+
+def get_project_name(taxonomy_name: str, branch_name: str) -> str:
+    """Create a project name for given branch and taxonomy"""
+    return "p_" + taxonomy_name + "_" + branch_name
@@ -2,7 +2,7 @@
 
 import pytest
 
-from openfoodfacts_taxonomy_parser import normalizer, parser
+from openfoodfacts_taxonomy_parser import parser, utils
 
 # taxonomy in text format : test.txt
 TEST_TAXONOMY_TXT = str(pathlib.Path(__file__).parent.parent / "data" / "test.txt")
@@ -17,8 +17,7 @@
     ],
 )
 def test_normalized_filename(filename: str, normalized_name: str):
-    taxonomy_parser = parser.TaxonomyParser()
-    assert taxonomy_parser._normalized_filename(filename) == normalized_name
+    assert utils.normalize_filename(filename) == normalized_name
 
 
 def test_fileiter(neo4j):
@@ -40,4 +39,4 @@ def test_fileiter(neo4j):
     ],
 )
 def test_normalizing(text: str, normalized_text: str, lang: str):
-    assert normalizer.normalizing(text, lang) == normalized_text
+    assert utils.normalize_text(text, lang) == normalized_text