Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: fix export to local file #352

Merged
merged 6 commits into from
Jan 19, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions backend/editor/entries.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
import tempfile
import urllib.request # Sending requests

from openfoodfacts_taxonomy_parser import normalizer # Normalizing tags
from openfoodfacts_taxonomy_parser import parser # Parser for taxonomies
from openfoodfacts_taxonomy_parser import unparser # Unparser for taxonomies
from openfoodfacts_taxonomy_parser import utils as parser_utils # Normalizing tags

from .exceptions import GithubBranchExistsError # Custom exceptions
from .exceptions import (
Expand Down Expand Up @@ -61,7 +61,7 @@ async def create_node(self, label, entry, main_language_code):
if label == "ENTRY":
# Normalizing new canonical tag
language_code, canonical_tag = entry.split(":", 1)
normalised_canonical_tag = normalizer.normalizing(canonical_tag, main_language_code)
normalised_canonical_tag = parser_utils.normalizing(canonical_tag, main_language_code)

# Reconstructing and updation of node ID
params["id"] = language_code + ":" + normalised_canonical_tag
Expand Down Expand Up @@ -224,7 +224,7 @@ def is_valid_branch_name(self):
"""
Helper function to check if a branch name is valid
"""
return normalizer.normalizing(self.branch_name, char="_") == self.branch_name
return parser_utils.normalize_text(self.branch_name, char="_") == self.branch_name

async def create_project(self, description):
"""
Expand Down Expand Up @@ -470,7 +470,9 @@ async def update_nodes(self, label, entry, new_node_keys):
keys_language_code = keys.split("_", 1)[1]
normalised_value = []
for values in new_node_keys[keys]:
normalised_value.append(normalizer.normalizing(values, keys_language_code))
normalised_value.append(
parser_utils.normalize_text(values, keys_language_code)
)
normalised_new_node_keys[keys] = normalised_value
normalised_new_node_keys["tags_ids_" + keys_language_code] = normalised_value
else:
Expand Down Expand Up @@ -556,7 +558,7 @@ async def full_text_search(self, text):
"""
# Escape special characters
normalized_text = re.sub(r"[^A-Za-z0-9_]", r" ", text)
normalized_id_text = normalizer.normalizing(text)
normalized_id_text = parser_utils.normalize_text(text)

# If normalized text is empty, no searches are found
if normalized_text.strip() == "":
Expand Down
10 changes: 3 additions & 7 deletions parser/openfoodfacts_taxonomy_parser/parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from neo4j import GraphDatabase, Session, Transaction

from .logger import ParserConsoleLogger
from ..normalizer import normalizing
from ..utils import get_project_name, normalize_text
from .taxonomy_parser import (
NodeType,
PreviousLink,
Expand All @@ -26,10 +26,6 @@ def __init__(self, session: Session):
self.session = session
self.parser_logger = ParserConsoleLogger()

def _get_project_name(self, taxonomy_name: str, branch_name: str):
"""Create a project name for given branch and taxonomy"""
return "p_" + taxonomy_name + "_" + branch_name

def _create_other_node(self, tx: Transaction, node_data: NodeData, project_label: str):
"""Create a TEXT, SYNONYMS or STOPWORDS node"""
if node_data.get_node_type() == NodeType.TEXT:
Expand Down Expand Up @@ -285,7 +281,7 @@ def _create_node_indexes(self, project_label: str):
self.parser_logger.info(f"Created indexes in {timeit.default_timer() - start_time} seconds")

def _write_to_database(self, taxonomy: Taxonomy, taxonomy_name: str, branch_name: str):
project_label = self._get_project_name(taxonomy_name, branch_name)
project_label = get_project_name(taxonomy_name, branch_name)
# First create nodes, then create node indexes to accelerate relationship creation, then create relationships
self._create_other_nodes(taxonomy.other_nodes, project_label)
self._create_entry_nodes(taxonomy.entry_nodes, project_label)
Expand All @@ -299,7 +295,7 @@ def __call__(self, filename: str, branch_name: str, taxonomy_name: str):
"""Process the file"""
start_time = timeit.default_timer()

branch_name = normalizing(branch_name, char="_")
branch_name = normalize_text(branch_name, char="_")
taxonomy_parser = TaxonomyParser()
taxonomy = taxonomy_parser.parse_file(filename, self.parser_logger)
self._write_to_database(taxonomy, taxonomy_name, branch_name)
Expand Down
14 changes: 5 additions & 9 deletions parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from .logger import ParserConsoleLogger
from .exception import DuplicateIDError
from ..normalizer import normalizing
from ..utils import normalize_filename, normalize_text


class NodeType(str, Enum):
Expand Down Expand Up @@ -75,10 +75,6 @@ class TaxonomyParser:
def __init__(self):
self.parser_logger = ParserConsoleLogger()

def _normalized_filename(self, filename: str) -> str:
"""Add the .txt extension if it is missing in the filename"""
return filename + (".txt" if (len(filename) < 4 or filename[-4:] != ".txt") else "")

def _file_iter(self, filename: str, start: int = 0) -> Iterator[tuple[int, str]]:
"""Generator to get the file line by line"""
with open(filename, "r", encoding="utf8") as file:
Expand Down Expand Up @@ -122,15 +118,15 @@ def _add_line(self, line: str) -> str:
"""
lc, line = line.split(":", 1)
new_line = lc + ":"
new_line += self._remove_stopwords(lc, normalizing(line, lc))
new_line += self._remove_stopwords(lc, normalize_text(line, lc))
return new_line

def _get_lc_value(self, line: str) -> tuple[str, list[str]]:
"""Get the language code "lc" and a list of normalized values"""
lc, line = line.split(":", 1)
new_line: list[str] = []
for word in line.split(","):
new_line.append(self._remove_stopwords(lc, normalizing(word, lc)))
new_line.append(self._remove_stopwords(lc, normalize_text(word, lc)))
return lc, new_line

def _set_data_id(self, data: NodeData, id: str, line_number: int) -> NodeData:
Expand Down Expand Up @@ -290,7 +286,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
tagsids_list = []
for word in line.split(","):
tags_list.append(word.strip())
word_normalized = self._remove_stopwords(lang, normalizing(word, lang))
word_normalized = self._remove_stopwords(lang, normalize_text(word, lang))
if word_normalized not in tagsids_list:
# in case 2 normalized synonyms are the same
tagsids_list.append(word_normalized)
Expand Down Expand Up @@ -356,7 +352,7 @@ def parse_file(self, filename: str, logger: ParserConsoleLogger | None = None) -
self.parser_logger = logger
"""Process the file into a Taxonomy object"""
start_time = timeit.default_timer()
filename = self._normalized_filename(filename)
filename = normalize_filename(filename)
taxonomy = self._create_taxonomy(filename)
self.parser_logger.info(f"Parsing done in {timeit.default_timer() - start_time} seconds.")
self.parser_logger.info(
Expand Down
33 changes: 10 additions & 23 deletions parser/openfoodfacts_taxonomy_parser/unparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from neo4j import GraphDatabase

from .normalizer import normalizing
from .utils import get_project_name, normalize_filename, normalize_text


class WriteTaxonomy:
Expand All @@ -12,20 +12,7 @@ class WriteTaxonomy:
def __init__(self, session):
self.session = session

def normalized_filename(self, filename):
"""add the .txt extension if it is missing in the filename"""
return filename + (".txt" if (len(filename) < 4 or filename[-4:] != ".txt") else "")

def get_project_name(self, taxonomy_name, branch_name):
"""Create a project name for given branch and taxonomy"""
return "p_" + taxonomy_name + "_" + branch_name

def create_multi_label(self, taxonomy_name, branch_name):
"""Create a combined label with taxonomy name and branch name"""
project_name = self.get_project_name(taxonomy_name, branch_name)
return project_name + ":" + ("t_" + taxonomy_name) + ":" + ("b_" + branch_name)

def get_all_nodes(self, multi_label):
def get_all_nodes(self, project_label):
"""query the database and yield each node with its parents,
this function use the relationships between nodes"""
# This query first lists all the nodes in the "is_before" order
Expand All @@ -34,7 +21,7 @@ def get_all_nodes(self, multi_label):
# Note: OPTIONAL MATCH is used to return nodes without parents
query = f"""
MATCH path = ShortestPath(
(h:{multi_label}:TEXT)-[:is_before*]->(f:{multi_label}:TEXT)
(h:{project_label}:TEXT)-[:is_before*]->(f:{project_label}:TEXT)
)
WHERE h.id="__header__" AND f.id="__footer__"
WITH nodes(path) AS nodes, range(0, size(nodes(path))-1) AS indexes
Expand Down Expand Up @@ -92,9 +79,9 @@ def get_parents_lines(self, parents):
parent_id = parent["tags_" + lc][0]
yield "<" + lc + ":" + parent_id

def iter_lines(self, multi_label):
def iter_lines(self, project_label):
previous_block_id = ""
for node, parents in self.get_all_nodes(multi_label):
for node, parents in self.get_all_nodes(project_label):
node = dict(node)
has_content = node["id"] not in ["__header__", "__footer__"]
# eventually add a blank line but in specific case
Expand Down Expand Up @@ -134,16 +121,16 @@ def iter_lines(self, multi_label):

def rewrite_file(self, filename, lines):
"""Write a .txt file with the given name"""
filename = self.normalized_filename(filename)
filename = normalize_filename(filename)
with open(filename, "w", encoding="utf8") as file:
for line in lines:
file.write(line + "\n")

def __call__(self, filename, branch_name, taxonomy_name):
filename = self.normalized_filename(filename)
branch_name = normalizing(branch_name, char="_")
multi_label = self.create_multi_label(taxonomy_name, branch_name)
lines = self.iter_lines(multi_label)
filename = normalize_filename(filename)
branch_name = normalize_text(branch_name, char="_")
project_label = get_project_name(taxonomy_name, branch_name)
lines = self.iter_lines(project_label)
self.rewrite_file(filename, lines)


Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
"""
String normalizer
"""
import re
import unicodedata

import unidecode


def normalizing(line: str, lang="default", char="-"):
def normalize_text(line: str, lang="default", char="-"):
"""Normalize a string depending on the language code"""
line = unicodedata.normalize("NFC", line)

Expand All @@ -33,3 +30,13 @@ def normalizing(line: str, lang="default", char="-"):
line = re.sub(r"-+", char, line)
line = line.strip(char)
return line


def normalize_filename(filename: str) -> str:
"""add the .txt extension if it is missing in the filename"""
return filename + (".txt" if (len(filename) < 4 or filename[-4:] != ".txt") else "")


def get_project_name(taxonomy_name: str, branch_name: str) -> str:
"""Create a project name for given branch and taxonomy"""
return "p_" + taxonomy_name + "_" + branch_name
7 changes: 3 additions & 4 deletions parser/tests/unit/test_parser_unit.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import pytest

from openfoodfacts_taxonomy_parser import normalizer, parser
from openfoodfacts_taxonomy_parser import parser, utils

# taxonomy in text format : test.txt
TEST_TAXONOMY_TXT = str(pathlib.Path(__file__).parent.parent / "data" / "test.txt")
Expand All @@ -17,8 +17,7 @@
],
)
def test_normalized_filename(filename: str, normalized_name: str):
taxonomy_parser = parser.TaxonomyParser()
assert taxonomy_parser._normalized_filename(filename) == normalized_name
assert utils.normalize_filename(filename) == normalized_name


def test_fileiter(neo4j):
Expand All @@ -40,4 +39,4 @@ def test_fileiter(neo4j):
],
)
def test_normalizing(text: str, normalized_text: str, lang: str):
assert normalizer.normalizing(text, lang) == normalized_text
assert utils.normalize_text(text, lang) == normalized_text