generated from OpenPecha/python-package-template
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
180 additions
and
0 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,180 @@ | ||
import re | ||
from typing import Dict, List, Union | ||
|
||
|
||
def generate_schema( | ||
en_book: List[Dict], bo_book: List[Dict], en_key: str = "", bo_key: str = "" | ||
) -> List: | ||
"""This function generate index schema for both complex and simple text""" | ||
nodes = [] | ||
# generate schema node for complex text | ||
if "content" in bo_book: | ||
botext = bo_book["content"] | ||
entext = en_book["content"] | ||
else: | ||
botext = bo_book | ||
entext = en_book | ||
|
||
if isinstance(entext, dict): | ||
for (enkey, envalue), (bokey, bovalue) in zip(entext.items(), botext.items()): | ||
en_full_key = enkey.strip() if en_key else enkey | ||
bo_full_key = bokey.strip() if bo_key else bokey | ||
if isinstance(envalue, dict) and enkey != "data": | ||
# Check if the dictionary has any children other than 'data' | ||
|
||
has_children = any(sub_key != "data" for sub_key in envalue.keys()) | ||
child_nodes = generate_schema( | ||
envalue, bovalue, en_full_key, bo_full_key | ||
) | ||
# if only data is present | ||
if not has_children: | ||
data_node = create_data_node( | ||
en_full_key, bo_full_key, envalue["data"], bovalue["data"] | ||
) | ||
nodes.append(data_node) | ||
else: | ||
node = { | ||
"nodes": child_nodes, | ||
"titles": [ | ||
{"lang": "he", "text": bo_full_key, "primary": True}, | ||
{"lang": "en", "text": en_full_key, "primary": True}, | ||
], | ||
"key": en_full_key, | ||
} | ||
nodes.append(node) | ||
|
||
elif enkey == "data": | ||
data_node = create_data_node(enkey, "གནས་བབས", envalue, bovalue) | ||
nodes.append(data_node) | ||
if isinstance(entext, list): | ||
data_node = create_data_node(en_book["title"], bo_book["title"], entext, botext) | ||
nodes.append(data_node) | ||
return nodes | ||
|
||
|
||
def create_data_node( | ||
en_key: str, | ||
bo_key: str, | ||
envalue: Union[List[str], List[List]], | ||
bovalue: Union[List[str], List[List]], | ||
): | ||
"""This function generate node for schema""" | ||
text_depth = None | ||
sections = ["Chapters", "Verses", "Paragraphs"] | ||
|
||
if len(envalue) > 0: | ||
text_depth = get_list_depth(envalue) | ||
else: | ||
text_depth = 1 | ||
|
||
if len(bovalue) > 0: | ||
text_depth = get_list_depth(bovalue) | ||
else: | ||
text_depth = 1 | ||
|
||
return { | ||
"nodeType": "JaggedArrayNode", | ||
"depth": text_depth, | ||
"addressTypes": list(map(lambda x: "Integer", sections[:text_depth])), | ||
"sections": sections[:text_depth], | ||
"titles": [ | ||
{"lang": "he", "text": bo_key, "primary": True}, | ||
{"lang": "en", "text": en_key, "primary": True}, | ||
], | ||
"key": en_key, | ||
} | ||
|
||
|
||
def parse_annotation(value: Union[List[str], List[List]]): | ||
"""clean and parse annotation""" | ||
|
||
def process_item(item): | ||
# If the item is a list, recursively process its contents | ||
if isinstance(item, list): | ||
return [process_item(sub_item) for sub_item in item] | ||
|
||
# Convert item to string and apply transformations | ||
if not isinstance(item, str): | ||
item = str(item) | ||
|
||
# Replace newlines | ||
item = item.replace("\n", "<br>") | ||
|
||
# Sapche transformation | ||
if "<sapche>" in item: | ||
item = item.replace("<sapche>", '<span class="text-subche-style">') | ||
item = item.replace("</sapche>", "</span>") | ||
|
||
# Citation transformation | ||
if "{" in item: | ||
item = item.replace("{", '<span class="text-citation-style">') | ||
item = item.replace("}", "</span>") | ||
|
||
# Quotation transformation | ||
if "(" in item: | ||
item = item.replace("(", '<span class="text-quotation-style">') | ||
item = item.replace(")", "</span>") | ||
|
||
# Remove numbered tags | ||
item = re.sub(r"<\d+>", "", item.strip()) | ||
|
||
return item | ||
|
||
# Process the entire input recursively | ||
return process_item(value) | ||
|
||
|
||
def generate_chapters( | ||
book: List[Dict], | ||
language: str, | ||
current_key: str = "", | ||
parent_keys: List[str] = None, | ||
) -> Dict: | ||
"""generate text content""" | ||
result = {} | ||
if parent_keys is None: | ||
parent_keys = [] | ||
|
||
for key, value in book.items(): | ||
full_key = key if current_key else key | ||
new_parent_keys = parent_keys + [key.strip()] # Update list of parent key | ||
clean_value = [] | ||
if isinstance(value, dict): | ||
|
||
# Check if the dictionary has any children other than 'data' | ||
has_children = any(sub_key != "data" for sub_key in value.keys()) | ||
child_data = generate_chapters(value, language, full_key, new_parent_keys) | ||
result.update(child_data) # Merge results from children | ||
|
||
# Determine the key for 'data' depending on whether there are other children | ||
if "data" in value: | ||
clean_value = parse_annotation(value["data"]) | ||
|
||
# If there are other children, include 'data' in the key, else exclude it | ||
if has_children: | ||
if language == "bo": | ||
data_key = ", ".join(new_parent_keys) + ", གནས་བབས" | ||
else: | ||
data_key = ", ".join(new_parent_keys) + ", data" | ||
else: | ||
data_key = ", ".join( | ||
new_parent_keys | ||
) # Exclude 'data' from the key if no other children | ||
result[data_key] = clean_value | ||
|
||
return result | ||
|
||
|
||
def get_list_depth(lst): | ||
""" | ||
Function to calculate the depth of a nested list. | ||
""" | ||
if not isinstance(lst, list): # Base case: not a list, no depth | ||
return 0 | ||
else: | ||
max_depth = 0 | ||
for item in lst: | ||
max_depth = max( | ||
max_depth, get_list_depth(item) | ||
) # Recurse and update max depth | ||
return max_depth + 1 # Add one to include the current depth level |