Skip to content

Commit

Permalink
added utils file
Browse files Browse the repository at this point in the history
  • Loading branch information
lobsam committed Dec 19, 2024
1 parent e2e1728 commit e6bf78c
Show file tree
Hide file tree
Showing 2 changed files with 180 additions and 0 deletions.
Empty file added src/pecha_uploader/pipeline.py
Empty file.
180 changes: 180 additions & 0 deletions src/pecha_uploader/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
import re
from typing import Dict, List, Union


def generate_schema(
en_book: List[Dict], bo_book: List[Dict], en_key: str = "", bo_key: str = ""
) -> List:
"""This function generate index schema for both complex and simple text"""
nodes = []
# generate schema node for complex text
if "content" in bo_book:
botext = bo_book["content"]
entext = en_book["content"]
else:
botext = bo_book
entext = en_book

if isinstance(entext, dict):
for (enkey, envalue), (bokey, bovalue) in zip(entext.items(), botext.items()):
en_full_key = enkey.strip() if en_key else enkey
bo_full_key = bokey.strip() if bo_key else bokey
if isinstance(envalue, dict) and enkey != "data":
# Check if the dictionary has any children other than 'data'

has_children = any(sub_key != "data" for sub_key in envalue.keys())
child_nodes = generate_schema(
envalue, bovalue, en_full_key, bo_full_key
)
# if only data is present
if not has_children:
data_node = create_data_node(
en_full_key, bo_full_key, envalue["data"], bovalue["data"]
)
nodes.append(data_node)
else:
node = {
"nodes": child_nodes,
"titles": [
{"lang": "he", "text": bo_full_key, "primary": True},
{"lang": "en", "text": en_full_key, "primary": True},
],
"key": en_full_key,
}
nodes.append(node)

elif enkey == "data":
data_node = create_data_node(enkey, "གནས་བབས", envalue, bovalue)
nodes.append(data_node)
if isinstance(entext, list):
data_node = create_data_node(en_book["title"], bo_book["title"], entext, botext)
nodes.append(data_node)
return nodes


def create_data_node(
en_key: str,
bo_key: str,
envalue: Union[List[str], List[List]],
bovalue: Union[List[str], List[List]],
):
"""This function generate node for schema"""
text_depth = None
sections = ["Chapters", "Verses", "Paragraphs"]

if len(envalue) > 0:
text_depth = get_list_depth(envalue)
else:
text_depth = 1

if len(bovalue) > 0:
text_depth = get_list_depth(bovalue)
else:
text_depth = 1

return {
"nodeType": "JaggedArrayNode",
"depth": text_depth,
"addressTypes": list(map(lambda x: "Integer", sections[:text_depth])),
"sections": sections[:text_depth],
"titles": [
{"lang": "he", "text": bo_key, "primary": True},
{"lang": "en", "text": en_key, "primary": True},
],
"key": en_key,
}


def parse_annotation(value: Union[List[str], List[List]]):
"""clean and parse annotation"""

def process_item(item):
# If the item is a list, recursively process its contents
if isinstance(item, list):
return [process_item(sub_item) for sub_item in item]

# Convert item to string and apply transformations
if not isinstance(item, str):
item = str(item)

# Replace newlines
item = item.replace("\n", "<br>")

# Sapche transformation
if "<sapche>" in item:
item = item.replace("<sapche>", '<span class="text-subche-style">')
item = item.replace("</sapche>", "</span>")

# Citation transformation
if "{" in item:
item = item.replace("{", '<span class="text-citation-style">')
item = item.replace("}", "</span>")

# Quotation transformation
if "(" in item:
item = item.replace("(", '<span class="text-quotation-style">')
item = item.replace(")", "</span>")

# Remove numbered tags
item = re.sub(r"<\d+>", "", item.strip())

return item

# Process the entire input recursively
return process_item(value)


def generate_chapters(
book: List[Dict],
language: str,
current_key: str = "",
parent_keys: List[str] = None,
) -> Dict:
"""generate text content"""
result = {}
if parent_keys is None:
parent_keys = []

for key, value in book.items():
full_key = key if current_key else key
new_parent_keys = parent_keys + [key.strip()] # Update list of parent key
clean_value = []
if isinstance(value, dict):

# Check if the dictionary has any children other than 'data'
has_children = any(sub_key != "data" for sub_key in value.keys())
child_data = generate_chapters(value, language, full_key, new_parent_keys)
result.update(child_data) # Merge results from children

# Determine the key for 'data' depending on whether there are other children
if "data" in value:
clean_value = parse_annotation(value["data"])

# If there are other children, include 'data' in the key, else exclude it
if has_children:
if language == "bo":
data_key = ", ".join(new_parent_keys) + ", གནས་བབས"
else:
data_key = ", ".join(new_parent_keys) + ", data"
else:
data_key = ", ".join(
new_parent_keys
) # Exclude 'data' from the key if no other children
result[data_key] = clean_value

return result


def get_list_depth(lst):
"""
Function to calculate the depth of a nested list.
"""
if not isinstance(lst, list): # Base case: not a list, no depth
return 0
else:
max_depth = 0
for item in lst:
max_depth = max(
max_depth, get_list_depth(item)
) # Recurse and update max depth
return max_depth + 1 # Add one to include the current depth level

0 comments on commit e6bf78c

Please sign in to comment.