Skip to content

Commit

Permalink
all tests pass addresses #12
Browse files Browse the repository at this point in the history
  • Loading branch information
hrshdhgd committed Oct 22, 2021
1 parent 5ab18be commit b6fe4f1
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 10 deletions.
22 changes: 15 additions & 7 deletions ontorunner/post/add_sentence.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def sentencify(input_df, output_df, output_fn):
# and hence the biohub_converter codes this with a '_SYNONYM' tag.
# In order to counter this, we need to filter these extra rows out.
if not sub_df.empty and any(
sub_df["entity_id"].str.endswith("_SYNONYM")
sub_df["object_id"].str.endswith("_SYNONYM")
):
sub_df = filter_synonyms(sub_df)

Expand Down Expand Up @@ -213,14 +213,22 @@ def parse(input_directory, output_directory) -> None:
output_df.columns = output_df.columns.str.replace(" ", "_").str.lower()
# Consolidate rows where the entitys is the same
# and recognized from multiple origins
output_df = output_df.rename(
columns={"entity_id": "object_id", "type": "object_category"}
)

output_df = consolidate_rows(output_df)
output_df[["preferred_form", "match_field"]] = output_df[

output_df[["preferred_form", "object_label"]] = output_df[
"preferred_form"
].str.split("\\[SYNONYM_OF:", expand=True)

output_df["match_field"] = output_df["match_field"].str.replace(
output_df["object_label"] = output_df["object_label"].str.replace(
"]", "", regex=True
)
output_df["object_label"] = output_df["object_label"].fillna(
output_df["preferred_form"]
)

# Add column which indicates how close of a match is the recognized entity.
output_df.insert(
Expand Down Expand Up @@ -273,7 +281,7 @@ def parse(input_directory, output_directory) -> None:

output_df["sentence"] = ""

output_df["entity_sentence_%"] = ""
output_df["object_sentence_%"] = ""

output_df = output_df.reindex(
columns=[
Expand All @@ -283,17 +291,17 @@ def parse(input_directory, output_directory) -> None:
"end_position",
"matched_term",
"preferred_form",
"match_field",
"object_label",
"match_type",
"levenshtein_distance",
"jaccard_index",
"monge_elkan",
"entity_id",
"object_id",
"sentence_id",
"umls_cui",
"origin",
"sentence",
"entity_sentence_%",
"object_sentence_%",
]
)

Expand Down
6 changes: 3 additions & 3 deletions ontorunner/post/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@ def filter_synonyms(df: pd.DataFrame) -> pd.DataFrame:
condition_1 = (
df["matched_term"].str.lower() == df["preferred_form"].str.lower()
)
condition_2 = df["entity_id"].str.contains("_SYNONYM")
condition_2 = df["object_id"].str.contains("_SYNONYM")
same_yet_syn_condition = condition_1 & condition_2
new_df = df[~same_yet_syn_condition]
tmp_df = df[same_yet_syn_condition]
tmp_df["entity_id"] = (
tmp_df["entity_id"].str.strip("_SYNONYM").drop_duplicates()
tmp_df["object_id"] = (
tmp_df["object_id"].str.strip("_SYNONYM").drop_duplicates()
)
new_df = pd.concat([new_df, tmp_df])

Expand Down

0 comments on commit b6fe4f1

Please sign in to comment.