From a4d0314e1ddb12c200d54572fceb94d3d54aa4ba Mon Sep 17 00:00:00 2001
From: Harshad <hrshdhgd@users.noreply.github.com>
Date: Fri, 13 Aug 2021 12:03:51 -0500
Subject: [PATCH] column names lowercased as per #12

---
 runner/post/add_sentence.py | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/runner/post/add_sentence.py b/runner/post/add_sentence.py
index e1b46642..34e4ca25 100644
--- a/runner/post/add_sentence.py
+++ b/runner/post/add_sentence.py
@@ -14,8 +14,8 @@ def find_extensions(dr, ext):
 
 
 def filterAlikeTermSynonyms(df):
-    condition_1 = df["MATCHED TERM"].str.lower() == df["PREFERRED FORM"].str.lower()
-    condition_2 = df["ENTITY ID"].str.contains("_SYNONYM")
+    condition_1 = df["matched_term"].str.lower() == df["preferred_form"].str.lower()
+    condition_2 = df["entity_id"].str.contains("_SYNONYM")
     fullConditionStatement = ~(condition_1 & condition_2)
     return df[fullConditionStatement]
 
@@ -39,30 +39,30 @@ def sentencify(input_df, output_df, output_fn):
                 .replace("\r", "",)
             )
             text_tok = nltk.sent_tokenize(text)
-            sub_df = output_df[output_df["DOCUMENT ID"] == idx]
+            sub_df = output_df[output_df["document_id"] == idx]
             # In certain instances, in spite of the 'matched' and 'preferred'
             # terms being the same, the term is registered as a synonym by KGX and
             # hence the biohub_converter codes this with a '_SYNONYM' tag.
             # In order to counter this, we need to filter these extra rows out.
-            if not sub_df.empty and any(sub_df["ENTITY ID"].str.endswith("_SYNONYM")):
+            if not sub_df.empty and any(sub_df["entity_id"].str.endswith("_SYNONYM")):
                 sub_df = filterAlikeTermSynonyms(sub_df)
 
             if len(text_tok) == 1:
-                sub_df["SENTENCE"] = text
+                sub_df["sentence"] = text
             else:
                 relevant_tok = []
                 start_reached = False
                 end_reached = False
                 for i, row2 in sub_df.iterrows():
-                    term_of_interest = str(row2["MATCHED TERM"])
-                    start_pos = int(row2["START POSITION"])
+                    term_of_interest = str(row2["matched_term"])
+                    start_pos = int(row2["start_position"])
                     if start_pos == 0:
                         start_reached = True
-                    end_pos = int(row2["END POSITION"])
+                    end_pos = int(row2["end_position"])
                     if end_pos == len(text):
                         end_reached = True
                     if term_of_interest == "nan":
-                        term_of_interest = str(row2["PREFERRED FORM"]).lower()
+                        term_of_interest = str(row2["preferred_form"]).lower()
 
                     relevant_tok = [x for x in text_tok if term_of_interest in x]
                     single_tok = relevant_tok
@@ -104,7 +104,7 @@ def sentencify(input_df, output_df, output_fn):
                         # the unique sentence forever.
                         # It's a hack but for now it'll do until severe consequences detected.
 
-                    sub_df.loc[i, "SENTENCE"] = single_tok[0]
+                    sub_df.loc[i, "sentence"] = single_tok[0]
 
             if not sub_df.empty:
                 sub_df.to_csv(output_fn, mode="a", sep="\t", header=None, index=None)
@@ -129,7 +129,8 @@ def parse(input_directory, output_directory) -> None:
         if "runNER" not in x
     ][0]
     output_df = pd.read_csv(output_file, sep="\t", low_memory=False)
-    output_df["SENTENCE"] = ""
+    output_df["sentence"] = ""
+    output_df.columns = output_df.columns.str.replace(" ", "_").str.lower()
 
     final_output_file = os.path.join(output_directory, "runNER_Output.tsv")