Merge pull request #245 from clulab/kwalcock/sentiment

Add sentiment to Ghana dataset
clulab · Mar 9, 2024 · 0533a3d · 0533a3d
2 parents 353ab57 + 6bbf002
commit 0533a3d
Show file tree

Hide file tree

Showing 3 changed files with 113 additions and 0 deletions.
diff --git a/belief_pipeline/sentiment_input_stage.py b/belief_pipeline/sentiment_input_stage.py
@@ -0,0 +1,53 @@
+from pandas import DataFrame
+from pipeline import InputStage
+
+import pandas
+
+class SentimentInputStage(InputStage):
+    def __init__(self, file_name: str) -> None:
+        super().__init__(".")
+        self.file_name = file_name
+
+    def mk_data_frame(self, file_name: str) -> DataFrame:
+        data_frame = pandas.read_csv(self.file_name, sep="\t", encoding="utf-8", na_values=[""], keep_default_na=False, dtype={
+            "url": str,
+            "terms": str,
+            "date": str,
+            "sentenceIndex": int,
+            "sentence": str,
+            # "context": str,
+            "causal": bool,
+            "causalIndex": "Int32", # this allows for None
+            "negationCount": "Int32",
+
+            "causeIncCount": "Int32",
+            "causeDecCount": "Int32",
+            "causePosCount": "Int32",
+            "causeNegCount": "Int32",
+
+            "effectIncCount": "Int32",
+            "effectDecCount": "Int32",
+            "effectPosCount": "Int32",
+            "effectNegCount": "Int32",
+
+            "causeText": str,
+            "effectText": str,
+            # "prevSentence": str
+            # Except for belief, the new ones are all str so that they are transferred
+            # unchanged to the output.
+            "belief": bool,
+            "sent_locs": str,
+            "context_locs": str,
+            "canonicalDate": str,
+            "prevLocation": str,
+            "prevDistance": str,
+            "nextLocation": str,
+            "nextDistance": str
+        })
+        # data_frame["prevSentence"].fillna("", inplace=True)
+        return data_frame
+
+    def run(self) -> DataFrame:
+        data_frame = self.mk_data_frame(self.file_name)
+        # data_frame = data_frame[0:1000] # TODO: remove
+        return data_frame
diff --git a/belief_pipeline/sentiment_main.py b/belief_pipeline/sentiment_main.py
@@ -0,0 +1,28 @@
+from argparse import ArgumentParser
+from pandas_output_stage import PandasOutputStage
+from pipeline import Pipeline
+from sentiment_input_stage import SentimentInputStage
+from sentiment_sentiment_stage import SentimentSentimentStage
+from typing import Tuple
+
+
+def get_in_and_out() -> Tuple[str, str]:
+    argument_parser = ArgumentParser()
+    argument_parser.add_argument("-i", "--input", required=True, help="input file name")
+    argument_parser.add_argument("-o", "--output", required=True, help="output file name")
+    args = argument_parser.parse_args()
+    return args.input, args.output
+
+if __name__ == "__main__":
+    sentiment_model_name: str = "hriaz/finetuned_beliefs_sentiment_classifier_experiment1"
+    input_file_name: str = "../corpora/ghana-sentiment-tsv/ghana-larger.tsv"
+    output_file_name: str = "../corpora/ghana-sentiment-tsv/ghana-larger-sentiment.tsv"
+    # input_file_name, output_file_name = get_in_and_out()
+    pipeline = Pipeline(
+        SentimentInputStage(input_file_name),
+        [
+            SentimentSentimentStage(sentiment_model_name),
+        ],
+        PandasOutputStage(output_file_name)
+    )
+    pipeline.run()
diff --git a/belief_pipeline/sentiment_sentiment_stage.py b/belief_pipeline/sentiment_sentiment_stage.py
@@ -0,0 +1,32 @@
+from pandas import DataFrame
+from pipeline import InnerStage
+from transformers import pipeline
+from tqdm import tqdm
+
+import math
+
+class SentimentSentimentStage(InnerStage):
+    def __init__(self, model_name: str, belief_column_name: str = "belief", text_column_name: str = "sentence") -> None:
+        super().__init__()
+        self.sentiment_analysis = pipeline("sentiment-analysis", model=model_name, device="cpu")
+        self.belief_column_name = belief_column_name
+        self.text_column_name = text_column_name
+
+    def run(self, data_frame: DataFrame) -> DataFrame:
+        sentiment_scores = []
+        for _, row in tqdm(data_frame.iterrows(), total=data_frame.shape[0], desc=f"Calculating sentiment"):
+            belief = row[self.belief_column_name]
+            if belief:
+                text = row[self.text_column_name]
+                sentiment_dict = self.sentiment_analysis(text)
+                label, score = sentiment_dict[0]["label"], sentiment_dict[0]["score"]
+                if label == "NEGATIVE":
+                    score *= -1
+                elif label == "UNDETERMINED":
+                    score = 0
+            else:
+                score = math.nan
+            sentiment_scores.append(score)
+        data_frame["sentiment_scores"] = sentiment_scores
+        return data_frame
+