Skip to content

Commit

Permalink
Merge pull request #245 from clulab/kwalcock/sentiment
Browse files Browse the repository at this point in the history
Add sentiment to Ghana dataset
  • Loading branch information
kwalcock authored Mar 9, 2024
2 parents 353ab57 + 6bbf002 commit 0533a3d
Show file tree
Hide file tree
Showing 3 changed files with 113 additions and 0 deletions.
53 changes: 53 additions & 0 deletions belief_pipeline/sentiment_input_stage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from pandas import DataFrame
from pipeline import InputStage

import pandas

class SentimentInputStage(InputStage):
def __init__(self, file_name: str) -> None:
super().__init__(".")
self.file_name = file_name

def mk_data_frame(self, file_name: str) -> DataFrame:
data_frame = pandas.read_csv(self.file_name, sep="\t", encoding="utf-8", na_values=[""], keep_default_na=False, dtype={
"url": str,
"terms": str,
"date": str,
"sentenceIndex": int,
"sentence": str,
# "context": str,
"causal": bool,
"causalIndex": "Int32", # this allows for None
"negationCount": "Int32",

"causeIncCount": "Int32",
"causeDecCount": "Int32",
"causePosCount": "Int32",
"causeNegCount": "Int32",

"effectIncCount": "Int32",
"effectDecCount": "Int32",
"effectPosCount": "Int32",
"effectNegCount": "Int32",

"causeText": str,
"effectText": str,
# "prevSentence": str
# Except for belief, the new ones are all str so that they are transferred
# unchanged to the output.
"belief": bool,
"sent_locs": str,
"context_locs": str,
"canonicalDate": str,
"prevLocation": str,
"prevDistance": str,
"nextLocation": str,
"nextDistance": str
})
# data_frame["prevSentence"].fillna("", inplace=True)
return data_frame

def run(self) -> DataFrame:
data_frame = self.mk_data_frame(self.file_name)
# data_frame = data_frame[0:1000] # TODO: remove
return data_frame
28 changes: 28 additions & 0 deletions belief_pipeline/sentiment_main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from argparse import ArgumentParser
from pandas_output_stage import PandasOutputStage
from pipeline import Pipeline
from sentiment_input_stage import SentimentInputStage
from sentiment_sentiment_stage import SentimentSentimentStage
from typing import Tuple


def get_in_and_out() -> Tuple[str, str]:
argument_parser = ArgumentParser()
argument_parser.add_argument("-i", "--input", required=True, help="input file name")
argument_parser.add_argument("-o", "--output", required=True, help="output file name")
args = argument_parser.parse_args()
return args.input, args.output

if __name__ == "__main__":
sentiment_model_name: str = "hriaz/finetuned_beliefs_sentiment_classifier_experiment1"
input_file_name: str = "../corpora/ghana-sentiment-tsv/ghana-larger.tsv"
output_file_name: str = "../corpora/ghana-sentiment-tsv/ghana-larger-sentiment.tsv"
# input_file_name, output_file_name = get_in_and_out()
pipeline = Pipeline(
SentimentInputStage(input_file_name),
[
SentimentSentimentStage(sentiment_model_name),
],
PandasOutputStage(output_file_name)
)
pipeline.run()
32 changes: 32 additions & 0 deletions belief_pipeline/sentiment_sentiment_stage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from pandas import DataFrame
from pipeline import InnerStage
from transformers import pipeline
from tqdm import tqdm

import math

class SentimentSentimentStage(InnerStage):
def __init__(self, model_name: str, belief_column_name: str = "belief", text_column_name: str = "sentence") -> None:
super().__init__()
self.sentiment_analysis = pipeline("sentiment-analysis", model=model_name, device="cpu")
self.belief_column_name = belief_column_name
self.text_column_name = text_column_name

def run(self, data_frame: DataFrame) -> DataFrame:
sentiment_scores = []
for _, row in tqdm(data_frame.iterrows(), total=data_frame.shape[0], desc=f"Calculating sentiment"):
belief = row[self.belief_column_name]
if belief:
text = row[self.text_column_name]
sentiment_dict = self.sentiment_analysis(text)
label, score = sentiment_dict[0]["label"], sentiment_dict[0]["score"]
if label == "NEGATIVE":
score *= -1
elif label == "UNDETERMINED":
score = 0
else:
score = math.nan
sentiment_scores.append(score)
data_frame["sentiment_scores"] = sentiment_scores
return data_frame

0 comments on commit 0533a3d

Please sign in to comment.