-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #245 from clulab/kwalcock/sentiment
Add sentiment to Ghana dataset
- Loading branch information
Showing
3 changed files
with
113 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
from pandas import DataFrame | ||
from pipeline import InputStage | ||
|
||
import pandas | ||
|
||
class SentimentInputStage(InputStage): | ||
def __init__(self, file_name: str) -> None: | ||
super().__init__(".") | ||
self.file_name = file_name | ||
|
||
def mk_data_frame(self, file_name: str) -> DataFrame: | ||
data_frame = pandas.read_csv(self.file_name, sep="\t", encoding="utf-8", na_values=[""], keep_default_na=False, dtype={ | ||
"url": str, | ||
"terms": str, | ||
"date": str, | ||
"sentenceIndex": int, | ||
"sentence": str, | ||
# "context": str, | ||
"causal": bool, | ||
"causalIndex": "Int32", # this allows for None | ||
"negationCount": "Int32", | ||
|
||
"causeIncCount": "Int32", | ||
"causeDecCount": "Int32", | ||
"causePosCount": "Int32", | ||
"causeNegCount": "Int32", | ||
|
||
"effectIncCount": "Int32", | ||
"effectDecCount": "Int32", | ||
"effectPosCount": "Int32", | ||
"effectNegCount": "Int32", | ||
|
||
"causeText": str, | ||
"effectText": str, | ||
# "prevSentence": str | ||
# Except for belief, the new ones are all str so that they are transferred | ||
# unchanged to the output. | ||
"belief": bool, | ||
"sent_locs": str, | ||
"context_locs": str, | ||
"canonicalDate": str, | ||
"prevLocation": str, | ||
"prevDistance": str, | ||
"nextLocation": str, | ||
"nextDistance": str | ||
}) | ||
# data_frame["prevSentence"].fillna("", inplace=True) | ||
return data_frame | ||
|
||
def run(self) -> DataFrame: | ||
data_frame = self.mk_data_frame(self.file_name) | ||
# data_frame = data_frame[0:1000] # TODO: remove | ||
return data_frame |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
from argparse import ArgumentParser | ||
from pandas_output_stage import PandasOutputStage | ||
from pipeline import Pipeline | ||
from sentiment_input_stage import SentimentInputStage | ||
from sentiment_sentiment_stage import SentimentSentimentStage | ||
from typing import Tuple | ||
|
||
|
||
def get_in_and_out() -> Tuple[str, str]: | ||
argument_parser = ArgumentParser() | ||
argument_parser.add_argument("-i", "--input", required=True, help="input file name") | ||
argument_parser.add_argument("-o", "--output", required=True, help="output file name") | ||
args = argument_parser.parse_args() | ||
return args.input, args.output | ||
|
||
if __name__ == "__main__": | ||
sentiment_model_name: str = "hriaz/finetuned_beliefs_sentiment_classifier_experiment1" | ||
input_file_name: str = "../corpora/ghana-sentiment-tsv/ghana-larger.tsv" | ||
output_file_name: str = "../corpora/ghana-sentiment-tsv/ghana-larger-sentiment.tsv" | ||
# input_file_name, output_file_name = get_in_and_out() | ||
pipeline = Pipeline( | ||
SentimentInputStage(input_file_name), | ||
[ | ||
SentimentSentimentStage(sentiment_model_name), | ||
], | ||
PandasOutputStage(output_file_name) | ||
) | ||
pipeline.run() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
from pandas import DataFrame | ||
from pipeline import InnerStage | ||
from transformers import pipeline | ||
from tqdm import tqdm | ||
|
||
import math | ||
|
||
class SentimentSentimentStage(InnerStage): | ||
def __init__(self, model_name: str, belief_column_name: str = "belief", text_column_name: str = "sentence") -> None: | ||
super().__init__() | ||
self.sentiment_analysis = pipeline("sentiment-analysis", model=model_name, device="cpu") | ||
self.belief_column_name = belief_column_name | ||
self.text_column_name = text_column_name | ||
|
||
def run(self, data_frame: DataFrame) -> DataFrame: | ||
sentiment_scores = [] | ||
for _, row in tqdm(data_frame.iterrows(), total=data_frame.shape[0], desc=f"Calculating sentiment"): | ||
belief = row[self.belief_column_name] | ||
if belief: | ||
text = row[self.text_column_name] | ||
sentiment_dict = self.sentiment_analysis(text) | ||
label, score = sentiment_dict[0]["label"], sentiment_dict[0]["score"] | ||
if label == "NEGATIVE": | ||
score *= -1 | ||
elif label == "UNDETERMINED": | ||
score = 0 | ||
else: | ||
score = math.nan | ||
sentiment_scores.append(score) | ||
data_frame["sentiment_scores"] = sentiment_scores | ||
return data_frame | ||
|