-
Notifications
You must be signed in to change notification settings - Fork 0
/
vaderArticleSent.py
139 lines (114 loc) · 3.67 KB
/
vaderArticleSent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from pymongo import MongoClient
# from wordcloud import WordCloud, STOPWORDS
# import matplotlib.pyplot as plt
# import numpy as np
# import pandas as pd
# from sklearn.feature_extraction.text import CountVectorizer
# import re
# import pandas
#
# import seaborn as sns
client = MongoClient("mongodb+srv://")
db = client.USArticles
num_of_articles = 0
total_articles = []
keywords_list = []
keywords_dict = {}
all_text = ""
news_outlet = "Time"
with client:
#db = client.Categories
articles = db.articles
for x in articles.find({"news_outlet": news_outlet}, {'text': 1,'keywords':1}):
num_of_articles +=1
#print(x)
total_articles.append(x['text'])
keywords = x['keywords']
for word in keywords:
all_text += word + " "
keywords_list.append(word)
if word in keywords_dict:
keywords_dict[word] += 1
else:
keywords_dict[word] = 1
file_name = news_outlet+"KeyWords.csv"
with open(file_name, 'w') as f:
for key in keywords_dict.keys():
f.write("%s,%s\n"%(key,keywords_dict[key]))
#
# corpus = keywords_list
# #Most frequently occuring words
# def get_top_n_words(corpus, n=None):
# vec = CountVectorizer().fit(corpus)
# bag_of_words = vec.transform(corpus)
# sum_words = bag_of_words.sum(axis=0)
# words_freq = [(word, sum_words[0, idx]) for word, idx in
# vec.vocabulary_.items()]
# words_freq =sorted(words_freq, key = lambda x: x[1],
# reverse=True)
# return words_freq[:n]
#
#
# #Convert most freq words to dataframe for plotting bar plot
# top_words = get_top_n_words(corpus, n=20)
# top_df = pandas.DataFrame(top_words)
# top_df.columns=["Word", "Freq"]
#
# #Barplot of most freq words
# sns.set(rc={'figure.figsize':(13,8)})
# g = sns.barplot(x="Word", y="Freq", data=top_df)
# g.set_xticklabels(g.get_xticklabels(), rotation=30)
#text = keywords_list[1]
scores = {"Positive": 0 , "Negative": 0 , "Neutral": 0}
scores_list = []
analyzer = SentimentIntensityAnalyzer()
pos=neg=neu=compound=compound2=0
#sentences = nltk.sent_tokenize(text.lower())
for article in total_articles:
idx = 1
pos = neg = neu = compound = compound2 = 0
peregraphs = article.split("Advertisement")
for per in peregraphs:
sentences = per.split(".")
for sentence in sentences:
#print("Sent:", sentence)
if(len(sentence) < 3):
continue
print("Sent:", sentence)
sent_len = len(sentence.split(" "))
if sent_len < 1:
sent_len = 50
vs = analyzer.polarity_scores(sentence)
pos += vs["pos"]
neg += vs["neg"]
neu += vs["neu"]
#compound += vs["compound"] / sent_len
compound2 += vs["compound"]
idx += 1
print("Final result for the article:")
print("idx:",idx)
print("pos:", pos/idx)
print("neg:", neg/idx)
print("neu:", neu/idx)
#print("compound1:", compound)
print("compound2:", compound2)
print("Score:",compound2/idx)
#print("keywords:",keywords_list)
print("num_of_articles",num_of_articles)
print("--------------")
score = compound2/idx
if score >= 0.05:
scores["Positive"] += 1
scores_list.append("Positive")
print("Positive")
elif score <= - 0.05:
scores["Negative"] += 1
scores_list.append("Negative")
print("Negative")
else:
scores["Neutral"] += 1
scores_list.append("Neutral")
print("Neutral")
print(scores)
print(scores_list)