-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwordcloud.py
130 lines (67 loc) · 2.48 KB
/
wordcloud.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/env python
# coding: utf-8
# In[15]:
# allow us to directly load a .zip file
import zipfile
import pandas as pd
import os
# As we are working with long texts, we set the corresponding option to visualized all
# the data complete
pd.set_option('display.max_colwidth', None)
import matplotlib.pyplot as plt
import mpld3
# In[2]:
import nltk
nltk.download('vader_lexicon')
# In[3]:
# now, we import the relevant modules from the NLTK library
from nltk.sentiment.vader import SentimentIntensityAnalyzer
def classify_compound(text, threshold=0.33):
# initialize VADER
sid = SentimentIntensityAnalyzer()
# Calling the polarity_scores method on sid and passing in the text
# outputs a dictionary with negative, neutral, positive, and compound scores for the input text
scores = sid.polarity_scores(text)
# get compound score
score = scores['compound']
# translate the score into the correcponding input according to the threshold
if score <= -threshold: return 'Negative'
elif score >= threshold: return 'Positive'
else: return 'Neutral'
# In[4]:
# load text data .csv with reviews and apply columns restrictions,
# also, we drop duplicates and any row with nan values in the column Translated_Review
text_data = pd.read_csv(r"C:\Users\EliGrinfeld\Box\Eli Grinfeld\CSAT_Clean.csv")
# In[5]:
# create a new feature based on compound score from VADER using our function "classify_compound"
text_data['compound_sentiment'] = text_data.CSAT_COMMENT.apply(lambda text: classify_compound(text))
df = text_data
# Visualize a random row to see all features together
df.sample(1)
# In[6]:
# Import all necesary libraries
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
# In[7]:
# Get stopwords from wordcloud library
stopwords = set(STOPWORDS)
# In[12]:
# Add some extra words ad hoc for our purpose
xtra_words = ['ARBONNE', 'PRODUCTS']
stopwords.update(xtra_words)
# In[13]:
# join all reviews
text = " ".join(review for review in text_data.CSAT_COMMENT)
# In[14]:
# Generate the image
wordcloud = WordCloud(stopwords=stopwords, background_color="white", max_words=100, min_word_length=5).generate(text)
# visualize the image
fig=plt.figure(figsize=(15, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title('Total Reviews Word Clowd')
plt.show()
# In[16]:
html_str = mpld3.fig_to_html(fig)
Html_file= open("index.html","w")
Html_file.write(html_str)
Html_file.close()