-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtexteda.py
113 lines (93 loc) · 3.8 KB
/
texteda.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
"""
Package that contains a collection of function that can be used for generic
exploratory data analysis on text.
"""
import matplotlib.pyplot as plt
from pandas import DataFrame
from seaborn import barplot
from wordcloud import WordCloud
from collections import Counter
from nltk.collocations import BigramCollocationFinder
from nltk import word_tokenize
def most_common_words(input_df:DataFrame,
text_col: str,
n_most_common=20):
"""
Given a collection of documents as text, compute the number of most common
words as defined by n_most_common.
Args:
------
input_df: Dataframe that contains the relevant text column
text_col: Name of the column
n_most_common: Number of most common words to calculate
Returns:
--------
Pandas dataframe with two columns indicating a word and number
of times (count) that it appears in the original input_df
"""
word_list = list([x.split() for x in input_df[text_col] if x is not None])
word_counter = Counter(x for xs in word_list for x in set(xs))
word_counter.most_common(n_most_common)
return DataFrame(word_counter.most_common(n_most_common),
columns=['words', 'count'])
def plot_most_common_words(input_df:DataFrame,
text_col: str,
n_most_common=20,
figsize=(10, 10)) -> None:
fig, ax = plt.subplots(figsize=figsize)
common_words_df = most_common_words(input_df=input_df,
text_col=text_col,
n_most_common=n_most_common)
barplot(x='count',
y='words',
data=common_words_df).set_title(f'Common Words Found - Overall',
fontweight='bold')
plt.grid(True, alpha=0.3, linestyle='-', color='black')
plt.show()
def plot_wordcloud(input_df:DataFrame,
text_col: str,
figsize=(10, 10)) -> None:
"""
Generate a WordCloud plot based on the number of occurenences of words
in a set documents
"""
plt.figure(figsize=figsize)
combined_text = ' '.join([x for x in input_df[text_col] if x is not None])
wordcloud = WordCloud().generate(combined_text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
def compute_bigrams(input_df:DataFrame,
text_col: str) -> dict:
"""
Calculate the number of occurences that a pair of words appear next to
each other, and return a dictionary of pair of words - count.
"""
combined_text = ' '.join([x for x in input_df[text_col]])
finder = BigramCollocationFinder.from_words(word_tokenize(combined_text))
bigrams_dict = {}
for k, v in finder.ngram_fd.items():
# Condition to avoid characters like '@' and '#'
if len(k[0]) > 1 and len(k[1]) > 1 and "'s" not in k:
bigrams_dict[k] = v
else:
continue
return bigrams_dict
def plot_bigrams(input_df:DataFrame,
text_col: str,
top_n: int,
figsize=(10, 8)) -> None:
bigrams_dict = compute_bigrams(input_df=input_df, text_col=text_col)
bigrams_sorted = sorted(bigrams_dict.items(),
key=lambda x: x[1],
reverse=True)[0:top_n]
bgram, counts = list(zip(*bigrams_sorted))
bgstring = list(map(lambda txt: '-'.join(txt), bgram))
plt.figure(figsize=figsize)
g = barplot(bgstring, counts, palette='muted')
g.set_xticklabels(g.get_xticklabels(), rotation=80)
plt.title(f'Top-{top_n} pairs of words that appear next to each other',
fontweight='bold')
plt.ylabel('Count')
plt.grid(True, alpha=0.1, color='black')
plt.show()