-
Notifications
You must be signed in to change notification settings - Fork 1
/
create-word-frequency-report-from-txt.py
65 lines (59 loc) · 2.18 KB
/
create-word-frequency-report-from-txt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from collections import Counter
import numpy as np
import re
import chardet
import matplotlib as mpl
import sys
mpl.use('MacOSX') # optional (depends on user's operating system)
mpl.rcParams['text.usetex'] = True
mpl.rcParams['text.latex.preamble'] = [r'\usepackage{amsmath}']
import matplotlib.pyplot as plt
import argparse
def openfile(Filename):
with open(Filename, 'rb') as f:
result = chardet.detect(f.read())
f.seek(0)
txt = f.read().decode(result['encoding'])
return txt
def removegarbage(str):
str = re.sub(r'\W+', ' ', str)
str = str.lower()
return str
def getwordbins(words):
cnt = Counter()
for word in words:
cnt[word] += 1
return cnt
def plot_histogram(nums,labels,Filename=None):
plt.figure(1,figsize=(16,8))
bins = np.arange(0,len(labels))
bar_width = 0.8*(bins[1]-bins[0])
plt.bar(bins,nums,bar_width)
plt.xticks(bins,labels,rotation=90, ha='center',fontsize=9)
if Filename is not None:
plt.title(f'Word Frequency Report for {Filename}')
else:
plt.title('Word Frequency Report')
plt.tight_layout()
plt.show(block=True)
def main():
parser = argparse.ArgumentParser(description='Plot a histogram of the top N most common words in a text file.')
parser.add_argument('Filename', type=str, help='Path to the text file')
parser.add_argument('topwords', type=int, help='The number of most common words to be plotted')
args = parser.parse_args()
Filename = args.Filename
topwords = args.topwords
# rest of the code
txt = openfile(Filename)
txt = removegarbage(txt)
words = txt.split(' ')
bins = getwordbins(words)
nums = []
labels = []
for key, value in bins.most_common(topwords):
if key not in ['we','our','they','i','mr','ms','was','mrs','one','two','said','also','have','from','the','of','are','and','et','al','were','in','to','for','a','will','be','on','with','is','1','2','3','4','5','6','7','8','9','by','this','at','as','we','can','has','that','an','which','or'] and len(key) > 1:
nums.append(value)
labels.append(key)
plot_histogram(nums,labels,Filename)
if __name__ == "__main__":
sys.exit(main())