-
Notifications
You must be signed in to change notification settings - Fork 1
/
create-word-frequency-report-from-pdf.py
79 lines (71 loc) · 2.72 KB
/
create-word-frequency-report-from-pdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
from collections import Counter
import numpy as np
import re
import chardet
import matplotlib as mpl
import fitz
import sys
import plotly.graph_objects as go
mpl.use('MacOSX') # optional (depends on user's operating system)
mpl.rcParams['text.usetex'] = True
mpl.rcParams['text.latex.preamble'] = [r'\usepackage{amsmath}']
import matplotlib.pyplot as plt
import argparse
def openfile(Filename):
# open the PDF file
pdf = fitz.open(Filename)
txt = ''
for page in pdf:
# extract the text from the page
txt += page.get_text()
return txt
def removegarbage(str):
str = re.sub(r'\W+', ' ', str)
str = str.lower()
return str
def getwordbins(words):
cnt = Counter()
for word in words:
cnt[word] += 1
return cnt
def plot_histogram(nums,labels,Filename=None):
plt.figure(1,figsize=(16,8))
bins = np.arange(0,len(labels))
bar_width = 0.8*(bins[1]-bins[0])
plt.bar(bins,nums,bar_width)
plt.xticks(bins,labels,rotation=90, ha='center',fontsize=9)
if Filename is not None:
plt.title(f'Word Frequency Report for {Filename}')
else:
plt.title('Word Frequency Report')
plt.tight_layout()
plt.show(block=True)
def plotly_histogram(nums, labels, Filename=None):
fig = go.Figure(data=[go.Bar(x=labels, y=nums)])
fig.update_layout(xaxis_tickangle=-45)
if Filename is not None:
fig.update_layout(title=f'Word Frequency Report for {Filename}')
else:
fig.update_layout(title='Word Frequency Report')
fig.show()
def main():
parser = argparse.ArgumentParser(description='Plot a histogram of the top N most common words in a text file.')
parser.add_argument('Filename', type=str, help='Path to the text file')
parser.add_argument('topwords', type=int, help='The number of most common words to be plotted')
args = parser.parse_args()
Filename = args.Filename
topwords = args.topwords
# rest of the code
txt = openfile(Filename)
txt = removegarbage(txt)
words = txt.split(' ')
bins = getwordbins(words)
nums = []
labels = []
for key, value in bins.most_common(topwords):
if key not in ['were','was','these','into','no','would','how','but','there','over','only','than','some','each','when','may','its','any','does','their','it','not','his','yet','what','all','we','our','they','i','mr','ms','mrs','one','two','said','also','have','from','the','of','are','and','in','to','for','a','will','be','on','with','is','1','2','3','4','5','6','7','8','9','by','this','at','as','we','can','has','that','an','which','or'] and len(key) > 1:
nums.append(value)
labels.append(key)
plotly_histogram(nums,labels,Filename)
if __name__ == "__main__":
sys.exit(main())