-
Notifications
You must be signed in to change notification settings - Fork 0
/
meaningly.py
97 lines (81 loc) · 3.17 KB
/
meaningly.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from absl import logging
import tensorflow as tfy
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns
import docx
import nltk.data
def plot_similarity(textlabels, textfeatures, sentencelabels, sentencefeatures, threshold):
plt.figure()
corr = np.inner(textfeatures, sentencefeatures)
x_labels = []
inds = np.where(abs(corr) >= abs(threshold))[0]
inds = list(set(inds))
for x in sorted(inds, reverse=True):
x_labels.append(textlabels.pop(x))
x_labels.reverse()
arr = corr[np.any(abs(corr) >= abs(threshold), axis=1)]
sns.set(font_scale=0.8)
g = sns.heatmap(
arr,
xticklabels=sentencelabels,
yticklabels=x_labels,
vmin=0,
vmax=1,
cmap="YlOrRd",
square=True)
g.set_xticklabels(sentencelabels, rotation=45, ha='right')
g.set_title("Semantic Textual Similarity")
plt.tight_layout()
plt.subplots_adjust(left=0.5, bottom=0.3)
plt.show()
# TODO: add input paramter split_sen. Should be a boolean that determines whether or not paragraphs are split into
# sentences
def process_file(file, start_symbol):
try:
doc = docx.Document(file)
except docx.opc.exceptions.PackageNotFoundError:
raise FileError
all_paras = doc.paragraphs
filtered_paras = []
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
# Remove the part of the sentence before the given symbol
for para in all_paras:
if start_symbol is not None:
for i in range(len(para.text)):
if para.text[i] == start_symbol:
filtered_text = para.text[i + 2:]
filtered_text = tokenizer.tokenize(filtered_text)
filtered_paras += filtered_text
break
else:
filtered_paras += tokenizer.tokenize(para.text)
return filtered_paras
class FileError(Exception):
"""Exception raised for errors when attempting to open a word file.
"""
pass
class Meaningly:
def __init__(self):
os.environ['TFHUB_CACHE_DIR'] = '/TensorFlowCache'
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
self.model = hub.load(module_url)
print("module %s loaded" % module_url)
def embed(self, input_):
return self.model(input_)
# TODO: Join corresponding labels and features together in a tuple so that features can be sorted by correlation and
# the labels will be sorted with them.
def run_and_plot(self, text_, sentences_, threshold_):
text_embeddings_ = self.embed(text_)
sentence_embeddings_ = self.embed(sentences_)
plot_similarity(text_, text_embeddings_, sentences_, sentence_embeddings_, threshold_)
def process_run_plot(self, file, sentences_to_compare, user_threshold, start_symbol=':'):
text = process_file(file, start_symbol)
self.run_and_plot(text, sentences_to_compare, user_threshold)
# For quick testing without the GUI:
"""meaningly = Meaningly()
meaningly.process_run_plot(r'C:/Users/spenc/Documents/Transcript.docx', ["No, you didn't", "I don't know"], 0)"""