-
Notifications
You must be signed in to change notification settings - Fork 0
/
text_summarizer_train.py
249 lines (200 loc) · 8.35 KB
/
text_summarizer_train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
import numpy as np
import pandas as pd
import pickle
from statistics import mode
import nltk
from nltk import word_tokenize
from nltk.stem import LancasterStemmer
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from tensorflow.keras.models import Model
from tensorflow.keras import models
from tensorflow.keras import backend as K
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import plot_model
from tensorflow.keras.layers import Input,LSTM,Embedding,Dense,Concatenate,Attention
from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup
from utils import clean
#read the dataset file
df=pd.read_csv("Reviews.csv",nrows=100000)
#drop the duplicate and na values from the records
df.drop_duplicates(subset=['Text'],inplace=True)
df.dropna(axis=0,inplace=True)
input_data = df.loc[:,'Text']
target_data = df.loc[:,'Summary']
target.replace('', np.nan, inplace=True)
input_texts=[]
target_texts=[]
input_words=[]
target_words=[]
contractions= pickle.load(open("contractions.pkl","rb"))['contractions']
#initialize stop words and LancasterStemmer
stop_words=set(stopwords.words('english'))
stemm=LancasterStemmer()
#pass the input records and taret records
for in_txt,tr_txt in zip(input_data,target_data):
in_words= clean(in_txt,"inputs")
input_texts+= [' '.join(in_words)]
input_words+= in_words
#add 'sos' at start and 'eos' at end of text
tr_words= clean("sos "+tr_txt+" eos","target")
target_texts+= [' '.join(tr_words)]
target_words+= tr_words
#store only unique words from input and target list of words
input_words = sorted(list(set(input_words)))
target_words = sorted(list(set(target_words)))
num_in_words = len(input_words) #total number of input words
num_tr_words = len(target_words) #total number of target words
#get the length of the input and target texts which appears most often
max_in_len = mode([len(i) for i in input_texts])
max_tr_len = mode([len(i) for i in target_texts])
print("number of input words : ",num_in_words)
print("number of target words : ",num_tr_words)
print("maximum input length : ",max_in_len)
print("maximum target length : ",max_tr_len)
#split the input and target text into 80:20 ratio or testing size of 20%.
x_train,x_test,y_train,y_test=train_test_split(input_texts,target_texts,test_size=0.2,random_state=0)
#train the tokenizer with all the words
in_tokenizer = Tokenizer()
in_tokenizer.fit_on_texts(x_train)
tr_tokenizer = Tokenizer()
tr_tokenizer.fit_on_texts(y_train)
#convert text into sequence of integers
#where the integer will be the index of that word
x_train= in_tokenizer.texts_to_sequences(x_train)
y_train= tr_tokenizer.texts_to_sequences(y_train)
#pad array of 0's if the length is less than the maximum length
en_in_data= pad_sequences(x_train, maxlen=max_in_len, padding='post')
dec_data= pad_sequences(y_train, maxlen=max_tr_len, padding='post')
#decoder input data will not include the last word
#i.e. 'eos' in decoder input data
dec_in_data = dec_data[:,:-1]
#decoder target data will be one time step ahead as it will not include
# the first word i.e 'sos'
dec_tr_data = dec_data.reshape(len(dec_data),max_tr_len,1)[:,1:]
K.clear_session()
latent_dim = 500
#create input object of total number of input words
en_inputs = Input(shape=(max_in_len,))
en_embedding = Embedding(num_in_words+1, latent_dim)(en_inputs)
#create 3 stacked LSTM layer with the shape of hidden dimension
#LSTM 1
en_lstm1= LSTM(latent_dim, return_state=True, return_sequences=True)
en_outputs1, state_h1, state_c1= en_lstm1(en_embedding)
#LSTM2
en_lstm2= LSTM(latent_dim, return_state=True, return_sequences=True)
en_outputs2, state_h2, state_c2= en_lstm2(en_outputs1)
#LSTM3
en_lstm3= LSTM(latent_dim,return_sequences=True,return_state=True)
en_outputs3 , state_h3 , state_c3= en_lstm3(en_outputs2)
#encoder states
en_states= [state_h3, state_c3]
# Decoder.
dec_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(num_tr_words+1, latent_dim)
dec_embedding = dec_emb_layer(dec_inputs)
#initialize decoder's LSTM layer with the output states of encoder
dec_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
dec_outputs, *_ = dec_lstm(dec_embedding,initial_state=en_states)
#Attention layer
attention =Attention()
attn_out = attention([dec_outputs,en_outputs3])
#Concatenate the attention output with the decoder ouputs
merge=Concatenate(axis=-1, name='concat_layer1')([dec_outputs,attn_out])
#Dense layer (output layer)
dec_dense = Dense(num_tr_words+1, activation='softmax')
dec_outputs = dec_dense(merge)
#Mode class and model summary
model = Model([en_inputs, dec_inputs], dec_outputs)
model.summary()
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)
model.compile(
optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"] )
model.fit(
[en_in_data, dec_in_data],
dec_tr_data,
batch_size=512,
epochs=10,
validation_split=0.1,
)
#Save model
model.save("s2s")
# encoder inference
latent_dim=500
#load the model
model = models.load_model("s2s")
#construct encoder model from the output of 6 layer i.e.last LSTM layer
en_outputs,state_h_enc,state_c_enc = model.layers[6].output
en_states=[state_h_enc,state_c_enc]
#add input and state from the layer.
en_model = Model(model.input[0],[en_outputs]+en_states)
# decoder inference
#create Input object for hidden and cell state for decoder
#shape of layer with hidden or latent dimension
dec_state_input_h = Input(shape=(latent_dim,))
dec_state_input_c = Input(shape=(latent_dim,))
dec_hidden_state_input = Input(shape=(max_in_len,latent_dim))
# Get the embeddings and input layer from the model
dec_inputs = model.input[1]
dec_emb_layer = model.layers[5]
dec_lstm = model.layers[7]
dec_embedding= dec_emb_layer(dec_inputs)
#add input and initialize LSTM layer with encoder LSTM states.
dec_outputs2, state_h2, state_c2 = dec_lstm(dec_embedding, initial_state=[dec_state_input_h,dec_state_input_c])
#Attention layer
attention = model.layers[8]
attn_out2 = attention([dec_outputs2,dec_hidden_state_input])
merge2 = Concatenate(axis=-1)([dec_outputs2, attn_out2])
#Dense layer
dec_dense = model.layers[10]
dec_outputs2 = dec_dense(merge2)
# Finally define the Model Class
dec_model = Model(
[dec_inputs] + [dec_hidden_state_input,dec_state_input_h,dec_state_input_c],
[dec_outputs2] + [state_h2, state_c2])
#create a dictionary with a key as index and value as words.
reverse_target_word_index = tr_tokenizer.index_word
reverse_source_word_index = in_tokenizer.index_word
target_word_index = tr_tokenizer.word_index
reverse_target_word_index[0]=' '
def decode_sequence(input_seq):
#get the encoder output and states by passing the input sequence
en_out, en_h, en_c= en_model.predict(input_seq)
#target sequence with inital word as 'sos'
target_seq = np.zeros((1, 1))
target_seq[0, 0] = target_word_index['sos']
#if the iteration reaches the end of text than it will be stop the iteration
stop_condition = False
#append every predicted word in decoded sentence
decoded_sentence = ""
while not stop_condition:
#get predicted output, hidden and cell state.
output_words, dec_h, dec_c= dec_model.predict([target_seq] + [en_out,en_h, en_c])
#get the index and from the dictionary get the word for that index.
word_index = np.argmax(output_words[0, -1, :])
text_word = reverse_target_word_index[word_index]
decoded_sentence += text_word +" "
# Exit condition: either hit max length
# or find a stop word or last word.
if text_word == "eos" or len(decoded_sentence) > max_tr_len:
stop_condition = True
#update target sequence to the current word index.
target_seq = np.zeros((1, 1))
target_seq[0, 0] = word_index
en_h, en_c = dec_h, dec_c
#return the deocded sentence
return decoded_sentence
inp_review = input("Enter : ")
print("Review :",inp_review)
inp_review = clean(inp_review,"inputs")
inp_review = ' '.join(inp_review)
inp_x= in_tokenizer.texts_to_sequences([inp_review])
inp_x= pad_sequences(inp_x, maxlen=max_in_len, padding='post')
summary=decode_sequence(inp_x.reshape(1,max_in_len))
if 'eos' in summary :
summary=summary.replace('eos','')
print("\nPredicted summary:",summary);print("\n")