-
Notifications
You must be signed in to change notification settings - Fork 7
/
cranfield.py
528 lines (370 loc) · 12.7 KB
/
cranfield.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
# ---
# jupyter:
# jupytext:
# formats: ipynb,py:light
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.3.3
# kernelspec:
# display_name: Python 3
# language: python
# name: python3
# ---
# # Name : Samujjwaal Dey
#
# ## CS 582 Information Retrieval : Homework 2
# ## Imports and Initializations
# Importing dependancy libraries
import os
import pandas as pd
import numpy as np
import re
import math as m
from collections import Counter
from bs4 import BeautifulSoup
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
stop_list = stopwords.words('english')
# sorted(stop_list)
# +
# Declaring variables for file path
in_path = 'cranfieldDocs'
out_path = 'preprocessed_cranfieldDocs'
# Declaring variables for query files
query = 'queries.txt'
preproc_query = 'preprocessed_queries.txt'
# Declaring variable for file with query relevance values
relevance = 'relevance.txt'
# Checking if the preprocessed docs folder exists already
if not os.path.isdir(out_path):
os.mkdir(out_path)
# Getting all filenames from the docs folder
filenames = os.listdir(in_path) # To generate file path
# print(filenames)
# Initiallizing Porter Stemmer object
st = PorterStemmer()
# Initializing regex to remove words with one or two characters length
shortword = re.compile(r'\W*\b\w{1,2}\b')
# -
# ## Preprocessing the documents and queries file
def tokenize(data):
"""Preprocesses the string given as input. Converts to lower case,
removes the punctuations and numbers, splits on whitespaces,
removes stopwords, performs stemming & removes words with one or
two characters length.
Arguments:
data {string} -- string to be tokenized
Returns:
string -- string of tokens generated
"""
# converting to lower case
lines = data.lower()
# removing punctuations by using regular expression
lines = re.sub('[^A-Za-z]+', ' ', lines)
# splitting on whitespaces to generate tokens
tokens = lines.split()
# removing stop words from the tokens
clean_tokens = [word for word in tokens if word not in stop_list]
# stemming the tokens
stem_tokens = [st.stem(word) for word in clean_tokens]
# checking for stopwords again
clean_stem_tokens = [word for word in stem_tokens if word not in stop_list]
# converting list of tokens to string
clean_stem_tokens = ' '.join(map(str, clean_stem_tokens))
# removing tokens with one or two characters length
clean_stem_tokens = shortword.sub('', clean_stem_tokens)
return clean_stem_tokens
def extractTokens(beautSoup, tag):
"""Extract tokens of the text between a specific SGML <tag>. The function
calls tokenize() function to generate tokens from the text.
Arguments:
beautSoup {bs4.BeautifulSoup} -- soup bs object formed using text of a file
tag {string} -- target SGML <tag>
Returns:
string -- string of tokens extracted from text between the target SGML <tag>
"""
# extract text of a particular SGML <tag>
textData = beautSoup.findAll(tag)
# converting to string
textData = ''.join(map(str, textData))
# remove the SGML <tag> from text
textData = textData.replace(tag, '')
# calling function to generate tokens from text
textData = tokenize(textData)
return textData
# +
# Preprocessing all the documents in the cranfieldDocs directory
"""This cell might take about 20 seconds to run."""
for fname in filenames:
# generate filenames
infilepath = in_path + '/' + fname
outfilepath = out_path + '/' + fname
with open(infilepath) as infile:
with open(outfilepath, 'w') as outfile:
# read all text in a file
fileData = infile.read()
# creating BeautifulSoup object to extract text between SGML tags
soup = BeautifulSoup(fileData)
# extract tokens for <title>
title = extractTokens(soup, 'title')
# extract tokens for <text>
text = extractTokens(soup, 'text')
# write tokens for <title> into new file
outfile.write(title)
outfile.write(" ")
# write tokens for <text> into new file
outfile.write(text)
outfile.close()
infile.close()
# +
# Preprocessing the queries.txt file
q = open(query)
# opening new file to write preprocessed tokens into
new_q = open(preproc_query, 'w')
# read each line of file seperately
text = q.readlines()
for line in text:
# if condition to avoid newline character in the end of file
if(line != text[-1]):
query_tokens = tokenize(line)
new_q.write(query_tokens + '\n')
else:
query_tokens = tokenize(line)
new_q.write(query_tokens)
# +
# Generate a single list of all preprocessed docs
all_docs = []
for fname in filenames:
outfilepath = out_path + '/' + fname
with open(outfilepath) as file:
fileData = file.read()
# append file data to the list
all_docs.append(fileData)
# print(all_docs)
# -
# total number of documents is 1400
no_of_docs = len(all_docs)
no_of_docs
# # Task 1
# ## Calculating df values for each term in the vocabulary
# +
# create a dictionary of key-value pairs where tokens are keys and their occurence in the corpus the value
DF = {}
for i in range(no_of_docs):
tokens = all_docs[i].split()
for w in tokens:
try:
# add token as key and doc number as value is chained
DF[w].add(i)
except:
# to handle when a new token is encountered
DF[w] = {i}
for i in DF:
# convert to number of occurences of the token from list of documents where token occurs
DF[i] = len(DF[i])
# -
print(DF)
# count number of unique words in the corpus
vocab_size = len(DF)
print(vocab_size)
# create vocabulary list of all unique words
vocab = [term for term in DF]
print(vocab)
# ## Calculating tf-idf values for each term in the vocabulary
# +
doc = 0
# creating dictionary to store tf-idf values for each term in the vocabulary
tf_idf = {}
for i in range(no_of_docs):
tokens = all_docs[i].split()
# counter object to efficiently count number of occurence of a term in a particular document
counter = Counter(tokens)
words_count = len(tokens)
for token in np.unique(tokens):
# counting occurence of term in object using counter object
tf = counter[token]/words_count
# retrieving df values from DF dictionary
df = DF[token] if token in vocab else 0
# adding 1 to numerator & denominator to avoid divide by 0 error
idf = np.log((no_of_docs+1)/(df+1))
tf_idf[doc, token] = tf*idf
doc += 1
# -
# print(tf_idf)
tf_idf
# ## Forming document vectors using the tf-idf values
# +
# initializing empty vector of vocabulary size
D = np.zeros((no_of_docs, vocab_size))
# creating vector of tf-idf values
for i in tf_idf:
ind = vocab.index(i[1])
D[i[0]][ind] = tf_idf[i]
# -
# len(D)
print(D)
def gen_vector(tokens):
"""To create a vector (with repsect to the vocabulary) of the tokens passed as input
Arguments:
tokens {list} -- list of tokens to be converted
Returns:
numpy.ndarray -- vector of tokens
"""
Q = np.zeros((len(vocab)))
counter = Counter(tokens)
words_count = len(tokens)
query_weights = {}
for token in np.unique(tokens):
tf = counter[token]/words_count
df = DF[token] if token in vocab else 0
idf = m.log((no_of_docs+1)/(df+1))
try:
ind = vocab.index(token)
Q[ind] = tf*idf
except:
pass
return Q
# # Task 2
# ## Calculating cosine similarity between each query vectors and document vectors
def cosine_sim(x, y):
"""To calculate cosine similarity between 2 vectors.
Arguments:
x {numpy.ndarray} -- vector 1
y {numpy.ndarray} -- vector 2
Returns:
numpy.float64 -- cosine similarity between vector 1 & vector 2
"""
cos_sim = np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y))
return cos_sim
def cosine_similarity(k, query):
"""To determine a ranked list of top k documents in descending order of their
cosine similarity with the query
Arguments:
k {integer} -- top k documents to retrieve from
query {string} -- query whose cosine similarity is to be computed with the corpus
Returns:
numpy.ndarray -- list of top k cosine similarities between query and corpus of documents
"""
tokens = query.split()
d_cosines = []
# vectorize the input query tokens
query_vector = gen_vector(tokens)
for d in D:
d_cosines.append(cosine_sim(query_vector, d))
if k == 0:
# k=0 to retrieve all documents in descending order
out = np.array(d_cosines).argsort()[::-1]
else:
# to retrieve the top k documents in descending order
out = np.array(d_cosines).argsort()[-k:][::-1]
return out
# list of all queries from preprocessed queries file
query_file = open(preproc_query, 'r')
queries = query_file.readlines()
queries
def list_of_docs(k):
"""To generate a ranked list of k top documents in descending order of their cosine similarity
calculated against the queries. Output is a list of (query id, document id) pairs.
If k=0 is given as input then list of all documnets in descending order is returned.
Arguments:
k {integer} -- number of top documents to be retrieved
Returns:
list -- list of documents in descending order of their cosine similarity
"""
cos_sims = []
for i in range(len(queries)):
cs = [i, cosine_similarity(k, queries[i])]
cos_sims.append(cs)
return cos_sims
#to get list of all documents
no_of_top=0
list_of_docs(no_of_top)
# ## Calculating precision and recall values for each query
# retrieving relevance values from relevance.txt
colnames=['query', 'relevance']
rel = pd.read_csv(relevance, delim_whitespace=True, names=colnames, header=None)
rel.head(10)
# +
# list of relevant document numbers for a document
rel_list = []
# list of list of relevant document numbers for all documents
query_rel = []
for i in range(1,11):
rel_list = rel[rel['query']==i]['relevance'].to_list()
# append list rel_list to list of list query_list
query_rel.append(rel_list)
print(query_rel)
# -
def intersection(lst1, lst2):
"""To count number of common items between 2 lists
Arguments:
lst1 {list} -- list 1
lst2 {list} -- list 2
Returns:
integer -- number of common items between list 1 & list 2
"""
lst3 = [value for value in lst1 if value in lst2]
return len(lst3)
# +
top = [10, 50, 100, 500]
# for top 100 docs
no_of_top = top[2]
no_of_top
# -
def calculate_recall(k):
"""To generate list of recall values for each query for given value of k
Arguments:
k {integer} -- number of top documents to be retrieved
Returns:
list -- list of recall values for each query
"""
recall = []
for i in range(len(queries)):
# Number of relevant documents retrieved
a = intersection(list_of_docs(k)[i][1].tolist(), query_rel[i])
# Total number of relevant documents
b = len(query_rel[i])
r = a / b
recall.append(r)
return recall
# for top 100 docs
calculate_recall(no_of_top)
np.mean(calculate_recall(no_of_top))
# +
def calculate_precision(k):
"""To generate list of precision values for each query for given value of k
Arguments:
k {[type]} -- number of top documents to be retrieved
Returns:
list -- list of precision values for each query
"""
precision = []
for i in range(len(queries)):
# Number of relevant documents retrieved
a = intersection(list_of_docs(k)[i][1].tolist(), query_rel[i])
# Total number of documents retrieved
b = k
p = a / b
precision.append(p)
return precision
# for top 100 docs
calculate_precision(no_of_top)
# -
np.mean(calculate_precision(no_of_top))
# +
# To print the output of answers to questions in Task 2
# Output saved to output.txt file
"""This cell might take about 40 seconds to run."""
for t in top:
p=0
r=0
print("Top {0} documents in the rank list".format(t))
p = calculate_precision(t)
r = calculate_recall(t)
for i in range(len(queries)):
print("Query: {0} \t Pr: {1} \t Re: {2}\n".format(i+1,p[i],r[i]))
print("Avg Precision: {0}\n".format(np.mean(p)))
print("Avg Recall: {0}\n\n".format(np.mean(r)))
# -