-
Notifications
You must be signed in to change notification settings - Fork 1
/
searchEngine.py
85 lines (67 loc) · 3.55 KB
/
searchEngine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import nltk
import heapq
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
def conjunction_search(courses_df, vocabulary, inverted_index, query):
# Tokenize and preprocess the query using the same pre-processing steps
query = query.lower()
query = query.split()
stemmer = PorterStemmer()
query = [stemmer.stem(word) for word in query]
# Initialize a list to store document indices matching all query terms
matching_indices = []
# Initialize a set with the document indices that correspond to the first query term
first_term_id = vocabulary.get(query[0])
if first_term_id is not None:
matching_indices = set(inverted_index.get(str(first_term_id), []))
# For each subsequent term in the query, intersect the indices with the matching indices
for term in query[1:]:
term_id = vocabulary.get(term)
if term_id is not None:
matching_indices.intersection_update(inverted_index.get(str(term_id), []))
# Create a new DataFrame with the matching rows
matching_df = courses_df.iloc[list(matching_indices)]
return matching_df
def tfidf_conjunction_search_topk(courses_df, vocabulary, inverted_index, query, vectorizer, X, k):
# Tokenize and preprocess the query using the same pre-processing steps
query = query.lower()
query = query.split()
stemmer = PorterStemmer()
query = [stemmer.stem(word) for word in query]
# Initialize a list to store document indices matching all query terms
matching_indices = []
# Initialize a set with the document indices that correspond to the first query term
first_term_id = vocabulary.get(query[0])
if first_term_id is not None:
matching_indices = [idx for idx, _ in inverted_index.get(str(first_term_id), [])]
# For each subsequent term in the query, intersect the indices with the matching indices
for term in query[1:]:
term_id = vocabulary.get(term)
if term_id is not None:
term_indices = [idx for idx, _ in inverted_index.get(str(term_id), [])]
matching_indices = list(set(matching_indices) & set(term_indices))
# Use a heap to maintain the top-k documents based on similarity scores
heap = []
# Iterate through the matching indices and update the heap
for idx in matching_indices:
# Handle NaN values in the 'description' column
description = courses_df['description'].iloc[idx]
if pd.notna(description):
similarity_score = cosine_similarity(vectorizer.transform([' '.join(query)]), X[idx])[0][0]
# Push the document with its similarity score onto the heap
heapq.heappush(heap, (similarity_score, idx))
# If the heap size exceeds k, pop the smallest element
if len(heap) > k:
heapq.heappop(heap)
# Extract the indices of the top-k documents in reverse order
topk_indices = [idx for _, idx in sorted(heap, key=lambda x: x[0], reverse=True)]
# Create a new DataFrame with the top-k matching rows
topk_matching_df = courses_df.loc[topk_indices].copy()
# Add a new column for the similarity scores
topk_matching_df['SimilarityScore'] = [cosine_similarity(vectorizer.transform([' '.join(query)]), X[idx])[0][0] for idx in topk_indices]
# Select the desired columns
return topk_matching_df[['courseName', 'universityName', 'description', 'url', 'SimilarityScore']]