-
Notifications
You must be signed in to change notification settings - Fork 0
/
search_title_paragraph.py
40 lines (35 loc) · 1.43 KB
/
search_title_paragraph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import json
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics.pairwise import linear_kernel
import sys
import search_title
# Path of parsed papers
path_parsed = "./test/"
# path_parsed = ""
def most_related_paragraph(title,paragraph_list):
tfidf = TfidfVectorizer().fit_transform([title]+paragraph_list)
cosine_similarities = linear_kernel(tfidf[0], tfidf[1:]).flatten()
most_similar, similarity = paragraph_list[0], -1
for i in range(len(paragraph_list)):
if cosine_similarities[i] > similarity:
similarity = cosine_similarities[i]
most_similar = paragraph_list[i]
return most_similar
def paragraph_rank(string,top_num):
#feature 2, return most related paragraph
f = open(path_parsed + "test.json", 'r')
data = json.load(f)
title_list = search_title.query(string,top_num)
ret_val = [] # list of dictionary of title and info
for idx,title in enumerate(title_list,1):
if title not in data:
title = title.strip()
if title not in data:
continue
paragraph = most_related_paragraph(title,data[title]["paragraph"])
ret_val.append({"title":title, "paragraph": paragraph, "author":data[title]["author"], "link": data[title]["link"]})
return ret_val
if __name__ == '__main__':
print(json.dumps(paragraph_rank(sys.argv[1],int(sys.argv[2]))))