-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfunctions.py
83 lines (65 loc) · 2.63 KB
/
functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import pickle
import os
import requests
import numpy as np
import pandas as pd
import textstat
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine, cdist
def download_and_open(url, mode="rb", folder="data"):
"""Downloads the specified file if it isn't already downloaded, then opens it."""
filename = os.path.basename(url)
pathname = os.path.join(folder, filename)
if not os.path.exists(pathname):
response = requests.get(url)
with open(pathname, "wb") as f:
f.write(response.content)
return open(pathname, mode)
with download_and_open(
r"https://text-ascent.s3-us-west-2.amazonaws.com/clean_df.pkl", "rb"
) as input_file:
clean_df = pickle.load(input_file)
clean_df["url"] = clean_df["title"].apply(
lambda title: f'https://en.wikipedia.org/wiki/{title.replace(" ", "_")}'
)
def load_vectorizer(
pickle_file="https://text-ascent.s3-us-west-2.amazonaws.com/vectorizer.pkl"
):
"""Loads the trained TF/IDF vectorizer."""
with download_and_open(pickle_file, "rb") as f:
return pickle.load(f)
def load_corpus_vectors(
pickle_file="https://text-ascent.s3-us-west-2.amazonaws.com/corpus_vectors.pkl"
):
"""Loads the corpus vectors."""
with download_and_open(pickle_file, "rb") as f:
return pickle.load(f)
def get_vocab_arr(vec):
"""Get the vocabulary array."""
n_features = len(vec.vocabulary_)
vocab_arr = np.empty(n_features, dtype=object)
for word, idx in vec.vocabulary_.items():
vocab_arr[idx] = word
return vocab_arr
def get_top_k_vector(vector, feature_ranking, k=20):
"""Return the top k vector according to feature_ranking."""
return vector[:, feature_ranking[:k]]
def top_k_text(text, k=50):
"""Return fifty of the most similar articles to the user input sorted by reading score. Display five articles at a time."""
vec = load_vectorizer()
corpus_vectors = load_corpus_vectors()
sample_vector = vec.transform([text]).toarray()
feature_ranking = np.argsort(sample_vector[0])[::-1]
vocab_arr = get_vocab_arr(vec)
distances = cdist(
get_top_k_vector(sample_vector, feature_ranking),
get_top_k_vector(corpus_vectors, feature_ranking),
)
nearest_article_idxs = np.argsort(distances)
nearest_articles = clean_df.loc[nearest_article_idxs[0], :]
top_k = nearest_articles[:k].copy()
top_k = top_k.sort_values(["score"], ascending=False)
top_k["i"] = list(range(k))[::-1] # won't change
top_k["style"] = "display: none"
top_k.loc[(top_k["i"] >= 22) & (top_k["i"] < 27), "style"] = ""
return top_k