-
Notifications
You must be signed in to change notification settings - Fork 26
/
recommend_util.py
105 lines (76 loc) · 4.26 KB
/
recommend_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# Decided Functionality: Takes course-id (String) as input in the form of a function,
# returns a list of recommended course-ids.
# Procedure for handling inputs:
# 1. Load the earlier traied clusteriazation model. Model, already trained. Don't train it again.
# 2. Give cluster labels/categories to all 8-clusters formed. A mapping like procedure, done manually here.
# Note:
# Limitaion: Categories with less courses got diluted down on current clusters that are formed.
# Mitigation: Another possible algorithm like SVD or add more cluster to be able to detect those cluster.
# 3. Assign labels to all live courses that are not retired & store them in a data-frame.
# Advantage: These labels are dynamic & dependent on model. Hence, with better models better labels can get assigned.
# 4. Receive input from user in terms of string, predict its cluster-category.
# 5. Recommend 10[or 'n'] random courses of same category based on the given input provided & predicted by user.
# import statements
import pandas as pd
import pickle
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
# 1. load model and previous preprocessing.
# load model only once
with open('finalized_model.sav', 'rb') as fid:
model = pickle.load(fid)
# X = vectorizer.fit_transform(course_df['InputString'])
# This will give an error as incorrect number of features, i.e. if features from a different data-frame is used
# seperate code snippet for building vocabulary for trained model
courses_df = pd.read_csv("data/courses.csv")
courses_df = courses_df.dropna(how='any')
courses_df['Description'] = courses_df['Description'].replace({"'ll": " "}, regex=True)
courses_df['CourseId'] = courses_df['CourseId'].replace({"-": " "}, regex=True)
comb_frame = courses_df.CourseId.str.cat(" "+courses_df.CourseTitle.str.cat(" "+courses_df.Description))
comb_frame = comb_frame.replace({"[^A-Za-z0-9 ]+": ""}, regex=True)
# Add clustering labels to every non-retired course
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(comb_frame)
# 2. Current utility variable and data frame preprocessing
# Verbose only, Not getting used in the code: creating labels for clusters manually
label_dict = {
0: "Gaming Professionals",
1: "Manufacturing & Design",
2: "Software Development",
3: "Data Professionals",
4: "Information & Cyber Security",
5: "Movie Making & Animation",
6: "IT Ops",
7: "Graphic Design"
}
# load the complete data in a dataframe
course_df = pd.read_csv("data/courses.csv")
# drop retired course from analysis. But, courses with no descriptions are kept.
course_df = course_df[course_df.IsCourseRetired == 'no']
# create new column in dataframe which is combination of (CourseId, CourseTitle, Description) in existing data-frame
course_df['InputString'] = course_df.CourseId.str.cat(" "+course_df.CourseTitle.str.cat(" "+course_df.Description))
course_df['ClusterPrediction'] = ""
def cluster_predict(str_input):
Y = vectorizer.transform(list(str_input))
prediction = model.predict(Y)
return prediction
# Cluster category for each live course
course_df['ClusterPrediction']=course_df.apply(lambda x: cluster_predict(course_df['InputString']), axis=0)
def recommend_util(str_input):
# Predict category of input string category
temp_df = course_df.loc[course_df['CourseId'] == str_input]
temp_df['InputString'] = temp_df.CourseId.str.cat(" "+temp_df.CourseTitle.str.cat(" "+temp_df['Description']))
str_input = list(temp_df['InputString'])
prediction_inp = cluster_predict(str_input)
prediction_inp = int(prediction_inp)
temp_df = course_df.loc[course_df['ClusterPrediction'] == prediction_inp]
temp_df = temp_df.sample(10)
return list(temp_df['CourseId'])
if __name__ == '__main__':
queries = ['wp7-core', 'ef41-data-access', 'nosql-big-pic', 'procedural-ice-modeling-softimage-153', \
'beginners-guide-shading-networks-softimage-510', 'centralized-logging-elastic-stack', \
'apache-pig-data-transformations']
for query in queries:
res = recommend_util(query)
print(res)