-
Notifications
You must be signed in to change notification settings - Fork 2
/
recommend.py
137 lines (102 loc) · 3.7 KB
/
recommend.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import json
from scipy import spatial
import nltk
from nltk.corpus import stopwords
import numpy as np
data = json.load(open('courses.json', 'r'))
courses = list(data.keys())
courses_id_to_name = json.load(open('subjects.json', 'r'))
for course in courses:
data[course] = [w.lower() for w in nltk.word_tokenize(data[course]['syllabus']) if w not in stopwords.words('english') and len(w)!=1]
import string
for course in courses:
for word_ind in range(len(data[course])):
for punct in string.punctuation + '•':
data[course][word_ind] = data[course][word_ind].replace(punct, " ")
for course in courses:
data[course] = " ".join(data[course]).split(" ")
to_del = []
for course_ind in range(len(courses)):
if (len(data[courses[course_ind]]) < 5):
print (courses[course_ind])
to_del.append(course_ind)
del data[courses[course_ind]]
to_del.reverse()
for index in to_del:
del courses[index]
corpora = set()
for a in data:
for b in data[a]:
corpora.add(b)
corpora = list(corpora)
matrix = np.zeros((len(data), len(corpora)))
for course_ind in range(len(courses)):
for word in data[courses[course_ind]]:
matrix[course_ind][corpora.index(word)] = 1
normalised_mat = matrix - np.asarray([(np.mean(matrix, 1))]).T
A = normalised_mat.T / np.sqrt(matrix.shape[0] - 1)
U, S, V = np.linalg.svd(A)
del U
del S
del A
del normalised_mat
sliced = V.T[:,:100]
len(sliced)
def find_top_k(to_find_sim, sliced, k, different_dep = False):
top_k = []
max_r = 0
for course_ind in range(len(courses)):
if (different_dep == True):
if (courses[to_find_sim][0:2] == courses[course_ind][0:2]):
continue
if (course_ind != to_find_sim):
result = 1 - spatial.distance.cosine(sliced[course_ind], sliced[to_find_sim])
if (len(top_k) < k):
top_k.append([result, courses[course_ind]])
else:
top_k = sorted(top_k, reverse = True)
if (top_k[-1][0] < result):
top_k[-1] = [result, courses[course_ind]]
return top_k
def find_top_k_vec(vec, sliced, k):
top_k = []
max_r = 0
for course_ind in range(len(courses)):
result = 1 - spatial.distance.cosine(sliced[course_ind], vec)
if (len(top_k) < k):
top_k.append([result, courses[course_ind]])
else:
top_k = sorted(top_k, reverse = True)
if (top_k[-1][0] < result):
top_k[-1] = [result, courses[course_ind]]
return top_k
def print_top_k(course_name, sliced, k, print_=True, different_dep = False):
tmp = find_top_k(courses.index(course_name), sliced, 5, different_dep)
for i in range(len(tmp)):
if (tmp[i][1] in courses_id_to_name):
tmp[i][1] = tmp[i][1] + " - " + courses_id_to_name[tmp[i][1]]
if (print_):
this_name = ""
if (course_name in courses_id_to_name):
this_name = courses_id_to_name[course_name]
print ("Most similar courses to ", course_name, " ", this_name)
pprint (tmp)
return tmp
def print_top_k_vec(vec, sliced, k, print_=True):
tmp = find_top_k_vec(vec, sliced, 5)
for i in range(len(tmp)):
if (tmp[i][1] in courses_id_to_name):
tmp[i][1] = tmp[i][1] + " - " + courses_id_to_name[tmp[i][1]]
if (print_):
this_name = ""
print ("Most similar courses are ")
pprint (tmp)
return tmp
print_top_k('MA21007', sliced, 5)
print_top_k('CS11001', sliced, 5)
def vec(course_name, sliced):
return sliced[courses.index(course_name)]
v_m = vec('HS20001', sliced)
v_c = vec('EP60008', sliced)
v = (v_c-v_m)/2
print_top_k('IM21003', sliced, 5, different_dep = True)