diff --git a/caliGraph.py b/caliGraph.py index 6fd2715..9737daa 100755 --- a/caliGraph.py +++ b/caliGraph.py @@ -817,9 +817,9 @@ def genScores(G, books, calcPagerank=True): globMu, globStd = calcRecDist(G, books) if calcPagerank: runPagerank(G) - genGprScores(G, globMu, globStd) - scoreOpinions(G, globMu, globStd) - scoreUnread(G, globMu, globStd) + genGprScores(G, globMu, globStd, 'score', 'std') + #scoreOpinions(G, globMu, globStd) + #scoreUnread(G, globMu, globStd) return globMu, globStd def addImageToNode(node, cache, shape='circularImage'): diff --git a/py/gp.py b/py/gp.py index 4dea0e7..d559881 100644 --- a/py/gp.py +++ b/py/gp.py @@ -8,17 +8,17 @@ from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.base import clone class BookKernel(GenericKernelMixin, Kernel): - def __init__(self, G): - self.baseline_similarity = 0.5 - self.baseline_similarity_bounds = (1e-5, 1) - - self.G = G - self.node2vec = Node2Vec(self.G, dimensions=32, walk_length=16, num_walks=256, workers=8) - self.model = self.node2vec.fit(window=10, min_count=1, batch_words=4) - self.wv = self.model.wv + def __init__(self, wv): + self.wv = wv def _f(self, s1, s2): - return self.wv.similarity(s1, s2) + """ + kernel value between a pair of sequences + """ + s = self.wv.similarity(s1, s2)**2*0.99 + 0.01 + if s <= 0: + print('bad!') + return s def __call__(self, X, Y=None, eval_gradient=False): if Y is None: @@ -26,10 +26,11 @@ class BookKernel(GenericKernelMixin, Kernel): if eval_gradient: return ( - np.array([[self._f(x, y) for y in Y] for x in X]), + np.array([[self._f(x, y) for y in Y] for x in X]) ) else: return np.array([[self._f(x, y) for y in Y] for x in X]) + #return np.array(self.wv.n_similarity(X, Y)) def diag(self, X): return self(X) @@ -43,22 +44,32 @@ class BookKernel(GenericKernelMixin, Kernel): return cloned def genGprScores(G, globMu, globStd, scoreName='gpr_score', stdName='gpr_std'): - gpr = GaussianProcessRegressor(kernel=BookKernel(G), random_state=3141) + print('[\] Constructing Vectorizer') + node2vec = Node2Vec(G, dimensions=32, walk_length=16, num_walks=128, workers=8) + print('[\] Fitting Embeddings for Kernel') + model = node2vec.fit(window=8, min_count=1, batch_words=4) + wv = model.wv + print('[\] Constructing Kernel') + kernel = BookKernel(wv) X, y = [], [] for n in G.nodes: node = G.nodes[n] - if node['rating']!=None: + if 'rating' in node and node['rating']!=None: X.append(n) y.append(node['rating']) - gpr.fit(X, y) + print('[\] Fitting GP') + gpr = GaussianProcessRegressor(kernel=kernel, random_state=3141, alpha=1e-8).fit(X, y) X = [] for n in G.nodes: node = G.nodes[n] - if node['rating']==None: + if 'rating' in node and node['rating']==None: X.append(n) - y,stds = gpr.predict(X, return_std=True) + print('[\] Inferencing GP') + y, stds = gpr.predict(X, return_std=True) + i=0 for n in G.nodes: node = G.nodes[n] - if node['rating']==None: - y, std = y.pop(0), stds.pop(0) - node[scoreName], node[stdName] = y, std + if 'rating' in node and node['rating']==None: + s, std = y[i], stds[i] + i+=1 + node[scoreName], node[stdName] = s, std