From b10fcac016955d5d67bbc76d142ad6d7ca855403 Mon Sep 17 00:00:00 2001 From: Dominik Roth Date: Thu, 24 Feb 2022 20:12:08 +0100 Subject: [PATCH] WIP --- caliGraph.py | 9 ++++++-- py/gp.py | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 2 deletions(-) create mode 100644 py/gp.py diff --git a/caliGraph.py b/caliGraph.py index b585cce..957fee7 100755 --- a/caliGraph.py +++ b/caliGraph.py @@ -20,6 +20,8 @@ import plotly.graph_objects as go import wikipedia +from py import * + def getAllAuthors(books): authors = set() for book in books: @@ -390,7 +392,7 @@ def scoreOpinions(G, globMu, globStd): for n in list(G.nodes): node = G.nodes[n] feedbacks = [] - if node['t'] not in ['book']: + if node['t'] not in ['book', 'newBooks']: adjacens = list(G.adj[n].keys()) for adj in adjacens: adjNode = G.nodes[adj] @@ -403,7 +405,8 @@ def scoreOpinions(G, globMu, globStd): node['score'] = node['mean'] node['feedbacks'] = feedbacks else: - node['score'] = None + node['score'] = globMu - globStd + node['score'] = node['score'] / 2 + node['gpr_score'] def scoreUnread(G, globMu, globStd): neuralBins = defaultdict(list) @@ -428,6 +431,7 @@ def scoreUnread(G, globMu, globStd): neuralBins['median'] = [node['median']] neuralBins['se'] = [node['se']] neuralBins['pagerank'] = [node['pagerank_score']] + neuralBins['gpr_score'] = [node['gpr_score']] if 'tgb_rank' in node: neuralBins['tgbrank'] = [10/math.ln10(10+node['tgb_rank'])] neuralBins['bias'] = [globMu] @@ -815,6 +819,7 @@ def genScores(G, books, calcPagerank=True): globMu, globStd = calcRecDist(G, books) if calcPagerank: runPagerank(G) + genGprScores(G, globMu, globStd) scoreOpinions(G, globMu, globStd) scoreUnread(G, globMu, globStd) return globMu, globStd diff --git a/py/gp.py b/py/gp.py new file mode 100644 index 0000000..a98c6b4 --- /dev/null +++ b/py/gp.py @@ -0,0 +1,62 @@ +import numpy as np +from sklearn.gaussian_process.kernels import Kernel, Hyperparameter +from sklearn.gaussian_process.kernels import GenericKernelMixin +from sklearn.gaussian_process import GaussianProcessRegressor +#from sklearn.gaussian_process import GaussianProcessClassifier +from sklearn.base import clone + +class BookKernel(GenericKernelMixin, Kernel): + def __init__(self, G): + self.baseline_similarity = 0.5 + self.baseline_similarity_bounds = (1e-5, 1) + + self.G = G + self.node2vec = Node2Vec(self.G, dimensions=32, walk_length=16, num_walks=256, workers=8) + self.model = node2vec.fit(window=10, min_count=1, batch_words=4) + self.wv = self.model.wv + + def _f(self, s1, s2): + return self.wv.similarity(s1, s2) + + def __call__(self, X, Y=None, eval_gradient=False): + if Y is None: + Y = X + + if eval_gradient: + return ( + np.array([[self._f(x, y) for y in Y] for x in X]), + ) + else: + return np.array([[self._f(x, y) for y in Y] for x in X]) + + def diag(self, X): + return self(X) + + def is_stationary(self): + return False + + def clone_with_theta(self, theta): + cloned = clone(self) + cloned.theta = theta + return cloned + +def genGprScores(G, globMu, globStd, scoreName='gpr_score', stdName='gpr_std'): + gpr = GaussianProcessRegressor(kernel=BookKernel(G), random_state=3141) + X, y = [], [] + for n in G.nodes: + node = G.nodes[n] + if node['rating']!=None: + X.append(n) + y.append(node['rating']) + gpr.fit(X, y) + X = [] + for n in G.nodes: + node = G.nodes[n] + if node['rating']==None: + X.append(n) + y,stds = gpr.predict(X, return_std=True) + for n in G.nodes: + node = G.nodes[n] + if node['rating']==None: + y, std = y.pop(0), stds.pop(0) + node[scoreName], node[stdName] = y, std