From 06712ee027b8162aa485cbb19c4b612db24ae9b4 Mon Sep 17 00:00:00 2001 From: Dominik Roth Date: Fri, 25 Feb 2022 00:44:58 +0100 Subject: [PATCH] GPR using Node2Vec embedding cosine similarity kernels done --- caliGraph.py | 44 +++++++++++++++++++------------------------- py/gp.py | 10 +++++----- 2 files changed, 24 insertions(+), 30 deletions(-) diff --git a/caliGraph.py b/caliGraph.py index caa31b5..cabcf7e 100755 --- a/caliGraph.py +++ b/caliGraph.py @@ -389,8 +389,11 @@ def removeUselessSeries(G, minSco=0): def scoreOpinions(G, globMu, globStd): for n in list(G.nodes): node = G.nodes[n] - feedbacks = [] if node['t'] not in ['book', 'newBooks']: + if 'gpr_score' in node: + feedbacks = [node['gpr_score']] + else: + feedbacks = [] adjacens = list(G.adj[n].keys()) for adj in adjacens: adjNode = G.nodes[adj] @@ -399,7 +402,6 @@ def scoreOpinions(G, globMu, globStd): if len(feedbacks): node['mean'], node['std'] = norm.fit(feedbacks) node['se'] = globStd / math.sqrt(len(feedbacks)) - ratio = len(feedbacks) / len(adjacens) node['score'] = node['mean'] node['feedbacks'] = feedbacks else: @@ -436,6 +438,11 @@ def scoreUnread(G, globMu, globStd): #ws.append(['se']) feedbacks.append(globMu) ws.append(['bias']) + if 'gpr_score' in node: + feedbacks.append(node['gpr_score']) + ws.append(['gpr_score']) + feedbacks.append(node['gpr_se']) + ws.append(['gpr_se']) node['score'] = sum([fb*getWeightForType(w[0], w[1] if len(w)>1 else 1) for fb, w in zip(feedbacks, ws)])/sum([getWeightForType(w[0], w[1] if len(w)>1 else 1) for w in ws]) node['_act'] = feedbacks node['_wgh'] = ws @@ -809,6 +816,9 @@ def buildFullGraph(darkMode=False): graphAddTopLists(G, books, darkMode=darkMode) graphAddSeries(G, books, darkMode=darkMode) graphAddTags(G, books, darkMode=darkMode) + + genGprScores(G, 'gpr_score', 'gpr_se') + return G, books @@ -816,9 +826,8 @@ def genScores(G, books, calcPagerank=True): globMu, globStd = calcRecDist(G, books) if calcPagerank: runPagerank(G) - genGprScores(G, globMu, globStd, 'score', 'se') - #scoreOpinions(G, globMu, globStd) - #scoreUnread(G, globMu, globStd) + scoreOpinions(G, globMu, globStd) + scoreUnread(G, globMu, globStd) return globMu, globStd def addImageToNode(node, cache, shape='circularImage'): @@ -1111,23 +1120,6 @@ def waveFlow(G, node, n, dist, menge, firstEdge=False): if node in bestlist or node in keeplist: waveFlow(G, node, m, dist, menge, firstEdge=firstEdge) -def gensimTokensForLines(lines): - for i, line in enumerate(lines): - tokens = gensim.utils.simple_preprocess(line) - if tokens_only: - yield tokens - else: - # For training data, add tags - yield gensim.models.doc2vec.TaggedDocument(tokens, [i]) - -def buildDoc2Vec(books): - import gensim - for n in list(G.nodes): - node = G.nodes[n] - if node['t'] == 'book': - pass - gensimTokensForLines(lines) - def shell(G, books, mu, std): from ptpython.repl import embed embed(globals(), locals()) @@ -1201,7 +1193,7 @@ def findNewBooks(G, books, mu, num=-1, minRecSco=5): # while batchSize is implemented, we only get a good gonvergence when we disable it (batchSize=-1) # but might be necessary to enable later for a larger libary for better training performance... # maybe try again for 128 books? -def evaluateFitness(books, batchSize=-1, debugPrint=False): +def evaluateFitness(books, batchSize=-1, debugPrint=False, doGPR=True): global weights G = buildBookGraph(books) graphAddAuthors(G, books) @@ -1210,6 +1202,8 @@ def evaluateFitness(books, batchSize=-1, debugPrint=False): graphAddSeries(G, books) graphAddTags(G, books) runPagerank(G) + if doGPR: + genGprScores(G, 'gpr_score', 'gpr_se') ratedBooks = [n for n in list(G.nodes) if 'rating' in G.nodes[n] and G.nodes[n]['rating'] != None] boundsLoss = 0 @@ -1270,7 +1264,7 @@ def train(initGamma, full=True): delta = sum(gradient[g]**2 for g in gradient) for wt in weights: weights[wt] += gamma*gradient[wt]/math.sqrt(delta) - mse, gradient = evaluateFitness(books) + mse, gradient = evaluateFitness(books, doGPR=False) if mse < last_mse: gamma = gamma*1.25 else: @@ -1307,7 +1301,7 @@ def loadWeights(): with open('neuralWeights.json', 'r') as f: weights = json.loads(f.read()) except IOError: - weights = {"topList": 0.15, "recommender": 0.30, "author": 0.70, "series": 0.05, "tag": 0.05, "pagerank": 0.05, "mu": 0.50, "sigma": 0.30, "bias": 0.25} #, "tgb_rank": 0.10} + weights = {"topList": 0.15, "recommender": 0.30, "author": 0.70, "series": 0.05, "tag": 0.05, "pagerank": 0.05, "mu": 0.50, "sigma": 0.30, "bias": 0.25, "gpr_score": 1.00, "gpr_se": -0.50} #, "tgb_rank": 0.10} return weights def cliInterface(): diff --git a/py/gp.py b/py/gp.py index ca05c6e..fe91309 100644 --- a/py/gp.py +++ b/py/gp.py @@ -43,33 +43,33 @@ class BookKernel(GenericKernelMixin, Kernel): cloned.theta = theta return cloned -def genGprScores(G, globMu, globStd, scoreName='gpr_score', stdName='gpr_std'): - print('[\] Constructing Vectorizer') +def genGprScores(G, scoreName='gpr_score', stdName='gpr_std'): + print('[\] Constructing Feature-Space-Projector') node2vec = Node2Vec(G, dimensions=32, walk_length=16, num_walks=128, workers=8) print('[\] Fitting Embeddings for Kernel') model = node2vec.fit(window=8, min_count=1, batch_words=4) wv = model.wv print('[\] Constructing Kernel') kernel = BookKernel(wv) + print('[\] Fitting GP') X, y = [], [] for n in G.nodes: node = G.nodes[n] if 'rating' in node and node['rating']!=None: X.append(n) y.append(node['rating']) - print('[\] Fitting GP') gpr = GaussianProcessRegressor(kernel=kernel, random_state=3141, alpha=1e-8).fit(X, y) + print('[\] Inferencing GP') X = [] for n in G.nodes: node = G.nodes[n] if not 'rating' in node or node['rating']==None: X.append(n) - print('[\] Inferencing GP') y, stds = gpr.predict(X, return_std=True) i=0 for n in G.nodes: node = G.nodes[n] if not 'rating' in node or node['rating']==None: - s, std = y[i], sum([val[0] for val in stds[i]]) + s, std = y[i], stds[i][i][0] i+=1 node[scoreName], node[stdName] = float(s), float(std)