GPR using Node2Vec embedding cosine similarity kernels done

More GNTM
GNTM gucken
2022-02-25 00:44:58 +01:00 · 2022-02-24 22:29:52 +01:00 · 2022-02-24 21:53:54 +01:00 · 2022-02-24 20:18:31 +01:00 · 2022-02-24 20:15:13 +01:00 · 2022-02-24 20:14:13 +01:00
2 changed files with 139 additions and 75 deletions
--- a/caliGraph.py
+++ b/caliGraph.py
@ -7,8 +7,6 @@ import copy
 import random
 import requests

-from collections import defaultdict
-
 import numpy as np
 import pandas as pd
 from scipy.stats import norm
@ -20,6 +18,8 @@ import plotly.graph_objects as go

 import wikipedia

+from py.gp import *
+
 def getAllAuthors(books):
    authors = set()
    for book in books:
@ -389,8 +389,11 @@ def removeUselessSeries(G, minSco=0):
 def scoreOpinions(G, globMu, globStd):
    for n in list(G.nodes):
        node = G.nodes[n]
-        feedbacks = []
-        if node['t'] not in ['book']:
+        if node['t'] not in ['book', 'newBooks']:
+            if 'gpr_score' in node:
+                feedbacks = [node['gpr_score']]
+            else:
+                feedbacks = []
            adjacens = list(G.adj[n].keys())
            for adj in adjacens:
                adjNode = G.nodes[adj]
@ -399,16 +402,15 @@ def scoreOpinions(G, globMu, globStd):
            if len(feedbacks):
                node['mean'], node['std'] = norm.fit(feedbacks)
                node['se'] = globStd / math.sqrt(len(feedbacks))
-                ratio = len(feedbacks) / len(adjacens)
                node['score'] = node['mean']
                node['feedbacks'] = feedbacks
            else:
                node['score'] = None

 def scoreUnread(G, globMu, globStd):
-    neuralBins = defaultdict(list)
-    feedbacks = [globMu-globStd, globMu+globStd]
    for n in list(G.nodes):
+        feedbacks = [globMu]
+        ws = [['mu']]
        node = G.nodes[n]
        if node['t'] == 'book':
            if node['rating'] == None:
@ -416,41 +418,47 @@ def scoreUnread(G, globMu, globStd):
                for adj in adjacens:
                    adjNode = G.nodes[adj]
                    if 'score' in adjNode and adjNode['score'] != None:
-                        w = adjNode['t']
+                        w = [adjNode['t'], G[n][adj]['weight'] if 'weight' in G[n][adj] else 1]
                        for fb in adjNode['feedbacks']:
-                            neuralBins[w].append(fb)
                            feedbacks.append(fb)
-                node['mean'], node['std'] = norm.fit(feedbacks)
-                node['median'] = np.percentile(feedbacks, [50], method='linear')[0]
-                node['se'] = globStd / math.sqrt(len(feedbacks))
-                neuralBins['mean'] = [node['mean']]
-                neuralBins['sigma'] = [node['std']]
-                neuralBins['median'] = [node['median']]
-                neuralBins['se'] = [node['se']]
-                neuralBins['pagerank'] = [node['pagerank_score']]
-                if 'tgb_rank' in node:
-                    neuralBins['tgbrank'] = [10/math.ln10(10+node['tgb_rank'])]
-                neuralBins['bias'] = [globMu]
-                score = 0
-                nb = dict(neuralBins)
-                act = {}
-                for b in nb:
-                    act[b] = sum(nb[b])/len(nb[b])
-                    score += act[b] * getWeightForType(b)
-                score /= sum([abs(getWeightForType(b)) for b in nb])
-                node['score'] = math.tanh(score/10)*10
-                node['_act'] = act
+                            ws.append(w)
+                if len(feedbacks):
+                    node['mean'], node['std'] = norm.fit(feedbacks)
+                    node['median'] = np.percentile(feedbacks, [50], method='linear')[0]
+                    node['se'] = globStd / math.sqrt(len(feedbacks))
+                    feedbacks.append(node['pagerank_score'])
+                    ws.append(['pagerank'])
+                    #feedbacks.append(10/math.ln10(10+node['tgb_rank']) if 'tgb_rank' in node else 0)
+                    #ws.append(['tgb_rank'])
+                    feedbacks.append(node['std'])
+                    ws.append(['sigma'])
+                    #feedbacks.append(node['median'])
+                    #ws.append(['median'])
+                    #feedbacks.append(node['se'])
+                    #ws.append(['se'])
+                    feedbacks.append(globMu)
+                    ws.append(['bias'])
+                    if 'gpr_score' in node:
+                        feedbacks.append(node['gpr_score'])
+                        ws.append(['gpr_score'])
+                        feedbacks.append(node['gpr_se'])
+                        ws.append(['gpr_se'])
+                    node['score'] = sum([fb*getWeightForType(w[0], w[1] if len(w)>1 else 1) for fb, w in zip(feedbacks, ws)])/sum([getWeightForType(w[0], w[1] if len(w)>1 else 1) for w in ws])
+                    node['_act'] = feedbacks
+                    node['_wgh'] = ws
+                else:
+                    node['score'] = globMu + errorFac*globStd + len(feedbacks)*0.0000000001
                if 'series' in node:
                    if node['series_index'] == 1.0:
                        node['score'] += 0.000000001

-def getWeightForType(nodeType):
+def getWeightForType(nodeType, edgeWeight=1):
    global weights
-    if nodeType not in weights:
-        weights[nodeType] = 0.1
-        saveWeights(weights)
-        print('[i] neuralWeights-Vector extended with >'+nodeType+'<')
-    return weights[nodeType]
+    w = weights[nodeType]
+    if nodeType == 'topList':
+        return edgeWeight*w
+    else:
+        return w

 def printBestList(G, t='book', num=-1):
    bestlist = []
@ -808,6 +816,9 @@ def buildFullGraph(darkMode=False):
    graphAddTopLists(G, books, darkMode=darkMode)
    graphAddSeries(G, books, darkMode=darkMode)
    graphAddTags(G, books, darkMode=darkMode)
+
+    genGprScores(G, 'gpr_score', 'gpr_se')
+
    return G, books


@ -1109,23 +1120,6 @@ def waveFlow(G, node, n, dist, menge, firstEdge=False):
        if node in bestlist or node in keeplist:
            waveFlow(G, node, m, dist, menge, firstEdge=firstEdge)

-def gensimTokensForLines(lines):
-    for i, line in enumerate(lines):
-        tokens = gensim.utils.simple_preprocess(line)
-        if tokens_only:
-            yield tokens
-        else:
-            # For training data, add tags
-            yield gensim.models.doc2vec.TaggedDocument(tokens, [i])
-
-def buildDoc2Vec(books):
-    import gensim
-    for n in list(G.nodes):
-        node = G.nodes[n]
-        if node['t'] == 'book':
-            pass
-    gensimTokensForLines(lines)
-
 def shell(G, books, mu, std):
    from ptpython.repl import embed
    embed(globals(), locals())
@ -1199,7 +1193,7 @@ def findNewBooks(G, books, mu, num=-1, minRecSco=5):
 # while batchSize is implemented, we only get a good gonvergence when we disable it (batchSize=-1)
 # but might be necessary to enable later for a larger libary for better training performance...
 # maybe try again for 128 books?
-def evaluateFitness(books, batchSize=16, debugPrint=False):
+def evaluateFitness(books, batchSize=-1, debugPrint=False, doGPR=True):
    global weights
    G = buildBookGraph(books)
    graphAddAuthors(G, books)
@ -1208,19 +1202,20 @@ def evaluateFitness(books, batchSize=16, debugPrint=False):
    graphAddSeries(G, books)
    graphAddTags(G, books)
    runPagerank(G)
+    if doGPR:
+        genGprScores(G, 'gpr_score', 'gpr_se')

    ratedBooks = [n for n in list(G.nodes) if 'rating' in G.nodes[n] and G.nodes[n]['rating'] != None]
    boundsLoss = 0
    linSepLoss = []
    errSq = []
    gradient = {}
-    for w in weights:
-        gradient[w] = 0
+    for wt in weights:
+        gradient[wt] = 0
    mu, sigma = genScores(G, books)
-    batch = random.sample(ratedBooks, batchSize) if batchSize!=-1 and len(ratedBooks) > batchSize else ratedBooks
    for b in G.nodes:
-        if b in ratedBooks:
-            node = G.nodes[b]
+        batch = random.sample(ratedBooks, batchSize) if batchSize!=-1 and len(ratedBooks) > batchSize else ratedBooks
+        if b in batch:
            rating = G.nodes[b]['rating']
            G.nodes[b]['rating'] = None
            _, _ = genScores(G, books, calcPagerank=False)
@ -1229,20 +1224,17 @@ def evaluateFitness(books, batchSize=16, debugPrint=False):
            else:
                errSq.append((rating - G.nodes[b]['score'])**2)
            G.nodes[b]['rating'] = rating
-            if b in batch:
-                for wt in weights:
-                    scoreB = 0
-                    for w in node['_act']:
-                        scoreB += node['_act'][w] * (getWeightForType(w) + (0.001 if wt==w else 0))
-                    scoreB /= sum([abs(getWeightForType(w)) for w in node['_act']])
-                    scoreB = math.tanh(scoreB/10)*10
-                    gradient[wt] += ((rating - G.nodes[b]['score'])**2 - (rating - scoreB)**2)*1000
+            for wt in weights:
+                scoreB = sum([a*(1.001 if wt==w[0] else 1)*weights[w[0]]*(w[1] if len(w)>1 else 1) for a,w in zip(G.nodes[b]['_act'], G.nodes[b]['_wgh'])])/sum([(1.001 if wt==w[0] else 1)*weights[w[0]]*(w[1] if len(w)>1 else 1) for w in G.nodes[b]['_wgh']])
+                gradient[wt] += ((rating - G.nodes[b]['score'])**2 - (rating - scoreB)**2)*1000
    regressionLoss = sum([max(0,abs(w)-1)**2 for w in weights.values()]) # no punishment if w within -1 and 1
    for wt in weights:
        if abs(weights[wt]) > 1.0:
-            gradient[wt] -= weights[wt]*3
+            gradient[wt] -= weights[wt]*10
+        else:
+            gradient[wt] -= weights[wt]*1
    for g in gradient:
-        gradient[g] /= len(batch)
+        gradient[g] /= len(errSq)
    if debugPrint:
        print(sum(errSq)/len(errSq), 0.001*regressionLoss)
    fit = sum(errSq)/len(errSq) + 0.001*regressionLoss
@ -1258,7 +1250,7 @@ def train(initGamma, full=True):
    books = loadBooksFromDB()
    bestWeights = copy.copy(weights)
    mse, gradient = evaluateFitness(books)
-    delta = math.sqrt(sum(gradient[g]**2 for g in gradient)/len(gradient))
+    delta = sum(gradient[g]**2 for g in gradient)
    best_mse = mse
    stagLen = 0
    goal = 1.0e-4
@ -1271,11 +1263,8 @@ def train(initGamma, full=True):
        print({'mse': mse, 'gamma': gamma, 'delta': delta})
        delta = sum(gradient[g]**2 for g in gradient)
        for wt in weights:
-            if wt in gradient:
-                weights[wt] += gamma*gradient[wt]/math.sqrt(delta)
-            #else:
-            #    del weights[wt]
-        mse, gradient = evaluateFitness(books)
+            weights[wt] += gamma*gradient[wt]/math.sqrt(delta)
+        mse, gradient = evaluateFitness(books, doGPR=False)
        if mse < last_mse:
            gamma = gamma*1.25
        else:
@ -1312,7 +1301,7 @@ def loadWeights():
        with open('neuralWeights.json', 'r') as f:
            weights = json.loads(f.read())
    except IOError:
-        weights = {"topList": 0.15, "recommender": 0.30, "author": 0.70, "series": 0.05, "tag": 0.05, "pagerank": 0.05, "mu": 0.50, "sigma": 0.30, "bias": 0.25} #, "tgb_rank": 0.10}
+        weights = {"topList": 0.15, "recommender": 0.30, "author": 0.70, "series": 0.05, "tag": 0.05, "pagerank": 0.05, "mu": 0.50, "sigma": 0.30, "bias": 0.25, "gpr_score": 1.00, "gpr_se": -0.50} #, "tgb_rank": 0.10}
    return weights

 def cliInterface():
--- a/py/gp.py
+++ b/py/gp.py
@ -0,0 +1,75 @@
+import numpy as np
+
+from node2vec import Node2Vec
+from sklearn.gaussian_process.kernels import Kernel, Hyperparameter
+from sklearn.gaussian_process.kernels import GenericKernelMixin
+from sklearn.gaussian_process import GaussianProcessRegressor
+#from sklearn.gaussian_process import GaussianProcessClassifier
+from sklearn.base import clone
+
+class BookKernel(GenericKernelMixin, Kernel):
+    def __init__(self, wv):
+        self.wv = wv
+
+    def _f(self, s1, s2):
+        """
+        kernel value between a pair of sequences
+        """
+        s = self.wv.similarity(s1, s2)**2*0.99 + 0.01
+        if s <= 0:
+            print('bad!')
+        return s
+
+    def __call__(self, X, Y=None, eval_gradient=False):
+        if Y is None:
+            Y = X
+
+        if eval_gradient:
+            return (
+                np.array([[self._f(x, y) for y in Y] for x in X])
+            )
+        else:
+            return np.array([[self._f(x, y) for y in Y] for x in X])
+            #return np.array(self.wv.n_similarity(X, Y))
+
+    def diag(self, X):
+        return self(X)
+
+    def is_stationary(self):
+        return False
+
+    def clone_with_theta(self, theta):
+        cloned = clone(self)
+        cloned.theta = theta
+        return cloned
+
+def genGprScores(G, scoreName='gpr_score', stdName='gpr_std'):
+    print('[\] Constructing Feature-Space-Projector')
+    node2vec = Node2Vec(G, dimensions=32, walk_length=16, num_walks=128, workers=8)
+    print('[\] Fitting Embeddings for Kernel')
+    model = node2vec.fit(window=8, min_count=1, batch_words=4)
+    wv = model.wv
+    print('[\] Constructing Kernel')
+    kernel = BookKernel(wv)
+    print('[\] Fitting GP')
+    X, y = [], []
+    for n in G.nodes:
+        node = G.nodes[n]
+        if 'rating' in node and node['rating']!=None:
+            X.append(n)
+            y.append(node['rating'])
+    gpr = GaussianProcessRegressor(kernel=kernel, random_state=3141, alpha=1e-8).fit(X, y)
+    print('[\] Inferencing GP')
+    X = []
+    for n in G.nodes:
+        node = G.nodes[n]
+        if not 'rating' in node or node['rating']==None:
+            X.append(n)
+    y, stds = gpr.predict(X, return_std=True)
+    i=0
+    for n in G.nodes:
+        node = G.nodes[n]
+        if not 'rating' in node or node['rating']==None:
+            s, std = y[i], stds[i][i][0]
+            i+=1
+            node[scoreName], node[stdName] = float(s), float(std)
Author	SHA1	Message	Date
Dominik Roth	06712ee027	GPR using Node2Vec embedding cosine similarity kernels done	2022-02-25 00:44:58 +01:00
Dominik Roth	02e912d4ff	More GNTM	2022-02-24 22:29:52 +01:00
Dominik Roth	e87288a927	GNTM gucken	2022-02-24 21:53:54 +01:00
Dominik Roth	85c800d39e	fixes	2022-02-24 20:18:31 +01:00
Dominik Roth	0dc40c5635	lol	2022-02-24 20:15:13 +01:00
Dominik Roth	7c16b8044e	Revert "implemented neuralBins (performance is bad...)" This reverts commit `bd53a83058`.	2022-02-24 20:14:13 +01:00
Dominik Roth	b10fcac016	WIP	2022-02-24 20:12:08 +01:00