Compare commits

...

7 Commits
master ... gp

Author SHA1 Message Date
06712ee027 GPR using Node2Vec embedding cosine similarity kernels done 2022-02-25 00:44:58 +01:00
02e912d4ff More GNTM 2022-02-24 22:29:52 +01:00
e87288a927 GNTM gucken 2022-02-24 21:53:54 +01:00
85c800d39e fixes 2022-02-24 20:18:31 +01:00
0dc40c5635 lol 2022-02-24 20:15:13 +01:00
7c16b8044e Revert "implemented neuralBins (performance is bad...)"
This reverts commit bd53a83058.
2022-02-24 20:14:13 +01:00
b10fcac016 WIP 2022-02-24 20:12:08 +01:00
2 changed files with 139 additions and 75 deletions

View File

@ -7,8 +7,6 @@ import copy
import random import random
import requests import requests
from collections import defaultdict
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from scipy.stats import norm from scipy.stats import norm
@ -20,6 +18,8 @@ import plotly.graph_objects as go
import wikipedia import wikipedia
from py.gp import *
def getAllAuthors(books): def getAllAuthors(books):
authors = set() authors = set()
for book in books: for book in books:
@ -389,8 +389,11 @@ def removeUselessSeries(G, minSco=0):
def scoreOpinions(G, globMu, globStd): def scoreOpinions(G, globMu, globStd):
for n in list(G.nodes): for n in list(G.nodes):
node = G.nodes[n] node = G.nodes[n]
feedbacks = [] if node['t'] not in ['book', 'newBooks']:
if node['t'] not in ['book']: if 'gpr_score' in node:
feedbacks = [node['gpr_score']]
else:
feedbacks = []
adjacens = list(G.adj[n].keys()) adjacens = list(G.adj[n].keys())
for adj in adjacens: for adj in adjacens:
adjNode = G.nodes[adj] adjNode = G.nodes[adj]
@ -399,16 +402,15 @@ def scoreOpinions(G, globMu, globStd):
if len(feedbacks): if len(feedbacks):
node['mean'], node['std'] = norm.fit(feedbacks) node['mean'], node['std'] = norm.fit(feedbacks)
node['se'] = globStd / math.sqrt(len(feedbacks)) node['se'] = globStd / math.sqrt(len(feedbacks))
ratio = len(feedbacks) / len(adjacens)
node['score'] = node['mean'] node['score'] = node['mean']
node['feedbacks'] = feedbacks node['feedbacks'] = feedbacks
else: else:
node['score'] = None node['score'] = None
def scoreUnread(G, globMu, globStd): def scoreUnread(G, globMu, globStd):
neuralBins = defaultdict(list)
feedbacks = [globMu-globStd, globMu+globStd]
for n in list(G.nodes): for n in list(G.nodes):
feedbacks = [globMu]
ws = [['mu']]
node = G.nodes[n] node = G.nodes[n]
if node['t'] == 'book': if node['t'] == 'book':
if node['rating'] == None: if node['rating'] == None:
@ -416,41 +418,47 @@ def scoreUnread(G, globMu, globStd):
for adj in adjacens: for adj in adjacens:
adjNode = G.nodes[adj] adjNode = G.nodes[adj]
if 'score' in adjNode and adjNode['score'] != None: if 'score' in adjNode and adjNode['score'] != None:
w = adjNode['t'] w = [adjNode['t'], G[n][adj]['weight'] if 'weight' in G[n][adj] else 1]
for fb in adjNode['feedbacks']: for fb in adjNode['feedbacks']:
neuralBins[w].append(fb)
feedbacks.append(fb) feedbacks.append(fb)
node['mean'], node['std'] = norm.fit(feedbacks) ws.append(w)
node['median'] = np.percentile(feedbacks, [50], method='linear')[0] if len(feedbacks):
node['se'] = globStd / math.sqrt(len(feedbacks)) node['mean'], node['std'] = norm.fit(feedbacks)
neuralBins['mean'] = [node['mean']] node['median'] = np.percentile(feedbacks, [50], method='linear')[0]
neuralBins['sigma'] = [node['std']] node['se'] = globStd / math.sqrt(len(feedbacks))
neuralBins['median'] = [node['median']] feedbacks.append(node['pagerank_score'])
neuralBins['se'] = [node['se']] ws.append(['pagerank'])
neuralBins['pagerank'] = [node['pagerank_score']] #feedbacks.append(10/math.ln10(10+node['tgb_rank']) if 'tgb_rank' in node else 0)
if 'tgb_rank' in node: #ws.append(['tgb_rank'])
neuralBins['tgbrank'] = [10/math.ln10(10+node['tgb_rank'])] feedbacks.append(node['std'])
neuralBins['bias'] = [globMu] ws.append(['sigma'])
score = 0 #feedbacks.append(node['median'])
nb = dict(neuralBins) #ws.append(['median'])
act = {} #feedbacks.append(node['se'])
for b in nb: #ws.append(['se'])
act[b] = sum(nb[b])/len(nb[b]) feedbacks.append(globMu)
score += act[b] * getWeightForType(b) ws.append(['bias'])
score /= sum([abs(getWeightForType(b)) for b in nb]) if 'gpr_score' in node:
node['score'] = math.tanh(score/10)*10 feedbacks.append(node['gpr_score'])
node['_act'] = act ws.append(['gpr_score'])
feedbacks.append(node['gpr_se'])
ws.append(['gpr_se'])
node['score'] = sum([fb*getWeightForType(w[0], w[1] if len(w)>1 else 1) for fb, w in zip(feedbacks, ws)])/sum([getWeightForType(w[0], w[1] if len(w)>1 else 1) for w in ws])
node['_act'] = feedbacks
node['_wgh'] = ws
else:
node['score'] = globMu + errorFac*globStd + len(feedbacks)*0.0000000001
if 'series' in node: if 'series' in node:
if node['series_index'] == 1.0: if node['series_index'] == 1.0:
node['score'] += 0.000000001 node['score'] += 0.000000001
def getWeightForType(nodeType): def getWeightForType(nodeType, edgeWeight=1):
global weights global weights
if nodeType not in weights: w = weights[nodeType]
weights[nodeType] = 0.1 if nodeType == 'topList':
saveWeights(weights) return edgeWeight*w
print('[i] neuralWeights-Vector extended with >'+nodeType+'<') else:
return weights[nodeType] return w
def printBestList(G, t='book', num=-1): def printBestList(G, t='book', num=-1):
bestlist = [] bestlist = []
@ -808,6 +816,9 @@ def buildFullGraph(darkMode=False):
graphAddTopLists(G, books, darkMode=darkMode) graphAddTopLists(G, books, darkMode=darkMode)
graphAddSeries(G, books, darkMode=darkMode) graphAddSeries(G, books, darkMode=darkMode)
graphAddTags(G, books, darkMode=darkMode) graphAddTags(G, books, darkMode=darkMode)
genGprScores(G, 'gpr_score', 'gpr_se')
return G, books return G, books
@ -1109,23 +1120,6 @@ def waveFlow(G, node, n, dist, menge, firstEdge=False):
if node in bestlist or node in keeplist: if node in bestlist or node in keeplist:
waveFlow(G, node, m, dist, menge, firstEdge=firstEdge) waveFlow(G, node, m, dist, menge, firstEdge=firstEdge)
def gensimTokensForLines(lines):
for i, line in enumerate(lines):
tokens = gensim.utils.simple_preprocess(line)
if tokens_only:
yield tokens
else:
# For training data, add tags
yield gensim.models.doc2vec.TaggedDocument(tokens, [i])
def buildDoc2Vec(books):
import gensim
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'book':
pass
gensimTokensForLines(lines)
def shell(G, books, mu, std): def shell(G, books, mu, std):
from ptpython.repl import embed from ptpython.repl import embed
embed(globals(), locals()) embed(globals(), locals())
@ -1199,7 +1193,7 @@ def findNewBooks(G, books, mu, num=-1, minRecSco=5):
# while batchSize is implemented, we only get a good gonvergence when we disable it (batchSize=-1) # while batchSize is implemented, we only get a good gonvergence when we disable it (batchSize=-1)
# but might be necessary to enable later for a larger libary for better training performance... # but might be necessary to enable later for a larger libary for better training performance...
# maybe try again for 128 books? # maybe try again for 128 books?
def evaluateFitness(books, batchSize=16, debugPrint=False): def evaluateFitness(books, batchSize=-1, debugPrint=False, doGPR=True):
global weights global weights
G = buildBookGraph(books) G = buildBookGraph(books)
graphAddAuthors(G, books) graphAddAuthors(G, books)
@ -1208,19 +1202,20 @@ def evaluateFitness(books, batchSize=16, debugPrint=False):
graphAddSeries(G, books) graphAddSeries(G, books)
graphAddTags(G, books) graphAddTags(G, books)
runPagerank(G) runPagerank(G)
if doGPR:
genGprScores(G, 'gpr_score', 'gpr_se')
ratedBooks = [n for n in list(G.nodes) if 'rating' in G.nodes[n] and G.nodes[n]['rating'] != None] ratedBooks = [n for n in list(G.nodes) if 'rating' in G.nodes[n] and G.nodes[n]['rating'] != None]
boundsLoss = 0 boundsLoss = 0
linSepLoss = [] linSepLoss = []
errSq = [] errSq = []
gradient = {} gradient = {}
for w in weights: for wt in weights:
gradient[w] = 0 gradient[wt] = 0
mu, sigma = genScores(G, books) mu, sigma = genScores(G, books)
batch = random.sample(ratedBooks, batchSize) if batchSize!=-1 and len(ratedBooks) > batchSize else ratedBooks
for b in G.nodes: for b in G.nodes:
if b in ratedBooks: batch = random.sample(ratedBooks, batchSize) if batchSize!=-1 and len(ratedBooks) > batchSize else ratedBooks
node = G.nodes[b] if b in batch:
rating = G.nodes[b]['rating'] rating = G.nodes[b]['rating']
G.nodes[b]['rating'] = None G.nodes[b]['rating'] = None
_, _ = genScores(G, books, calcPagerank=False) _, _ = genScores(G, books, calcPagerank=False)
@ -1229,20 +1224,17 @@ def evaluateFitness(books, batchSize=16, debugPrint=False):
else: else:
errSq.append((rating - G.nodes[b]['score'])**2) errSq.append((rating - G.nodes[b]['score'])**2)
G.nodes[b]['rating'] = rating G.nodes[b]['rating'] = rating
if b in batch: for wt in weights:
for wt in weights: scoreB = sum([a*(1.001 if wt==w[0] else 1)*weights[w[0]]*(w[1] if len(w)>1 else 1) for a,w in zip(G.nodes[b]['_act'], G.nodes[b]['_wgh'])])/sum([(1.001 if wt==w[0] else 1)*weights[w[0]]*(w[1] if len(w)>1 else 1) for w in G.nodes[b]['_wgh']])
scoreB = 0 gradient[wt] += ((rating - G.nodes[b]['score'])**2 - (rating - scoreB)**2)*1000
for w in node['_act']:
scoreB += node['_act'][w] * (getWeightForType(w) + (0.001 if wt==w else 0))
scoreB /= sum([abs(getWeightForType(w)) for w in node['_act']])
scoreB = math.tanh(scoreB/10)*10
gradient[wt] += ((rating - G.nodes[b]['score'])**2 - (rating - scoreB)**2)*1000
regressionLoss = sum([max(0,abs(w)-1)**2 for w in weights.values()]) # no punishment if w within -1 and 1 regressionLoss = sum([max(0,abs(w)-1)**2 for w in weights.values()]) # no punishment if w within -1 and 1
for wt in weights: for wt in weights:
if abs(weights[wt]) > 1.0: if abs(weights[wt]) > 1.0:
gradient[wt] -= weights[wt]*3 gradient[wt] -= weights[wt]*10
else:
gradient[wt] -= weights[wt]*1
for g in gradient: for g in gradient:
gradient[g] /= len(batch) gradient[g] /= len(errSq)
if debugPrint: if debugPrint:
print(sum(errSq)/len(errSq), 0.001*regressionLoss) print(sum(errSq)/len(errSq), 0.001*regressionLoss)
fit = sum(errSq)/len(errSq) + 0.001*regressionLoss fit = sum(errSq)/len(errSq) + 0.001*regressionLoss
@ -1258,7 +1250,7 @@ def train(initGamma, full=True):
books = loadBooksFromDB() books = loadBooksFromDB()
bestWeights = copy.copy(weights) bestWeights = copy.copy(weights)
mse, gradient = evaluateFitness(books) mse, gradient = evaluateFitness(books)
delta = math.sqrt(sum(gradient[g]**2 for g in gradient)/len(gradient)) delta = sum(gradient[g]**2 for g in gradient)
best_mse = mse best_mse = mse
stagLen = 0 stagLen = 0
goal = 1.0e-4 goal = 1.0e-4
@ -1271,11 +1263,8 @@ def train(initGamma, full=True):
print({'mse': mse, 'gamma': gamma, 'delta': delta}) print({'mse': mse, 'gamma': gamma, 'delta': delta})
delta = sum(gradient[g]**2 for g in gradient) delta = sum(gradient[g]**2 for g in gradient)
for wt in weights: for wt in weights:
if wt in gradient: weights[wt] += gamma*gradient[wt]/math.sqrt(delta)
weights[wt] += gamma*gradient[wt]/math.sqrt(delta) mse, gradient = evaluateFitness(books, doGPR=False)
#else:
# del weights[wt]
mse, gradient = evaluateFitness(books)
if mse < last_mse: if mse < last_mse:
gamma = gamma*1.25 gamma = gamma*1.25
else: else:
@ -1312,7 +1301,7 @@ def loadWeights():
with open('neuralWeights.json', 'r') as f: with open('neuralWeights.json', 'r') as f:
weights = json.loads(f.read()) weights = json.loads(f.read())
except IOError: except IOError:
weights = {"topList": 0.15, "recommender": 0.30, "author": 0.70, "series": 0.05, "tag": 0.05, "pagerank": 0.05, "mu": 0.50, "sigma": 0.30, "bias": 0.25} #, "tgb_rank": 0.10} weights = {"topList": 0.15, "recommender": 0.30, "author": 0.70, "series": 0.05, "tag": 0.05, "pagerank": 0.05, "mu": 0.50, "sigma": 0.30, "bias": 0.25, "gpr_score": 1.00, "gpr_se": -0.50} #, "tgb_rank": 0.10}
return weights return weights
def cliInterface(): def cliInterface():

75
py/gp.py Normal file
View File

@ -0,0 +1,75 @@
import numpy as np
from node2vec import Node2Vec
from sklearn.gaussian_process.kernels import Kernel, Hyperparameter
from sklearn.gaussian_process.kernels import GenericKernelMixin
from sklearn.gaussian_process import GaussianProcessRegressor
#from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.base import clone
class BookKernel(GenericKernelMixin, Kernel):
def __init__(self, wv):
self.wv = wv
def _f(self, s1, s2):
"""
kernel value between a pair of sequences
"""
s = self.wv.similarity(s1, s2)**2*0.99 + 0.01
if s <= 0:
print('bad!')
return s
def __call__(self, X, Y=None, eval_gradient=False):
if Y is None:
Y = X
if eval_gradient:
return (
np.array([[self._f(x, y) for y in Y] for x in X])
)
else:
return np.array([[self._f(x, y) for y in Y] for x in X])
#return np.array(self.wv.n_similarity(X, Y))
def diag(self, X):
return self(X)
def is_stationary(self):
return False
def clone_with_theta(self, theta):
cloned = clone(self)
cloned.theta = theta
return cloned
def genGprScores(G, scoreName='gpr_score', stdName='gpr_std'):
print('[\] Constructing Feature-Space-Projector')
node2vec = Node2Vec(G, dimensions=32, walk_length=16, num_walks=128, workers=8)
print('[\] Fitting Embeddings for Kernel')
model = node2vec.fit(window=8, min_count=1, batch_words=4)
wv = model.wv
print('[\] Constructing Kernel')
kernel = BookKernel(wv)
print('[\] Fitting GP')
X, y = [], []
for n in G.nodes:
node = G.nodes[n]
if 'rating' in node and node['rating']!=None:
X.append(n)
y.append(node['rating'])
gpr = GaussianProcessRegressor(kernel=kernel, random_state=3141, alpha=1e-8).fit(X, y)
print('[\] Inferencing GP')
X = []
for n in G.nodes:
node = G.nodes[n]
if not 'rating' in node or node['rating']==None:
X.append(n)
y, stds = gpr.predict(X, return_std=True)
i=0
for n in G.nodes:
node = G.nodes[n]
if not 'rating' in node or node['rating']==None:
s, std = y[i], stds[i][i][0]
i+=1
node[scoreName], node[stdName] = float(s), float(std)