GPR using Node2Vec embedding cosine similarity kernels done

This commit is contained in:
Dominik Moritz Roth 2022-02-25 00:44:58 +01:00
parent 02e912d4ff
commit 06712ee027
2 changed files with 24 additions and 30 deletions

View File

@ -389,8 +389,11 @@ def removeUselessSeries(G, minSco=0):
def scoreOpinions(G, globMu, globStd): def scoreOpinions(G, globMu, globStd):
for n in list(G.nodes): for n in list(G.nodes):
node = G.nodes[n] node = G.nodes[n]
feedbacks = []
if node['t'] not in ['book', 'newBooks']: if node['t'] not in ['book', 'newBooks']:
if 'gpr_score' in node:
feedbacks = [node['gpr_score']]
else:
feedbacks = []
adjacens = list(G.adj[n].keys()) adjacens = list(G.adj[n].keys())
for adj in adjacens: for adj in adjacens:
adjNode = G.nodes[adj] adjNode = G.nodes[adj]
@ -399,7 +402,6 @@ def scoreOpinions(G, globMu, globStd):
if len(feedbacks): if len(feedbacks):
node['mean'], node['std'] = norm.fit(feedbacks) node['mean'], node['std'] = norm.fit(feedbacks)
node['se'] = globStd / math.sqrt(len(feedbacks)) node['se'] = globStd / math.sqrt(len(feedbacks))
ratio = len(feedbacks) / len(adjacens)
node['score'] = node['mean'] node['score'] = node['mean']
node['feedbacks'] = feedbacks node['feedbacks'] = feedbacks
else: else:
@ -436,6 +438,11 @@ def scoreUnread(G, globMu, globStd):
#ws.append(['se']) #ws.append(['se'])
feedbacks.append(globMu) feedbacks.append(globMu)
ws.append(['bias']) ws.append(['bias'])
if 'gpr_score' in node:
feedbacks.append(node['gpr_score'])
ws.append(['gpr_score'])
feedbacks.append(node['gpr_se'])
ws.append(['gpr_se'])
node['score'] = sum([fb*getWeightForType(w[0], w[1] if len(w)>1 else 1) for fb, w in zip(feedbacks, ws)])/sum([getWeightForType(w[0], w[1] if len(w)>1 else 1) for w in ws]) node['score'] = sum([fb*getWeightForType(w[0], w[1] if len(w)>1 else 1) for fb, w in zip(feedbacks, ws)])/sum([getWeightForType(w[0], w[1] if len(w)>1 else 1) for w in ws])
node['_act'] = feedbacks node['_act'] = feedbacks
node['_wgh'] = ws node['_wgh'] = ws
@ -809,6 +816,9 @@ def buildFullGraph(darkMode=False):
graphAddTopLists(G, books, darkMode=darkMode) graphAddTopLists(G, books, darkMode=darkMode)
graphAddSeries(G, books, darkMode=darkMode) graphAddSeries(G, books, darkMode=darkMode)
graphAddTags(G, books, darkMode=darkMode) graphAddTags(G, books, darkMode=darkMode)
genGprScores(G, 'gpr_score', 'gpr_se')
return G, books return G, books
@ -816,9 +826,8 @@ def genScores(G, books, calcPagerank=True):
globMu, globStd = calcRecDist(G, books) globMu, globStd = calcRecDist(G, books)
if calcPagerank: if calcPagerank:
runPagerank(G) runPagerank(G)
genGprScores(G, globMu, globStd, 'score', 'se') scoreOpinions(G, globMu, globStd)
#scoreOpinions(G, globMu, globStd) scoreUnread(G, globMu, globStd)
#scoreUnread(G, globMu, globStd)
return globMu, globStd return globMu, globStd
def addImageToNode(node, cache, shape='circularImage'): def addImageToNode(node, cache, shape='circularImage'):
@ -1111,23 +1120,6 @@ def waveFlow(G, node, n, dist, menge, firstEdge=False):
if node in bestlist or node in keeplist: if node in bestlist or node in keeplist:
waveFlow(G, node, m, dist, menge, firstEdge=firstEdge) waveFlow(G, node, m, dist, menge, firstEdge=firstEdge)
def gensimTokensForLines(lines):
for i, line in enumerate(lines):
tokens = gensim.utils.simple_preprocess(line)
if tokens_only:
yield tokens
else:
# For training data, add tags
yield gensim.models.doc2vec.TaggedDocument(tokens, [i])
def buildDoc2Vec(books):
import gensim
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'book':
pass
gensimTokensForLines(lines)
def shell(G, books, mu, std): def shell(G, books, mu, std):
from ptpython.repl import embed from ptpython.repl import embed
embed(globals(), locals()) embed(globals(), locals())
@ -1201,7 +1193,7 @@ def findNewBooks(G, books, mu, num=-1, minRecSco=5):
# while batchSize is implemented, we only get a good gonvergence when we disable it (batchSize=-1) # while batchSize is implemented, we only get a good gonvergence when we disable it (batchSize=-1)
# but might be necessary to enable later for a larger libary for better training performance... # but might be necessary to enable later for a larger libary for better training performance...
# maybe try again for 128 books? # maybe try again for 128 books?
def evaluateFitness(books, batchSize=-1, debugPrint=False): def evaluateFitness(books, batchSize=-1, debugPrint=False, doGPR=True):
global weights global weights
G = buildBookGraph(books) G = buildBookGraph(books)
graphAddAuthors(G, books) graphAddAuthors(G, books)
@ -1210,6 +1202,8 @@ def evaluateFitness(books, batchSize=-1, debugPrint=False):
graphAddSeries(G, books) graphAddSeries(G, books)
graphAddTags(G, books) graphAddTags(G, books)
runPagerank(G) runPagerank(G)
if doGPR:
genGprScores(G, 'gpr_score', 'gpr_se')
ratedBooks = [n for n in list(G.nodes) if 'rating' in G.nodes[n] and G.nodes[n]['rating'] != None] ratedBooks = [n for n in list(G.nodes) if 'rating' in G.nodes[n] and G.nodes[n]['rating'] != None]
boundsLoss = 0 boundsLoss = 0
@ -1270,7 +1264,7 @@ def train(initGamma, full=True):
delta = sum(gradient[g]**2 for g in gradient) delta = sum(gradient[g]**2 for g in gradient)
for wt in weights: for wt in weights:
weights[wt] += gamma*gradient[wt]/math.sqrt(delta) weights[wt] += gamma*gradient[wt]/math.sqrt(delta)
mse, gradient = evaluateFitness(books) mse, gradient = evaluateFitness(books, doGPR=False)
if mse < last_mse: if mse < last_mse:
gamma = gamma*1.25 gamma = gamma*1.25
else: else:
@ -1307,7 +1301,7 @@ def loadWeights():
with open('neuralWeights.json', 'r') as f: with open('neuralWeights.json', 'r') as f:
weights = json.loads(f.read()) weights = json.loads(f.read())
except IOError: except IOError:
weights = {"topList": 0.15, "recommender": 0.30, "author": 0.70, "series": 0.05, "tag": 0.05, "pagerank": 0.05, "mu": 0.50, "sigma": 0.30, "bias": 0.25} #, "tgb_rank": 0.10} weights = {"topList": 0.15, "recommender": 0.30, "author": 0.70, "series": 0.05, "tag": 0.05, "pagerank": 0.05, "mu": 0.50, "sigma": 0.30, "bias": 0.25, "gpr_score": 1.00, "gpr_se": -0.50} #, "tgb_rank": 0.10}
return weights return weights
def cliInterface(): def cliInterface():

View File

@ -43,33 +43,33 @@ class BookKernel(GenericKernelMixin, Kernel):
cloned.theta = theta cloned.theta = theta
return cloned return cloned
def genGprScores(G, globMu, globStd, scoreName='gpr_score', stdName='gpr_std'): def genGprScores(G, scoreName='gpr_score', stdName='gpr_std'):
print('[\] Constructing Vectorizer') print('[\] Constructing Feature-Space-Projector')
node2vec = Node2Vec(G, dimensions=32, walk_length=16, num_walks=128, workers=8) node2vec = Node2Vec(G, dimensions=32, walk_length=16, num_walks=128, workers=8)
print('[\] Fitting Embeddings for Kernel') print('[\] Fitting Embeddings for Kernel')
model = node2vec.fit(window=8, min_count=1, batch_words=4) model = node2vec.fit(window=8, min_count=1, batch_words=4)
wv = model.wv wv = model.wv
print('[\] Constructing Kernel') print('[\] Constructing Kernel')
kernel = BookKernel(wv) kernel = BookKernel(wv)
print('[\] Fitting GP')
X, y = [], [] X, y = [], []
for n in G.nodes: for n in G.nodes:
node = G.nodes[n] node = G.nodes[n]
if 'rating' in node and node['rating']!=None: if 'rating' in node and node['rating']!=None:
X.append(n) X.append(n)
y.append(node['rating']) y.append(node['rating'])
print('[\] Fitting GP')
gpr = GaussianProcessRegressor(kernel=kernel, random_state=3141, alpha=1e-8).fit(X, y) gpr = GaussianProcessRegressor(kernel=kernel, random_state=3141, alpha=1e-8).fit(X, y)
print('[\] Inferencing GP')
X = [] X = []
for n in G.nodes: for n in G.nodes:
node = G.nodes[n] node = G.nodes[n]
if not 'rating' in node or node['rating']==None: if not 'rating' in node or node['rating']==None:
X.append(n) X.append(n)
print('[\] Inferencing GP')
y, stds = gpr.predict(X, return_std=True) y, stds = gpr.predict(X, return_std=True)
i=0 i=0
for n in G.nodes: for n in G.nodes:
node = G.nodes[n] node = G.nodes[n]
if not 'rating' in node or node['rating']==None: if not 'rating' in node or node['rating']==None:
s, std = y[i], sum([val[0] for val in stds[i]]) s, std = y[i], stds[i][i][0]
i+=1 i+=1
node[scoreName], node[stdName] = float(s), float(std) node[scoreName], node[stdName] = float(s), float(std)