Compare commits
7 Commits
Author | SHA1 | Date | |
---|---|---|---|
06712ee027 | |||
02e912d4ff | |||
e87288a927 | |||
85c800d39e | |||
0dc40c5635 | |||
7c16b8044e | |||
b10fcac016 |
139
caliGraph.py
139
caliGraph.py
@ -7,8 +7,6 @@ import copy
|
|||||||
import random
|
import random
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from collections import defaultdict
|
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from scipy.stats import norm
|
from scipy.stats import norm
|
||||||
@ -20,6 +18,8 @@ import plotly.graph_objects as go
|
|||||||
|
|
||||||
import wikipedia
|
import wikipedia
|
||||||
|
|
||||||
|
from py.gp import *
|
||||||
|
|
||||||
def getAllAuthors(books):
|
def getAllAuthors(books):
|
||||||
authors = set()
|
authors = set()
|
||||||
for book in books:
|
for book in books:
|
||||||
@ -389,8 +389,11 @@ def removeUselessSeries(G, minSco=0):
|
|||||||
def scoreOpinions(G, globMu, globStd):
|
def scoreOpinions(G, globMu, globStd):
|
||||||
for n in list(G.nodes):
|
for n in list(G.nodes):
|
||||||
node = G.nodes[n]
|
node = G.nodes[n]
|
||||||
feedbacks = []
|
if node['t'] not in ['book', 'newBooks']:
|
||||||
if node['t'] not in ['book']:
|
if 'gpr_score' in node:
|
||||||
|
feedbacks = [node['gpr_score']]
|
||||||
|
else:
|
||||||
|
feedbacks = []
|
||||||
adjacens = list(G.adj[n].keys())
|
adjacens = list(G.adj[n].keys())
|
||||||
for adj in adjacens:
|
for adj in adjacens:
|
||||||
adjNode = G.nodes[adj]
|
adjNode = G.nodes[adj]
|
||||||
@ -399,16 +402,15 @@ def scoreOpinions(G, globMu, globStd):
|
|||||||
if len(feedbacks):
|
if len(feedbacks):
|
||||||
node['mean'], node['std'] = norm.fit(feedbacks)
|
node['mean'], node['std'] = norm.fit(feedbacks)
|
||||||
node['se'] = globStd / math.sqrt(len(feedbacks))
|
node['se'] = globStd / math.sqrt(len(feedbacks))
|
||||||
ratio = len(feedbacks) / len(adjacens)
|
|
||||||
node['score'] = node['mean']
|
node['score'] = node['mean']
|
||||||
node['feedbacks'] = feedbacks
|
node['feedbacks'] = feedbacks
|
||||||
else:
|
else:
|
||||||
node['score'] = None
|
node['score'] = None
|
||||||
|
|
||||||
def scoreUnread(G, globMu, globStd):
|
def scoreUnread(G, globMu, globStd):
|
||||||
neuralBins = defaultdict(list)
|
|
||||||
feedbacks = [globMu-globStd, globMu+globStd]
|
|
||||||
for n in list(G.nodes):
|
for n in list(G.nodes):
|
||||||
|
feedbacks = [globMu]
|
||||||
|
ws = [['mu']]
|
||||||
node = G.nodes[n]
|
node = G.nodes[n]
|
||||||
if node['t'] == 'book':
|
if node['t'] == 'book':
|
||||||
if node['rating'] == None:
|
if node['rating'] == None:
|
||||||
@ -416,41 +418,47 @@ def scoreUnread(G, globMu, globStd):
|
|||||||
for adj in adjacens:
|
for adj in adjacens:
|
||||||
adjNode = G.nodes[adj]
|
adjNode = G.nodes[adj]
|
||||||
if 'score' in adjNode and adjNode['score'] != None:
|
if 'score' in adjNode and adjNode['score'] != None:
|
||||||
w = adjNode['t']
|
w = [adjNode['t'], G[n][adj]['weight'] if 'weight' in G[n][adj] else 1]
|
||||||
for fb in adjNode['feedbacks']:
|
for fb in adjNode['feedbacks']:
|
||||||
neuralBins[w].append(fb)
|
|
||||||
feedbacks.append(fb)
|
feedbacks.append(fb)
|
||||||
node['mean'], node['std'] = norm.fit(feedbacks)
|
ws.append(w)
|
||||||
node['median'] = np.percentile(feedbacks, [50], method='linear')[0]
|
if len(feedbacks):
|
||||||
node['se'] = globStd / math.sqrt(len(feedbacks))
|
node['mean'], node['std'] = norm.fit(feedbacks)
|
||||||
neuralBins['mean'] = [node['mean']]
|
node['median'] = np.percentile(feedbacks, [50], method='linear')[0]
|
||||||
neuralBins['sigma'] = [node['std']]
|
node['se'] = globStd / math.sqrt(len(feedbacks))
|
||||||
neuralBins['median'] = [node['median']]
|
feedbacks.append(node['pagerank_score'])
|
||||||
neuralBins['se'] = [node['se']]
|
ws.append(['pagerank'])
|
||||||
neuralBins['pagerank'] = [node['pagerank_score']]
|
#feedbacks.append(10/math.ln10(10+node['tgb_rank']) if 'tgb_rank' in node else 0)
|
||||||
if 'tgb_rank' in node:
|
#ws.append(['tgb_rank'])
|
||||||
neuralBins['tgbrank'] = [10/math.ln10(10+node['tgb_rank'])]
|
feedbacks.append(node['std'])
|
||||||
neuralBins['bias'] = [globMu]
|
ws.append(['sigma'])
|
||||||
score = 0
|
#feedbacks.append(node['median'])
|
||||||
nb = dict(neuralBins)
|
#ws.append(['median'])
|
||||||
act = {}
|
#feedbacks.append(node['se'])
|
||||||
for b in nb:
|
#ws.append(['se'])
|
||||||
act[b] = sum(nb[b])/len(nb[b])
|
feedbacks.append(globMu)
|
||||||
score += act[b] * getWeightForType(b)
|
ws.append(['bias'])
|
||||||
score /= sum([abs(getWeightForType(b)) for b in nb])
|
if 'gpr_score' in node:
|
||||||
node['score'] = math.tanh(score/10)*10
|
feedbacks.append(node['gpr_score'])
|
||||||
node['_act'] = act
|
ws.append(['gpr_score'])
|
||||||
|
feedbacks.append(node['gpr_se'])
|
||||||
|
ws.append(['gpr_se'])
|
||||||
|
node['score'] = sum([fb*getWeightForType(w[0], w[1] if len(w)>1 else 1) for fb, w in zip(feedbacks, ws)])/sum([getWeightForType(w[0], w[1] if len(w)>1 else 1) for w in ws])
|
||||||
|
node['_act'] = feedbacks
|
||||||
|
node['_wgh'] = ws
|
||||||
|
else:
|
||||||
|
node['score'] = globMu + errorFac*globStd + len(feedbacks)*0.0000000001
|
||||||
if 'series' in node:
|
if 'series' in node:
|
||||||
if node['series_index'] == 1.0:
|
if node['series_index'] == 1.0:
|
||||||
node['score'] += 0.000000001
|
node['score'] += 0.000000001
|
||||||
|
|
||||||
def getWeightForType(nodeType):
|
def getWeightForType(nodeType, edgeWeight=1):
|
||||||
global weights
|
global weights
|
||||||
if nodeType not in weights:
|
w = weights[nodeType]
|
||||||
weights[nodeType] = 0.1
|
if nodeType == 'topList':
|
||||||
saveWeights(weights)
|
return edgeWeight*w
|
||||||
print('[i] neuralWeights-Vector extended with >'+nodeType+'<')
|
else:
|
||||||
return weights[nodeType]
|
return w
|
||||||
|
|
||||||
def printBestList(G, t='book', num=-1):
|
def printBestList(G, t='book', num=-1):
|
||||||
bestlist = []
|
bestlist = []
|
||||||
@ -808,6 +816,9 @@ def buildFullGraph(darkMode=False):
|
|||||||
graphAddTopLists(G, books, darkMode=darkMode)
|
graphAddTopLists(G, books, darkMode=darkMode)
|
||||||
graphAddSeries(G, books, darkMode=darkMode)
|
graphAddSeries(G, books, darkMode=darkMode)
|
||||||
graphAddTags(G, books, darkMode=darkMode)
|
graphAddTags(G, books, darkMode=darkMode)
|
||||||
|
|
||||||
|
genGprScores(G, 'gpr_score', 'gpr_se')
|
||||||
|
|
||||||
return G, books
|
return G, books
|
||||||
|
|
||||||
|
|
||||||
@ -1109,23 +1120,6 @@ def waveFlow(G, node, n, dist, menge, firstEdge=False):
|
|||||||
if node in bestlist or node in keeplist:
|
if node in bestlist or node in keeplist:
|
||||||
waveFlow(G, node, m, dist, menge, firstEdge=firstEdge)
|
waveFlow(G, node, m, dist, menge, firstEdge=firstEdge)
|
||||||
|
|
||||||
def gensimTokensForLines(lines):
|
|
||||||
for i, line in enumerate(lines):
|
|
||||||
tokens = gensim.utils.simple_preprocess(line)
|
|
||||||
if tokens_only:
|
|
||||||
yield tokens
|
|
||||||
else:
|
|
||||||
# For training data, add tags
|
|
||||||
yield gensim.models.doc2vec.TaggedDocument(tokens, [i])
|
|
||||||
|
|
||||||
def buildDoc2Vec(books):
|
|
||||||
import gensim
|
|
||||||
for n in list(G.nodes):
|
|
||||||
node = G.nodes[n]
|
|
||||||
if node['t'] == 'book':
|
|
||||||
pass
|
|
||||||
gensimTokensForLines(lines)
|
|
||||||
|
|
||||||
def shell(G, books, mu, std):
|
def shell(G, books, mu, std):
|
||||||
from ptpython.repl import embed
|
from ptpython.repl import embed
|
||||||
embed(globals(), locals())
|
embed(globals(), locals())
|
||||||
@ -1199,7 +1193,7 @@ def findNewBooks(G, books, mu, num=-1, minRecSco=5):
|
|||||||
# while batchSize is implemented, we only get a good gonvergence when we disable it (batchSize=-1)
|
# while batchSize is implemented, we only get a good gonvergence when we disable it (batchSize=-1)
|
||||||
# but might be necessary to enable later for a larger libary for better training performance...
|
# but might be necessary to enable later for a larger libary for better training performance...
|
||||||
# maybe try again for 128 books?
|
# maybe try again for 128 books?
|
||||||
def evaluateFitness(books, batchSize=16, debugPrint=False):
|
def evaluateFitness(books, batchSize=-1, debugPrint=False, doGPR=True):
|
||||||
global weights
|
global weights
|
||||||
G = buildBookGraph(books)
|
G = buildBookGraph(books)
|
||||||
graphAddAuthors(G, books)
|
graphAddAuthors(G, books)
|
||||||
@ -1208,19 +1202,20 @@ def evaluateFitness(books, batchSize=16, debugPrint=False):
|
|||||||
graphAddSeries(G, books)
|
graphAddSeries(G, books)
|
||||||
graphAddTags(G, books)
|
graphAddTags(G, books)
|
||||||
runPagerank(G)
|
runPagerank(G)
|
||||||
|
if doGPR:
|
||||||
|
genGprScores(G, 'gpr_score', 'gpr_se')
|
||||||
|
|
||||||
ratedBooks = [n for n in list(G.nodes) if 'rating' in G.nodes[n] and G.nodes[n]['rating'] != None]
|
ratedBooks = [n for n in list(G.nodes) if 'rating' in G.nodes[n] and G.nodes[n]['rating'] != None]
|
||||||
boundsLoss = 0
|
boundsLoss = 0
|
||||||
linSepLoss = []
|
linSepLoss = []
|
||||||
errSq = []
|
errSq = []
|
||||||
gradient = {}
|
gradient = {}
|
||||||
for w in weights:
|
for wt in weights:
|
||||||
gradient[w] = 0
|
gradient[wt] = 0
|
||||||
mu, sigma = genScores(G, books)
|
mu, sigma = genScores(G, books)
|
||||||
batch = random.sample(ratedBooks, batchSize) if batchSize!=-1 and len(ratedBooks) > batchSize else ratedBooks
|
|
||||||
for b in G.nodes:
|
for b in G.nodes:
|
||||||
if b in ratedBooks:
|
batch = random.sample(ratedBooks, batchSize) if batchSize!=-1 and len(ratedBooks) > batchSize else ratedBooks
|
||||||
node = G.nodes[b]
|
if b in batch:
|
||||||
rating = G.nodes[b]['rating']
|
rating = G.nodes[b]['rating']
|
||||||
G.nodes[b]['rating'] = None
|
G.nodes[b]['rating'] = None
|
||||||
_, _ = genScores(G, books, calcPagerank=False)
|
_, _ = genScores(G, books, calcPagerank=False)
|
||||||
@ -1229,20 +1224,17 @@ def evaluateFitness(books, batchSize=16, debugPrint=False):
|
|||||||
else:
|
else:
|
||||||
errSq.append((rating - G.nodes[b]['score'])**2)
|
errSq.append((rating - G.nodes[b]['score'])**2)
|
||||||
G.nodes[b]['rating'] = rating
|
G.nodes[b]['rating'] = rating
|
||||||
if b in batch:
|
for wt in weights:
|
||||||
for wt in weights:
|
scoreB = sum([a*(1.001 if wt==w[0] else 1)*weights[w[0]]*(w[1] if len(w)>1 else 1) for a,w in zip(G.nodes[b]['_act'], G.nodes[b]['_wgh'])])/sum([(1.001 if wt==w[0] else 1)*weights[w[0]]*(w[1] if len(w)>1 else 1) for w in G.nodes[b]['_wgh']])
|
||||||
scoreB = 0
|
gradient[wt] += ((rating - G.nodes[b]['score'])**2 - (rating - scoreB)**2)*1000
|
||||||
for w in node['_act']:
|
|
||||||
scoreB += node['_act'][w] * (getWeightForType(w) + (0.001 if wt==w else 0))
|
|
||||||
scoreB /= sum([abs(getWeightForType(w)) for w in node['_act']])
|
|
||||||
scoreB = math.tanh(scoreB/10)*10
|
|
||||||
gradient[wt] += ((rating - G.nodes[b]['score'])**2 - (rating - scoreB)**2)*1000
|
|
||||||
regressionLoss = sum([max(0,abs(w)-1)**2 for w in weights.values()]) # no punishment if w within -1 and 1
|
regressionLoss = sum([max(0,abs(w)-1)**2 for w in weights.values()]) # no punishment if w within -1 and 1
|
||||||
for wt in weights:
|
for wt in weights:
|
||||||
if abs(weights[wt]) > 1.0:
|
if abs(weights[wt]) > 1.0:
|
||||||
gradient[wt] -= weights[wt]*3
|
gradient[wt] -= weights[wt]*10
|
||||||
|
else:
|
||||||
|
gradient[wt] -= weights[wt]*1
|
||||||
for g in gradient:
|
for g in gradient:
|
||||||
gradient[g] /= len(batch)
|
gradient[g] /= len(errSq)
|
||||||
if debugPrint:
|
if debugPrint:
|
||||||
print(sum(errSq)/len(errSq), 0.001*regressionLoss)
|
print(sum(errSq)/len(errSq), 0.001*regressionLoss)
|
||||||
fit = sum(errSq)/len(errSq) + 0.001*regressionLoss
|
fit = sum(errSq)/len(errSq) + 0.001*regressionLoss
|
||||||
@ -1258,7 +1250,7 @@ def train(initGamma, full=True):
|
|||||||
books = loadBooksFromDB()
|
books = loadBooksFromDB()
|
||||||
bestWeights = copy.copy(weights)
|
bestWeights = copy.copy(weights)
|
||||||
mse, gradient = evaluateFitness(books)
|
mse, gradient = evaluateFitness(books)
|
||||||
delta = math.sqrt(sum(gradient[g]**2 for g in gradient)/len(gradient))
|
delta = sum(gradient[g]**2 for g in gradient)
|
||||||
best_mse = mse
|
best_mse = mse
|
||||||
stagLen = 0
|
stagLen = 0
|
||||||
goal = 1.0e-4
|
goal = 1.0e-4
|
||||||
@ -1271,11 +1263,8 @@ def train(initGamma, full=True):
|
|||||||
print({'mse': mse, 'gamma': gamma, 'delta': delta})
|
print({'mse': mse, 'gamma': gamma, 'delta': delta})
|
||||||
delta = sum(gradient[g]**2 for g in gradient)
|
delta = sum(gradient[g]**2 for g in gradient)
|
||||||
for wt in weights:
|
for wt in weights:
|
||||||
if wt in gradient:
|
weights[wt] += gamma*gradient[wt]/math.sqrt(delta)
|
||||||
weights[wt] += gamma*gradient[wt]/math.sqrt(delta)
|
mse, gradient = evaluateFitness(books, doGPR=False)
|
||||||
#else:
|
|
||||||
# del weights[wt]
|
|
||||||
mse, gradient = evaluateFitness(books)
|
|
||||||
if mse < last_mse:
|
if mse < last_mse:
|
||||||
gamma = gamma*1.25
|
gamma = gamma*1.25
|
||||||
else:
|
else:
|
||||||
@ -1312,7 +1301,7 @@ def loadWeights():
|
|||||||
with open('neuralWeights.json', 'r') as f:
|
with open('neuralWeights.json', 'r') as f:
|
||||||
weights = json.loads(f.read())
|
weights = json.loads(f.read())
|
||||||
except IOError:
|
except IOError:
|
||||||
weights = {"topList": 0.15, "recommender": 0.30, "author": 0.70, "series": 0.05, "tag": 0.05, "pagerank": 0.05, "mu": 0.50, "sigma": 0.30, "bias": 0.25} #, "tgb_rank": 0.10}
|
weights = {"topList": 0.15, "recommender": 0.30, "author": 0.70, "series": 0.05, "tag": 0.05, "pagerank": 0.05, "mu": 0.50, "sigma": 0.30, "bias": 0.25, "gpr_score": 1.00, "gpr_se": -0.50} #, "tgb_rank": 0.10}
|
||||||
return weights
|
return weights
|
||||||
|
|
||||||
def cliInterface():
|
def cliInterface():
|
||||||
|
75
py/gp.py
Normal file
75
py/gp.py
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from node2vec import Node2Vec
|
||||||
|
from sklearn.gaussian_process.kernels import Kernel, Hyperparameter
|
||||||
|
from sklearn.gaussian_process.kernels import GenericKernelMixin
|
||||||
|
from sklearn.gaussian_process import GaussianProcessRegressor
|
||||||
|
#from sklearn.gaussian_process import GaussianProcessClassifier
|
||||||
|
from sklearn.base import clone
|
||||||
|
|
||||||
|
class BookKernel(GenericKernelMixin, Kernel):
|
||||||
|
def __init__(self, wv):
|
||||||
|
self.wv = wv
|
||||||
|
|
||||||
|
def _f(self, s1, s2):
|
||||||
|
"""
|
||||||
|
kernel value between a pair of sequences
|
||||||
|
"""
|
||||||
|
s = self.wv.similarity(s1, s2)**2*0.99 + 0.01
|
||||||
|
if s <= 0:
|
||||||
|
print('bad!')
|
||||||
|
return s
|
||||||
|
|
||||||
|
def __call__(self, X, Y=None, eval_gradient=False):
|
||||||
|
if Y is None:
|
||||||
|
Y = X
|
||||||
|
|
||||||
|
if eval_gradient:
|
||||||
|
return (
|
||||||
|
np.array([[self._f(x, y) for y in Y] for x in X])
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
return np.array([[self._f(x, y) for y in Y] for x in X])
|
||||||
|
#return np.array(self.wv.n_similarity(X, Y))
|
||||||
|
|
||||||
|
def diag(self, X):
|
||||||
|
return self(X)
|
||||||
|
|
||||||
|
def is_stationary(self):
|
||||||
|
return False
|
||||||
|
|
||||||
|
def clone_with_theta(self, theta):
|
||||||
|
cloned = clone(self)
|
||||||
|
cloned.theta = theta
|
||||||
|
return cloned
|
||||||
|
|
||||||
|
def genGprScores(G, scoreName='gpr_score', stdName='gpr_std'):
|
||||||
|
print('[\] Constructing Feature-Space-Projector')
|
||||||
|
node2vec = Node2Vec(G, dimensions=32, walk_length=16, num_walks=128, workers=8)
|
||||||
|
print('[\] Fitting Embeddings for Kernel')
|
||||||
|
model = node2vec.fit(window=8, min_count=1, batch_words=4)
|
||||||
|
wv = model.wv
|
||||||
|
print('[\] Constructing Kernel')
|
||||||
|
kernel = BookKernel(wv)
|
||||||
|
print('[\] Fitting GP')
|
||||||
|
X, y = [], []
|
||||||
|
for n in G.nodes:
|
||||||
|
node = G.nodes[n]
|
||||||
|
if 'rating' in node and node['rating']!=None:
|
||||||
|
X.append(n)
|
||||||
|
y.append(node['rating'])
|
||||||
|
gpr = GaussianProcessRegressor(kernel=kernel, random_state=3141, alpha=1e-8).fit(X, y)
|
||||||
|
print('[\] Inferencing GP')
|
||||||
|
X = []
|
||||||
|
for n in G.nodes:
|
||||||
|
node = G.nodes[n]
|
||||||
|
if not 'rating' in node or node['rating']==None:
|
||||||
|
X.append(n)
|
||||||
|
y, stds = gpr.predict(X, return_std=True)
|
||||||
|
i=0
|
||||||
|
for n in G.nodes:
|
||||||
|
node = G.nodes[n]
|
||||||
|
if not 'rating' in node or node['rating']==None:
|
||||||
|
s, std = y[i], stds[i][i][0]
|
||||||
|
i+=1
|
||||||
|
node[scoreName], node[stdName] = float(s), float(std)
|
Loading…
Reference in New Issue
Block a user