implemented neuralBins (performance is bad...)

2022-02-22 10:37:16 +01:00 · 2022-02-22 10:37:16 +01:00 · bd53a83058
commit bd53a83058
parent 880cb6ba7e
1 changed files with 53 additions and 45 deletions
--- a/caliGraph.py
+++ b/caliGraph.py
@ -7,6 +7,8 @@ import copy
 import random
 import requests

+from collections import defaultdict
+
 import numpy as np
 import pandas as pd
 from scipy.stats import norm
@ -404,9 +406,9 @@ def scoreOpinions(G, globMu, globStd):
                node['score'] = None

 def scoreUnread(G, globMu, globStd):
+    neuralBins = defaultdict(list)
+    feedbacks = [globMu-globStd, globMu+globStd]
    for n in list(G.nodes):
-        feedbacks = [globMu]
-        ws = [['mu']]
        node = G.nodes[n]
        if node['t'] == 'book':
            if node['rating'] == None:
@ -414,42 +416,41 @@ def scoreUnread(G, globMu, globStd):
                for adj in adjacens:
                    adjNode = G.nodes[adj]
                    if 'score' in adjNode and adjNode['score'] != None:
-                        w = [adjNode['t'], G[n][adj]['weight'] if 'weight' in G[n][adj] else 1]
+                        w = adjNode['t']
                        for fb in adjNode['feedbacks']:
+                            neuralBins[w].append(fb)
                            feedbacks.append(fb)
-                            ws.append(w)
-                if len(feedbacks):
                node['mean'], node['std'] = norm.fit(feedbacks)
                node['median'] = np.percentile(feedbacks, [50], method='linear')[0]
                node['se'] = globStd / math.sqrt(len(feedbacks))
-                    feedbacks.append(node['pagerank_score'])
-                    ws.append(['pagerank'])
-                    #feedbacks.append(10/math.ln10(10+node['tgb_rank']) if 'tgb_rank' in node else 0)
-                    #ws.append(['tgb_rank'])
-                    feedbacks.append(node['std'])
-                    ws.append(['sigma'])
-                    #feedbacks.append(node['median'])
-                    #ws.append(['median'])
-                    #feedbacks.append(node['se'])
-                    #ws.append(['se'])
-                    feedbacks.append(globMu)
-                    ws.append(['bias'])
-                    node['score'] = sum([fb*getWeightForType(w[0], w[1] if len(w)>1 else 1) for fb, w in zip(feedbacks, ws)])/sum([getWeightForType(w[0], w[1] if len(w)>1 else 1) for w in ws])
-                    node['_act'] = feedbacks
-                    node['_wgh'] = ws
-                else:
-                    node['score'] = globMu + errorFac*globStd + len(feedbacks)*0.0000000001
+                neuralBins['mean'] = [node['mean']]
+                neuralBins['sigma'] = [node['std']]
+                neuralBins['median'] = [node['median']]
+                neuralBins['se'] = [node['se']]
+                neuralBins['pagerank'] = [node['pagerank_score']]
+                if 'tgb_rank' in node:
+                    neuralBins['tgbrank'] = [10/math.ln10(10+node['tgb_rank'])]
+                neuralBins['bias'] = [globMu]
+                score = 0
+                nb = dict(neuralBins)
+                act = {}
+                for b in nb:
+                    act[b] = sum(nb[b])/len(nb[b])
+                    score += act[b] * getWeightForType(b)
+                score /= sum([abs(getWeightForType(b)) for b in nb])
+                node['score'] = math.tanh(score/10)*10
+                node['_act'] = act
                if 'series' in node:
                    if node['series_index'] == 1.0:
                        node['score'] += 0.000000001

-def getWeightForType(nodeType, edgeWeight=1):
+def getWeightForType(nodeType):
    global weights
-    w = weights[nodeType]
-    if nodeType == 'topList':
-        return edgeWeight*w
-    else:
-        return w
+    if nodeType not in weights:
+        weights[nodeType] = 0.1
+        saveWeights(weights)
+        print('[i] neuralWeights-Vector extended with >'+nodeType+'<')
+    return weights[nodeType]

 def printBestList(G, t='book', num=-1):
    bestlist = []
@ -1198,7 +1199,7 @@ def findNewBooks(G, books, mu, num=-1, minRecSco=5):
 # while batchSize is implemented, we only get a good gonvergence when we disable it (batchSize=-1)
 # but might be necessary to enable later for a larger libary for better training performance...
 # maybe try again for 128 books?
-def evaluateFitness(books, batchSize=-1, debugPrint=False):
+def evaluateFitness(books, batchSize=16, debugPrint=False):
    global weights
    G = buildBookGraph(books)
    graphAddAuthors(G, books)
@ -1213,12 +1214,13 @@ def evaluateFitness(books, batchSize=-1, debugPrint=False):
    linSepLoss = []
    errSq = []
    gradient = {}
-    for wt in weights:
-        gradient[wt] = 0
+    for w in weights:
+        gradient[w] = 0
    mu, sigma = genScores(G, books)
-    for b in G.nodes:
    batch = random.sample(ratedBooks, batchSize) if batchSize!=-1 and len(ratedBooks) > batchSize else ratedBooks
-        if b in batch:
+    for b in G.nodes:
+        if b in ratedBooks:
+            node = G.nodes[b]
            rating = G.nodes[b]['rating']
            G.nodes[b]['rating'] = None
            _, _ = genScores(G, books, calcPagerank=False)
@ -1227,17 +1229,20 @@ def evaluateFitness(books, batchSize=-1, debugPrint=False):
            else:
                errSq.append((rating - G.nodes[b]['score'])**2)
            G.nodes[b]['rating'] = rating
+            if b in batch:
                for wt in weights:
-                scoreB = sum([a*(1.001 if wt==w[0] else 1)*weights[w[0]]*(w[1] if len(w)>1 else 1) for a,w in zip(G.nodes[b]['_act'], G.nodes[b]['_wgh'])])/sum([(1.001 if wt==w[0] else 1)*weights[w[0]]*(w[1] if len(w)>1 else 1) for w in G.nodes[b]['_wgh']])
+                    scoreB = 0
+                    for w in node['_act']:
+                        scoreB += node['_act'][w] * (getWeightForType(w) + (0.001 if wt==w else 0))
+                    scoreB /= sum([abs(getWeightForType(w)) for w in node['_act']])
+                    scoreB = math.tanh(scoreB/10)*10
                    gradient[wt] += ((rating - G.nodes[b]['score'])**2 - (rating - scoreB)**2)*1000
    regressionLoss = sum([max(0,abs(w)-1)**2 for w in weights.values()]) # no punishment if w within -1 and 1
    for wt in weights:
        if abs(weights[wt]) > 1.0:
-            gradient[wt] -= weights[wt]*10
-        else:
-            gradient[wt] -= weights[wt]*1
+            gradient[wt] -= weights[wt]*3
    for g in gradient:
-        gradient[g] /= len(errSq)
+        gradient[g] /= len(batch)
    if debugPrint:
        print(sum(errSq)/len(errSq), 0.001*regressionLoss)
    fit = sum(errSq)/len(errSq) + 0.001*regressionLoss
@ -1253,7 +1258,7 @@ def train(initGamma, full=True):
    books = loadBooksFromDB()
    bestWeights = copy.copy(weights)
    mse, gradient = evaluateFitness(books)
-    delta = sum(gradient[g]**2 for g in gradient)
+    delta = math.sqrt(sum(gradient[g]**2 for g in gradient)/len(gradient))
    best_mse = mse
    stagLen = 0
    goal = 1.0e-4
@ -1266,7 +1271,10 @@ def train(initGamma, full=True):
        print({'mse': mse, 'gamma': gamma, 'delta': delta})
        delta = sum(gradient[g]**2 for g in gradient)
        for wt in weights:
+            if wt in gradient:
                weights[wt] += gamma*gradient[wt]/math.sqrt(delta)
+            #else:
+            #    del weights[wt]
        mse, gradient = evaluateFitness(books)
        if mse < last_mse:
            gamma = gamma*1.25