smashing 'deees bugs

Allow disabling pagerank
Fixed bin accumulation; added perf-test
2022-03-07 17:56:59 +01:00 · 2022-02-22 15:22:36 +01:00 · 2022-02-22 15:03:51 +01:00
2 changed files with 36 additions and 12 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,5 +2,5 @@ __pycache__
 *.html
 .venv
 neuralWeights.json
-neuralWeights.json.bak
+neuralWeights.json.*
 .imgLinkCache.json
--- a/caliGraph.py
+++ b/caliGraph.py
@ -418,8 +418,8 @@ def scoreUnread(G, globMu, globStd):
                    if 'score' in adjNode and adjNode['score'] != None:
                        w = adjNode['t']
                        for fb in adjNode['feedbacks']:
-                            neuralBins[w].append(fb)
                            feedbacks.append(fb)
+                        neuralBins[w].append(adjNode['score'])
                node['mean'], node['std'] = norm.fit(feedbacks)
                node['median'] = np.percentile(feedbacks, [50], method='linear')[0]
                node['se'] = globStd / math.sqrt(len(feedbacks))
@ -427,15 +427,17 @@ def scoreUnread(G, globMu, globStd):
                neuralBins['sigma'] = [node['std']]
                neuralBins['median'] = [node['median']]
                neuralBins['se'] = [node['se']]
-                neuralBins['pagerank'] = [node['pagerank_score']]
+                if 'pagerank_score' in node:
+                    neuralBins['pagerank'] = [node['pagerank_score']]
                if 'tgb_rank' in node:
                    neuralBins['tgbrank'] = [10/math.ln10(10+node['tgb_rank'])]
                neuralBins['bias'] = [globMu]
                score = 0
                nb = dict(neuralBins)
                act = {}
+                jig = {}
                for b in nb:
-                    act[b] = sum(nb[b])/len(nb[b])
+                    act[b], jig[b] = norm.fit(nb[b])
                    score += act[b] * getWeightForType(b)
                score /= sum([abs(getWeightForType(b)) for b in nb])
                node['score'] = math.tanh(score/10)*10
@ -1199,7 +1201,7 @@ def findNewBooks(G, books, mu, num=-1, minRecSco=5):
 # while batchSize is implemented, we only get a good gonvergence when we disable it (batchSize=-1)
 # but might be necessary to enable later for a larger libary for better training performance...
 # maybe try again for 128 books?
-def evaluateFitness(books, batchSize=16, debugPrint=False):
+def evaluateFitness(books, batchSize=16, debugPrint=False, calcPagerank=True):
    global weights
    G = buildBookGraph(books)
    graphAddAuthors(G, books)
@ -1207,7 +1209,8 @@ def evaluateFitness(books, batchSize=16, debugPrint=False):
    graphAddTopLists(G, books)
    graphAddSeries(G, books)
    graphAddTags(G, books)
-    runPagerank(G)
+    if calcPagerank:
+        runPagerank(G)

    ratedBooks = [n for n in list(G.nodes) if 'rating' in G.nodes[n] and G.nodes[n]['rating'] != None]
    boundsLoss = 0
@ -1216,7 +1219,7 @@ def evaluateFitness(books, batchSize=16, debugPrint=False):
    gradient = {}
    for w in weights:
        gradient[w] = 0
-    mu, sigma = genScores(G, books)
+    mu, sigma = genScores(G, books, calcPagerank=runPagerank)
    batch = random.sample(ratedBooks, batchSize) if batchSize!=-1 and len(ratedBooks) > batchSize else ratedBooks
    for b in G.nodes:
        if b in ratedBooks:
@ -1248,7 +1251,7 @@ def evaluateFitness(books, batchSize=16, debugPrint=False):
    fit = sum(errSq)/len(errSq) + 0.001*regressionLoss
    return fit, gradient

-def train(initGamma, full=True):
+def train(initGamma, full=True, noPagerank=False):
    global weights
    if full:
        for wt in weights:
@ -1257,7 +1260,7 @@ def train(initGamma, full=True):
    gamma = initGamma
    books = loadBooksFromDB()
    bestWeights = copy.copy(weights)
-    mse, gradient = evaluateFitness(books)
+    mse, gradient = evaluateFitness(books, calcPagerank=not noPagerank)
    delta = math.sqrt(sum(gradient[g]**2 for g in gradient)/len(gradient))
    best_mse = mse
    stagLen = 0
@ -1275,7 +1278,7 @@ def train(initGamma, full=True):
                weights[wt] += gamma*gradient[wt]/math.sqrt(delta)
            #else:
            #    del weights[wt]
-        mse, gradient = evaluateFitness(books)
+        mse, gradient = evaluateFitness(books, calcPagerank=not noPagerank)
        if mse < last_mse:
            gamma = gamma*1.25
        else:
@ -1331,6 +1334,8 @@ def cliInterface():
    parser.add_argument('--dark', action="store_true")
    parser.add_argument('--v3d', action="store_true")
    parser.add_argument('--imgs', action="store_true")
+    parser.add_argument('--perf-test', action="store_true")
+    parser.add_argument('--no-pagerank', action="store_true")
    cmds = parser.add_subparsers(required=True, dest='cmd')

    p_rec = cmds.add_parser('recommend', description="TODO", aliases=['rec'])
@ -1370,14 +1375,33 @@ def cliInterface():

    args = parser.parse_args()

+    if args.perf_test:
+        perfTestCLI(args)
+    else:
+        mainCLI(args)
+
+def perfTestCLI(args):
+    import time
+    from pycallgraph import PyCallGraph
+    from pycallgraph import Config
+    from pycallgraph import GlobbingFilter
+    from pycallgraph.output import GraphvizOutput
+    config = Config()
+    config.trace_filter = GlobbingFilter(exclude=[
+        "pycallgraph.*",
+    ])
+    with PyCallGraph(output=GraphvizOutput(output_file='perfTests/' + str(int(time.time())) + '.png'), config=config):
+        mainCLI(args)
+
+def mainCLI(args):
    if args.cmd=="train":
-        train(args.g, args.full)
+        train(args.g, args.full, args.no_pagerank)
        exit()

    bestListT = 'book'

    G, books = buildFullGraph(darkMode=args.dark)
-    mu, std = genScores(G, books)
+    mu, std = genScores(G, books, calcPagerank=not args.no_pagerank)

    if not args.keep_whitepapers:
        removeWhitepapers(G)
Author	SHA1	Message	Date
Dominik Roth	cbb884b377	smashing 'deees bugs	2022-03-07 17:56:59 +01:00
Dominik Roth	9a02bdc2a8	Allow disabling pagerank	2022-02-22 15:22:36 +01:00
Dominik Roth	01d41f3a82	Fixed bin accumulation; added perf-test	2022-02-22 15:03:51 +01:00