2021-12-05 19:56:26 +01:00
|
|
|
#!./.venv/bin/python3.10
|
2021-06-14 22:20:36 +02:00
|
|
|
import os
|
|
|
|
import json
|
|
|
|
import math
|
2021-09-24 16:13:55 +02:00
|
|
|
import copy
|
2021-06-14 22:20:36 +02:00
|
|
|
import random
|
|
|
|
|
2023-01-17 23:18:26 +01:00
|
|
|
import argcomplete, pyzshcomplete
|
|
|
|
# Enable autocomplete in global completion mode for bash & zsh:
|
|
|
|
# PYTHON_ARGCOMPLETE_OK
|
|
|
|
# PYZSHCOMPLETE_OK
|
2021-06-14 22:20:36 +02:00
|
|
|
|
2023-01-17 23:18:26 +01:00
|
|
|
# performance hack; only load these after validating cmd args
|
|
|
|
def defered_imports():
|
|
|
|
global re, requests
|
|
|
|
import re
|
|
|
|
import requests
|
2021-06-14 22:20:36 +02:00
|
|
|
|
2023-01-17 23:18:26 +01:00
|
|
|
global np, pd, norm
|
|
|
|
import numpy as np
|
|
|
|
import pandas as pd
|
|
|
|
from scipy.stats import norm
|
|
|
|
|
|
|
|
global plt, nx, Network, go
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
import networkx as nx
|
|
|
|
from pyvis.network import Network
|
|
|
|
import plotly.graph_objects as go
|
|
|
|
|
|
|
|
global wikipedia
|
|
|
|
import wikipedia
|
2021-06-14 22:20:36 +02:00
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2022-03-19 11:35:30 +01:00
|
|
|
class Error(Exception):
|
|
|
|
pass
|
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2021-06-14 22:20:36 +02:00
|
|
|
def getAllAuthors(books):
|
|
|
|
authors = set()
|
|
|
|
for book in books:
|
|
|
|
for author in getAuthors(book):
|
|
|
|
authors.add(author)
|
|
|
|
return list(authors)
|
|
|
|
|
|
|
|
|
|
|
|
def getAuthors(book):
|
|
|
|
return book['authors'].split(' & ')
|
|
|
|
|
|
|
|
|
|
|
|
def getRecommenders(book):
|
2022-02-06 17:59:21 +01:00
|
|
|
recs = set()
|
2021-06-14 22:20:36 +02:00
|
|
|
for tag in book['tags']:
|
|
|
|
if tag.find(" Recommendation") != -1:
|
2022-02-06 17:59:21 +01:00
|
|
|
recs.add(tag.replace(" Recommendation", ""))
|
2021-06-23 15:45:32 +02:00
|
|
|
elif tag.find("s Literature Club") != -1:
|
2022-02-06 17:59:21 +01:00
|
|
|
recs.add(tag.replace("s Literature Club", ""))
|
|
|
|
elif tag.find(":MRB") != -1:
|
|
|
|
recs.add(tag.replace(":MRB", ""))
|
|
|
|
return list(recs)
|
2021-06-14 22:20:36 +02:00
|
|
|
|
|
|
|
|
|
|
|
def getTags(book):
|
|
|
|
for tag in book['tags']:
|
2022-02-06 17:59:21 +01:00
|
|
|
if tag.find(" Recommendation") == -1 and tag.find("s Literature Club") == -1 and tag.find(" Top ") == -1 and tag.find(":MRB") == -1:
|
2021-06-14 22:20:36 +02:00
|
|
|
yield tag
|
|
|
|
|
|
|
|
|
|
|
|
def getAllRecommenders(books):
|
|
|
|
recs = set()
|
|
|
|
for book in books:
|
|
|
|
for rec in getRecommenders(book):
|
|
|
|
recs.add(rec)
|
|
|
|
return list(recs)
|
|
|
|
|
|
|
|
|
|
|
|
def getTopLists(book):
|
|
|
|
lists = set()
|
|
|
|
for tag in book['tags']:
|
|
|
|
if tag.find(" Top ") != -1:
|
|
|
|
lists.add(tag.split(" Top ")[0])
|
|
|
|
return list(lists)
|
|
|
|
|
|
|
|
|
|
|
|
def getAllTopLists(books):
|
|
|
|
tops = set()
|
|
|
|
for book in books:
|
|
|
|
for top in getTopLists(book):
|
|
|
|
tops.add(top)
|
|
|
|
return list(tops)
|
|
|
|
|
|
|
|
|
|
|
|
def getAllSeries(books):
|
|
|
|
series = set()
|
|
|
|
for book in books:
|
|
|
|
if 'series' in book:
|
|
|
|
series.add(book['series'])
|
|
|
|
return list(series)
|
|
|
|
|
|
|
|
|
|
|
|
def getAllTags(books):
|
|
|
|
tags = set()
|
|
|
|
for book in books:
|
|
|
|
for tag in getTags(book):
|
|
|
|
tags.add(tag)
|
|
|
|
return list(tags)
|
|
|
|
|
|
|
|
|
2021-09-24 16:13:55 +02:00
|
|
|
def getTopListWeight(book, topList):
|
2021-06-14 22:20:36 +02:00
|
|
|
minScope = 100000
|
|
|
|
for tag in book['tags']:
|
|
|
|
if tag.find(topList+" Top ") != -1:
|
|
|
|
scope = int(tag.split(" Top ")[1])
|
|
|
|
minScope = min(minScope, scope)
|
|
|
|
if minScope == 100000:
|
|
|
|
raise Exception("You stupid?")
|
2021-07-04 20:25:26 +02:00
|
|
|
if minScope == 10:
|
|
|
|
return 1
|
|
|
|
elif minScope == 25:
|
|
|
|
return 0.85
|
|
|
|
elif minScope == 100:
|
|
|
|
return 0.5
|
2021-09-03 21:21:07 +02:00
|
|
|
return 50 / minScope
|
2021-06-14 22:20:36 +02:00
|
|
|
|
|
|
|
|
|
|
|
def removeRead(G):
|
|
|
|
for n in list(G.nodes):
|
|
|
|
node = G.nodes[n]
|
|
|
|
if node['t'] == 'book':
|
|
|
|
if node['rating'] != None:
|
|
|
|
G.remove_node(n)
|
|
|
|
|
|
|
|
|
|
|
|
def removeUnread(G):
|
|
|
|
for n in list(G.nodes):
|
|
|
|
node = G.nodes[n]
|
|
|
|
if node['t'] == 'book':
|
|
|
|
if node['rating'] == None:
|
|
|
|
G.remove_node(n)
|
|
|
|
|
|
|
|
|
|
|
|
def removePriv(G):
|
|
|
|
for n in list(G.nodes):
|
|
|
|
node = G.nodes[n]
|
|
|
|
if node['t'] == 'book':
|
|
|
|
if 'priv' in node['tags']:
|
|
|
|
G.remove_node(n)
|
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2021-10-17 15:47:37 +02:00
|
|
|
def removeWhitepapers(G):
|
|
|
|
for n in list(G.nodes):
|
|
|
|
node = G.nodes[n]
|
|
|
|
if node['t'] == 'book':
|
2022-11-19 16:18:58 +01:00
|
|
|
if 'whitepaper' in node['tags'] or 'Lernzettel' in node['tags']:
|
2021-10-17 15:47:37 +02:00
|
|
|
G.remove_node(n)
|
|
|
|
|
2021-06-14 22:20:36 +02:00
|
|
|
|
|
|
|
def removeDangling(G, alsoBooks=False):
|
|
|
|
for n in list(G.nodes):
|
|
|
|
node = G.nodes[n]
|
|
|
|
if node['t'] != 'book' or alsoBooks:
|
|
|
|
if not len(G.adj[n]):
|
|
|
|
G.remove_node(n)
|
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2022-02-06 17:59:21 +01:00
|
|
|
def removeThinRecs(G, minCons=3):
|
|
|
|
for n in list(G.nodes):
|
|
|
|
node = G.nodes[n]
|
|
|
|
if node['t'] == 'recommender':
|
|
|
|
if not len(G.adj[n]) >= minCons:
|
|
|
|
G.remove_node(n)
|
2021-06-14 22:20:36 +02:00
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2021-06-14 22:20:36 +02:00
|
|
|
def removeEdge(G):
|
|
|
|
for n in list(G.nodes):
|
|
|
|
node = G.nodes[n]
|
|
|
|
if node['t'] != 'book':
|
|
|
|
if len(G.adj[n]) < 2:
|
|
|
|
G.remove_node(n)
|
|
|
|
|
|
|
|
|
|
|
|
def removeBad(G, threshold, groups=['book', 'topList', 'recommender', 'author', 'series', 'tag']):
|
|
|
|
for n in list(G.nodes):
|
|
|
|
node = G.nodes[n]
|
|
|
|
if node['t'] in groups:
|
|
|
|
if 'score' in node and (node['score'] == None or node['score'] < threshold):
|
|
|
|
G.remove_node(n)
|
|
|
|
|
|
|
|
|
2022-02-06 17:59:21 +01:00
|
|
|
def removeKeepBest(G, num, maxDistForRead=1, forType='book'):
|
2021-06-14 22:20:36 +02:00
|
|
|
bestlist = []
|
|
|
|
for n in list(G.nodes):
|
|
|
|
node = G.nodes[n]
|
2022-02-06 17:59:21 +01:00
|
|
|
if node['t'] == forType:
|
2021-06-14 22:20:36 +02:00
|
|
|
if 'score' in node and node['score'] != None:
|
|
|
|
bestlist.append(node)
|
|
|
|
bestlist.sort(key=lambda node: node['score'], reverse=True)
|
|
|
|
bestlist = bestlist[:num]
|
|
|
|
|
|
|
|
for n in list(G.nodes):
|
|
|
|
node = G.nodes[n]
|
2022-02-06 17:59:21 +01:00
|
|
|
if node['t'] == forType and node not in bestlist or 'score' in node and node['score'] == None:
|
2021-06-14 22:20:36 +02:00
|
|
|
if not 'rating' in node or node['rating'] == None or node['rating'] < bestlist[-1]['score']-maxDistForRead:
|
|
|
|
G.remove_node(n)
|
|
|
|
|
|
|
|
|
|
|
|
def removeTags(G):
|
|
|
|
for n in list(G.nodes):
|
|
|
|
node = G.nodes[n]
|
|
|
|
if node['t'] == 'tag':
|
|
|
|
G.remove_node(n)
|
|
|
|
|
|
|
|
|
2022-11-19 16:18:58 +01:00
|
|
|
def pruneTags(G, minCons=2, forceKeepLabels=[]):
|
2021-09-26 16:51:17 +02:00
|
|
|
for n in sorted(list(G.nodes), key=lambda i: G.nodes[i]['score'] + len(G.nodes[i]['feedbacks'])/5 if 'score' in G.nodes[i] and 'feedbacks' in G.nodes[i] else 0):
|
2021-06-14 22:20:36 +02:00
|
|
|
node = G.nodes[n]
|
|
|
|
if node['t'] == 'tag':
|
|
|
|
foundCon = 0
|
|
|
|
for book in G.adj[n]:
|
|
|
|
for con in G.adj[book]:
|
2021-06-23 15:45:32 +02:00
|
|
|
conType = G.nodes[con]['t']
|
|
|
|
if conType not in ['topList']:
|
|
|
|
if conType in ['recommender']:
|
|
|
|
foundCon += 0.5
|
|
|
|
elif conType in ['tag', 'series']:
|
|
|
|
foundCon += 0.25
|
|
|
|
else:
|
|
|
|
foundCon += 1
|
2022-11-19 16:18:58 +01:00
|
|
|
if foundCon > minCons and node['label'] not in forceKeepLabels:
|
2021-06-14 22:20:36 +02:00
|
|
|
G.remove_node(n)
|
|
|
|
|
2021-09-25 00:54:09 +02:00
|
|
|
|
|
|
|
def pruneRecommenders(G, minCons=2):
|
2021-09-26 16:51:17 +02:00
|
|
|
for n in sorted(list(G.nodes), key=lambda i: G.nodes[i]['score'] if 'score' in G.nodes[i] else 0):
|
2021-09-25 00:54:09 +02:00
|
|
|
node = G.nodes[n]
|
|
|
|
if node['t'] == 'recommender':
|
|
|
|
foundCon = 0
|
|
|
|
for book in G.adj[n]:
|
|
|
|
for con in G.adj[book]:
|
|
|
|
conType = G.nodes[con]['t']
|
|
|
|
if conType not in ['topList']:
|
|
|
|
if conType in ['recommender']:
|
|
|
|
foundCon += 0.5
|
|
|
|
elif conType in ['tag', 'series']:
|
|
|
|
foundCon += 0.25
|
|
|
|
else:
|
|
|
|
foundCon += 1
|
|
|
|
if foundCon > minCons:
|
|
|
|
G.remove_node(n)
|
|
|
|
|
|
|
|
|
2021-06-15 13:52:41 +02:00
|
|
|
def pruneRecommenderCons(G, maxCons=5):
|
|
|
|
for n in list(G.nodes):
|
|
|
|
node = G.nodes[n]
|
|
|
|
if node['t'] == 'recommender':
|
|
|
|
if len(G.adj[n]) > maxCons:
|
|
|
|
bestlist = []
|
|
|
|
for m in list(G.adj[n]):
|
|
|
|
book = G.nodes[m]
|
|
|
|
if book['t'] == 'book':
|
|
|
|
if 'score' in book and book['score'] != None:
|
|
|
|
bestlist.append(book)
|
|
|
|
bestlist.sort(key=lambda node: node['score'], reverse=True)
|
|
|
|
bestlist = bestlist[:maxCons]
|
|
|
|
|
|
|
|
for m in list(G.adj[n]):
|
|
|
|
book = G.nodes[m]
|
|
|
|
if book['t'] == 'book' and book not in bestlist or 'score' in book and book['score'] == None:
|
|
|
|
if not 'rating' in book or book['rating'] == None:
|
|
|
|
foundCon = 0
|
|
|
|
for con in G.adj[m]:
|
|
|
|
if G.nodes[con]['t'] not in ['topList']:
|
|
|
|
foundCon += 1
|
|
|
|
if foundCon < 2:
|
|
|
|
G.remove_node(m)
|
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2021-06-15 14:23:49 +02:00
|
|
|
def pruneAuthorCons(G, maxCons=3):
|
|
|
|
for n in list(G.nodes):
|
|
|
|
node = G.nodes[n]
|
|
|
|
if node['t'] == 'author':
|
|
|
|
if len(G.adj[n]) > maxCons:
|
|
|
|
bestlist = []
|
|
|
|
for m in list(G.adj[n]):
|
|
|
|
book = G.nodes[m]
|
|
|
|
if book['t'] == 'book':
|
|
|
|
if 'score' in book and book['score'] != None:
|
|
|
|
bestlist.append(book)
|
|
|
|
bestlist.sort(key=lambda node: node['score'], reverse=True)
|
|
|
|
bestlist = bestlist[:maxCons]
|
|
|
|
|
|
|
|
for m in list(G.adj[n]):
|
|
|
|
book = G.nodes[m]
|
|
|
|
if book['t'] == 'book' and book not in bestlist or 'score' in book and book['score'] == None:
|
|
|
|
if not 'rating' in book or book['rating'] == None:
|
|
|
|
foundCon = 0
|
|
|
|
for con in G.adj[m]:
|
|
|
|
if G.nodes[con]['t'] not in ['topList']:
|
|
|
|
foundCon += 1
|
|
|
|
if foundCon < 2:
|
|
|
|
G.remove_node(m)
|
2021-06-14 22:20:36 +02:00
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2022-11-19 16:18:58 +01:00
|
|
|
def removeHighSpanTags(G, maxCons=5, forceKeepLabels=[]):
|
2021-06-14 22:20:36 +02:00
|
|
|
for n in list(G.nodes):
|
|
|
|
node = G.nodes[n]
|
|
|
|
if node['t'] == 'tag':
|
2022-11-19 16:18:58 +01:00
|
|
|
if len(G.adj[n]) > maxCons and not node['label'] in forceKeepLabels:
|
2021-06-14 22:20:36 +02:00
|
|
|
G.remove_node(n)
|
|
|
|
|
|
|
|
|
2021-09-24 18:25:37 +02:00
|
|
|
def removeHighSpanReadBooks(G, maxCons=8):
|
|
|
|
for n in list(G.nodes):
|
|
|
|
node = G.nodes[n]
|
|
|
|
if node['t'] == 'book' and node['rating'] != None:
|
2022-09-11 18:56:47 +02:00
|
|
|
if sum([1 for adj in G.adj[n] if G.nodes[adj]['t'] == 'recommender']) > maxCons:
|
2021-09-24 18:25:37 +02:00
|
|
|
G.remove_node(n)
|
|
|
|
|
|
|
|
|
2021-06-14 22:20:36 +02:00
|
|
|
def removeTopLists(G):
|
|
|
|
for n in list(G.nodes):
|
|
|
|
node = G.nodes[n]
|
|
|
|
if node['t'] == 'topList':
|
|
|
|
G.remove_node(n)
|
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2021-09-24 23:39:55 +02:00
|
|
|
def removeRecommenders(G):
|
|
|
|
for n in list(G.nodes):
|
|
|
|
node = G.nodes[n]
|
|
|
|
if node['t'] == 'recommender':
|
|
|
|
G.remove_node(n)
|
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2021-12-05 19:56:26 +01:00
|
|
|
def removeAuthors(G):
|
|
|
|
for n in list(G.nodes):
|
|
|
|
node = G.nodes[n]
|
|
|
|
if node['t'] == 'author':
|
|
|
|
G.remove_node(n)
|
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2021-12-05 19:56:26 +01:00
|
|
|
def removeSeries(G):
|
|
|
|
for n in list(G.nodes):
|
|
|
|
node = G.nodes[n]
|
|
|
|
if node['t'] == 'series':
|
|
|
|
G.remove_node(n)
|
2021-06-14 22:20:36 +02:00
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2021-06-14 22:20:36 +02:00
|
|
|
def removeRestOfSeries(G):
|
|
|
|
for n in list(G.nodes):
|
|
|
|
node = G.nodes[n]
|
|
|
|
if node['t'] == 'series':
|
|
|
|
seriesState = 0
|
|
|
|
for adj in G.adj[n]:
|
|
|
|
adjNode = G.nodes[adj]
|
|
|
|
if adjNode['rating'] != None:
|
|
|
|
seriesState = max(seriesState, int(
|
|
|
|
adjNode['series_index']))
|
|
|
|
for adj in list(G.adj[n]):
|
|
|
|
adjNode = G.nodes[adj]
|
|
|
|
if adjNode['series_index'] > seriesState + 1.0001:
|
|
|
|
G.remove_node(adj)
|
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2021-09-06 16:17:54 +02:00
|
|
|
def removeUnusedRecommenders(G):
|
|
|
|
for n in list(G.nodes):
|
|
|
|
node = G.nodes[n]
|
|
|
|
if node['t'] == 'recommender':
|
|
|
|
for adj in G.adj[n]:
|
|
|
|
adjNode = G.nodes[adj]
|
2022-09-11 18:56:47 +02:00
|
|
|
if adjNode['t'] == 'book' and 'score' in adjNode:
|
2021-09-06 16:17:54 +02:00
|
|
|
break
|
2022-09-11 18:56:47 +02:00
|
|
|
else: # No unrated recommendation
|
2021-09-06 16:17:54 +02:00
|
|
|
G.remove_node(n)
|
2021-06-14 22:20:36 +02:00
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2021-09-08 22:54:49 +02:00
|
|
|
def removeUselessReadBooks(G):
|
2021-09-24 18:25:37 +02:00
|
|
|
minForce = 1.5
|
|
|
|
minContact = 2
|
2021-09-08 22:54:49 +02:00
|
|
|
for n in list(G.nodes):
|
|
|
|
node = G.nodes[n]
|
2021-09-24 18:25:37 +02:00
|
|
|
if node['t'] == 'book' and node['rating'] != None:
|
|
|
|
force = 0
|
|
|
|
contacts = 0
|
2021-09-08 22:54:49 +02:00
|
|
|
for adj in G.adj[n]:
|
|
|
|
adjNode = G.nodes[adj]
|
2021-09-24 18:25:37 +02:00
|
|
|
contacts += 1
|
2021-09-08 22:54:49 +02:00
|
|
|
for cousin in G.adj[adj]:
|
|
|
|
cousinNode = G.nodes[cousin]
|
2022-09-11 18:56:47 +02:00
|
|
|
if cousinNode['t'] == 'book' and 'score' in cousinNode or cousinNode['t'] == 'newBook':
|
|
|
|
if adjNode['t'] == 'recommender':
|
2021-09-24 18:25:37 +02:00
|
|
|
force += 0.5
|
|
|
|
else:
|
|
|
|
force += 1
|
|
|
|
if force < minForce or contacts < minContact:
|
2021-09-08 22:54:49 +02:00
|
|
|
G.remove_node(n)
|
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2021-09-24 18:25:37 +02:00
|
|
|
def removeUselessTags(G, minUnread=1):
|
2021-09-24 16:13:55 +02:00
|
|
|
for n in list(G.nodes):
|
|
|
|
node = G.nodes[n]
|
|
|
|
if node['t'] == 'tag':
|
2021-09-24 18:25:37 +02:00
|
|
|
foundUnread = 0
|
2021-09-24 16:13:55 +02:00
|
|
|
for adj in G.adj[n]:
|
|
|
|
adjNode = G.nodes[adj]
|
2022-09-11 18:56:47 +02:00
|
|
|
if adjNode['t'] == 'book' and 'score' in adjNode:
|
2021-09-24 18:25:37 +02:00
|
|
|
foundUnread += 1
|
|
|
|
if foundUnread < minUnread:
|
2021-09-24 16:13:55 +02:00
|
|
|
G.remove_node(n)
|
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2022-11-30 17:44:12 +01:00
|
|
|
def curiosityReward(G, coeff=1, dTan=False):
|
2022-11-19 16:18:58 +01:00
|
|
|
for n in list(G.nodes):
|
|
|
|
node = G.nodes[n]
|
|
|
|
if 'score' in node and 'se' in node:
|
|
|
|
delta = node['se'] * coeff
|
|
|
|
if dTan:
|
2022-11-30 17:44:12 +01:00
|
|
|
delta *= (1 - math.tanh((node['score']/10-0.5)*7)**2)
|
|
|
|
else:
|
|
|
|
delta *= (1 - math.tanh(node['score']/5))
|
2022-11-19 16:18:58 +01:00
|
|
|
new = max(0.0, min(10.0, node['score'] + delta))
|
|
|
|
node['score'] = new
|
|
|
|
|
2021-09-26 12:46:30 +02:00
|
|
|
def removeUselessSeries(G, minSco=0):
|
|
|
|
for n in list(G.nodes):
|
|
|
|
node = G.nodes[n]
|
|
|
|
if node['t'] == 'series':
|
|
|
|
if len(G.adj[n]) < 2 or node['score'] < minSco:
|
|
|
|
G.remove_node(n)
|
2021-09-24 16:13:55 +02:00
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2021-09-24 18:25:37 +02:00
|
|
|
def scoreOpinions(G, globMu, globStd):
|
2021-06-14 22:20:36 +02:00
|
|
|
for n in list(G.nodes):
|
|
|
|
node = G.nodes[n]
|
|
|
|
feedbacks = []
|
2021-09-24 17:13:36 +02:00
|
|
|
if node['t'] not in ['book']:
|
2021-06-14 22:20:36 +02:00
|
|
|
adjacens = list(G.adj[n].keys())
|
|
|
|
for adj in adjacens:
|
|
|
|
adjNode = G.nodes[adj]
|
|
|
|
if adjNode['rating'] != None:
|
|
|
|
feedbacks.append(adjNode['rating'])
|
|
|
|
if len(feedbacks):
|
|
|
|
node['mean'], node['std'] = norm.fit(feedbacks)
|
|
|
|
node['se'] = globStd / math.sqrt(len(feedbacks))
|
|
|
|
ratio = len(feedbacks) / len(adjacens)
|
2022-03-07 13:51:26 +01:00
|
|
|
node['score'] = node['mean'] + globStd/3 - node['se']
|
2021-06-14 22:20:36 +02:00
|
|
|
node['feedbacks'] = feedbacks
|
|
|
|
else:
|
|
|
|
node['score'] = None
|
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2021-09-24 18:25:37 +02:00
|
|
|
def scoreUnread(G, globMu, globStd):
|
2021-06-14 22:20:36 +02:00
|
|
|
for n in list(G.nodes):
|
2022-02-24 20:19:00 +01:00
|
|
|
feedbacks = [globMu]
|
|
|
|
ws = [['mu']]
|
2021-06-14 22:20:36 +02:00
|
|
|
node = G.nodes[n]
|
|
|
|
if node['t'] == 'book':
|
|
|
|
if node['rating'] == None:
|
|
|
|
adjacens = list(G.adj[n].keys())
|
|
|
|
for adj in adjacens:
|
|
|
|
adjNode = G.nodes[adj]
|
|
|
|
if 'score' in adjNode and adjNode['score'] != None:
|
2022-09-11 18:56:47 +02:00
|
|
|
w = [adjNode['t'], G[n][adj]['weight']
|
|
|
|
if 'weight' in G[n][adj] else 1]
|
2021-07-04 20:08:25 +02:00
|
|
|
for fb in adjNode['feedbacks']:
|
2021-09-24 14:49:59 +02:00
|
|
|
feedbacks.append(fb)
|
2022-02-24 20:19:00 +01:00
|
|
|
ws.append(w)
|
|
|
|
if len(feedbacks):
|
|
|
|
node['mean'], node['std'] = norm.fit(feedbacks)
|
2022-09-11 18:56:47 +02:00
|
|
|
node['median'] = np.percentile(
|
|
|
|
feedbacks, [50], method='linear')[0]
|
2022-02-24 20:19:00 +01:00
|
|
|
node['se'] = globStd / math.sqrt(len(feedbacks))
|
|
|
|
feedbacks.append(node['pagerank_score'])
|
|
|
|
ws.append(['pagerank'])
|
|
|
|
#feedbacks.append(10/math.ln10(10+node['tgb_rank']) if 'tgb_rank' in node else 0)
|
2022-09-11 18:56:47 +02:00
|
|
|
# ws.append(['tgb_rank'])
|
2022-02-24 20:19:00 +01:00
|
|
|
feedbacks.append(node['std'])
|
|
|
|
ws.append(['sigma'])
|
2022-03-07 13:21:16 +01:00
|
|
|
feedbacks.append(node['median'])
|
|
|
|
ws.append(['median'])
|
2022-09-11 18:56:47 +02:00
|
|
|
# feedbacks.append(node['se'])
|
|
|
|
# ws.append(['se'])
|
2022-02-24 20:19:00 +01:00
|
|
|
feedbacks.append(globMu)
|
|
|
|
ws.append(['bias'])
|
2022-09-11 18:56:47 +02:00
|
|
|
node['score'] = sum([fb*getWeightForType(w[0], w[1] if len(w) > 1 else 1) for fb, w in zip(
|
|
|
|
feedbacks, ws)])/sum([getWeightForType(w[0], w[1] if len(w) > 1 else 1) for w in ws])
|
2022-02-24 20:19:00 +01:00
|
|
|
node['_act'] = feedbacks
|
|
|
|
node['_wgh'] = ws
|
|
|
|
else:
|
2022-09-11 18:56:47 +02:00
|
|
|
node['score'] = globMu + errorFac * \
|
|
|
|
globStd + len(feedbacks)*0.0000000001
|
2021-09-24 14:49:59 +02:00
|
|
|
if 'series' in node:
|
|
|
|
if node['series_index'] == 1.0:
|
|
|
|
node['score'] += 0.000000001
|
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2022-02-24 20:19:00 +01:00
|
|
|
def getWeightForType(nodeType, edgeWeight=1):
|
2021-09-24 16:13:55 +02:00
|
|
|
global weights
|
2022-02-24 20:19:00 +01:00
|
|
|
w = weights[nodeType]
|
|
|
|
if nodeType == 'topList':
|
|
|
|
return edgeWeight*w
|
|
|
|
else:
|
|
|
|
return w
|
2021-06-14 22:20:36 +02:00
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2022-02-06 18:22:46 +01:00
|
|
|
def printBestList(G, t='book', num=-1):
|
2021-06-14 22:20:36 +02:00
|
|
|
bestlist = []
|
|
|
|
for n in list(G.nodes):
|
|
|
|
node = G.nodes[n]
|
2022-02-06 18:22:46 +01:00
|
|
|
if node['t'] == t:
|
2021-06-14 22:20:36 +02:00
|
|
|
if 'score' in node and node['score'] != None:
|
|
|
|
bestlist.append(node)
|
2022-09-11 18:56:47 +02:00
|
|
|
bestlist.sort(key=lambda node: node['score'] + 0.00001 *
|
|
|
|
(node['se'] if 'se' in node else 0), reverse=True)
|
2021-06-14 22:20:36 +02:00
|
|
|
for i, book in enumerate(bestlist):
|
2022-09-11 18:56:47 +02:00
|
|
|
if t == 'book':
|
|
|
|
line = book['title'] + " ("+" & ".join(book['authors'])+")" + \
|
|
|
|
": {:.5f}".format(book['score'])
|
2022-02-06 18:22:46 +01:00
|
|
|
else:
|
|
|
|
line = book['label']
|
2022-09-11 18:56:47 +02:00
|
|
|
print("["+str(i+1).zfill(int((math.log10(num) if num != -1 else 3)+1))+"] "+line)
|
|
|
|
if num != -1 and i == num-1:
|
2021-06-14 22:20:36 +02:00
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
def readColor(book):
|
|
|
|
if 'rating' in book:
|
|
|
|
return 'green'
|
|
|
|
else:
|
|
|
|
return 'gray'
|
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2021-06-14 22:20:36 +02:00
|
|
|
def loadBooksFromDB():
|
2022-03-19 11:35:30 +01:00
|
|
|
books = calibreDB.getBooks()
|
2022-02-06 17:59:21 +01:00
|
|
|
infuseDataFromMRB(books)
|
2022-09-11 18:56:47 +02:00
|
|
|
# infuseDataFromTGB(books)
|
2022-02-06 17:59:21 +01:00
|
|
|
return books
|
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2022-02-06 17:59:21 +01:00
|
|
|
def mrbGetBook(mrbdf, title, authors):
|
2022-02-06 18:30:55 +01:00
|
|
|
title = title.split('(')[0]
|
2022-09-11 18:56:47 +02:00
|
|
|
title = title.replace('*', '')
|
2022-02-15 19:54:14 +01:00
|
|
|
pot = mrbdf[mrbdf['title'].str.contains(title)]
|
|
|
|
dic = pot.to_dict(orient='records')
|
|
|
|
for d in dic:
|
|
|
|
for author in authors:
|
|
|
|
parts = author.split(" ")
|
|
|
|
for part in [parts[0], parts[-1]]:
|
2022-09-11 18:56:47 +02:00
|
|
|
if d['author'].find(part) == -1:
|
2022-02-15 19:54:14 +01:00
|
|
|
break
|
|
|
|
else:
|
|
|
|
return d
|
|
|
|
return False
|
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2022-02-15 19:54:14 +01:00
|
|
|
def tgbGetBook(df, title, authors):
|
|
|
|
title = title.split('(')[0]
|
2022-09-11 18:56:47 +02:00
|
|
|
title = title.replace('*', '')
|
2022-02-15 19:54:14 +01:00
|
|
|
pot = df[df['title'].str.contains(title)]
|
|
|
|
dic = pot.to_dict(orient='records')
|
|
|
|
for d in dic:
|
|
|
|
for author in authors:
|
|
|
|
parts = author.split(" ")
|
2022-02-15 19:35:03 +01:00
|
|
|
for part in [parts[0], parts[-1]]:
|
2022-09-11 18:56:47 +02:00
|
|
|
if d['author'].find(part) == -1:
|
2022-02-15 19:35:03 +01:00
|
|
|
break
|
|
|
|
else:
|
|
|
|
return d
|
|
|
|
return False
|
2022-02-06 17:59:21 +01:00
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2022-02-06 17:59:21 +01:00
|
|
|
def infuseDataFromMRB(books):
|
2022-02-15 19:35:03 +01:00
|
|
|
mrbdf = pd.read_csv('rec_dbs/mrb_db.csv')
|
2022-02-06 17:59:21 +01:00
|
|
|
for book in books:
|
|
|
|
mrb = mrbGetBook(mrbdf, book['title'], book['authors'])
|
|
|
|
if mrb:
|
|
|
|
for rec in str(mrb['recommender']).split('|'):
|
|
|
|
book['tags'] += [rec + ':MRB']
|
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2022-02-15 19:54:14 +01:00
|
|
|
def infuseDataFromTGB(books):
|
2022-09-11 18:56:47 +02:00
|
|
|
for i in range(1, 3):
|
2022-02-15 19:54:14 +01:00
|
|
|
df = pd.read_csv('rec_dbs/tgb_'+str(i)+'.csv')
|
|
|
|
for book in books:
|
|
|
|
tgb = tgbGetBook(df, book['title'], book['authors'])
|
|
|
|
if tgb:
|
|
|
|
book['tgb_rank'] = int(tgb['id'])
|
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2022-03-19 11:35:30 +01:00
|
|
|
class calibreDB():
|
|
|
|
@classmethod
|
|
|
|
def _getTxt(cls, request):
|
|
|
|
ret = os.popen("calibredb "+request).read()
|
|
|
|
if not ret:
|
2022-09-11 18:56:47 +02:00
|
|
|
raise Error(
|
|
|
|
'Unable to connect to CalibreDB. Please close all open instances of Calibre.')
|
2022-03-19 11:35:30 +01:00
|
|
|
return ret
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def _getJson(cls, request):
|
|
|
|
return json.loads(cls._getTxt(request))
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def getBooks(cls):
|
|
|
|
return cls._getJson('list --for-machine -f all')
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def getCustomColumns(cls):
|
|
|
|
lines = cls._getTxt('custom_columns').split('\n')
|
|
|
|
cols = [line.split(' ')[0] for line in lines]
|
|
|
|
return cols
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def _requireCaliceColumn(cls):
|
2022-03-19 12:18:14 +01:00
|
|
|
cols = cls.getCustomColumns()
|
|
|
|
avai = ['calice_score' in cols, 'calice_rating' in cols]
|
|
|
|
if not any(avai):
|
2022-09-11 18:56:47 +02:00
|
|
|
raise Error(
|
|
|
|
'Custom Columns missing from CalibreDB. Create columns for "Calice Score" and/or "Calice Rating" using the "createCaliceColumn" command.')
|
2022-03-19 12:18:14 +01:00
|
|
|
return avai
|
2022-03-19 11:35:30 +01:00
|
|
|
|
|
|
|
@classmethod
|
2022-03-19 12:18:14 +01:00
|
|
|
def createCaliceRatingColumn(cls):
|
|
|
|
if 'calice_rating' in cls.getCustomColumns():
|
2022-03-19 11:35:30 +01:00
|
|
|
raise Error('Custom Column already exists.')
|
2022-03-19 12:18:14 +01:00
|
|
|
cls._getTxt("add_custom_column calice_rating 'Calice Rating' rating")
|
2022-03-19 11:35:30 +01:00
|
|
|
|
|
|
|
@classmethod
|
2022-03-19 12:18:14 +01:00
|
|
|
def createCaliceScoreColumn(cls):
|
|
|
|
if 'calice_score' in cls.getCustomColumns():
|
|
|
|
raise Error('Custom Column already exists.')
|
|
|
|
cls._getTxt("add_custom_column calice_score 'Calice Score' float")
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def writeCaliceColumn(cls, bookId, score):
|
|
|
|
cls.writeCaliceColumnMultiple({bookId: score})
|
2022-03-19 11:35:30 +01:00
|
|
|
|
|
|
|
@classmethod
|
2022-03-19 12:18:14 +01:00
|
|
|
def writeCaliceColumnMultiple(cls, scores):
|
2022-03-19 11:35:30 +01:00
|
|
|
from tqdm.auto import tqdm
|
2022-03-19 12:18:14 +01:00
|
|
|
sco, rat = cls._requireCaliceColumn()
|
|
|
|
for bookId in tqdm(scores):
|
|
|
|
score = scores[bookId]
|
2022-03-20 18:11:56 +01:00
|
|
|
if score == '<clear>':
|
|
|
|
cls._getTxt('set_custom calice_score '+str(bookId)+' ""')
|
|
|
|
else:
|
|
|
|
if sco:
|
2022-09-11 18:56:47 +02:00
|
|
|
cls._getTxt('set_custom calice_score ' +
|
|
|
|
str(bookId)+' '+str(round(score, 5)))
|
2022-03-20 18:11:56 +01:00
|
|
|
if rat:
|
2022-09-11 18:56:47 +02:00
|
|
|
cls._getTxt('set_custom calice_rating ' +
|
|
|
|
str(bookId)+' '+str(int(round(score))))
|
|
|
|
|
2022-03-19 11:35:30 +01:00
|
|
|
|
|
|
|
def calice(G):
|
2022-03-19 12:18:14 +01:00
|
|
|
scores = {}
|
2022-03-19 11:35:30 +01:00
|
|
|
for n in list(G.nodes):
|
|
|
|
node = G.nodes[n]
|
|
|
|
if node['t'] in ['book']:
|
|
|
|
if 'score' in node and node['score'] != None:
|
2022-03-19 12:18:14 +01:00
|
|
|
scores[node['calibreID']] = node['score']
|
2022-03-20 18:11:56 +01:00
|
|
|
elif 'rating' in node:
|
|
|
|
scores[node['calibreID']] = '<clear>'
|
2022-03-19 12:18:14 +01:00
|
|
|
calibreDB.writeCaliceColumnMultiple(scores)
|
2022-03-19 11:35:30 +01:00
|
|
|
print('Done.')
|
2021-06-14 22:20:36 +02:00
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2021-12-11 13:58:01 +01:00
|
|
|
def remove_html_tags(text):
|
|
|
|
clean = re.compile('<.*?>')
|
|
|
|
return re.sub(clean, '', text)
|
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
|
|
|
def getKeywords(txt, rake):
|
2021-12-18 18:09:45 +01:00
|
|
|
txt = remove_html_tags(txt)
|
|
|
|
k = []
|
|
|
|
rake.extract_keywords_from_text(txt)
|
|
|
|
kws = rake.get_ranked_phrases_with_scores()
|
2022-09-11 18:56:47 +02:00
|
|
|
for i, (score, kw) in enumerate(kws):
|
2021-12-18 18:09:45 +01:00
|
|
|
l = len(kw.split(' '))
|
2022-09-11 18:56:47 +02:00
|
|
|
if kw.lower() not in ['p', 'die', 'best', 'known', 'fk', 'p pp', 'one'] and len(kw) > 3 and kw.find('div') == -1 and kw.lower().find('p p') == -1:
|
|
|
|
k.append((score**(1/(l*0.4)), kw))
|
|
|
|
k.sort(key=lambda x: x[0], reverse=True)
|
2021-12-18 18:09:45 +01:00
|
|
|
if k:
|
|
|
|
minSco = k[0][0]/3*2
|
2022-09-11 18:56:47 +02:00
|
|
|
for i, kw in enumerate(k):
|
2021-12-18 18:09:45 +01:00
|
|
|
if kw[0] < minSco:
|
2022-09-11 18:56:47 +02:00
|
|
|
return [(sco, word.title()) for sco, word in k[:i]]
|
2021-12-18 18:09:45 +01:00
|
|
|
return k
|
|
|
|
return []
|
2021-12-11 13:58:01 +01:00
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2022-01-31 13:45:26 +01:00
|
|
|
def runPagerank(G):
|
|
|
|
try:
|
|
|
|
scores = nx.pagerank(G=G)
|
|
|
|
except nx.exception.PowerIterationFailedConvergence:
|
2022-01-31 14:02:00 +01:00
|
|
|
print('[!] Could not calculate pagerank-scores: Power iteration of the eigenvector calculation did not converge')
|
|
|
|
print('[ ] Recommendations will be of slighly lower quality')
|
2022-01-31 13:45:26 +01:00
|
|
|
scores = {}
|
|
|
|
for n in list(G.nodes):
|
|
|
|
G.nodes[n]['pagerank_score'] = scores[n] if n in scores else 0
|
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2021-12-11 13:58:01 +01:00
|
|
|
def buildBookGraph(books, darkMode=False, extractKeywords=True, mergeTags=True):
|
2021-06-14 22:20:36 +02:00
|
|
|
G = nx.Graph()
|
2021-12-11 13:58:01 +01:00
|
|
|
if extractKeywords:
|
|
|
|
from rake_nltk.rake import Rake
|
|
|
|
rake = Rake()
|
2021-06-14 22:20:36 +02:00
|
|
|
|
|
|
|
# Books
|
|
|
|
for book in books:
|
2021-12-11 13:58:01 +01:00
|
|
|
tags = book['tags']
|
2021-06-14 22:20:36 +02:00
|
|
|
if 'rating' in book:
|
|
|
|
rating = book['rating']
|
|
|
|
else:
|
|
|
|
rating = None
|
|
|
|
if 'comments' in book:
|
2021-12-11 13:58:01 +01:00
|
|
|
desc = book['comments']
|
2021-06-14 22:20:36 +02:00
|
|
|
else:
|
|
|
|
desc = ''
|
2021-12-11 13:58:01 +01:00
|
|
|
if 'comments' in book and extractKeywords:
|
2022-09-11 18:56:47 +02:00
|
|
|
sanitized = re.sub(r'[^a-zA-Z0-9\s\.äöü]+',
|
|
|
|
'', book['comments']).replace('\n', ' ')
|
|
|
|
keywords = getKeywords(sanitized, rake)
|
2021-12-11 13:58:01 +01:00
|
|
|
else:
|
|
|
|
keywords = []
|
|
|
|
if mergeTags:
|
|
|
|
tags = tags + [word for (score, word) in keywords]
|
2021-06-14 22:20:36 +02:00
|
|
|
if 'series' in book:
|
|
|
|
series = book['series']
|
|
|
|
series_index = book['series_index']
|
|
|
|
else:
|
|
|
|
series = None
|
|
|
|
series_index = None
|
2022-09-11 18:56:47 +02:00
|
|
|
G.add_node(book['id'], t='book', label=book['title'], title=book['title'], shape='image', image=book['cover'], rating=rating, tags=tags, keywords=keywords,
|
|
|
|
desc=desc, isbn=book['isbn'], files=book['formats'], authors=getAuthors(book), series=series, series_index=series_index, calibreID=book['id'])
|
2021-06-14 22:20:36 +02:00
|
|
|
|
|
|
|
return G
|
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2022-02-11 12:14:24 +01:00
|
|
|
def getWikiImage(search_term):
|
|
|
|
from fuzzywuzzy import fuzz
|
|
|
|
WIKI_REQUEST = 'http://en.wikipedia.org/w/api.php?action=query&prop=pageimages&format=json&piprop=original&titles='
|
|
|
|
try:
|
|
|
|
print('[i] Searching for >'+search_term+'< on WikiPedia...')
|
2022-09-11 18:56:47 +02:00
|
|
|
result = wikipedia.search(search_term, results=1)
|
2022-02-11 12:14:24 +01:00
|
|
|
if fuzz.ratio(search_term, result) < 50:
|
|
|
|
raise Exception('blub')
|
|
|
|
wikipedia.set_lang('en')
|
2022-09-11 18:56:47 +02:00
|
|
|
wkpage = wikipedia.WikipediaPage(title=result[0])
|
2022-02-11 12:14:24 +01:00
|
|
|
title = wkpage.title
|
2022-09-11 18:56:47 +02:00
|
|
|
response = requests.get(WIKI_REQUEST+title)
|
2022-02-11 12:14:24 +01:00
|
|
|
json_data = json.loads(response.text)
|
2022-09-11 18:56:47 +02:00
|
|
|
img_link = list(json_data['query']['pages'].values())[
|
|
|
|
0]['original']['source']
|
2022-02-11 12:14:24 +01:00
|
|
|
return img_link
|
|
|
|
except:
|
|
|
|
print('[!] No match for '+search_term+' on WikiPedia...')
|
|
|
|
return None
|
2021-06-14 22:20:36 +02:00
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2021-10-05 18:25:27 +02:00
|
|
|
def graphAddAuthors(G, books, darkMode=False):
|
2021-06-14 22:20:36 +02:00
|
|
|
for author in getAllAuthors(books):
|
|
|
|
G.add_node('a/'+author, color='green', t='author', label=author)
|
|
|
|
for book in books:
|
|
|
|
for author in getAuthors(book):
|
|
|
|
G.add_edge('a/'+author, book['id'], color=readColor(book))
|
|
|
|
return G
|
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2021-10-05 18:25:27 +02:00
|
|
|
def graphAddRecommenders(G, books, darkMode=False):
|
2021-06-14 22:20:36 +02:00
|
|
|
for rec in getAllRecommenders(books):
|
|
|
|
G.add_node('r/'+rec, color='orange', t='recommender', label=rec)
|
|
|
|
for book in books:
|
|
|
|
for rec in getRecommenders(book):
|
|
|
|
G.add_edge('r/'+rec, book['id'], color=readColor(book))
|
|
|
|
return G
|
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2021-10-05 18:25:27 +02:00
|
|
|
def graphAddTopLists(G, books, darkMode=False):
|
2021-06-14 22:20:36 +02:00
|
|
|
for tl in getAllTopLists(books):
|
|
|
|
G.add_node('t/'+tl, color='yellow', t='topList', label=tl)
|
|
|
|
for book in books:
|
|
|
|
for top in getTopLists(book):
|
2021-09-24 16:13:55 +02:00
|
|
|
G.add_edge('t/'+top, book['id'], weight=getTopListWeight(
|
2021-06-14 22:20:36 +02:00
|
|
|
book, top), color=readColor(book))
|
|
|
|
return G
|
|
|
|
|
2021-10-05 18:25:27 +02:00
|
|
|
def graphAddSeries(G, books, darkMode=False):
|
2021-06-14 22:20:36 +02:00
|
|
|
for series in getAllSeries(books):
|
2022-09-11 18:56:47 +02:00
|
|
|
G.add_node('s/'+series, color='red', t='series',
|
|
|
|
label=series, shape='triangle')
|
2021-06-14 22:20:36 +02:00
|
|
|
for book in books:
|
|
|
|
if 'series' in book:
|
|
|
|
G.add_edge('s/'+book['series'], book['id'], color=readColor(book))
|
|
|
|
return G
|
|
|
|
|
|
|
|
|
2021-10-05 18:25:27 +02:00
|
|
|
def graphAddTags(G, books, darkMode=False):
|
2021-06-14 22:20:36 +02:00
|
|
|
for tag in getAllTags(books):
|
2022-09-11 18:56:47 +02:00
|
|
|
G.add_node('t/'+tag, color=['lightGray', 'darkgray']
|
|
|
|
[darkMode], t='tag', label=tag, shape='box')
|
2021-06-14 22:20:36 +02:00
|
|
|
for book in books:
|
|
|
|
for tag in getTags(book):
|
|
|
|
G.add_edge('t/'+tag, book['id'], color=readColor(book))
|
|
|
|
return G
|
|
|
|
|
|
|
|
|
|
|
|
def calcRecDist(G, books):
|
|
|
|
globRatings = []
|
|
|
|
for book in books:
|
|
|
|
if G.nodes[book['id']]['rating'] != None:
|
|
|
|
globRatings.append(G.nodes[book['id']]['rating'])
|
|
|
|
return norm.fit(globRatings)
|
|
|
|
|
|
|
|
|
|
|
|
def scaleBooksByRating(G):
|
|
|
|
for n in list(G.nodes):
|
|
|
|
node = G.nodes[n]
|
|
|
|
if node['t'] not in []:
|
|
|
|
if 'rating' in node and node['rating'] != None:
|
|
|
|
node['value'] = 20 + 5 * int(node['rating'])
|
|
|
|
else:
|
|
|
|
if 'score' in node and node['score'] != None:
|
2022-02-06 17:59:21 +01:00
|
|
|
node['value'] = 20 + int(5 * float(node['score']))
|
2021-06-14 22:20:36 +02:00
|
|
|
else:
|
|
|
|
node['value'] = 15
|
|
|
|
|
|
|
|
|
|
|
|
def scaleOpinionsByRating(G):
|
|
|
|
for n in list(G.nodes):
|
|
|
|
node = G.nodes[n]
|
|
|
|
if node['t'] in ['topList', 'recommender', 'author', 'series']:
|
|
|
|
if 'score' in node and node['score'] != None:
|
|
|
|
node['value'] = 20 + 5 * int(node['score'])
|
|
|
|
else:
|
|
|
|
node['value'] = 20
|
|
|
|
|
|
|
|
|
|
|
|
def addScoreToLabels(G):
|
|
|
|
for n in list(G.nodes):
|
|
|
|
node = G.nodes[n]
|
2022-11-19 16:28:06 +01:00
|
|
|
if node['t'] not in ['tag', 'newBook']:
|
2021-06-14 22:20:36 +02:00
|
|
|
if 'rating' in node and node['rating'] != None:
|
|
|
|
node['label'] += " ("+str(node['rating'])+")"
|
|
|
|
else:
|
2021-09-26 14:31:00 +02:00
|
|
|
if 'score' in node and node['score'] != None and 'se' in node:
|
2022-09-11 18:56:47 +02:00
|
|
|
node['label'] += " ({:.2f}±{:.1f})".format(
|
|
|
|
node['score'], node['se'])
|
2021-06-14 22:20:36 +02:00
|
|
|
else:
|
2021-10-12 20:10:48 +02:00
|
|
|
node['label'] += " (0±∞)"
|
2022-11-19 16:35:04 +01:00
|
|
|
if False and node['t'] in ['tag']:
|
2022-11-19 16:28:06 +01:00
|
|
|
if 'score' in node and node['score'] != None and 'se' in node:
|
|
|
|
node['label'] += " ({:.1f})".format(
|
|
|
|
node['score'])
|
|
|
|
else:
|
|
|
|
node['label'] += " (0)"
|
2021-06-14 22:20:36 +02:00
|
|
|
|
|
|
|
|
2021-10-05 18:25:27 +02:00
|
|
|
def genAndShowHTML(G, showButtons=False, darkMode=False, arrows=False):
|
2022-02-06 22:17:23 +01:00
|
|
|
net = Network('1050px', '1900px',
|
2021-10-05 18:25:27 +02:00
|
|
|
directed=arrows,
|
2022-09-11 18:56:47 +02:00
|
|
|
bgcolor=['#FFFFFF', '#181818'][darkMode])
|
2021-06-14 22:20:36 +02:00
|
|
|
if showButtons:
|
|
|
|
net.show_buttons(filter_=['configure', 'layout',
|
|
|
|
'interaction', 'physics', 'edges'])
|
|
|
|
net.from_nx(G)
|
|
|
|
net.show('nx.html')
|
|
|
|
|
|
|
|
|
2022-02-11 12:14:24 +01:00
|
|
|
def genAndShow3D(G, darkMode=False):
|
|
|
|
node_sizes = []
|
|
|
|
node_labels = []
|
|
|
|
node_cols = []
|
|
|
|
for n in G.nodes:
|
|
|
|
node = G.nodes[n]
|
2022-09-11 18:56:47 +02:00
|
|
|
if node['t'] == 'tag':
|
2022-02-11 12:14:24 +01:00
|
|
|
node_cols.append('gray')
|
2022-09-11 18:56:47 +02:00
|
|
|
elif node['t'] == 'book':
|
|
|
|
if 'score' in node: # unread book
|
2022-02-11 12:14:24 +01:00
|
|
|
node_cols.append('lightblue')
|
|
|
|
else:
|
|
|
|
node_cols.append('magenta')
|
|
|
|
elif 'color' in node:
|
|
|
|
node_cols.append(node['color'])
|
|
|
|
else:
|
|
|
|
node_cols.append('black')
|
|
|
|
|
|
|
|
node_labels.append(node['label'])
|
|
|
|
node_sizes.append((node['value']/8)**1.5)
|
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
spring = nx.spring_layout(G, dim=3, seed=random.randint(0, 65536))
|
|
|
|
x_nodes = [spring[p][0] for p in spring] # x-coordinates of nodes
|
|
|
|
y_nodes = [spring[p][1] for p in spring] # y-coordinates
|
|
|
|
z_nodes = [spring[p][2] for p in spring] # z-coordinates
|
2022-02-11 12:14:24 +01:00
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
x_edges = []
|
|
|
|
y_edges = []
|
|
|
|
z_edges = []
|
2022-02-11 12:14:24 +01:00
|
|
|
|
|
|
|
for edge in G.edges():
|
2022-09-11 18:56:47 +02:00
|
|
|
x_coords = [spring[edge[0]][0], spring[edge[1]][0], None]
|
2022-02-11 12:14:24 +01:00
|
|
|
x_edges += x_coords
|
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
y_coords = [spring[edge[0]][1], spring[edge[1]][1], None]
|
2022-02-11 12:14:24 +01:00
|
|
|
y_edges += y_coords
|
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
z_coords = [spring[edge[0]][2], spring[edge[1]][2], None]
|
2022-02-11 12:14:24 +01:00
|
|
|
z_edges += z_coords
|
|
|
|
|
|
|
|
trace_edges = go.Scatter3d(x=x_edges,
|
2022-09-11 18:56:47 +02:00
|
|
|
y=y_edges,
|
|
|
|
z=z_edges,
|
|
|
|
mode='lines',
|
|
|
|
line=dict(color='black', width=2),
|
|
|
|
hoverinfo='none')
|
2022-02-11 12:14:24 +01:00
|
|
|
|
|
|
|
trace_nodes = go.Scatter3d(x=x_nodes,
|
2022-09-11 18:56:47 +02:00
|
|
|
y=y_nodes,
|
|
|
|
z=z_nodes,
|
|
|
|
mode='markers',
|
|
|
|
marker=dict(symbol='circle',
|
|
|
|
size=node_sizes,
|
|
|
|
color=node_cols, # color the nodes according to their community
|
|
|
|
# colorscale=['lightgreen','magenta'], #either green or mageneta
|
|
|
|
line=dict(color='gray', width=0.5)),
|
|
|
|
text=node_labels,
|
|
|
|
hoverinfo='text')
|
2022-02-11 12:14:24 +01:00
|
|
|
|
|
|
|
axis = dict(showbackground=False,
|
2022-09-11 18:56:47 +02:00
|
|
|
showline=False,
|
|
|
|
zeroline=False,
|
|
|
|
showgrid=False,
|
|
|
|
showticklabels=False,
|
|
|
|
title='')
|
2022-02-11 12:14:24 +01:00
|
|
|
|
|
|
|
layout = go.Layout(title="",
|
2022-09-11 18:56:47 +02:00
|
|
|
width=1920,
|
|
|
|
height=1080,
|
|
|
|
plot_bgcolor=['#FFFFFF', '#181818'][darkMode],
|
|
|
|
paper_bgcolor=['#FFFFFF', '#181818'][darkMode],
|
|
|
|
showlegend=False,
|
|
|
|
scene=dict(xaxis=dict(axis),
|
|
|
|
yaxis=dict(axis),
|
|
|
|
zaxis=dict(axis),
|
|
|
|
),
|
|
|
|
margin=dict(l=0, r=0, b=0, t=0),
|
|
|
|
hovermode='closest')
|
2022-02-11 12:14:24 +01:00
|
|
|
|
|
|
|
data = [trace_edges, trace_nodes]
|
|
|
|
fig = go.Figure(data=data, layout=layout)
|
|
|
|
|
|
|
|
fig.show()
|
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2021-10-05 18:25:27 +02:00
|
|
|
def buildFullGraph(darkMode=False):
|
2021-06-14 22:20:36 +02:00
|
|
|
books = loadBooksFromDB()
|
2021-10-05 18:25:27 +02:00
|
|
|
G = buildBookGraph(books, darkMode=darkMode)
|
2021-06-14 22:20:36 +02:00
|
|
|
|
2021-10-05 18:25:27 +02:00
|
|
|
graphAddAuthors(G, books, darkMode=darkMode)
|
|
|
|
graphAddRecommenders(G, books, darkMode=darkMode)
|
|
|
|
graphAddTopLists(G, books, darkMode=darkMode)
|
|
|
|
graphAddSeries(G, books, darkMode=darkMode)
|
|
|
|
graphAddTags(G, books, darkMode=darkMode)
|
2021-06-14 22:20:36 +02:00
|
|
|
return G, books
|
|
|
|
|
|
|
|
|
2022-02-04 20:34:59 +01:00
|
|
|
def genScores(G, books, calcPagerank=True):
|
2021-06-14 22:20:36 +02:00
|
|
|
globMu, globStd = calcRecDist(G, books)
|
2022-02-04 20:34:59 +01:00
|
|
|
if calcPagerank:
|
|
|
|
runPagerank(G)
|
2021-06-14 22:20:36 +02:00
|
|
|
scoreOpinions(G, globMu, globStd)
|
|
|
|
scoreUnread(G, globMu, globStd)
|
|
|
|
return globMu, globStd
|
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2022-02-11 12:14:24 +01:00
|
|
|
def addImageToNode(node, cache, shape='circularImage'):
|
2022-09-11 18:56:47 +02:00
|
|
|
name = node['label'].split(' (')[0].replace('*', '')
|
|
|
|
if not name in cache or (cache[name] == False and random.random() < 0.05):
|
2022-02-11 12:14:24 +01:00
|
|
|
term = name
|
|
|
|
img = getWikiImage(term)
|
|
|
|
if img:
|
|
|
|
cache[name] = img
|
2022-02-11 12:28:03 +01:00
|
|
|
else:
|
|
|
|
cache[name] = False
|
2022-02-11 12:14:24 +01:00
|
|
|
else:
|
|
|
|
img = cache[name]
|
|
|
|
if img:
|
2022-02-11 17:37:23 +01:00
|
|
|
#node['imagePadding'] = '100px'
|
2022-09-11 18:56:47 +02:00
|
|
|
node['image'] = img
|
|
|
|
node['shape'] = shape
|
|
|
|
|
2022-02-11 12:14:24 +01:00
|
|
|
|
|
|
|
def addImagesToNodes(G):
|
|
|
|
try:
|
|
|
|
with open('.imgLinkCache.json', 'r') as cf:
|
|
|
|
cache = json.loads(cf.read())
|
|
|
|
except IOError:
|
|
|
|
cache = {}
|
|
|
|
for n in list(G.nodes):
|
|
|
|
node = G.nodes[n]
|
|
|
|
if node['t'] in ['recommender', 'author']:
|
2022-09-11 18:56:47 +02:00
|
|
|
addImageToNode(
|
|
|
|
node, cache, ['circularImage', 'image'][node['t'] == 'author'])
|
2022-02-11 12:14:24 +01:00
|
|
|
with open('.imgLinkCache.json', 'w') as cf:
|
|
|
|
cf.write(json.dumps(cache))
|
2021-06-14 22:20:36 +02:00
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2021-09-25 00:54:09 +02:00
|
|
|
def recommendNBooksRecommenderBased(G, mu, std, n, removeTopListsB=True, removeUselessRecommenders=True):
|
2021-06-14 22:20:36 +02:00
|
|
|
removeRestOfSeries(G)
|
2021-09-24 16:13:55 +02:00
|
|
|
removeBad(G, mu-std*2-1)
|
2021-09-24 17:23:34 +02:00
|
|
|
removeKeepBest(G, int(n*2) + 5, maxDistForRead=2)
|
2021-06-14 22:20:36 +02:00
|
|
|
removeEdge(G)
|
2021-09-24 18:25:37 +02:00
|
|
|
removeHighSpanTags(G, 6)
|
2021-06-14 22:20:36 +02:00
|
|
|
removeDangling(G, alsoBooks=False)
|
2021-09-24 17:23:34 +02:00
|
|
|
pruneTags(G, 10)
|
2021-06-14 22:20:36 +02:00
|
|
|
removeBad(G, mu, groups=['book'])
|
2021-09-08 22:54:49 +02:00
|
|
|
removeUselessReadBooks(G)
|
2021-09-24 17:23:34 +02:00
|
|
|
pruneTags(G, 6)
|
2021-06-15 14:23:49 +02:00
|
|
|
pruneRecommenderCons(G, int(n/7)+1)
|
|
|
|
pruneAuthorCons(G, int(n/15))
|
2021-09-24 16:13:55 +02:00
|
|
|
removeUselessTags(G)
|
2021-09-06 16:17:54 +02:00
|
|
|
if removeTopListsB:
|
|
|
|
removeTopLists(G)
|
2021-06-14 22:20:36 +02:00
|
|
|
removeDangling(G, alsoBooks=True)
|
2021-09-24 17:23:34 +02:00
|
|
|
removeKeepBest(G, n+math.ceil(n/20), maxDistForRead=1.5)
|
2021-06-14 22:20:36 +02:00
|
|
|
removeEdge(G)
|
|
|
|
removeDangling(G, alsoBooks=True)
|
2021-09-24 18:25:37 +02:00
|
|
|
removeUselessReadBooks(G)
|
2021-09-06 16:21:01 +02:00
|
|
|
if removeUselessRecommenders:
|
|
|
|
removeUnusedRecommenders(G)
|
2021-09-24 16:13:55 +02:00
|
|
|
removeDangling(G, alsoBooks=True)
|
2021-09-24 17:23:34 +02:00
|
|
|
removeKeepBest(G, n, maxDistForRead=1.25)
|
2021-06-14 22:20:36 +02:00
|
|
|
|
|
|
|
scaleBooksByRating(G)
|
|
|
|
scaleOpinionsByRating(G)
|
|
|
|
addScoreToLabels(G)
|
|
|
|
|
2021-09-24 23:39:55 +02:00
|
|
|
|
|
|
|
def recommendNBooksTagBased(G, mu, std, n, removeTopListsB=True):
|
|
|
|
removeRestOfSeries(G)
|
|
|
|
removeBad(G, mu-std*2-1)
|
|
|
|
removeKeepBest(G, int(n*2) + 5, maxDistForRead=2)
|
|
|
|
removeEdge(G)
|
|
|
|
removeHighSpanTags(G, 12)
|
|
|
|
removeDangling(G, alsoBooks=False)
|
|
|
|
pruneTags(G, 24)
|
|
|
|
removeBad(G, mu, groups=['book'])
|
|
|
|
removeUselessReadBooks(G)
|
|
|
|
pruneTags(G, 16)
|
|
|
|
pruneAuthorCons(G, int(n/5))
|
|
|
|
removeRecommenders(G)
|
|
|
|
removeUselessTags(G)
|
|
|
|
if removeTopListsB:
|
|
|
|
removeTopLists(G)
|
|
|
|
removeDangling(G, alsoBooks=True)
|
|
|
|
removeKeepBest(G, n+math.ceil(n/20), maxDistForRead=1.5)
|
|
|
|
removeUselessReadBooks(G)
|
2021-09-25 20:15:14 +02:00
|
|
|
removeUselessTags(G)
|
2021-09-24 23:39:55 +02:00
|
|
|
removeKeepBest(G, n, maxDistForRead=1.25)
|
|
|
|
|
|
|
|
scaleBooksByRating(G)
|
|
|
|
scaleOpinionsByRating(G)
|
|
|
|
addScoreToLabels(G)
|
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2022-02-11 18:12:49 +01:00
|
|
|
def recommendNBooks(G, mu, std, n, removeTopListsB=True, removeUselessRecommenders=True, v3d=False):
|
2021-09-25 00:54:09 +02:00
|
|
|
removeRestOfSeries(G)
|
2021-09-26 16:51:17 +02:00
|
|
|
removeBad(G, mu-std-0.5)
|
|
|
|
removeBad(G, mu+std/2, groups=['recommender'])
|
2022-02-06 17:59:21 +01:00
|
|
|
removeThinRecs(G, 3)
|
2021-09-25 00:54:09 +02:00
|
|
|
removeKeepBest(G, int(n*2) + 5, maxDistForRead=2)
|
|
|
|
removeEdge(G)
|
2022-02-01 12:22:13 +01:00
|
|
|
removeHighSpanTags(G, 8)
|
2022-02-10 18:18:05 +01:00
|
|
|
pruneTags(G, 7)
|
2022-02-11 18:04:47 +01:00
|
|
|
removeHighSpanReadBooks(G, 14)
|
2021-09-25 00:54:09 +02:00
|
|
|
removeDangling(G, alsoBooks=False)
|
2022-02-06 17:59:21 +01:00
|
|
|
pruneRecommenders(G, 12)
|
|
|
|
removeThinRecs(G, 3)
|
2021-09-25 00:54:09 +02:00
|
|
|
removeBad(G, mu, groups=['book'])
|
|
|
|
removeUselessReadBooks(G)
|
2022-02-01 12:22:13 +01:00
|
|
|
pruneAuthorCons(G, int(n/5)+3)
|
2022-02-06 17:59:21 +01:00
|
|
|
pruneRecommenders(G, 12 - min(4, n/20))
|
2021-09-26 12:46:30 +02:00
|
|
|
removeUselessSeries(G, mu)
|
2021-09-25 00:54:09 +02:00
|
|
|
removeUselessTags(G)
|
2022-02-10 18:18:05 +01:00
|
|
|
pruneTags(G, 6)
|
2021-09-25 00:54:09 +02:00
|
|
|
if removeTopListsB:
|
|
|
|
removeTopLists(G)
|
|
|
|
removeDangling(G, alsoBooks=True)
|
2021-09-25 20:15:14 +02:00
|
|
|
removeKeepBest(G, n+math.ceil(n/20)+3, maxDistForRead=1.5)
|
|
|
|
removeEdge(G)
|
2022-02-11 18:16:43 +01:00
|
|
|
removeKeepBest(G, n+1, maxDistForRead=1.25)
|
2021-09-26 12:46:30 +02:00
|
|
|
removeUselessSeries(G, mu)
|
|
|
|
removeUselessTags(G)
|
|
|
|
removeUselessReadBooks(G)
|
2022-09-11 18:56:47 +02:00
|
|
|
removeThinRecs(G, 2 + 1 * (n > 20 and not v3d))
|
2021-09-25 00:54:09 +02:00
|
|
|
removeKeepBest(G, n, maxDistForRead=1.25)
|
|
|
|
|
|
|
|
scaleBooksByRating(G)
|
|
|
|
scaleOpinionsByRating(G)
|
|
|
|
addScoreToLabels(G)
|
|
|
|
|
2021-09-24 23:39:55 +02:00
|
|
|
|
2021-10-13 15:10:12 +02:00
|
|
|
def listScores(G, mu, std, n):
|
|
|
|
removeRestOfSeries(G)
|
|
|
|
removeKeepBest(G, n, maxDistForRead=10)
|
|
|
|
|
|
|
|
scaleBooksByRating(G)
|
|
|
|
scaleOpinionsByRating(G)
|
|
|
|
addScoreToLabels(G)
|
|
|
|
|
|
|
|
|
2021-10-04 12:27:12 +02:00
|
|
|
def fullGraph(G, removeTopListsB=True):
|
2021-06-14 22:20:36 +02:00
|
|
|
removeEdge(G)
|
|
|
|
removeHighSpanTags(G, 7)
|
|
|
|
removeDangling(G, alsoBooks=False)
|
2021-10-04 12:27:12 +02:00
|
|
|
if removeTopListsB:
|
2021-09-06 16:17:54 +02:00
|
|
|
removeTopLists(G)
|
2021-06-14 22:20:36 +02:00
|
|
|
pruneTags(G, 3)
|
|
|
|
removeDangling(G, alsoBooks=True)
|
|
|
|
|
|
|
|
scaleBooksByRating(G)
|
|
|
|
scaleOpinionsByRating(G)
|
|
|
|
addScoreToLabels(G)
|
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2021-12-05 19:56:26 +01:00
|
|
|
def recommenderCompetence(G):
|
2022-09-11 18:56:47 +02:00
|
|
|
# removeRead(G)
|
2021-12-05 19:56:26 +01:00
|
|
|
removeUnread(G)
|
|
|
|
removeTags(G)
|
|
|
|
removeAuthors(G)
|
|
|
|
removeSeries(G)
|
|
|
|
removeTopLists(G)
|
|
|
|
|
2022-02-06 18:22:46 +01:00
|
|
|
removeEdge(G)
|
|
|
|
removeDangling(G, alsoBooks=True)
|
|
|
|
|
2021-12-05 19:56:26 +01:00
|
|
|
scaleBooksByRating(G)
|
|
|
|
scaleOpinionsByRating(G)
|
|
|
|
addScoreToLabels(G)
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2022-02-06 18:22:46 +01:00
|
|
|
for n in list(G.nodes):
|
|
|
|
node = G.nodes[n]
|
|
|
|
if node['t'] == 'recommender':
|
2022-02-06 18:28:35 +01:00
|
|
|
if 'se' in node:
|
|
|
|
node['score'] -= node['se'] * 1
|
|
|
|
else:
|
|
|
|
if not node['score']:
|
|
|
|
node['score'] = 0
|
|
|
|
node['score'] /= 2
|
2021-12-05 19:56:26 +01:00
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2021-09-06 16:17:54 +02:00
|
|
|
def readBooksAnalysis(G, minRating=0, showAllTags=True, removeUnconnected=False, removeTopListsB=True):
|
2021-06-14 22:20:36 +02:00
|
|
|
removeUnread(G)
|
2021-06-16 15:12:26 +02:00
|
|
|
removeBad(G, minRating)
|
|
|
|
if not showAllTags:
|
|
|
|
removeEdge(G)
|
2021-06-14 22:20:36 +02:00
|
|
|
removeHighSpanTags(G, 15)
|
2021-06-16 15:12:26 +02:00
|
|
|
removeDangling(G, alsoBooks=removeUnconnected)
|
2021-09-06 16:17:54 +02:00
|
|
|
if removeTopListsB:
|
|
|
|
removeTopLists(G)
|
2021-06-14 22:20:36 +02:00
|
|
|
pruneTags(G, 8)
|
|
|
|
|
|
|
|
scaleBooksByRating(G)
|
|
|
|
scaleOpinionsByRating(G)
|
|
|
|
addScoreToLabels(G)
|
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2022-02-15 19:35:03 +01:00
|
|
|
def progress(G, books, mu, minimum=3.5):
|
2022-09-11 18:56:47 +02:00
|
|
|
findNewBooks(G, books, mu, -1, minRecSco=minimum)
|
2021-11-24 22:35:39 +01:00
|
|
|
bookCount = 0
|
2022-02-15 19:35:03 +01:00
|
|
|
libCount = 0
|
2021-11-24 22:35:39 +01:00
|
|
|
readCount = 0
|
|
|
|
toReadCount = 0
|
|
|
|
for n in list(G.nodes):
|
|
|
|
node = G.nodes[n]
|
2022-09-11 18:56:47 +02:00
|
|
|
if node['t'] in ['book', 'newBook']:
|
2022-02-15 19:35:03 +01:00
|
|
|
if node['t'] == 'book':
|
2022-09-11 18:56:47 +02:00
|
|
|
libCount += 1
|
2021-11-24 22:35:39 +01:00
|
|
|
bookCount += 1
|
2022-02-15 19:35:03 +01:00
|
|
|
if 'rating' in node and node['rating'] != None:
|
2021-11-24 22:35:39 +01:00
|
|
|
readCount += 1
|
2022-09-11 18:56:47 +02:00
|
|
|
elif 'score' in node and (node['score'] >= minimum or 'std' in node and node['std'] == 0.0):
|
2021-11-24 22:35:39 +01:00
|
|
|
toReadCount += 1
|
|
|
|
perc = round(readCount / (toReadCount+readCount) * 100, 2)
|
2022-02-15 19:35:03 +01:00
|
|
|
print('Books in library: '+str(libCount))
|
|
|
|
print('Books in CaliGraph: '+str(bookCount))
|
2021-11-24 22:35:39 +01:00
|
|
|
print('Read Books: '+str(readCount))
|
|
|
|
print('Unread Books: '+str(bookCount-readCount))
|
|
|
|
print('Recommended Books (score > '+str(round(minimum, 2))+'): '+str(toReadCount))
|
|
|
|
print('Progress: '+str(perc)+'%')
|
|
|
|
|
2021-06-14 22:20:36 +02:00
|
|
|
|
2022-02-11 17:50:07 +01:00
|
|
|
def analyze(G, books, mu, type_name, name, dist=2.1):
|
2021-06-16 15:12:26 +02:00
|
|
|
from fuzzywuzzy import fuzz
|
|
|
|
type_ident = type_name[0]
|
|
|
|
full_name = type_ident + "/" + name
|
|
|
|
bestRatio, match, n = 0, None, 0
|
|
|
|
for ni in list(G.nodes):
|
|
|
|
node = G.nodes[ni]
|
2022-09-11 18:56:47 +02:00
|
|
|
if node['t'] == type_name or type_name == "any":
|
|
|
|
if name == node['label'] or full_name == node['label']:
|
2021-06-16 15:12:26 +02:00
|
|
|
match, n = node, ni
|
|
|
|
break
|
|
|
|
ratio = fuzz.ratio(node['label'], name)
|
|
|
|
if ratio > bestRatio:
|
|
|
|
bestRatio, match, n = ratio, node, ni
|
2021-06-16 18:01:28 +02:00
|
|
|
if bestRatio < 70:
|
|
|
|
print("Best Match: "+match['label'])
|
2021-06-16 17:35:40 +02:00
|
|
|
|
2022-02-11 17:50:07 +01:00
|
|
|
findNewBooks(G, books, mu, num=-1, minRecSco=1)
|
2022-02-11 17:37:23 +01:00
|
|
|
|
2021-06-16 15:12:26 +02:00
|
|
|
menge = set()
|
2021-06-23 15:45:32 +02:00
|
|
|
waveFlow(G, match, n, dist, menge)
|
2021-06-16 15:12:26 +02:00
|
|
|
for n in list(G.nodes):
|
|
|
|
if n not in menge:
|
|
|
|
G.remove_node(n)
|
2022-02-06 21:35:00 +01:00
|
|
|
if dist >= 2:
|
|
|
|
removeThinRecs(G, 2)
|
2022-11-19 16:18:58 +01:00
|
|
|
removeHighSpanTags(G, 12, forceKeepLabels=[match['label']])
|
2021-06-16 15:12:26 +02:00
|
|
|
if dist > 1:
|
|
|
|
removeDangling(G, True)
|
|
|
|
|
|
|
|
scaleBooksByRating(G)
|
|
|
|
scaleOpinionsByRating(G)
|
2021-06-22 15:41:24 +02:00
|
|
|
#match['value'] = 100
|
2021-06-16 18:01:28 +02:00
|
|
|
if not 'shape' in match:
|
|
|
|
match['shape'] = 'star'
|
2021-06-16 15:12:26 +02:00
|
|
|
addScoreToLabels(G)
|
2021-06-22 15:41:24 +02:00
|
|
|
match['label'] = "*"+match['label']+"*"
|
2021-06-16 15:12:26 +02:00
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2021-06-23 15:45:32 +02:00
|
|
|
def waveFlow(G, node, n, dist, menge, firstEdge=False):
|
2021-06-16 15:12:26 +02:00
|
|
|
if dist <= 0:
|
|
|
|
return
|
|
|
|
dist -= 1
|
2022-09-11 18:56:47 +02:00
|
|
|
if menge == set():
|
|
|
|
firstEdge = True
|
2021-06-22 15:41:24 +02:00
|
|
|
if node['t'] in ['topList']:
|
|
|
|
if firstEdge:
|
|
|
|
menge.add(n)
|
|
|
|
return
|
2021-06-16 15:12:26 +02:00
|
|
|
menge.add(n)
|
|
|
|
if node['t'] in ['tag']:
|
|
|
|
if firstEdge:
|
2022-09-11 18:56:47 +02:00
|
|
|
dist -= 0.1
|
2021-06-16 15:12:26 +02:00
|
|
|
else:
|
|
|
|
return
|
|
|
|
bestlist = []
|
|
|
|
keeplist = []
|
|
|
|
for m in list(G.adj[n]):
|
|
|
|
book = G.nodes[m]
|
2021-06-22 15:41:24 +02:00
|
|
|
if book['t'] not in ['NOTHING']:
|
2021-06-16 15:12:26 +02:00
|
|
|
if 'score' in book and book['score'] != None:
|
|
|
|
bestlist.append(book)
|
|
|
|
elif 'rating' in book and book['rating'] != None:
|
|
|
|
keeplist.append(book)
|
|
|
|
else:
|
|
|
|
book['score'] = 0
|
|
|
|
bestlist.append(book)
|
|
|
|
bestlist.sort(key=lambda node: node['score'], reverse=True)
|
2022-09-11 18:56:47 +02:00
|
|
|
toKeep = min(int(dist*10), math.ceil(len(bestlist)
|
|
|
|
* dist - len(keeplist)*0.5))
|
2021-06-16 15:12:26 +02:00
|
|
|
if toKeep <= 0:
|
|
|
|
keeplist.sort(key=lambda node: node['rating'], reverse=True)
|
|
|
|
keeplist = keeplist[:min(int(dist*10), int(len(keeplist) * dist))]
|
|
|
|
bestlist = []
|
|
|
|
else:
|
|
|
|
bestlist = bestlist[:toKeep]
|
|
|
|
|
|
|
|
for m in list(G.adj[n]):
|
|
|
|
node = G.nodes[m]
|
|
|
|
if node in bestlist or node in keeplist:
|
2021-06-23 15:45:32 +02:00
|
|
|
waveFlow(G, node, m, dist, menge, firstEdge=firstEdge)
|
2021-06-16 15:12:26 +02:00
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2021-12-11 13:58:01 +01:00
|
|
|
def gensimTokensForLines(lines):
|
|
|
|
for i, line in enumerate(lines):
|
|
|
|
tokens = gensim.utils.simple_preprocess(line)
|
|
|
|
if tokens_only:
|
|
|
|
yield tokens
|
|
|
|
else:
|
|
|
|
# For training data, add tags
|
|
|
|
yield gensim.models.doc2vec.TaggedDocument(tokens, [i])
|
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2021-12-11 13:58:01 +01:00
|
|
|
def buildDoc2Vec(books):
|
|
|
|
import gensim
|
|
|
|
for n in list(G.nodes):
|
|
|
|
node = G.nodes[n]
|
|
|
|
if node['t'] == 'book':
|
|
|
|
pass
|
|
|
|
gensimTokensForLines(lines)
|
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2021-12-11 13:58:01 +01:00
|
|
|
def shell(G, books, mu, std):
|
|
|
|
from ptpython.repl import embed
|
|
|
|
embed(globals(), locals())
|
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2022-02-06 17:59:21 +01:00
|
|
|
def newBooks(G, books, num, mu, std):
|
2022-02-06 21:45:08 +01:00
|
|
|
removeBad(G, mu-std*2)
|
2022-09-11 18:56:47 +02:00
|
|
|
findNewBooks(G, books, mu, num, minRecSco=mu-std)
|
2022-02-11 17:37:23 +01:00
|
|
|
removeThinRecs(G, 2)
|
2022-02-06 17:59:21 +01:00
|
|
|
removeUnread(G)
|
|
|
|
removeUselessReadBooks(G)
|
|
|
|
removeTags(G)
|
|
|
|
removeTopLists(G)
|
|
|
|
removeSeries(G)
|
|
|
|
removeEdge(G)
|
|
|
|
removeDangling(G, alsoBooks=True)
|
|
|
|
|
|
|
|
scaleBooksByRating(G)
|
|
|
|
scaleOpinionsByRating(G)
|
|
|
|
addScoreToLabels(G)
|
|
|
|
|
|
|
|
|
2022-02-11 17:50:07 +01:00
|
|
|
def findNewBooks(G, books, mu, num=-1, minRecSco=5):
|
2022-02-15 19:35:03 +01:00
|
|
|
mrbdf = pd.read_csv('rec_dbs/mrb_db.csv')
|
2022-02-06 17:59:21 +01:00
|
|
|
recs = []
|
|
|
|
for n in list(G.nodes):
|
|
|
|
node = G.nodes[n]
|
|
|
|
if node['t'] == 'recommender' and 'score' in node:
|
|
|
|
oldBooks = []
|
|
|
|
newBooks = []
|
2022-09-11 18:56:47 +02:00
|
|
|
recBooks = mrbdf[mrbdf['recommender'].str.contains(
|
|
|
|
node['label'])].to_dict(orient='records')
|
2022-02-06 17:59:21 +01:00
|
|
|
for book in recBooks:
|
|
|
|
if book['title'] in [b['title'] for b in books]:
|
2022-09-11 18:56:47 +02:00
|
|
|
oldBooks.append(
|
|
|
|
{'title': book['title'], 'author': book['author']})
|
2022-02-06 17:59:21 +01:00
|
|
|
else:
|
2022-09-11 18:56:47 +02:00
|
|
|
newBooks.append(
|
|
|
|
{'title': book['title'], 'author': book['author']})
|
|
|
|
recs.append({'name': node['label'], 'rec': node,
|
|
|
|
'newBooks': newBooks, 'oldBooks': oldBooks})
|
2022-02-06 17:59:21 +01:00
|
|
|
for rec in recs:
|
|
|
|
for book in rec['newBooks']:
|
2022-09-11 18:56:47 +02:00
|
|
|
G.add_node('n/'+book['title'], color='blue', t='newBook',
|
|
|
|
label=book['title'], author=book['author'])
|
2022-02-06 17:59:21 +01:00
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
G.add_node('r/'+rec['rec']['label'], color='orange', t='recommender',
|
|
|
|
label=rec['rec']['label'], score=rec['rec']['score'])
|
|
|
|
G.add_edge('r/'+rec['rec']['label'], 'n/' +
|
|
|
|
book['title'], color='blue')
|
2022-02-06 17:59:21 +01:00
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
G.add_node('a/'+book['author'], color='green',
|
|
|
|
t='author', label=book['author'])
|
2022-02-06 17:59:21 +01:00
|
|
|
G.add_edge('a/'+book['author'], 'n/'+book['title'], color='blue')
|
|
|
|
for n in list(G.nodes):
|
|
|
|
node = G.nodes[n]
|
|
|
|
if node['t'] == 'newBook':
|
|
|
|
ses = []
|
|
|
|
scores = []
|
|
|
|
for m in list(G.adj[n]):
|
|
|
|
adj = G.nodes[m]
|
2022-09-11 18:56:47 +02:00
|
|
|
if adj['t'] == 'recommender' and adj['score'] != None:
|
2022-02-06 17:59:21 +01:00
|
|
|
scores.append(adj['score'])
|
|
|
|
ses.append(adj['se'])
|
2022-02-11 17:37:23 +01:00
|
|
|
if not len(scores):
|
2022-02-06 17:59:21 +01:00
|
|
|
G.remove_node(n)
|
|
|
|
else:
|
2022-02-11 17:37:23 +01:00
|
|
|
ses.append(min(ses))
|
2022-02-11 17:50:07 +01:00
|
|
|
scores.append(mu)
|
2022-09-11 18:56:47 +02:00
|
|
|
# This is not how SE works. DILLIGAF?
|
|
|
|
node['fake_se'] = sum(ses)/(len(ses)**1.2) + \
|
|
|
|
0.5 + 0.5 * (len(scores) == 2)
|
|
|
|
node['score'] = sum(
|
2022-11-19 16:28:06 +01:00
|
|
|
scores)/len(scores)*1.2 - node['fake_se']*1.4 + 0.5 - 0.1/math.sqrt(len(scores))
|
2022-09-11 18:56:47 +02:00
|
|
|
if len(scores) == 2:
|
2022-11-19 16:28:06 +01:00
|
|
|
node['score'] *= 0.85
|
2022-02-06 17:59:21 +01:00
|
|
|
node['value'] = 20 + 5 * float(node['score'])
|
2022-09-11 18:56:47 +02:00
|
|
|
node['label'] += " ({:.2f}±{:.1f})".format(node['score'],
|
|
|
|
node['fake_se'])
|
2022-02-06 19:10:21 +01:00
|
|
|
node['label'] += '\n ' + node['author']
|
2022-09-11 18:56:47 +02:00
|
|
|
if num != -1:
|
2022-02-11 17:37:23 +01:00
|
|
|
removeKeepBest(G, num, 10, 'newBook')
|
2022-02-06 17:59:21 +01:00
|
|
|
|
2022-02-04 20:34:59 +01:00
|
|
|
# while batchSize is implemented, we only get a good gonvergence when we disable it (batchSize=-1)
|
|
|
|
# but might be necessary to enable later for a larger libary for better training performance...
|
|
|
|
# maybe try again for 128 books?
|
2022-09-11 18:56:47 +02:00
|
|
|
|
|
|
|
|
2022-02-24 20:19:00 +01:00
|
|
|
def evaluateFitness(books, batchSize=-1, debugPrint=False):
|
2021-09-24 19:12:09 +02:00
|
|
|
global weights
|
2021-09-24 17:13:36 +02:00
|
|
|
G = buildBookGraph(books)
|
|
|
|
graphAddAuthors(G, books)
|
|
|
|
graphAddRecommenders(G, books)
|
|
|
|
graphAddTopLists(G, books)
|
|
|
|
graphAddSeries(G, books)
|
|
|
|
graphAddTags(G, books)
|
2022-02-04 20:34:59 +01:00
|
|
|
runPagerank(G)
|
2021-09-24 17:13:36 +02:00
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
ratedBooks = [n for n in list(
|
|
|
|
G.nodes) if 'rating' in G.nodes[n] and G.nodes[n]['rating'] != None]
|
2021-09-24 23:39:55 +02:00
|
|
|
boundsLoss = 0
|
2021-09-26 12:46:30 +02:00
|
|
|
linSepLoss = []
|
2021-09-24 16:13:55 +02:00
|
|
|
errSq = []
|
2021-09-26 23:13:43 +02:00
|
|
|
gradient = {}
|
2022-02-24 20:19:00 +01:00
|
|
|
for wt in weights:
|
|
|
|
gradient[wt] = 0
|
2021-09-25 00:54:09 +02:00
|
|
|
mu, sigma = genScores(G, books)
|
2021-09-24 23:39:55 +02:00
|
|
|
for b in G.nodes:
|
2022-09-11 18:56:47 +02:00
|
|
|
batch = random.sample(ratedBooks, batchSize) if batchSize != - \
|
|
|
|
1 and len(ratedBooks) > batchSize else ratedBooks
|
2022-02-24 20:19:00 +01:00
|
|
|
if b in batch:
|
2021-09-24 23:39:55 +02:00
|
|
|
rating = G.nodes[b]['rating']
|
|
|
|
G.nodes[b]['rating'] = None
|
2022-02-04 20:34:59 +01:00
|
|
|
_, _ = genScores(G, books, calcPagerank=False)
|
2022-09-11 18:56:47 +02:00
|
|
|
if G.nodes[b]['score'] > rating: # over estimated
|
2021-09-24 23:39:55 +02:00
|
|
|
errSq.append(((rating - G.nodes[b]['score'])**2)*2)
|
|
|
|
else:
|
|
|
|
errSq.append((rating - G.nodes[b]['score'])**2)
|
|
|
|
G.nodes[b]['rating'] = rating
|
2022-02-24 20:19:00 +01:00
|
|
|
for wt in weights:
|
2022-09-11 18:56:47 +02:00
|
|
|
scoreB = sum([a*(1.001 if wt == w[0] else 1)*weights[w[0]]*(w[1] if len(w) > 1 else 1) for a, w in zip(G.nodes[b]['_act'],
|
|
|
|
G.nodes[b]['_wgh'])])/sum([(1.001 if wt == w[0] else 1)*weights[w[0]]*(w[1] if len(w) > 1 else 1) for w in G.nodes[b]['_wgh']])
|
|
|
|
gradient[wt] += ((rating - G.nodes[b]['score'])
|
|
|
|
** 2 - (rating - scoreB)**2)*1000
|
|
|
|
# no punishment if w within -1 and 1
|
|
|
|
regressionLoss = sum([max(0, abs(w)-1)**2 for w in weights.values()])
|
2022-02-07 19:57:03 +01:00
|
|
|
for wt in weights:
|
|
|
|
if abs(weights[wt]) > 1.0:
|
2022-02-24 20:19:00 +01:00
|
|
|
gradient[wt] -= weights[wt]*10
|
|
|
|
else:
|
|
|
|
gradient[wt] -= weights[wt]*1
|
2021-09-26 23:13:43 +02:00
|
|
|
for g in gradient:
|
2022-02-24 20:19:00 +01:00
|
|
|
gradient[g] /= len(errSq)
|
2021-09-26 14:31:00 +02:00
|
|
|
if debugPrint:
|
2022-02-07 19:57:03 +01:00
|
|
|
print(sum(errSq)/len(errSq), 0.001*regressionLoss)
|
|
|
|
fit = sum(errSq)/len(errSq) + 0.001*regressionLoss
|
2021-09-26 23:13:43 +02:00
|
|
|
return fit, gradient
|
2021-09-24 16:13:55 +02:00
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
|
|
|
def calcDissonance(books):
|
|
|
|
global weights
|
|
|
|
G = buildBookGraph(books)
|
|
|
|
graphAddAuthors(G, books)
|
|
|
|
graphAddRecommenders(G, books)
|
|
|
|
graphAddTopLists(G, books)
|
|
|
|
graphAddSeries(G, books)
|
|
|
|
graphAddTags(G, books)
|
|
|
|
runPagerank(G)
|
|
|
|
|
|
|
|
ratedBooks = [n for n in list(
|
|
|
|
G.nodes) if 'rating' in G.nodes[n] and G.nodes[n]['rating'] != None]
|
|
|
|
errSq = []
|
|
|
|
gradient = {}
|
|
|
|
for wt in weights:
|
|
|
|
gradient[wt] = 0
|
|
|
|
mu, sigma = genScores(G, books)
|
|
|
|
for b in G.nodes:
|
|
|
|
batch = ratedBooks
|
|
|
|
if b in batch:
|
|
|
|
rating = G.nodes[b]['rating']
|
|
|
|
G.nodes[b]['rating'] = None
|
|
|
|
_, _ = genScores(G, books, calcPagerank=False)
|
|
|
|
G.nodes[b]['_test_score'] = G.nodes[b]['score']
|
|
|
|
G.nodes[b]['rating'] = rating
|
|
|
|
G.nodes[b]['dissonance_off'] = rating - G.nodes[b]['score']
|
|
|
|
G.nodes[b]['dissonance_abs'] = abs(rating - G.nodes[b]['score'])
|
|
|
|
|
|
|
|
return G
|
|
|
|
|
|
|
|
|
|
|
|
def describeDissonance(books, num=-1, sortKey='dissonance_abs', sortDir=True):
|
|
|
|
bestlist = []
|
|
|
|
G = calcDissonance(books)
|
|
|
|
for n in list(G.nodes):
|
|
|
|
node = G.nodes[n]
|
|
|
|
if'dissonance_abs' in node:
|
|
|
|
bestlist.append(node)
|
|
|
|
bestlist.sort(key=lambda node: node[sortKey], reverse=sortDir)
|
|
|
|
for i, book in enumerate(bestlist):
|
|
|
|
line = book['title'] + " ("+" & ".join(book['authors'])+")" + \
|
|
|
|
": You: {:.5f}, AI: {:.5f}, Delta: {:.5f}".format(
|
|
|
|
book['rating'], book['_test_score'], book['dissonance_off'])
|
|
|
|
print("["+str(i+1).zfill(int((math.log10(num) if num != -1 else 3)+1))+"] "+line)
|
|
|
|
if num != -1 and i == num-1:
|
|
|
|
break
|
|
|
|
|
|
|
|
|
2021-11-23 20:51:24 +01:00
|
|
|
def train(initGamma, full=True):
|
2021-09-24 16:13:55 +02:00
|
|
|
global weights
|
2021-09-26 23:13:43 +02:00
|
|
|
if full:
|
|
|
|
for wt in weights:
|
|
|
|
weights[wt] = random.random()
|
2022-02-03 15:24:06 +01:00
|
|
|
saveWeights(weights)
|
2021-10-05 18:08:32 +02:00
|
|
|
gamma = initGamma
|
2021-09-24 17:13:36 +02:00
|
|
|
books = loadBooksFromDB()
|
2021-09-24 16:13:55 +02:00
|
|
|
bestWeights = copy.copy(weights)
|
2021-09-26 23:13:43 +02:00
|
|
|
mse, gradient = evaluateFitness(books)
|
2022-02-24 20:19:00 +01:00
|
|
|
delta = sum(gradient[g]**2 for g in gradient)
|
2021-09-26 23:13:43 +02:00
|
|
|
best_mse = mse
|
|
|
|
stagLen = 0
|
2021-12-11 11:54:25 +01:00
|
|
|
goal = 1.0e-4
|
2021-12-11 11:52:49 +01:00
|
|
|
if full:
|
2021-12-11 11:54:25 +01:00
|
|
|
goal = 1.0e-5
|
2021-09-26 23:13:43 +02:00
|
|
|
|
2021-12-11 11:52:49 +01:00
|
|
|
while gamma > goal and delta > goal or best_mse > 15:
|
2022-01-22 18:12:57 +01:00
|
|
|
goal *= 1.1
|
2021-09-26 23:13:43 +02:00
|
|
|
last_mse = mse
|
2021-09-26 23:15:50 +02:00
|
|
|
print({'mse': mse, 'gamma': gamma, 'delta': delta})
|
2021-09-26 23:13:43 +02:00
|
|
|
delta = sum(gradient[g]**2 for g in gradient)
|
|
|
|
for wt in weights:
|
2022-03-07 13:21:16 +01:00
|
|
|
weights[wt] += gamma*0.1*gradient[wt]/math.sqrt(delta)
|
2021-09-26 23:13:43 +02:00
|
|
|
mse, gradient = evaluateFitness(books)
|
|
|
|
if mse < last_mse:
|
|
|
|
gamma = gamma*1.25
|
2021-09-24 16:32:43 +02:00
|
|
|
else:
|
2021-09-26 23:13:43 +02:00
|
|
|
gamma *= 0.25
|
2021-09-24 23:39:55 +02:00
|
|
|
if mse < best_mse:
|
2021-09-24 16:13:55 +02:00
|
|
|
saveWeights(weights)
|
|
|
|
bestWeights = copy.copy(weights)
|
2021-10-05 18:08:32 +02:00
|
|
|
best_mse = mse
|
|
|
|
if mse > last_mse:
|
2021-09-26 23:13:43 +02:00
|
|
|
stagLen += 1
|
2021-10-05 18:08:32 +02:00
|
|
|
else:
|
|
|
|
stagLen = 0
|
|
|
|
if stagLen == 4 or mse > 50:
|
2021-10-12 20:10:48 +02:00
|
|
|
if full or mse > 10:
|
|
|
|
stagLen = 0
|
|
|
|
gamma = initGamma
|
|
|
|
if random.random() < 0.50:
|
|
|
|
for wt in weights:
|
2022-02-01 12:22:13 +01:00
|
|
|
weights[wt] = random.random()*2-0.5
|
2021-10-12 20:10:48 +02:00
|
|
|
else:
|
|
|
|
weights = copy.copy(bestWeights)
|
|
|
|
for wt in weights:
|
|
|
|
weights[wt] *= 0.975+0.05*random.random()
|
2021-10-05 18:08:32 +02:00
|
|
|
else:
|
2021-10-12 20:10:48 +02:00
|
|
|
break
|
2021-09-26 23:17:03 +02:00
|
|
|
print('Done.')
|
2021-09-24 16:13:55 +02:00
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2021-09-24 16:13:55 +02:00
|
|
|
def saveWeights(weights):
|
|
|
|
with open('neuralWeights.json', 'w') as f:
|
|
|
|
f.write(json.dumps(weights))
|
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2021-09-24 16:13:55 +02:00
|
|
|
def loadWeights():
|
2022-02-11 12:17:01 +01:00
|
|
|
try:
|
|
|
|
with open('neuralWeights.json', 'r') as f:
|
|
|
|
weights = json.loads(f.read())
|
|
|
|
except IOError:
|
2022-09-11 18:56:47 +02:00
|
|
|
weights = {"topList": 0.15, "recommender": 0.30, "author": 0.70, "series": 0.05, "tag": 0.05,
|
|
|
|
"pagerank": 0.05, "mu": 0.50, "sigma": 0.30, "bias": 0.25, "median": 0.10} # , "tgb_rank": 0.10}
|
2021-09-24 16:13:55 +02:00
|
|
|
return weights
|
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2022-03-07 13:26:29 +01:00
|
|
|
def cliInterface(imgDef=False):
|
2021-06-16 15:12:26 +02:00
|
|
|
import argparse
|
2023-01-17 23:18:26 +01:00
|
|
|
import argcomplete
|
2021-06-16 15:12:26 +02:00
|
|
|
|
|
|
|
parser = argparse.ArgumentParser(description='TODO: Write Description.')
|
|
|
|
parser.add_argument('--keep-priv', action="store_true")
|
2021-10-17 15:51:26 +02:00
|
|
|
parser.add_argument('--keep-whitepapers', action="store_true")
|
2021-06-16 15:12:26 +02:00
|
|
|
parser.add_argument('--remove-read', action="store_true")
|
|
|
|
parser.add_argument('--remove-unread', action="store_true")
|
|
|
|
parser.add_argument('--no-web', action="store_true")
|
|
|
|
parser.add_argument('--no-list', action="store_true")
|
2021-06-16 17:35:40 +02:00
|
|
|
parser.add_argument('--remove-edge', action="store_true")
|
2021-09-06 16:17:54 +02:00
|
|
|
parser.add_argument('--keep-top-lists', action="store_true")
|
2021-09-06 16:21:01 +02:00
|
|
|
parser.add_argument('--keep-useless-recommenders', action="store_true")
|
2022-02-11 12:14:24 +01:00
|
|
|
parser.add_argument('--dark', action="store_true")
|
2022-11-19 16:18:58 +01:00
|
|
|
parser.add_argument('--curiosity', type=float, default=0.0,
|
|
|
|
help='curiosity coefficient (higher = more speculative)')
|
2022-02-11 12:14:24 +01:00
|
|
|
parser.add_argument('--v3d', action="store_true")
|
2022-03-07 13:26:29 +01:00
|
|
|
if imgDef:
|
|
|
|
parser.add_argument('--no-imgs', action="store_true")
|
|
|
|
else:
|
|
|
|
parser.add_argument('--imgs', action="store_true")
|
2022-02-22 15:02:48 +01:00
|
|
|
parser.add_argument('--perf-test', action="store_true")
|
2022-03-07 13:21:16 +01:00
|
|
|
parser.add_argument('--train', action="store_true")
|
2021-06-16 15:12:26 +02:00
|
|
|
cmds = parser.add_subparsers(required=True, dest='cmd')
|
|
|
|
|
|
|
|
p_rec = cmds.add_parser('recommend', description="TODO", aliases=['rec'])
|
2022-09-11 18:56:47 +02:00
|
|
|
p_rec.add_argument('-n', type=int, default=20,
|
|
|
|
help='number of books to recommend')
|
2021-09-24 23:39:55 +02:00
|
|
|
p_rec.add_argument('--tag-based', action="store_true")
|
2021-09-25 00:54:09 +02:00
|
|
|
p_rec.add_argument('--recommender-based', action="store_true")
|
2022-09-11 18:56:47 +02:00
|
|
|
p_rec.add_argument('--new', type=int, default=-1,
|
|
|
|
help='number of new books to recommend')
|
2021-06-16 15:12:26 +02:00
|
|
|
|
2021-10-13 15:10:12 +02:00
|
|
|
p_rec = cmds.add_parser('listScores', description="TODO", aliases=['ls'])
|
2022-09-11 18:56:47 +02:00
|
|
|
p_rec.add_argument('-n', type=int, default=50,
|
|
|
|
help='number of books to recommend')
|
2021-10-13 15:10:12 +02:00
|
|
|
|
2021-06-16 15:12:26 +02:00
|
|
|
p_read = cmds.add_parser('read', description="TODO", aliases=[])
|
|
|
|
p_read.add_argument('--min-rating', type=int, default=0)
|
|
|
|
p_read.add_argument('--all-tags', action="store_true")
|
|
|
|
p_read.add_argument('--only-connected', action="store_true")
|
|
|
|
|
|
|
|
p_show = cmds.add_parser('analyze', description="TODO", aliases=[])
|
2022-09-11 18:56:47 +02:00
|
|
|
p_show.add_argument(
|
|
|
|
'type', choices=['any', 'book', 'recommender', 'author', 'series', 'tag'])
|
2021-06-16 15:12:26 +02:00
|
|
|
p_show.add_argument('name', type=str)
|
2022-09-11 18:56:47 +02:00
|
|
|
p_show.add_argument('-d', type=float, default=2.1,
|
|
|
|
help='depth of expansion')
|
2021-06-16 15:12:26 +02:00
|
|
|
|
2021-09-24 16:13:55 +02:00
|
|
|
p_train = cmds.add_parser('train', description="TODO", aliases=[])
|
2022-09-11 18:56:47 +02:00
|
|
|
p_train.add_argument('-g', type=float, default=0.2,
|
|
|
|
help='learning rate gamma')
|
2021-09-26 15:52:54 +02:00
|
|
|
p_train.add_argument('--full', action="store_true")
|
2021-09-24 16:13:55 +02:00
|
|
|
|
2021-11-24 22:35:39 +01:00
|
|
|
p_prog = cmds.add_parser('progress', description="TODO", aliases=[])
|
2022-09-11 18:56:47 +02:00
|
|
|
p_prog.add_argument('-m', type=float, default=7,
|
|
|
|
help='Mimimum Score to read')
|
2021-11-24 22:35:39 +01:00
|
|
|
|
2021-12-05 19:56:26 +01:00
|
|
|
p_comp = cmds.add_parser('competence', description="TODO", aliases=[])
|
|
|
|
|
2021-12-11 13:58:01 +01:00
|
|
|
p_shell = cmds.add_parser('shell', description="TODO", aliases=[])
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2022-02-06 17:59:21 +01:00
|
|
|
p_new = cmds.add_parser('newBooks', description="TODO", aliases=[])
|
2022-09-11 18:56:47 +02:00
|
|
|
p_new.add_argument('-n', type=int, default=10,
|
|
|
|
help='number of books to recommend')
|
|
|
|
|
|
|
|
p_cal = cmds.add_parser('calice', description="TODO", aliases=[])
|
|
|
|
|
|
|
|
p_dis = cmds.add_parser('dissonance', description="TODO", aliases=['dis'])
|
|
|
|
p_dis.add_argument('-n', type=int, default=-1,
|
|
|
|
help='Maximum number of books to lost')
|
|
|
|
p_dis.add_argument(
|
|
|
|
'--sort', choices=['dissonance_abs', 'dissonance_off', 'score'], default='dissonance_abs', const='dissonance_abs', nargs='?')
|
|
|
|
p_dis.add_argument('--reversed', action="store_true")
|
|
|
|
|
|
|
|
p_createCol = cmds.add_parser(
|
|
|
|
'createCaliceColumn', description="TODO", aliases=[])
|
2022-03-19 12:18:14 +01:00
|
|
|
p_createCol.add_argument('type', choices=['score', 'rating', 'both'])
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2021-06-16 15:12:26 +02:00
|
|
|
p_full = cmds.add_parser('full', description="TODO", aliases=[])
|
|
|
|
|
2023-01-17 23:18:26 +01:00
|
|
|
pyzshcomplete.autocomplete(parser)
|
|
|
|
argcomplete.autocomplete(parser)
|
2021-06-16 15:12:26 +02:00
|
|
|
args = parser.parse_args()
|
|
|
|
|
2022-03-07 13:26:29 +01:00
|
|
|
if imgDef:
|
|
|
|
args.imgs = not args.no_imgs
|
|
|
|
|
2022-02-22 15:26:04 +01:00
|
|
|
if args.perf_test:
|
2022-02-22 15:02:48 +01:00
|
|
|
perfTestCLI(args)
|
|
|
|
else:
|
|
|
|
mainCLI(args)
|
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2022-02-22 15:02:48 +01:00
|
|
|
def perfTestCLI(args):
|
2022-02-22 15:26:04 +01:00
|
|
|
import time
|
2022-02-22 15:02:48 +01:00
|
|
|
from pycallgraph import PyCallGraph
|
|
|
|
from pycallgraph import Config
|
|
|
|
from pycallgraph import GlobbingFilter
|
|
|
|
from pycallgraph.output import GraphvizOutput
|
|
|
|
config = Config()
|
|
|
|
config.trace_filter = GlobbingFilter(exclude=[
|
|
|
|
"pycallgraph.*",
|
2022-02-22 15:26:04 +01:00
|
|
|
"numpy.*"
|
2022-02-22 15:02:48 +01:00
|
|
|
])
|
2022-02-22 15:26:04 +01:00
|
|
|
with PyCallGraph(output=GraphvizOutput(output_file='perfTests/' + str(int(time.time())) + '.png'), config=config):
|
2022-02-22 15:02:48 +01:00
|
|
|
mainCLI(args)
|
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
|
2022-02-22 15:02:48 +01:00
|
|
|
def mainCLI(args):
|
2023-01-17 23:18:26 +01:00
|
|
|
global weights
|
|
|
|
defered_imports()
|
|
|
|
weights = loadWeights()
|
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
if args.cmd == "train":
|
2021-09-26 23:13:43 +02:00
|
|
|
train(args.g, args.full)
|
2021-09-24 16:13:55 +02:00
|
|
|
exit()
|
|
|
|
|
2022-03-07 13:21:16 +01:00
|
|
|
if args.train:
|
|
|
|
train(0.2, False)
|
|
|
|
|
2022-02-06 18:22:46 +01:00
|
|
|
bestListT = 'book'
|
|
|
|
|
2022-02-11 12:14:24 +01:00
|
|
|
G, books = buildFullGraph(darkMode=args.dark)
|
2021-06-16 15:12:26 +02:00
|
|
|
mu, std = genScores(G, books)
|
|
|
|
|
2022-11-19 16:18:58 +01:00
|
|
|
curiosityReward(G, args.curiosity)
|
|
|
|
|
2021-10-17 15:48:44 +02:00
|
|
|
if not args.keep_whitepapers:
|
|
|
|
removeWhitepapers(G)
|
|
|
|
|
2022-09-11 18:56:47 +02:00
|
|
|
if args.cmd == "recommend":
|
|
|
|
if args.new == -1:
|
2022-02-06 17:59:21 +01:00
|
|
|
args.new = int(args.n / 5)
|
|
|
|
if args.new != 0:
|
2022-09-11 18:56:47 +02:00
|
|
|
findNewBooks(G, books, mu, args.new, minRecSco=mu-std)
|
2021-09-24 23:39:55 +02:00
|
|
|
if args.tag_based:
|
2021-09-25 00:54:09 +02:00
|
|
|
if args.recommender_based:
|
2022-09-11 18:56:47 +02:00
|
|
|
raise Exception(
|
|
|
|
'tag-based and recommender-based can not be be combined')
|
|
|
|
recommendNBooksTagBased(
|
|
|
|
G, mu, std, args.n, not args.keep_top_lists)
|
2021-09-25 00:54:09 +02:00
|
|
|
elif args.recommender_based:
|
2022-09-11 18:56:47 +02:00
|
|
|
recommendNBooksRecommenderBased(
|
|
|
|
G, mu, std, args.n, not args.keep_top_lists, not args.keep_useless_recommenders)
|
2021-09-24 23:39:55 +02:00
|
|
|
else:
|
2022-09-11 18:56:47 +02:00
|
|
|
recommendNBooks(G, mu, std, args.n, not args.keep_top_lists,
|
|
|
|
not args.keep_useless_recommenders, args.v3d)
|
|
|
|
elif args.cmd == "listScores":
|
2021-10-13 15:10:12 +02:00
|
|
|
listScores(G, mu, std, args.n)
|
2022-09-11 18:56:47 +02:00
|
|
|
elif args.cmd == "read":
|
|
|
|
readBooksAnalysis(G, args.min_rating, args.all_tags,
|
|
|
|
args.only_connected, not args.keep_top_lists)
|
|
|
|
elif args.cmd == "analyze":
|
2022-02-11 17:50:07 +01:00
|
|
|
analyze(G, books, mu, args.type, args.name, args.d)
|
2022-09-11 18:56:47 +02:00
|
|
|
elif args.cmd == "full":
|
2021-09-06 16:17:54 +02:00
|
|
|
fullGraph(G, not args.keep_top_lists)
|
2022-09-11 18:56:47 +02:00
|
|
|
elif args.cmd == "competence":
|
2022-02-06 18:22:46 +01:00
|
|
|
bestListT = 'recommender'
|
2021-12-05 19:56:26 +01:00
|
|
|
recommenderCompetence(G)
|
2022-09-11 18:56:47 +02:00
|
|
|
elif args.cmd == "shell":
|
2021-12-11 13:58:01 +01:00
|
|
|
shell(G, books, mu, std)
|
2022-09-11 18:56:47 +02:00
|
|
|
elif args.cmd == "progress":
|
2022-02-15 19:35:03 +01:00
|
|
|
progress(G, books, mu, args.m)
|
2021-11-24 22:35:39 +01:00
|
|
|
return
|
2022-09-11 18:56:47 +02:00
|
|
|
elif args.cmd == "newBooks":
|
2022-02-06 18:22:46 +01:00
|
|
|
bestListT = 'newBook'
|
2022-02-06 17:59:21 +01:00
|
|
|
newBooks(G, books, args.n, mu, std)
|
2022-09-11 18:56:47 +02:00
|
|
|
elif args.cmd == "calice":
|
2022-03-19 11:35:30 +01:00
|
|
|
calice(G)
|
2022-03-19 11:37:24 +01:00
|
|
|
exit()
|
2022-09-11 18:56:47 +02:00
|
|
|
elif args.cmd == "dissonance":
|
|
|
|
describeDissonance(books, args.n, args.sort, not args.reversed)
|
|
|
|
exit()
|
|
|
|
elif args.cmd == "createCaliceColumn":
|
2022-03-19 12:18:14 +01:00
|
|
|
if args.type in ['score', 'both']:
|
|
|
|
calibreDB.createCaliceScoreColumn()
|
|
|
|
print('[*] Column "Calice Score" was created.')
|
|
|
|
if args.type in ['rating', 'both']:
|
|
|
|
calibreDB.createCaliceRatingColumn()
|
|
|
|
print('[*] Column "Calice Rating" was created.')
|
2022-09-11 18:56:47 +02:00
|
|
|
print(
|
|
|
|
'[i] To allow displaying half-stars, please active them manually in the calibre-settings.')
|
2022-03-19 11:37:24 +01:00
|
|
|
exit()
|
2021-06-16 15:12:26 +02:00
|
|
|
else:
|
|
|
|
raise Exception("Bad")
|
|
|
|
|
2021-09-24 16:13:55 +02:00
|
|
|
if not args.keep_priv:
|
|
|
|
removePriv(G)
|
|
|
|
if args.remove_read:
|
|
|
|
removeRead(G)
|
|
|
|
elif args.remove_unread:
|
|
|
|
removeUnread(G)
|
|
|
|
|
|
|
|
removeDangling(G, alsoBooks=True)
|
|
|
|
|
2021-06-16 17:35:40 +02:00
|
|
|
if args.remove_edge:
|
|
|
|
removeEdge(G)
|
|
|
|
|
2021-06-16 15:12:26 +02:00
|
|
|
if not args.no_list:
|
2022-02-06 18:22:46 +01:00
|
|
|
printBestList(G, t=bestListT)
|
2021-10-13 15:10:12 +02:00
|
|
|
if not args.no_web and not args.cmd in ['listScores']:
|
2022-02-11 12:14:24 +01:00
|
|
|
if args.v3d:
|
|
|
|
genAndShow3D(G, darkMode=args.dark)
|
|
|
|
else:
|
|
|
|
if args.imgs:
|
|
|
|
addImagesToNodes(G)
|
|
|
|
genAndShowHTML(G, darkMode=args.dark)
|
2021-06-14 22:20:36 +02:00
|
|
|
|
2021-09-24 16:13:55 +02:00
|
|
|
|
2023-01-17 23:18:26 +01:00
|
|
|
#weights = loadWeights()
|
|
|
|
weights = None
|
2021-06-14 22:20:36 +02:00
|
|
|
if __name__ == "__main__":
|
2022-03-19 11:35:30 +01:00
|
|
|
try:
|
|
|
|
cliInterface(imgDef=True)
|
|
|
|
except Error as e:
|
|
|
|
print("[!] {0}".format(e))
|
2023-01-17 23:18:26 +01:00
|
|
|
else:
|
|
|
|
weights = loadWeights()
|
|
|
|
defered_imports()
|