CaliGraph/caliGraph.py

1465 lines
49 KiB
Python
Raw Normal View History

#!./.venv/bin/python3.10
2021-06-14 22:20:36 +02:00
import os
import re
2021-06-14 22:20:36 +02:00
import json
import math
2021-09-24 16:13:55 +02:00
import copy
2021-06-14 22:20:36 +02:00
import random
import requests
2021-06-14 22:20:36 +02:00
import numpy as np
import pandas as pd
2021-06-14 22:20:36 +02:00
from scipy.stats import norm
import matplotlib.pyplot as plt
import networkx as nx
from pyvis.network import Network
import plotly.graph_objects as go
2021-06-14 22:20:36 +02:00
import wikipedia
2021-06-14 22:20:36 +02:00
def getAllAuthors(books):
authors = set()
for book in books:
for author in getAuthors(book):
authors.add(author)
return list(authors)
def getAuthors(book):
return book['authors'].split(' & ')
def getRecommenders(book):
recs = set()
2021-06-14 22:20:36 +02:00
for tag in book['tags']:
if tag.find(" Recommendation") != -1:
recs.add(tag.replace(" Recommendation", ""))
elif tag.find("s Literature Club") != -1:
recs.add(tag.replace("s Literature Club", ""))
elif tag.find(":MRB") != -1:
recs.add(tag.replace(":MRB", ""))
return list(recs)
2021-06-14 22:20:36 +02:00
def getTags(book):
for tag in book['tags']:
if tag.find(" Recommendation") == -1 and tag.find("s Literature Club") == -1 and tag.find(" Top ") == -1 and tag.find(":MRB") == -1:
2021-06-14 22:20:36 +02:00
yield tag
def getAllRecommenders(books):
recs = set()
for book in books:
for rec in getRecommenders(book):
recs.add(rec)
return list(recs)
def getTopLists(book):
lists = set()
for tag in book['tags']:
if tag.find(" Top ") != -1:
lists.add(tag.split(" Top ")[0])
return list(lists)
def getAllTopLists(books):
tops = set()
for book in books:
for top in getTopLists(book):
tops.add(top)
return list(tops)
def getAllSeries(books):
series = set()
for book in books:
if 'series' in book:
series.add(book['series'])
return list(series)
def getAllTags(books):
tags = set()
for book in books:
for tag in getTags(book):
tags.add(tag)
return list(tags)
2021-09-24 16:13:55 +02:00
def getTopListWeight(book, topList):
2021-06-14 22:20:36 +02:00
minScope = 100000
for tag in book['tags']:
if tag.find(topList+" Top ") != -1:
scope = int(tag.split(" Top ")[1])
minScope = min(minScope, scope)
if minScope == 100000:
raise Exception("You stupid?")
2021-07-04 20:25:26 +02:00
if minScope == 10:
return 1
elif minScope == 25:
return 0.85
elif minScope == 100:
return 0.5
2021-09-03 21:21:07 +02:00
return 50 / minScope
2021-06-14 22:20:36 +02:00
def removeRead(G):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'book':
if node['rating'] != None:
G.remove_node(n)
def removeUnread(G):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'book':
if node['rating'] == None:
G.remove_node(n)
def removePriv(G):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'book':
if 'priv' in node['tags']:
G.remove_node(n)
def removeWhitepapers(G):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'book':
if 'whitepaper' in node['tags']:
G.remove_node(n)
2021-06-14 22:20:36 +02:00
def removeDangling(G, alsoBooks=False):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] != 'book' or alsoBooks:
if not len(G.adj[n]):
G.remove_node(n)
def removeThinRecs(G, minCons=3):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'recommender':
if not len(G.adj[n]) >= minCons:
G.remove_node(n)
2021-06-14 22:20:36 +02:00
def removeEdge(G):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] != 'book':
if len(G.adj[n]) < 2:
G.remove_node(n)
def removeBad(G, threshold, groups=['book', 'topList', 'recommender', 'author', 'series', 'tag']):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] in groups:
if 'score' in node and (node['score'] == None or node['score'] < threshold):
G.remove_node(n)
def removeKeepBest(G, num, maxDistForRead=1, forType='book'):
2021-06-14 22:20:36 +02:00
bestlist = []
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == forType:
2021-06-14 22:20:36 +02:00
if 'score' in node and node['score'] != None:
bestlist.append(node)
bestlist.sort(key=lambda node: node['score'], reverse=True)
bestlist = bestlist[:num]
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == forType and node not in bestlist or 'score' in node and node['score'] == None:
2021-06-14 22:20:36 +02:00
if not 'rating' in node or node['rating'] == None or node['rating'] < bestlist[-1]['score']-maxDistForRead:
G.remove_node(n)
def removeTags(G):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'tag':
G.remove_node(n)
def pruneTags(G, minCons=2):
for n in sorted(list(G.nodes), key=lambda i: G.nodes[i]['score'] + len(G.nodes[i]['feedbacks'])/5 if 'score' in G.nodes[i] and 'feedbacks' in G.nodes[i] else 0):
2021-06-14 22:20:36 +02:00
node = G.nodes[n]
if node['t'] == 'tag':
foundCon = 0
for book in G.adj[n]:
for con in G.adj[book]:
conType = G.nodes[con]['t']
if conType not in ['topList']:
if conType in ['recommender']:
foundCon += 0.5
elif conType in ['tag', 'series']:
foundCon += 0.25
else:
foundCon += 1
2021-06-14 22:20:36 +02:00
if foundCon > minCons:
G.remove_node(n)
def pruneRecommenders(G, minCons=2):
for n in sorted(list(G.nodes), key=lambda i: G.nodes[i]['score'] if 'score' in G.nodes[i] else 0):
node = G.nodes[n]
if node['t'] == 'recommender':
foundCon = 0
for book in G.adj[n]:
for con in G.adj[book]:
conType = G.nodes[con]['t']
if conType not in ['topList']:
if conType in ['recommender']:
foundCon += 0.5
elif conType in ['tag', 'series']:
foundCon += 0.25
else:
foundCon += 1
if foundCon > minCons:
G.remove_node(n)
def pruneRecommenderCons(G, maxCons=5):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'recommender':
if len(G.adj[n]) > maxCons:
bestlist = []
for m in list(G.adj[n]):
book = G.nodes[m]
if book['t'] == 'book':
if 'score' in book and book['score'] != None:
bestlist.append(book)
bestlist.sort(key=lambda node: node['score'], reverse=True)
bestlist = bestlist[:maxCons]
for m in list(G.adj[n]):
book = G.nodes[m]
if book['t'] == 'book' and book not in bestlist or 'score' in book and book['score'] == None:
if not 'rating' in book or book['rating'] == None:
foundCon = 0
for con in G.adj[m]:
if G.nodes[con]['t'] not in ['topList']:
foundCon += 1
if foundCon < 2:
G.remove_node(m)
def pruneAuthorCons(G, maxCons=3):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'author':
if len(G.adj[n]) > maxCons:
bestlist = []
for m in list(G.adj[n]):
book = G.nodes[m]
if book['t'] == 'book':
if 'score' in book and book['score'] != None:
bestlist.append(book)
bestlist.sort(key=lambda node: node['score'], reverse=True)
bestlist = bestlist[:maxCons]
for m in list(G.adj[n]):
book = G.nodes[m]
if book['t'] == 'book' and book not in bestlist or 'score' in book and book['score'] == None:
if not 'rating' in book or book['rating'] == None:
foundCon = 0
for con in G.adj[m]:
if G.nodes[con]['t'] not in ['topList']:
foundCon += 1
if foundCon < 2:
G.remove_node(m)
2021-06-14 22:20:36 +02:00
def removeHighSpanTags(G, maxCons=5):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'tag':
if len(G.adj[n]) > maxCons:
G.remove_node(n)
2021-09-24 18:25:37 +02:00
def removeHighSpanReadBooks(G, maxCons=8):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'book' and node['rating'] != None:
if sum([1 for adj in G.adj[n] if G.nodes[adj]['t']=='recommender']) > maxCons:
G.remove_node(n)
2021-06-14 22:20:36 +02:00
def removeTopLists(G):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'topList':
G.remove_node(n)
def removeRecommenders(G):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'recommender':
G.remove_node(n)
def removeAuthors(G):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'author':
G.remove_node(n)
def removeSeries(G):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'series':
G.remove_node(n)
2021-06-14 22:20:36 +02:00
def removeRestOfSeries(G):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'series':
seriesState = 0
for adj in G.adj[n]:
adjNode = G.nodes[adj]
if adjNode['rating'] != None:
seriesState = max(seriesState, int(
adjNode['series_index']))
for adj in list(G.adj[n]):
adjNode = G.nodes[adj]
if adjNode['series_index'] > seriesState + 1.0001:
G.remove_node(adj)
def removeUnusedRecommenders(G):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'recommender':
for adj in G.adj[n]:
adjNode = G.nodes[adj]
if adjNode['t']=='book' and 'score' in adjNode:
break
else: # No unrated recommendation
G.remove_node(n)
2021-06-14 22:20:36 +02:00
def removeUselessReadBooks(G):
2021-09-24 18:25:37 +02:00
minForce = 1.5
minContact = 2
for n in list(G.nodes):
node = G.nodes[n]
2021-09-24 18:25:37 +02:00
if node['t'] == 'book' and node['rating'] != None:
force = 0
contacts = 0
for adj in G.adj[n]:
adjNode = G.nodes[adj]
2021-09-24 18:25:37 +02:00
contacts += 1
for cousin in G.adj[adj]:
cousinNode = G.nodes[cousin]
if cousinNode['t']=='book' and 'score' in cousinNode or cousinNode['t'] == 'newBook':
2021-09-24 18:25:37 +02:00
if adjNode['t']=='recommender':
force += 0.5
else:
force += 1
if force < minForce or contacts < minContact:
G.remove_node(n)
2021-09-24 18:25:37 +02:00
def removeUselessTags(G, minUnread=1):
2021-09-24 16:13:55 +02:00
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'tag':
2021-09-24 18:25:37 +02:00
foundUnread = 0
2021-09-24 16:13:55 +02:00
for adj in G.adj[n]:
adjNode = G.nodes[adj]
if adjNode['t']=='book' and 'score' in adjNode:
2021-09-24 18:25:37 +02:00
foundUnread += 1
if foundUnread < minUnread:
2021-09-24 16:13:55 +02:00
G.remove_node(n)
def removeUselessSeries(G, minSco=0):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'series':
if len(G.adj[n]) < 2 or node['score'] < minSco:
G.remove_node(n)
2021-09-24 16:13:55 +02:00
2021-09-24 18:25:37 +02:00
def scoreOpinions(G, globMu, globStd):
2021-06-14 22:20:36 +02:00
for n in list(G.nodes):
node = G.nodes[n]
feedbacks = []
if node['t'] not in ['book']:
2021-06-14 22:20:36 +02:00
adjacens = list(G.adj[n].keys())
for adj in adjacens:
adjNode = G.nodes[adj]
if adjNode['rating'] != None:
feedbacks.append(adjNode['rating'])
if len(feedbacks):
node['mean'], node['std'] = norm.fit(feedbacks)
node['se'] = globStd / math.sqrt(len(feedbacks))
ratio = len(feedbacks) / len(adjacens)
2021-09-24 18:25:37 +02:00
node['score'] = node['mean']
2021-06-14 22:20:36 +02:00
node['feedbacks'] = feedbacks
else:
node['score'] = None
2021-09-24 18:25:37 +02:00
def scoreUnread(G, globMu, globStd):
2021-06-14 22:20:36 +02:00
for n in list(G.nodes):
feedbacks = [globMu]
ws = [['mu']]
2021-06-14 22:20:36 +02:00
node = G.nodes[n]
if node['t'] == 'book':
if node['rating'] == None:
adjacens = list(G.adj[n].keys())
for adj in adjacens:
adjNode = G.nodes[adj]
if 'score' in adjNode and adjNode['score'] != None:
w = [adjNode['t'], G[n][adj]['weight'] if 'weight' in G[n][adj] else 1]
for fb in adjNode['feedbacks']:
feedbacks.append(fb)
ws.append(w)
if len(feedbacks):
node['mean'], node['std'] = norm.fit(feedbacks)
node['median'] = np.percentile(feedbacks, [50], method='linear')[0]
node['se'] = globStd / math.sqrt(len(feedbacks))
feedbacks.append(node['pagerank_score'])
ws.append(['pagerank'])
#feedbacks.append(10/math.ln10(10+node['tgb_rank']) if 'tgb_rank' in node else 0)
#ws.append(['tgb_rank'])
feedbacks.append(node['std'])
ws.append(['sigma'])
2022-03-07 13:21:16 +01:00
feedbacks.append(node['median'])
ws.append(['median'])
#feedbacks.append(node['se'])
#ws.append(['se'])
feedbacks.append(globMu)
ws.append(['bias'])
node['score'] = sum([fb*getWeightForType(w[0], w[1] if len(w)>1 else 1) for fb, w in zip(feedbacks, ws)])/sum([getWeightForType(w[0], w[1] if len(w)>1 else 1) for w in ws])
node['_act'] = feedbacks
node['_wgh'] = ws
else:
node['score'] = globMu + errorFac*globStd + len(feedbacks)*0.0000000001
if 'series' in node:
if node['series_index'] == 1.0:
node['score'] += 0.000000001
def getWeightForType(nodeType, edgeWeight=1):
2021-09-24 16:13:55 +02:00
global weights
w = weights[nodeType]
if nodeType == 'topList':
return edgeWeight*w
else:
return w
2021-06-14 22:20:36 +02:00
def printBestList(G, t='book', num=-1):
2021-06-14 22:20:36 +02:00
bestlist = []
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == t:
2021-06-14 22:20:36 +02:00
if 'score' in node and node['score'] != None:
bestlist.append(node)
bestlist.sort(key=lambda node: node['score'] + 0.00001*(node['se'] if 'se' in node else 0), reverse=True)
2021-06-14 22:20:36 +02:00
for i, book in enumerate(bestlist):
if t=='book':
line = book['title'] + " ("+" & ".join(book['authors'])+")"+": {:.5f}".format(book['score'])
else:
line = book['label']
print("["+str(i+1).zfill(int((math.log10(num) if num!=-1 else 3)+1))+"] "+line)
if num!=-1 and i == num-1:
2021-06-14 22:20:36 +02:00
break
def readColor(book):
if 'rating' in book:
return 'green'
else:
return 'gray'
def loadBooksFromDB():
books = loadBooksFromCalibreDB()
infuseDataFromMRB(books)
#infuseDataFromTGB(books)
return books
def mrbGetBook(mrbdf, title, authors):
2022-02-06 18:30:55 +01:00
title = title.split('(')[0]
title = title.replace('*','')
pot = mrbdf[mrbdf['title'].str.contains(title)]
dic = pot.to_dict(orient='records')
for d in dic:
for author in authors:
parts = author.split(" ")
for part in [parts[0], parts[-1]]:
if d['author'].find(part)==-1:
break
else:
return d
return False
def tgbGetBook(df, title, authors):
title = title.split('(')[0]
title = title.replace('*','')
pot = df[df['title'].str.contains(title)]
dic = pot.to_dict(orient='records')
for d in dic:
for author in authors:
parts = author.split(" ")
2022-02-15 19:35:03 +01:00
for part in [parts[0], parts[-1]]:
if d['author'].find(part)==-1:
break
else:
return d
return False
def infuseDataFromMRB(books):
2022-02-15 19:35:03 +01:00
mrbdf = pd.read_csv('rec_dbs/mrb_db.csv')
for book in books:
mrb = mrbGetBook(mrbdf, book['title'], book['authors'])
if mrb:
for rec in str(mrb['recommender']).split('|'):
book['tags'] += [rec + ':MRB']
def infuseDataFromTGB(books):
for i in range(1,3):
df = pd.read_csv('rec_dbs/tgb_'+str(i)+'.csv')
for book in books:
tgb = tgbGetBook(df, book['title'], book['authors'])
if tgb:
book['tgb_rank'] = int(tgb['id'])
def loadBooksFromCalibreDB():
2021-06-14 22:20:36 +02:00
return json.loads(os.popen("calibredb list --for-machine -f all").read())
def remove_html_tags(text):
clean = re.compile('<.*?>')
return re.sub(clean, '', text)
def getKeywords(txt,rake):
txt = remove_html_tags(txt)
k = []
rake.extract_keywords_from_text(txt)
kws = rake.get_ranked_phrases_with_scores()
for i,(score,kw) in enumerate(kws):
l = len(kw.split(' '))
if kw.lower() not in ['p', 'die', 'best', 'known', 'fk', 'p pp', 'one'] and len(kw)>3 and kw.find('div')==-1 and kw.lower().find('p p')==-1:
k.append((score**(1/(l*0.4)),kw))
k.sort(key=lambda x: x[0],reverse=True)
if k:
minSco = k[0][0]/3*2
for i,kw in enumerate(k):
if kw[0] < minSco:
return [(sco,word.title()) for sco,word in k[:i]]
return k
return []
2022-01-31 13:45:26 +01:00
def runPagerank(G):
try:
scores = nx.pagerank(G=G)
except nx.exception.PowerIterationFailedConvergence:
print('[!] Could not calculate pagerank-scores: Power iteration of the eigenvector calculation did not converge')
print('[ ] Recommendations will be of slighly lower quality')
2022-01-31 13:45:26 +01:00
scores = {}
for n in list(G.nodes):
G.nodes[n]['pagerank_score'] = scores[n] if n in scores else 0
def buildBookGraph(books, darkMode=False, extractKeywords=True, mergeTags=True):
2021-06-14 22:20:36 +02:00
G = nx.Graph()
if extractKeywords:
from rake_nltk.rake import Rake
rake = Rake()
2021-06-14 22:20:36 +02:00
# Books
for book in books:
tags = book['tags']
2021-06-14 22:20:36 +02:00
if 'rating' in book:
rating = book['rating']
else:
rating = None
if 'comments' in book:
desc = book['comments']
2021-06-14 22:20:36 +02:00
else:
desc = ''
if 'comments' in book and extractKeywords:
sanitized = re.sub(r'[^a-zA-Z0-9\s\.äöü]+', '', book['comments']).replace('\n',' ')
keywords = getKeywords(sanitized,rake)
else:
keywords = []
if mergeTags:
tags = tags + [word for (score, word) in keywords]
2021-06-14 22:20:36 +02:00
if 'series' in book:
series = book['series']
series_index = book['series_index']
else:
series = None
series_index = None
G.add_node(book['id'], t='book', label=book['title'], title=book['title'], shape='image', image=book['cover'], rating=rating, tags=tags, keywords=keywords, desc=desc, isbn=book['isbn'], files=book['formats'], authors=getAuthors(book), series=series, series_index=series_index)
2021-06-14 22:20:36 +02:00
return G
def getWikiImage(search_term):
from fuzzywuzzy import fuzz
WIKI_REQUEST = 'http://en.wikipedia.org/w/api.php?action=query&prop=pageimages&format=json&piprop=original&titles='
try:
print('[i] Searching for >'+search_term+'< on WikiPedia...')
result = wikipedia.search(search_term, results = 1)
if fuzz.ratio(search_term, result) < 50:
raise Exception('blub')
wikipedia.set_lang('en')
wkpage = wikipedia.WikipediaPage(title = result[0])
title = wkpage.title
response = requests.get(WIKI_REQUEST+title)
json_data = json.loads(response.text)
img_link = list(json_data['query']['pages'].values())[0]['original']['source']
return img_link
except:
print('[!] No match for '+search_term+' on WikiPedia...')
return None
2021-06-14 22:20:36 +02:00
2021-10-05 18:25:27 +02:00
def graphAddAuthors(G, books, darkMode=False):
2021-06-14 22:20:36 +02:00
for author in getAllAuthors(books):
G.add_node('a/'+author, color='green', t='author', label=author)
for book in books:
for author in getAuthors(book):
G.add_edge('a/'+author, book['id'], color=readColor(book))
return G
2021-10-05 18:25:27 +02:00
def graphAddRecommenders(G, books, darkMode=False):
2021-06-14 22:20:36 +02:00
for rec in getAllRecommenders(books):
G.add_node('r/'+rec, color='orange', t='recommender', label=rec)
for book in books:
for rec in getRecommenders(book):
G.add_edge('r/'+rec, book['id'], color=readColor(book))
return G
2021-10-05 18:25:27 +02:00
def graphAddTopLists(G, books, darkMode=False):
2021-06-14 22:20:36 +02:00
for tl in getAllTopLists(books):
G.add_node('t/'+tl, color='yellow', t='topList', label=tl)
for book in books:
for top in getTopLists(book):
2021-09-24 16:13:55 +02:00
G.add_edge('t/'+top, book['id'], weight=getTopListWeight(
2021-06-14 22:20:36 +02:00
book, top), color=readColor(book))
return G
2021-10-05 18:25:27 +02:00
def graphAddSeries(G, books, darkMode=False):
2021-06-14 22:20:36 +02:00
for series in getAllSeries(books):
2021-06-16 18:01:28 +02:00
G.add_node('s/'+series, color='red', t='series', label=series, shape='triangle')
2021-06-14 22:20:36 +02:00
for book in books:
if 'series' in book:
G.add_edge('s/'+book['series'], book['id'], color=readColor(book))
return G
2021-10-05 18:25:27 +02:00
def graphAddTags(G, books, darkMode=False):
2021-06-14 22:20:36 +02:00
for tag in getAllTags(books):
2021-10-05 18:25:27 +02:00
G.add_node('t/'+tag, color=['lightGray','darkgray'][darkMode], t='tag', label=tag, shape='box')
2021-06-14 22:20:36 +02:00
for book in books:
for tag in getTags(book):
G.add_edge('t/'+tag, book['id'], color=readColor(book))
return G
def calcRecDist(G, books):
globRatings = []
for book in books:
if G.nodes[book['id']]['rating'] != None:
globRatings.append(G.nodes[book['id']]['rating'])
return norm.fit(globRatings)
def scaleBooksByRating(G):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] not in []:
if 'rating' in node and node['rating'] != None:
node['value'] = 20 + 5 * int(node['rating'])
else:
if 'score' in node and node['score'] != None:
node['value'] = 20 + int(5 * float(node['score']))
2021-06-14 22:20:36 +02:00
else:
node['value'] = 15
def scaleOpinionsByRating(G):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] in ['topList', 'recommender', 'author', 'series']:
if 'score' in node and node['score'] != None:
node['value'] = 20 + 5 * int(node['score'])
else:
node['value'] = 20
def addScoreToLabels(G):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] not in ['tag', 'newBook']:
2021-06-14 22:20:36 +02:00
if 'rating' in node and node['rating'] != None:
node['label'] += " ("+str(node['rating'])+")"
else:
2021-09-26 14:31:00 +02:00
if 'score' in node and node['score'] != None and 'se' in node:
node['label'] += " ({:.2f}±{:.1f})".format(node['score'], node['se'])
2021-06-14 22:20:36 +02:00
else:
node['label'] += " (0±∞)"
2021-06-14 22:20:36 +02:00
2021-10-05 18:25:27 +02:00
def genAndShowHTML(G, showButtons=False, darkMode=False, arrows=False):
net = Network('1050px', '1900px',
2021-10-05 18:25:27 +02:00
directed=arrows,
bgcolor=['#FFFFFF','#181818'][darkMode])
2021-06-14 22:20:36 +02:00
if showButtons:
net.show_buttons(filter_=['configure', 'layout',
'interaction', 'physics', 'edges'])
net.from_nx(G)
net.show('nx.html')
def genAndShow3D(G, darkMode=False):
node_sizes = []
node_labels = []
node_cols = []
for n in G.nodes:
node = G.nodes[n]
if node['t']=='tag':
node_cols.append('gray')
elif node['t']=='book':
if 'score' in node: # unread book
node_cols.append('lightblue')
else:
node_cols.append('magenta')
elif 'color' in node:
node_cols.append(node['color'])
else:
node_cols.append('black')
node_labels.append(node['label'])
node_sizes.append((node['value']/8)**1.5)
spring = nx.spring_layout(G,dim=3, seed=random.randint(0, 65536))
x_nodes = [spring[p][0] for p in spring]# x-coordinates of nodes
y_nodes = [spring[p][1] for p in spring]# y-coordinates
z_nodes = [spring[p][2] for p in spring]# z-coordinates
x_edges=[]
y_edges=[]
z_edges=[]
for edge in G.edges():
x_coords = [spring[edge[0]][0],spring[edge[1]][0],None]
x_edges += x_coords
y_coords = [spring[edge[0]][1],spring[edge[1]][1],None]
y_edges += y_coords
z_coords = [spring[edge[0]][2],spring[edge[1]][2],None]
z_edges += z_coords
trace_edges = go.Scatter3d(x=x_edges,
y=y_edges,
z=z_edges,
mode='lines',
line=dict(color='black', width=2),
hoverinfo='none')
trace_nodes = go.Scatter3d(x=x_nodes,
y=y_nodes,
z=z_nodes,
mode='markers',
marker=dict(symbol='circle',
size=node_sizes,
color=node_cols, #color the nodes according to their community
#colorscale=['lightgreen','magenta'], #either green or mageneta
line=dict(color='gray', width=0.5)),
text=node_labels,
hoverinfo='text')
axis = dict(showbackground=False,
showline=False,
zeroline=False,
showgrid=False,
showticklabels=False,
title='')
layout = go.Layout(title="",
width=1920,
height=1080,
plot_bgcolor=['#FFFFFF','#181818'][darkMode],
paper_bgcolor=['#FFFFFF','#181818'][darkMode],
showlegend=False,
scene=dict(xaxis=dict(axis),
yaxis=dict(axis),
zaxis=dict(axis),
),
margin=dict(l=0, r=0, b=0, t=0),
hovermode='closest')
data = [trace_edges, trace_nodes]
fig = go.Figure(data=data, layout=layout)
fig.show()
2021-10-05 18:25:27 +02:00
def buildFullGraph(darkMode=False):
2021-06-14 22:20:36 +02:00
books = loadBooksFromDB()
2021-10-05 18:25:27 +02:00
G = buildBookGraph(books, darkMode=darkMode)
2021-06-14 22:20:36 +02:00
2021-10-05 18:25:27 +02:00
graphAddAuthors(G, books, darkMode=darkMode)
graphAddRecommenders(G, books, darkMode=darkMode)
graphAddTopLists(G, books, darkMode=darkMode)
graphAddSeries(G, books, darkMode=darkMode)
graphAddTags(G, books, darkMode=darkMode)
2021-06-14 22:20:36 +02:00
return G, books
def genScores(G, books, calcPagerank=True):
2021-06-14 22:20:36 +02:00
globMu, globStd = calcRecDist(G, books)
if calcPagerank:
runPagerank(G)
2021-06-14 22:20:36 +02:00
scoreOpinions(G, globMu, globStd)
scoreUnread(G, globMu, globStd)
return globMu, globStd
def addImageToNode(node, cache, shape='circularImage'):
name = node['label'].split(' (')[0].replace('*','')
if not name in cache or (cache[name]==False and random.random()<0.05):
term = name
img = getWikiImage(term)
if img:
cache[name] = img
else:
cache[name] = False
else:
img = cache[name]
if img:
2022-02-11 17:37:23 +01:00
#node['imagePadding'] = '100px'
node['image']=img
node['shape']=shape
def addImagesToNodes(G):
try:
with open('.imgLinkCache.json', 'r') as cf:
cache = json.loads(cf.read())
except IOError:
cache = {}
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] in ['recommender', 'author']:
addImageToNode(node, cache, ['circularImage','image'][node['t']=='author'])
with open('.imgLinkCache.json', 'w') as cf:
cf.write(json.dumps(cache))
2021-06-14 22:20:36 +02:00
def recommendNBooksRecommenderBased(G, mu, std, n, removeTopListsB=True, removeUselessRecommenders=True):
2021-06-14 22:20:36 +02:00
removeRestOfSeries(G)
2021-09-24 16:13:55 +02:00
removeBad(G, mu-std*2-1)
2021-09-24 17:23:34 +02:00
removeKeepBest(G, int(n*2) + 5, maxDistForRead=2)
2021-06-14 22:20:36 +02:00
removeEdge(G)
2021-09-24 18:25:37 +02:00
removeHighSpanTags(G, 6)
2021-06-14 22:20:36 +02:00
removeDangling(G, alsoBooks=False)
2021-09-24 17:23:34 +02:00
pruneTags(G, 10)
2021-06-14 22:20:36 +02:00
removeBad(G, mu, groups=['book'])
removeUselessReadBooks(G)
2021-09-24 17:23:34 +02:00
pruneTags(G, 6)
pruneRecommenderCons(G, int(n/7)+1)
pruneAuthorCons(G, int(n/15))
2021-09-24 16:13:55 +02:00
removeUselessTags(G)
if removeTopListsB:
removeTopLists(G)
2021-06-14 22:20:36 +02:00
removeDangling(G, alsoBooks=True)
2021-09-24 17:23:34 +02:00
removeKeepBest(G, n+math.ceil(n/20), maxDistForRead=1.5)
2021-06-14 22:20:36 +02:00
removeEdge(G)
removeDangling(G, alsoBooks=True)
2021-09-24 18:25:37 +02:00
removeUselessReadBooks(G)
if removeUselessRecommenders:
removeUnusedRecommenders(G)
2021-09-24 16:13:55 +02:00
removeDangling(G, alsoBooks=True)
2021-09-24 17:23:34 +02:00
removeKeepBest(G, n, maxDistForRead=1.25)
2021-06-14 22:20:36 +02:00
scaleBooksByRating(G)
scaleOpinionsByRating(G)
addScoreToLabels(G)
def recommendNBooksTagBased(G, mu, std, n, removeTopListsB=True):
removeRestOfSeries(G)
removeBad(G, mu-std*2-1)
removeKeepBest(G, int(n*2) + 5, maxDistForRead=2)
removeEdge(G)
removeHighSpanTags(G, 12)
removeDangling(G, alsoBooks=False)
pruneTags(G, 24)
removeBad(G, mu, groups=['book'])
removeUselessReadBooks(G)
pruneTags(G, 16)
pruneAuthorCons(G, int(n/5))
removeRecommenders(G)
removeUselessTags(G)
if removeTopListsB:
removeTopLists(G)
removeDangling(G, alsoBooks=True)
removeKeepBest(G, n+math.ceil(n/20), maxDistForRead=1.5)
removeUselessReadBooks(G)
2021-09-25 20:15:14 +02:00
removeUselessTags(G)
removeKeepBest(G, n, maxDistForRead=1.25)
scaleBooksByRating(G)
scaleOpinionsByRating(G)
addScoreToLabels(G)
2022-02-11 18:12:49 +01:00
def recommendNBooks(G, mu, std, n, removeTopListsB=True, removeUselessRecommenders=True, v3d=False):
removeRestOfSeries(G)
removeBad(G, mu-std-0.5)
removeBad(G, mu+std/2, groups=['recommender'])
removeThinRecs(G, 3)
removeKeepBest(G, int(n*2) + 5, maxDistForRead=2)
removeEdge(G)
2022-02-01 12:22:13 +01:00
removeHighSpanTags(G, 8)
2022-02-10 18:18:05 +01:00
pruneTags(G, 7)
2022-02-11 18:04:47 +01:00
removeHighSpanReadBooks(G, 14)
removeDangling(G, alsoBooks=False)
pruneRecommenders(G, 12)
removeThinRecs(G, 3)
removeBad(G, mu, groups=['book'])
removeUselessReadBooks(G)
2022-02-01 12:22:13 +01:00
pruneAuthorCons(G, int(n/5)+3)
pruneRecommenders(G, 12 - min(4, n/20))
removeUselessSeries(G, mu)
removeUselessTags(G)
2022-02-10 18:18:05 +01:00
pruneTags(G, 6)
if removeTopListsB:
removeTopLists(G)
removeDangling(G, alsoBooks=True)
2021-09-25 20:15:14 +02:00
removeKeepBest(G, n+math.ceil(n/20)+3, maxDistForRead=1.5)
removeEdge(G)
2022-02-11 18:16:43 +01:00
removeKeepBest(G, n+1, maxDistForRead=1.25)
removeUselessSeries(G, mu)
removeUselessTags(G)
removeUselessReadBooks(G)
2022-02-11 18:12:49 +01:00
removeThinRecs(G, 2 + 1 * (n>20 and not v3d))
removeKeepBest(G, n, maxDistForRead=1.25)
scaleBooksByRating(G)
scaleOpinionsByRating(G)
addScoreToLabels(G)
2021-10-13 15:10:12 +02:00
def listScores(G, mu, std, n):
removeRestOfSeries(G)
removeKeepBest(G, n, maxDistForRead=10)
scaleBooksByRating(G)
scaleOpinionsByRating(G)
addScoreToLabels(G)
def fullGraph(G, removeTopListsB=True):
2021-06-14 22:20:36 +02:00
removeEdge(G)
removeHighSpanTags(G, 7)
removeDangling(G, alsoBooks=False)
if removeTopListsB:
removeTopLists(G)
2021-06-14 22:20:36 +02:00
pruneTags(G, 3)
removeDangling(G, alsoBooks=True)
scaleBooksByRating(G)
scaleOpinionsByRating(G)
addScoreToLabels(G)
def recommenderCompetence(G):
#removeRead(G)
removeUnread(G)
removeTags(G)
removeAuthors(G)
removeSeries(G)
removeTopLists(G)
removeEdge(G)
removeDangling(G, alsoBooks=True)
scaleBooksByRating(G)
scaleOpinionsByRating(G)
addScoreToLabels(G)
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'recommender':
2022-02-06 18:28:35 +01:00
if 'se' in node:
node['score'] -= node['se'] * 1
else:
if not node['score']:
node['score'] = 0
node['score'] /= 2
def readBooksAnalysis(G, minRating=0, showAllTags=True, removeUnconnected=False, removeTopListsB=True):
2021-06-14 22:20:36 +02:00
removeUnread(G)
removeBad(G, minRating)
if not showAllTags:
removeEdge(G)
2021-06-14 22:20:36 +02:00
removeHighSpanTags(G, 15)
removeDangling(G, alsoBooks=removeUnconnected)
if removeTopListsB:
removeTopLists(G)
2021-06-14 22:20:36 +02:00
pruneTags(G, 8)
scaleBooksByRating(G)
scaleOpinionsByRating(G)
addScoreToLabels(G)
2022-02-15 19:35:03 +01:00
def progress(G, books, mu, minimum=3.5):
findNewBooks(G, books, mu, -1, minRecSco = minimum)
bookCount = 0
2022-02-15 19:35:03 +01:00
libCount = 0
readCount = 0
toReadCount = 0
for n in list(G.nodes):
node = G.nodes[n]
2022-02-15 19:35:03 +01:00
if node['t'] in ['book','newBook']:
if node['t'] == 'book':
libCount +=1
bookCount += 1
2022-02-15 19:35:03 +01:00
if 'rating' in node and node['rating'] != None:
readCount += 1
2022-02-15 19:35:03 +01:00
elif 'score' in node and (node['score'] >= minimum or 'std' in node and node['std']==0.0):
toReadCount += 1
perc = round(readCount / (toReadCount+readCount) * 100, 2)
2022-02-15 19:35:03 +01:00
print('Books in library: '+str(libCount))
print('Books in CaliGraph: '+str(bookCount))
print('Read Books: '+str(readCount))
print('Unread Books: '+str(bookCount-readCount))
print('Recommended Books (score > '+str(round(minimum, 2))+'): '+str(toReadCount))
print('Progress: '+str(perc)+'%')
2021-06-14 22:20:36 +02:00
2022-02-11 17:50:07 +01:00
def analyze(G, books, mu, type_name, name, dist=2.1):
from fuzzywuzzy import fuzz
type_ident = type_name[0]
full_name = type_ident + "/" + name
bestRatio, match, n = 0, None, 0
for ni in list(G.nodes):
node = G.nodes[ni]
if node['t'] == type_name or type_name=="any":
if name==node['label'] or full_name==node['label']:
match, n = node, ni
break
ratio = fuzz.ratio(node['label'], name)
if ratio > bestRatio:
bestRatio, match, n = ratio, node, ni
2021-06-16 18:01:28 +02:00
if bestRatio < 70:
print("Best Match: "+match['label'])
2021-06-16 17:35:40 +02:00
2022-02-11 17:50:07 +01:00
findNewBooks(G, books, mu, num=-1, minRecSco=1)
2022-02-11 17:37:23 +01:00
menge = set()
waveFlow(G, match, n, dist, menge)
for n in list(G.nodes):
if n not in menge:
G.remove_node(n)
2022-02-06 21:35:00 +01:00
if dist >= 2:
removeThinRecs(G, 2)
removeHighSpanTags(G, 12)
if dist > 1:
removeDangling(G, True)
scaleBooksByRating(G)
scaleOpinionsByRating(G)
#match['value'] = 100
2021-06-16 18:01:28 +02:00
if not 'shape' in match:
match['shape'] = 'star'
addScoreToLabels(G)
match['label'] = "*"+match['label']+"*"
def waveFlow(G, node, n, dist, menge, firstEdge=False):
if dist <= 0:
return
dist -= 1
if menge==set():
firstEdge=True
if node['t'] in ['topList']:
if firstEdge:
menge.add(n)
return
menge.add(n)
if node['t'] in ['tag']:
if firstEdge:
dist-=0.1
else:
return
bestlist = []
keeplist = []
for m in list(G.adj[n]):
book = G.nodes[m]
if book['t'] not in ['NOTHING']:
if 'score' in book and book['score'] != None:
bestlist.append(book)
elif 'rating' in book and book['rating'] != None:
keeplist.append(book)
else:
book['score'] = 0
bestlist.append(book)
bestlist.sort(key=lambda node: node['score'], reverse=True)
toKeep = min(int(dist*10), math.ceil(len(bestlist) * dist - len(keeplist)*0.5))
if toKeep <= 0:
keeplist.sort(key=lambda node: node['rating'], reverse=True)
keeplist = keeplist[:min(int(dist*10), int(len(keeplist) * dist))]
bestlist = []
else:
bestlist = bestlist[:toKeep]
for m in list(G.adj[n]):
node = G.nodes[m]
if node in bestlist or node in keeplist:
waveFlow(G, node, m, dist, menge, firstEdge=firstEdge)
def gensimTokensForLines(lines):
for i, line in enumerate(lines):
tokens = gensim.utils.simple_preprocess(line)
if tokens_only:
yield tokens
else:
# For training data, add tags
yield gensim.models.doc2vec.TaggedDocument(tokens, [i])
def buildDoc2Vec(books):
import gensim
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'book':
pass
gensimTokensForLines(lines)
def shell(G, books, mu, std):
from ptpython.repl import embed
embed(globals(), locals())
def newBooks(G, books, num, mu, std):
2022-02-06 21:45:08 +01:00
removeBad(G, mu-std*2)
2022-02-11 17:50:07 +01:00
findNewBooks(G, books, mu, num, minRecSco = mu-std)
2022-02-11 17:37:23 +01:00
removeThinRecs(G, 2)
removeUnread(G)
removeUselessReadBooks(G)
removeTags(G)
removeTopLists(G)
removeSeries(G)
removeEdge(G)
removeDangling(G, alsoBooks=True)
scaleBooksByRating(G)
scaleOpinionsByRating(G)
addScoreToLabels(G)
2022-02-11 17:50:07 +01:00
def findNewBooks(G, books, mu, num=-1, minRecSco=5):
2022-02-15 19:35:03 +01:00
mrbdf = pd.read_csv('rec_dbs/mrb_db.csv')
recs = []
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'recommender' and 'score' in node:
oldBooks = []
newBooks = []
recBooks = mrbdf[mrbdf['recommender'].str.contains(node['label'])].to_dict(orient='records')
for book in recBooks:
if book['title'] in [b['title'] for b in books]:
oldBooks.append({'title': book['title'], 'author': book['author']})
else:
newBooks.append({'title': book['title'], 'author': book['author']})
recs.append({'name': node['label'], 'rec': node, 'newBooks': newBooks, 'oldBooks': oldBooks})
for rec in recs:
for book in rec['newBooks']:
2022-02-06 19:10:21 +01:00
G.add_node('n/'+book['title'], color='blue', t='newBook', label=book['title'], author=book['author'])
G.add_node('r/'+rec['rec']['label'], color='orange', t='recommender', label=rec['rec']['label'], score=rec['rec']['score'])
G.add_edge('r/'+rec['rec']['label'], 'n/'+book['title'], color='blue')
G.add_node('a/'+book['author'], color='green', t='author', label=book['author'])
G.add_edge('a/'+book['author'], 'n/'+book['title'], color='blue')
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'newBook':
ses = []
scores = []
for m in list(G.adj[n]):
adj = G.nodes[m]
2022-02-11 17:37:23 +01:00
if adj['t'] == 'recommender' and adj['score']!=None:
scores.append(adj['score'])
ses.append(adj['se'])
2022-02-11 17:37:23 +01:00
if not len(scores):
G.remove_node(n)
else:
2022-02-11 17:37:23 +01:00
ses.append(min(ses))
2022-02-11 17:50:07 +01:00
scores.append(mu)
node['fake_se'] = sum(ses)/(len(ses)**1.2) + 0.5 + 0.5 * (len(scores)==2) # This is not how SE works. DILLIGAF?
node['score'] = sum(scores)/len(scores)*1.2 - node['fake_se']*2 + 0.5 - 0.1/math.sqrt(len(scores))
2022-02-11 17:50:07 +01:00
if len(scores)==2:
node['score']*=0.80
node['value'] = 20 + 5 * float(node['score'])
node['label'] += " ({:.2f}±{:.1f})".format(node['score'], node['fake_se'])
2022-02-06 19:10:21 +01:00
node['label'] += '\n ' + node['author']
2022-02-11 17:37:23 +01:00
if num!=-1:
removeKeepBest(G, num, 10, 'newBook')
# while batchSize is implemented, we only get a good gonvergence when we disable it (batchSize=-1)
# but might be necessary to enable later for a larger libary for better training performance...
# maybe try again for 128 books?
def evaluateFitness(books, batchSize=-1, debugPrint=False):
global weights
G = buildBookGraph(books)
graphAddAuthors(G, books)
graphAddRecommenders(G, books)
graphAddTopLists(G, books)
graphAddSeries(G, books)
graphAddTags(G, books)
runPagerank(G)
2021-09-24 16:13:55 +02:00
ratedBooks = [n for n in list(G.nodes) if 'rating' in G.nodes[n] and G.nodes[n]['rating'] != None]
boundsLoss = 0
linSepLoss = []
2021-09-24 16:13:55 +02:00
errSq = []
2021-09-26 23:13:43 +02:00
gradient = {}
for wt in weights:
gradient[wt] = 0
mu, sigma = genScores(G, books)
for b in G.nodes:
batch = random.sample(ratedBooks, batchSize) if batchSize!=-1 and len(ratedBooks) > batchSize else ratedBooks
if b in batch:
rating = G.nodes[b]['rating']
G.nodes[b]['rating'] = None
_, _ = genScores(G, books, calcPagerank=False)
if G.nodes[b]['score'] > rating: # over estimated
errSq.append(((rating - G.nodes[b]['score'])**2)*2)
else:
errSq.append((rating - G.nodes[b]['score'])**2)
G.nodes[b]['rating'] = rating
for wt in weights:
scoreB = sum([a*(1.001 if wt==w[0] else 1)*weights[w[0]]*(w[1] if len(w)>1 else 1) for a,w in zip(G.nodes[b]['_act'], G.nodes[b]['_wgh'])])/sum([(1.001 if wt==w[0] else 1)*weights[w[0]]*(w[1] if len(w)>1 else 1) for w in G.nodes[b]['_wgh']])
gradient[wt] += ((rating - G.nodes[b]['score'])**2 - (rating - scoreB)**2)*1000
2022-02-07 19:57:03 +01:00
regressionLoss = sum([max(0,abs(w)-1)**2 for w in weights.values()]) # no punishment if w within -1 and 1
for wt in weights:
if abs(weights[wt]) > 1.0:
gradient[wt] -= weights[wt]*10
else:
gradient[wt] -= weights[wt]*1
2021-09-26 23:13:43 +02:00
for g in gradient:
gradient[g] /= len(errSq)
2021-09-26 14:31:00 +02:00
if debugPrint:
2022-02-07 19:57:03 +01:00
print(sum(errSq)/len(errSq), 0.001*regressionLoss)
fit = sum(errSq)/len(errSq) + 0.001*regressionLoss
2021-09-26 23:13:43 +02:00
return fit, gradient
2021-09-24 16:13:55 +02:00
2021-11-23 20:51:24 +01:00
def train(initGamma, full=True):
2021-09-24 16:13:55 +02:00
global weights
2021-09-26 23:13:43 +02:00
if full:
for wt in weights:
weights[wt] = random.random()
saveWeights(weights)
gamma = initGamma
books = loadBooksFromDB()
2021-09-24 16:13:55 +02:00
bestWeights = copy.copy(weights)
2021-09-26 23:13:43 +02:00
mse, gradient = evaluateFitness(books)
delta = sum(gradient[g]**2 for g in gradient)
2021-09-26 23:13:43 +02:00
best_mse = mse
stagLen = 0
2021-12-11 11:54:25 +01:00
goal = 1.0e-4
2021-12-11 11:52:49 +01:00
if full:
2021-12-11 11:54:25 +01:00
goal = 1.0e-5
2021-09-26 23:13:43 +02:00
2021-12-11 11:52:49 +01:00
while gamma > goal and delta > goal or best_mse > 15:
2022-01-22 18:12:57 +01:00
goal *= 1.1
2021-09-26 23:13:43 +02:00
last_mse = mse
2021-09-26 23:15:50 +02:00
print({'mse': mse, 'gamma': gamma, 'delta': delta})
2021-09-26 23:13:43 +02:00
delta = sum(gradient[g]**2 for g in gradient)
for wt in weights:
2022-03-07 13:21:16 +01:00
weights[wt] += gamma*0.1*gradient[wt]/math.sqrt(delta)
2021-09-26 23:13:43 +02:00
mse, gradient = evaluateFitness(books)
if mse < last_mse:
gamma = gamma*1.25
2021-09-24 16:32:43 +02:00
else:
2021-09-26 23:13:43 +02:00
gamma *= 0.25
if mse < best_mse:
2021-09-24 16:13:55 +02:00
saveWeights(weights)
bestWeights = copy.copy(weights)
best_mse = mse
if mse > last_mse:
2021-09-26 23:13:43 +02:00
stagLen += 1
else:
stagLen = 0
if stagLen == 4 or mse > 50:
if full or mse > 10:
stagLen = 0
gamma = initGamma
if random.random() < 0.50:
for wt in weights:
2022-02-01 12:22:13 +01:00
weights[wt] = random.random()*2-0.5
else:
weights = copy.copy(bestWeights)
for wt in weights:
weights[wt] *= 0.975+0.05*random.random()
else:
break
2021-09-26 23:17:03 +02:00
print('Done.')
2021-09-24 16:13:55 +02:00
def saveWeights(weights):
with open('neuralWeights.json', 'w') as f:
f.write(json.dumps(weights))
def loadWeights():
2022-02-11 12:17:01 +01:00
try:
with open('neuralWeights.json', 'r') as f:
weights = json.loads(f.read())
except IOError:
2022-03-07 13:21:16 +01:00
weights = {"topList": 0.15, "recommender": 0.30, "author": 0.70, "series": 0.05, "tag": 0.05, "pagerank": 0.05, "mu": 0.50, "sigma": 0.30, "bias": 0.25, "median": 0.10} #, "tgb_rank": 0.10}
2021-09-24 16:13:55 +02:00
return weights
def cliInterface():
import argparse
parser = argparse.ArgumentParser(description='TODO: Write Description.')
parser.add_argument('--keep-priv', action="store_true")
2021-10-17 15:51:26 +02:00
parser.add_argument('--keep-whitepapers', action="store_true")
parser.add_argument('--remove-read', action="store_true")
parser.add_argument('--remove-unread', action="store_true")
parser.add_argument('--no-web', action="store_true")
parser.add_argument('--no-list', action="store_true")
2021-06-16 17:35:40 +02:00
parser.add_argument('--remove-edge', action="store_true")
parser.add_argument('--keep-top-lists', action="store_true")
parser.add_argument('--keep-useless-recommenders', action="store_true")
parser.add_argument('--dark', action="store_true")
parser.add_argument('--v3d', action="store_true")
parser.add_argument('--imgs', action="store_true")
2022-02-22 15:02:48 +01:00
parser.add_argument('--perf-test', action="store_true")
2022-03-07 13:21:16 +01:00
parser.add_argument('--train', action="store_true")
cmds = parser.add_subparsers(required=True, dest='cmd')
p_rec = cmds.add_parser('recommend', description="TODO", aliases=['rec'])
2021-10-13 15:10:12 +02:00
p_rec.add_argument('-n', type=int, default=20, help='number of books to recommend')
p_rec.add_argument('--tag-based', action="store_true")
p_rec.add_argument('--recommender-based', action="store_true")
p_rec.add_argument('--new', type=int, default=-1, help='number of new books to recommend')
2021-10-13 15:10:12 +02:00
p_rec = cmds.add_parser('listScores', description="TODO", aliases=['ls'])
p_rec.add_argument('-n', type=int, default=50, help='number of books to recommend')
p_read = cmds.add_parser('read', description="TODO", aliases=[])
p_read.add_argument('--min-rating', type=int, default=0)
p_read.add_argument('--all-tags', action="store_true")
p_read.add_argument('--only-connected', action="store_true")
p_show = cmds.add_parser('analyze', description="TODO", aliases=[])
2022-02-06 22:30:44 +01:00
p_show.add_argument('type', choices=['any', 'book', 'recommender', 'author', 'series', 'tag'])
p_show.add_argument('name', type=str)
2022-02-06 21:38:41 +01:00
p_show.add_argument('-d', type=float, default=2.1, help='depth of expansion')
2021-09-24 16:13:55 +02:00
p_train = cmds.add_parser('train', description="TODO", aliases=[])
2022-02-07 19:57:03 +01:00
p_train.add_argument('-g', type=float, default=0.2, help='learning rate gamma')
p_train.add_argument('--full', action="store_true")
2021-09-24 16:13:55 +02:00
p_prog = cmds.add_parser('progress', description="TODO", aliases=[])
p_prog.add_argument('-m', type=float, default=7, help='Mimimum Score to read')
p_comp = cmds.add_parser('competence', description="TODO", aliases=[])
p_shell = cmds.add_parser('shell', description="TODO", aliases=[])
p_new = cmds.add_parser('newBooks', description="TODO", aliases=[])
p_new.add_argument('-n', type=int, default=10, help='number of books to recommend')
p_full = cmds.add_parser('full', description="TODO", aliases=[])
args = parser.parse_args()
2022-02-22 15:26:04 +01:00
if args.perf_test:
2022-02-22 15:02:48 +01:00
perfTestCLI(args)
else:
mainCLI(args)
def perfTestCLI(args):
2022-02-22 15:26:04 +01:00
import time
2022-02-22 15:02:48 +01:00
from pycallgraph import PyCallGraph
from pycallgraph import Config
from pycallgraph import GlobbingFilter
from pycallgraph.output import GraphvizOutput
config = Config()
config.trace_filter = GlobbingFilter(exclude=[
"pycallgraph.*",
2022-02-22 15:26:04 +01:00
"numpy.*"
2022-02-22 15:02:48 +01:00
])
2022-02-22 15:26:04 +01:00
with PyCallGraph(output=GraphvizOutput(output_file='perfTests/' + str(int(time.time())) + '.png'), config=config):
2022-02-22 15:02:48 +01:00
mainCLI(args)
def mainCLI(args):
2021-09-24 16:13:55 +02:00
if args.cmd=="train":
2021-09-26 23:13:43 +02:00
train(args.g, args.full)
2021-09-24 16:13:55 +02:00
exit()
2022-03-07 13:21:16 +01:00
if args.train:
train(0.2, False)
bestListT = 'book'
G, books = buildFullGraph(darkMode=args.dark)
mu, std = genScores(G, books)
if not args.keep_whitepapers:
removeWhitepapers(G)
if args.cmd=="recommend":
if args.new==-1:
args.new = int(args.n / 5)
if args.new != 0:
2022-02-11 17:50:07 +01:00
findNewBooks(G, books, mu, args.new, minRecSco = mu-std)
if args.tag_based:
if args.recommender_based:
raise Exception('tag-based and recommender-based can not be be combined')
recommendNBooksTagBased(G, mu, std, args.n, not args.keep_top_lists)
elif args.recommender_based:
recommendNBooksRecommenderBased(G, mu, std, args.n, not args.keep_top_lists, not args.keep_useless_recommenders)
else:
2022-02-11 18:12:49 +01:00
recommendNBooks(G, mu, std, args.n, not args.keep_top_lists, not args.keep_useless_recommenders, args.v3d)
2021-10-13 15:10:12 +02:00
elif args.cmd=="listScores":
listScores(G, mu, std, args.n)
elif args.cmd=="read":
readBooksAnalysis(G, args.min_rating, args.all_tags, args.only_connected, not args.keep_top_lists)
elif args.cmd=="analyze":
2022-02-11 17:50:07 +01:00
analyze(G, books, mu, args.type, args.name, args.d)
elif args.cmd=="full":
fullGraph(G, not args.keep_top_lists)
elif args.cmd=="competence":
bestListT = 'recommender'
recommenderCompetence(G)
elif args.cmd=="shell":
shell(G, books, mu, std)
2022-02-04 17:20:46 +01:00
elif args.cmd=="progress":
2022-02-15 19:35:03 +01:00
progress(G, books, mu, args.m)
return
elif args.cmd=="newBooks":
bestListT = 'newBook'
newBooks(G, books, args.n, mu, std)
else:
raise Exception("Bad")
2021-09-24 16:13:55 +02:00
if not args.keep_priv:
removePriv(G)
if args.remove_read:
removeRead(G)
elif args.remove_unread:
removeUnread(G)
removeDangling(G, alsoBooks=True)
2021-06-16 17:35:40 +02:00
if args.remove_edge:
removeEdge(G)
if not args.no_list:
printBestList(G, t=bestListT)
2021-10-13 15:10:12 +02:00
if not args.no_web and not args.cmd in ['listScores']:
if args.v3d:
genAndShow3D(G, darkMode=args.dark)
else:
if args.imgs:
addImagesToNodes(G)
genAndShowHTML(G, darkMode=args.dark)
2021-06-14 22:20:36 +02:00
2021-09-24 16:13:55 +02:00
weights = loadWeights()
2021-06-14 22:20:36 +02:00
if __name__ == "__main__":
cliInterface()