CaliGraph/caliGraph.py

1769 lines
58 KiB
Python
Raw Permalink Normal View History

#!./.venv/bin/python3.10
2021-06-14 22:20:36 +02:00
import os
import json
import math
2021-09-24 16:13:55 +02:00
import copy
2021-06-14 22:20:36 +02:00
import random
2023-01-17 23:18:26 +01:00
import argcomplete, pyzshcomplete
# Enable autocomplete in global completion mode for bash & zsh:
# PYTHON_ARGCOMPLETE_OK
# PYZSHCOMPLETE_OK
2021-06-14 22:20:36 +02:00
2023-01-17 23:18:26 +01:00
# performance hack; only load these after validating cmd args
def defered_imports():
global re, requests
import re
import requests
2021-06-14 22:20:36 +02:00
2023-01-17 23:18:26 +01:00
global np, pd, norm
import numpy as np
import pandas as pd
from scipy.stats import norm
global plt, nx, Network, go
import matplotlib.pyplot as plt
import networkx as nx
from pyvis.network import Network
import plotly.graph_objects as go
global wikipedia
import wikipedia
2021-06-14 22:20:36 +02:00
2022-09-11 18:56:47 +02:00
2022-03-19 11:35:30 +01:00
class Error(Exception):
pass
2022-09-11 18:56:47 +02:00
2021-06-14 22:20:36 +02:00
def getAllAuthors(books):
authors = set()
for book in books:
for author in getAuthors(book):
authors.add(author)
return list(authors)
def getAuthors(book):
return book['authors'].split(' & ')
def getRecommenders(book):
recs = set()
2021-06-14 22:20:36 +02:00
for tag in book['tags']:
if tag.find(" Recommendation") != -1:
recs.add(tag.replace(" Recommendation", ""))
elif tag.find("s Literature Club") != -1:
recs.add(tag.replace("s Literature Club", ""))
elif tag.find(":MRB") != -1:
recs.add(tag.replace(":MRB", ""))
return list(recs)
2021-06-14 22:20:36 +02:00
def getTags(book):
for tag in book['tags']:
if tag.find(" Recommendation") == -1 and tag.find("s Literature Club") == -1 and tag.find(" Top ") == -1 and tag.find(":MRB") == -1:
2021-06-14 22:20:36 +02:00
yield tag
def getAllRecommenders(books):
recs = set()
for book in books:
for rec in getRecommenders(book):
recs.add(rec)
return list(recs)
def getTopLists(book):
lists = set()
for tag in book['tags']:
if tag.find(" Top ") != -1:
lists.add(tag.split(" Top ")[0])
return list(lists)
def getAllTopLists(books):
tops = set()
for book in books:
for top in getTopLists(book):
tops.add(top)
return list(tops)
def getAllSeries(books):
series = set()
for book in books:
if 'series' in book:
series.add(book['series'])
return list(series)
def getAllTags(books):
tags = set()
for book in books:
for tag in getTags(book):
tags.add(tag)
return list(tags)
2021-09-24 16:13:55 +02:00
def getTopListWeight(book, topList):
2021-06-14 22:20:36 +02:00
minScope = 100000
for tag in book['tags']:
if tag.find(topList+" Top ") != -1:
scope = int(tag.split(" Top ")[1])
minScope = min(minScope, scope)
if minScope == 100000:
raise Exception("You stupid?")
2021-07-04 20:25:26 +02:00
if minScope == 10:
return 1
elif minScope == 25:
return 0.85
elif minScope == 100:
return 0.5
2021-09-03 21:21:07 +02:00
return 50 / minScope
2021-06-14 22:20:36 +02:00
def removeRead(G):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'book':
if node['rating'] != None:
G.remove_node(n)
def removeUnread(G):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'book':
if node['rating'] == None:
G.remove_node(n)
def removePriv(G):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'book':
if 'priv' in node['tags']:
G.remove_node(n)
2022-09-11 18:56:47 +02:00
def removeWhitepapers(G):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'book':
if 'whitepaper' in node['tags'] or 'Lernzettel' in node['tags']:
G.remove_node(n)
2021-06-14 22:20:36 +02:00
def removeDangling(G, alsoBooks=False):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] != 'book' or alsoBooks:
if not len(G.adj[n]):
G.remove_node(n)
2022-09-11 18:56:47 +02:00
def removeThinRecs(G, minCons=3):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'recommender':
if not len(G.adj[n]) >= minCons:
G.remove_node(n)
2021-06-14 22:20:36 +02:00
2022-09-11 18:56:47 +02:00
2021-06-14 22:20:36 +02:00
def removeEdge(G):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] != 'book':
if len(G.adj[n]) < 2:
G.remove_node(n)
def removeBad(G, threshold, groups=['book', 'topList', 'recommender', 'author', 'series', 'tag']):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] in groups:
if 'score' in node and (node['score'] == None or node['score'] < threshold):
G.remove_node(n)
def removeKeepBest(G, num, maxDistForRead=1, forType='book'):
2021-06-14 22:20:36 +02:00
bestlist = []
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == forType:
2021-06-14 22:20:36 +02:00
if 'score' in node and node['score'] != None:
bestlist.append(node)
bestlist.sort(key=lambda node: node['score'], reverse=True)
bestlist = bestlist[:num]
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == forType and node not in bestlist or 'score' in node and node['score'] == None:
2021-06-14 22:20:36 +02:00
if not 'rating' in node or node['rating'] == None or node['rating'] < bestlist[-1]['score']-maxDistForRead:
G.remove_node(n)
def removeTags(G):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'tag':
G.remove_node(n)
def pruneTags(G, minCons=2, forceKeepLabels=[]):
for n in sorted(list(G.nodes), key=lambda i: G.nodes[i]['score'] + len(G.nodes[i]['feedbacks'])/5 if 'score' in G.nodes[i] and 'feedbacks' in G.nodes[i] else 0):
2021-06-14 22:20:36 +02:00
node = G.nodes[n]
if node['t'] == 'tag':
foundCon = 0
for book in G.adj[n]:
for con in G.adj[book]:
conType = G.nodes[con]['t']
if conType not in ['topList']:
if conType in ['recommender']:
foundCon += 0.5
elif conType in ['tag', 'series']:
foundCon += 0.25
else:
foundCon += 1
if foundCon > minCons and node['label'] not in forceKeepLabels:
2021-06-14 22:20:36 +02:00
G.remove_node(n)
def pruneRecommenders(G, minCons=2):
for n in sorted(list(G.nodes), key=lambda i: G.nodes[i]['score'] if 'score' in G.nodes[i] else 0):
node = G.nodes[n]
if node['t'] == 'recommender':
foundCon = 0
for book in G.adj[n]:
for con in G.adj[book]:
conType = G.nodes[con]['t']
if conType not in ['topList']:
if conType in ['recommender']:
foundCon += 0.5
elif conType in ['tag', 'series']:
foundCon += 0.25
else:
foundCon += 1
if foundCon > minCons:
G.remove_node(n)
def pruneRecommenderCons(G, maxCons=5):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'recommender':
if len(G.adj[n]) > maxCons:
bestlist = []
for m in list(G.adj[n]):
book = G.nodes[m]
if book['t'] == 'book':
if 'score' in book and book['score'] != None:
bestlist.append(book)
bestlist.sort(key=lambda node: node['score'], reverse=True)
bestlist = bestlist[:maxCons]
for m in list(G.adj[n]):
book = G.nodes[m]
if book['t'] == 'book' and book not in bestlist or 'score' in book and book['score'] == None:
if not 'rating' in book or book['rating'] == None:
foundCon = 0
for con in G.adj[m]:
if G.nodes[con]['t'] not in ['topList']:
foundCon += 1
if foundCon < 2:
G.remove_node(m)
2022-09-11 18:56:47 +02:00
def pruneAuthorCons(G, maxCons=3):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'author':
if len(G.adj[n]) > maxCons:
bestlist = []
for m in list(G.adj[n]):
book = G.nodes[m]
if book['t'] == 'book':
if 'score' in book and book['score'] != None:
bestlist.append(book)
bestlist.sort(key=lambda node: node['score'], reverse=True)
bestlist = bestlist[:maxCons]
for m in list(G.adj[n]):
book = G.nodes[m]
if book['t'] == 'book' and book not in bestlist or 'score' in book and book['score'] == None:
if not 'rating' in book or book['rating'] == None:
foundCon = 0
for con in G.adj[m]:
if G.nodes[con]['t'] not in ['topList']:
foundCon += 1
if foundCon < 2:
G.remove_node(m)
2021-06-14 22:20:36 +02:00
2022-09-11 18:56:47 +02:00
def removeHighSpanTags(G, maxCons=5, forceKeepLabels=[]):
2021-06-14 22:20:36 +02:00
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'tag':
if len(G.adj[n]) > maxCons and not node['label'] in forceKeepLabels:
2021-06-14 22:20:36 +02:00
G.remove_node(n)
2021-09-24 18:25:37 +02:00
def removeHighSpanReadBooks(G, maxCons=8):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'book' and node['rating'] != None:
2022-09-11 18:56:47 +02:00
if sum([1 for adj in G.adj[n] if G.nodes[adj]['t'] == 'recommender']) > maxCons:
2021-09-24 18:25:37 +02:00
G.remove_node(n)
2021-06-14 22:20:36 +02:00
def removeTopLists(G):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'topList':
G.remove_node(n)
2022-09-11 18:56:47 +02:00
def removeRecommenders(G):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'recommender':
G.remove_node(n)
2022-09-11 18:56:47 +02:00
def removeAuthors(G):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'author':
G.remove_node(n)
2022-09-11 18:56:47 +02:00
def removeSeries(G):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'series':
G.remove_node(n)
2021-06-14 22:20:36 +02:00
2022-09-11 18:56:47 +02:00
2021-06-14 22:20:36 +02:00
def removeRestOfSeries(G):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'series':
seriesState = 0
for adj in G.adj[n]:
adjNode = G.nodes[adj]
if adjNode['rating'] != None:
seriesState = max(seriesState, int(
adjNode['series_index']))
for adj in list(G.adj[n]):
adjNode = G.nodes[adj]
if adjNode['series_index'] > seriesState + 1.0001:
G.remove_node(adj)
2022-09-11 18:56:47 +02:00
def removeUnusedRecommenders(G):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'recommender':
for adj in G.adj[n]:
adjNode = G.nodes[adj]
2022-09-11 18:56:47 +02:00
if adjNode['t'] == 'book' and 'score' in adjNode:
break
2022-09-11 18:56:47 +02:00
else: # No unrated recommendation
G.remove_node(n)
2021-06-14 22:20:36 +02:00
2022-09-11 18:56:47 +02:00
def removeUselessReadBooks(G):
2021-09-24 18:25:37 +02:00
minForce = 1.5
minContact = 2
for n in list(G.nodes):
node = G.nodes[n]
2021-09-24 18:25:37 +02:00
if node['t'] == 'book' and node['rating'] != None:
force = 0
contacts = 0
for adj in G.adj[n]:
adjNode = G.nodes[adj]
2021-09-24 18:25:37 +02:00
contacts += 1
for cousin in G.adj[adj]:
cousinNode = G.nodes[cousin]
2022-09-11 18:56:47 +02:00
if cousinNode['t'] == 'book' and 'score' in cousinNode or cousinNode['t'] == 'newBook':
if adjNode['t'] == 'recommender':
2021-09-24 18:25:37 +02:00
force += 0.5
else:
force += 1
if force < minForce or contacts < minContact:
G.remove_node(n)
2022-09-11 18:56:47 +02:00
2021-09-24 18:25:37 +02:00
def removeUselessTags(G, minUnread=1):
2021-09-24 16:13:55 +02:00
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'tag':
2021-09-24 18:25:37 +02:00
foundUnread = 0
2021-09-24 16:13:55 +02:00
for adj in G.adj[n]:
adjNode = G.nodes[adj]
2022-09-11 18:56:47 +02:00
if adjNode['t'] == 'book' and 'score' in adjNode:
2021-09-24 18:25:37 +02:00
foundUnread += 1
if foundUnread < minUnread:
2021-09-24 16:13:55 +02:00
G.remove_node(n)
2022-09-11 18:56:47 +02:00
2022-11-30 17:44:12 +01:00
def curiosityReward(G, coeff=1, dTan=False):
for n in list(G.nodes):
node = G.nodes[n]
if 'score' in node and 'se' in node:
delta = node['se'] * coeff
if dTan:
2022-11-30 17:44:12 +01:00
delta *= (1 - math.tanh((node['score']/10-0.5)*7)**2)
else:
delta *= (1 - math.tanh(node['score']/5))
new = max(0.0, min(10.0, node['score'] + delta))
node['score'] = new
def removeUselessSeries(G, minSco=0):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'series':
if len(G.adj[n]) < 2 or node['score'] < minSco:
G.remove_node(n)
2021-09-24 16:13:55 +02:00
2022-09-11 18:56:47 +02:00
2021-09-24 18:25:37 +02:00
def scoreOpinions(G, globMu, globStd):
2021-06-14 22:20:36 +02:00
for n in list(G.nodes):
node = G.nodes[n]
feedbacks = []
if node['t'] not in ['book']:
2021-06-14 22:20:36 +02:00
adjacens = list(G.adj[n].keys())
for adj in adjacens:
adjNode = G.nodes[adj]
if adjNode['rating'] != None:
feedbacks.append(adjNode['rating'])
if len(feedbacks):
node['mean'], node['std'] = norm.fit(feedbacks)
node['se'] = globStd / math.sqrt(len(feedbacks))
ratio = len(feedbacks) / len(adjacens)
2022-03-07 13:51:26 +01:00
node['score'] = node['mean'] + globStd/3 - node['se']
2021-06-14 22:20:36 +02:00
node['feedbacks'] = feedbacks
else:
node['score'] = None
2022-09-11 18:56:47 +02:00
2021-09-24 18:25:37 +02:00
def scoreUnread(G, globMu, globStd):
2021-06-14 22:20:36 +02:00
for n in list(G.nodes):
feedbacks = [globMu]
ws = [['mu']]
2021-06-14 22:20:36 +02:00
node = G.nodes[n]
if node['t'] == 'book':
if node['rating'] == None:
adjacens = list(G.adj[n].keys())
for adj in adjacens:
adjNode = G.nodes[adj]
if 'score' in adjNode and adjNode['score'] != None:
2022-09-11 18:56:47 +02:00
w = [adjNode['t'], G[n][adj]['weight']
if 'weight' in G[n][adj] else 1]
for fb in adjNode['feedbacks']:
feedbacks.append(fb)
ws.append(w)
if len(feedbacks):
node['mean'], node['std'] = norm.fit(feedbacks)
2022-09-11 18:56:47 +02:00
node['median'] = np.percentile(
feedbacks, [50], method='linear')[0]
node['se'] = globStd / math.sqrt(len(feedbacks))
feedbacks.append(node['pagerank_score'])
ws.append(['pagerank'])
#feedbacks.append(10/math.ln10(10+node['tgb_rank']) if 'tgb_rank' in node else 0)
2022-09-11 18:56:47 +02:00
# ws.append(['tgb_rank'])
feedbacks.append(node['std'])
ws.append(['sigma'])
2022-03-07 13:21:16 +01:00
feedbacks.append(node['median'])
ws.append(['median'])
2022-09-11 18:56:47 +02:00
# feedbacks.append(node['se'])
# ws.append(['se'])
feedbacks.append(globMu)
ws.append(['bias'])
2022-09-11 18:56:47 +02:00
node['score'] = sum([fb*getWeightForType(w[0], w[1] if len(w) > 1 else 1) for fb, w in zip(
feedbacks, ws)])/sum([getWeightForType(w[0], w[1] if len(w) > 1 else 1) for w in ws])
node['_act'] = feedbacks
node['_wgh'] = ws
else:
2022-09-11 18:56:47 +02:00
node['score'] = globMu + errorFac * \
globStd + len(feedbacks)*0.0000000001
if 'series' in node:
if node['series_index'] == 1.0:
node['score'] += 0.000000001
2022-09-11 18:56:47 +02:00
def getWeightForType(nodeType, edgeWeight=1):
2021-09-24 16:13:55 +02:00
global weights
w = weights[nodeType]
if nodeType == 'topList':
return edgeWeight*w
else:
return w
2021-06-14 22:20:36 +02:00
2022-09-11 18:56:47 +02:00
def printBestList(G, t='book', num=-1):
2021-06-14 22:20:36 +02:00
bestlist = []
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == t:
2021-06-14 22:20:36 +02:00
if 'score' in node and node['score'] != None:
bestlist.append(node)
2022-09-11 18:56:47 +02:00
bestlist.sort(key=lambda node: node['score'] + 0.00001 *
(node['se'] if 'se' in node else 0), reverse=True)
2021-06-14 22:20:36 +02:00
for i, book in enumerate(bestlist):
2022-09-11 18:56:47 +02:00
if t == 'book':
line = book['title'] + " ("+" & ".join(book['authors'])+")" + \
": {:.5f}".format(book['score'])
else:
line = book['label']
2022-09-11 18:56:47 +02:00
print("["+str(i+1).zfill(int((math.log10(num) if num != -1 else 3)+1))+"] "+line)
if num != -1 and i == num-1:
2021-06-14 22:20:36 +02:00
break
def readColor(book):
if 'rating' in book:
return 'green'
else:
return 'gray'
2022-09-11 18:56:47 +02:00
2021-06-14 22:20:36 +02:00
def loadBooksFromDB():
2022-03-19 11:35:30 +01:00
books = calibreDB.getBooks()
infuseDataFromMRB(books)
2022-09-11 18:56:47 +02:00
# infuseDataFromTGB(books)
return books
2022-09-11 18:56:47 +02:00
def mrbGetBook(mrbdf, title, authors):
2022-02-06 18:30:55 +01:00
title = title.split('(')[0]
2022-09-11 18:56:47 +02:00
title = title.replace('*', '')
pot = mrbdf[mrbdf['title'].str.contains(title)]
dic = pot.to_dict(orient='records')
for d in dic:
for author in authors:
parts = author.split(" ")
for part in [parts[0], parts[-1]]:
2022-09-11 18:56:47 +02:00
if d['author'].find(part) == -1:
break
else:
return d
return False
2022-09-11 18:56:47 +02:00
def tgbGetBook(df, title, authors):
title = title.split('(')[0]
2022-09-11 18:56:47 +02:00
title = title.replace('*', '')
pot = df[df['title'].str.contains(title)]
dic = pot.to_dict(orient='records')
for d in dic:
for author in authors:
parts = author.split(" ")
2022-02-15 19:35:03 +01:00
for part in [parts[0], parts[-1]]:
2022-09-11 18:56:47 +02:00
if d['author'].find(part) == -1:
2022-02-15 19:35:03 +01:00
break
else:
return d
return False
2022-09-11 18:56:47 +02:00
def infuseDataFromMRB(books):
2022-02-15 19:35:03 +01:00
mrbdf = pd.read_csv('rec_dbs/mrb_db.csv')
for book in books:
mrb = mrbGetBook(mrbdf, book['title'], book['authors'])
if mrb:
for rec in str(mrb['recommender']).split('|'):
book['tags'] += [rec + ':MRB']
2022-09-11 18:56:47 +02:00
def infuseDataFromTGB(books):
2022-09-11 18:56:47 +02:00
for i in range(1, 3):
df = pd.read_csv('rec_dbs/tgb_'+str(i)+'.csv')
for book in books:
tgb = tgbGetBook(df, book['title'], book['authors'])
if tgb:
book['tgb_rank'] = int(tgb['id'])
2022-09-11 18:56:47 +02:00
2022-03-19 11:35:30 +01:00
class calibreDB():
@classmethod
def _getTxt(cls, request):
ret = os.popen("calibredb "+request).read()
if not ret:
2022-09-11 18:56:47 +02:00
raise Error(
'Unable to connect to CalibreDB. Please close all open instances of Calibre.')
2022-03-19 11:35:30 +01:00
return ret
@classmethod
def _getJson(cls, request):
return json.loads(cls._getTxt(request))
@classmethod
def getBooks(cls):
return cls._getJson('list --for-machine -f all')
@classmethod
def getCustomColumns(cls):
lines = cls._getTxt('custom_columns').split('\n')
cols = [line.split(' ')[0] for line in lines]
return cols
@classmethod
def _requireCaliceColumn(cls):
cols = cls.getCustomColumns()
avai = ['calice_score' in cols, 'calice_rating' in cols]
if not any(avai):
2022-09-11 18:56:47 +02:00
raise Error(
'Custom Columns missing from CalibreDB. Create columns for "Calice Score" and/or "Calice Rating" using the "createCaliceColumn" command.')
return avai
2022-03-19 11:35:30 +01:00
@classmethod
def createCaliceRatingColumn(cls):
if 'calice_rating' in cls.getCustomColumns():
2022-03-19 11:35:30 +01:00
raise Error('Custom Column already exists.')
cls._getTxt("add_custom_column calice_rating 'Calice Rating' rating")
2022-03-19 11:35:30 +01:00
@classmethod
def createCaliceScoreColumn(cls):
if 'calice_score' in cls.getCustomColumns():
raise Error('Custom Column already exists.')
cls._getTxt("add_custom_column calice_score 'Calice Score' float")
@classmethod
def writeCaliceColumn(cls, bookId, score):
cls.writeCaliceColumnMultiple({bookId: score})
2022-03-19 11:35:30 +01:00
@classmethod
def writeCaliceColumnMultiple(cls, scores):
2022-03-19 11:35:30 +01:00
from tqdm.auto import tqdm
sco, rat = cls._requireCaliceColumn()
for bookId in tqdm(scores):
score = scores[bookId]
2022-03-20 18:11:56 +01:00
if score == '<clear>':
cls._getTxt('set_custom calice_score '+str(bookId)+' ""')
else:
if sco:
2022-09-11 18:56:47 +02:00
cls._getTxt('set_custom calice_score ' +
str(bookId)+' '+str(round(score, 5)))
2022-03-20 18:11:56 +01:00
if rat:
2022-09-11 18:56:47 +02:00
cls._getTxt('set_custom calice_rating ' +
str(bookId)+' '+str(int(round(score))))
2022-03-19 11:35:30 +01:00
def calice(G):
scores = {}
2022-03-19 11:35:30 +01:00
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] in ['book']:
if 'score' in node and node['score'] != None:
scores[node['calibreID']] = node['score']
2022-03-20 18:11:56 +01:00
elif 'rating' in node:
scores[node['calibreID']] = '<clear>'
calibreDB.writeCaliceColumnMultiple(scores)
2022-03-19 11:35:30 +01:00
print('Done.')
2021-06-14 22:20:36 +02:00
2022-09-11 18:56:47 +02:00
def remove_html_tags(text):
clean = re.compile('<.*?>')
return re.sub(clean, '', text)
2022-09-11 18:56:47 +02:00
def getKeywords(txt, rake):
txt = remove_html_tags(txt)
k = []
rake.extract_keywords_from_text(txt)
kws = rake.get_ranked_phrases_with_scores()
2022-09-11 18:56:47 +02:00
for i, (score, kw) in enumerate(kws):
l = len(kw.split(' '))
2022-09-11 18:56:47 +02:00
if kw.lower() not in ['p', 'die', 'best', 'known', 'fk', 'p pp', 'one'] and len(kw) > 3 and kw.find('div') == -1 and kw.lower().find('p p') == -1:
k.append((score**(1/(l*0.4)), kw))
k.sort(key=lambda x: x[0], reverse=True)
if k:
minSco = k[0][0]/3*2
2022-09-11 18:56:47 +02:00
for i, kw in enumerate(k):
if kw[0] < minSco:
2022-09-11 18:56:47 +02:00
return [(sco, word.title()) for sco, word in k[:i]]
return k
return []
2022-09-11 18:56:47 +02:00
2022-01-31 13:45:26 +01:00
def runPagerank(G):
try:
scores = nx.pagerank(G=G)
except nx.exception.PowerIterationFailedConvergence:
print('[!] Could not calculate pagerank-scores: Power iteration of the eigenvector calculation did not converge')
print('[ ] Recommendations will be of slighly lower quality')
2022-01-31 13:45:26 +01:00
scores = {}
for n in list(G.nodes):
G.nodes[n]['pagerank_score'] = scores[n] if n in scores else 0
2022-09-11 18:56:47 +02:00
def buildBookGraph(books, darkMode=False, extractKeywords=True, mergeTags=True):
2021-06-14 22:20:36 +02:00
G = nx.Graph()
if extractKeywords:
from rake_nltk.rake import Rake
rake = Rake()
2021-06-14 22:20:36 +02:00
# Books
for book in books:
tags = book['tags']
2021-06-14 22:20:36 +02:00
if 'rating' in book:
rating = book['rating']
else:
rating = None
if 'comments' in book:
desc = book['comments']
2021-06-14 22:20:36 +02:00
else:
desc = ''
if 'comments' in book and extractKeywords:
2022-09-11 18:56:47 +02:00
sanitized = re.sub(r'[^a-zA-Z0-9\s\.äöü]+',
'', book['comments']).replace('\n', ' ')
keywords = getKeywords(sanitized, rake)
else:
keywords = []
if mergeTags:
tags = tags + [word for (score, word) in keywords]
2021-06-14 22:20:36 +02:00
if 'series' in book:
series = book['series']
series_index = book['series_index']
else:
series = None
series_index = None
2022-09-11 18:56:47 +02:00
G.add_node(book['id'], t='book', label=book['title'], title=book['title'], shape='image', image=book['cover'], rating=rating, tags=tags, keywords=keywords,
desc=desc, isbn=book['isbn'], files=book['formats'], authors=getAuthors(book), series=series, series_index=series_index, calibreID=book['id'])
2021-06-14 22:20:36 +02:00
return G
2022-09-11 18:56:47 +02:00
def getWikiImage(search_term):
from fuzzywuzzy import fuzz
WIKI_REQUEST = 'http://en.wikipedia.org/w/api.php?action=query&prop=pageimages&format=json&piprop=original&titles='
try:
print('[i] Searching for >'+search_term+'< on WikiPedia...')
2022-09-11 18:56:47 +02:00
result = wikipedia.search(search_term, results=1)
if fuzz.ratio(search_term, result) < 50:
raise Exception('blub')
wikipedia.set_lang('en')
2022-09-11 18:56:47 +02:00
wkpage = wikipedia.WikipediaPage(title=result[0])
title = wkpage.title
2022-09-11 18:56:47 +02:00
response = requests.get(WIKI_REQUEST+title)
json_data = json.loads(response.text)
2022-09-11 18:56:47 +02:00
img_link = list(json_data['query']['pages'].values())[
0]['original']['source']
return img_link
except:
print('[!] No match for '+search_term+' on WikiPedia...')
return None
2021-06-14 22:20:36 +02:00
2022-09-11 18:56:47 +02:00
2021-10-05 18:25:27 +02:00
def graphAddAuthors(G, books, darkMode=False):
2021-06-14 22:20:36 +02:00
for author in getAllAuthors(books):
G.add_node('a/'+author, color='green', t='author', label=author)
for book in books:
for author in getAuthors(book):
G.add_edge('a/'+author, book['id'], color=readColor(book))
return G
2022-09-11 18:56:47 +02:00
2021-10-05 18:25:27 +02:00
def graphAddRecommenders(G, books, darkMode=False):
2021-06-14 22:20:36 +02:00
for rec in getAllRecommenders(books):
G.add_node('r/'+rec, color='orange', t='recommender', label=rec)
for book in books:
for rec in getRecommenders(book):
G.add_edge('r/'+rec, book['id'], color=readColor(book))
return G
2022-09-11 18:56:47 +02:00
2021-10-05 18:25:27 +02:00
def graphAddTopLists(G, books, darkMode=False):
2021-06-14 22:20:36 +02:00
for tl in getAllTopLists(books):
G.add_node('t/'+tl, color='yellow', t='topList', label=tl)
for book in books:
for top in getTopLists(book):
2021-09-24 16:13:55 +02:00
G.add_edge('t/'+top, book['id'], weight=getTopListWeight(
2021-06-14 22:20:36 +02:00
book, top), color=readColor(book))
return G
2021-10-05 18:25:27 +02:00
def graphAddSeries(G, books, darkMode=False):
2021-06-14 22:20:36 +02:00
for series in getAllSeries(books):
2022-09-11 18:56:47 +02:00
G.add_node('s/'+series, color='red', t='series',
label=series, shape='triangle')
2021-06-14 22:20:36 +02:00
for book in books:
if 'series' in book:
G.add_edge('s/'+book['series'], book['id'], color=readColor(book))
return G
2021-10-05 18:25:27 +02:00
def graphAddTags(G, books, darkMode=False):
2021-06-14 22:20:36 +02:00
for tag in getAllTags(books):
2022-09-11 18:56:47 +02:00
G.add_node('t/'+tag, color=['lightGray', 'darkgray']
[darkMode], t='tag', label=tag, shape='box')
2021-06-14 22:20:36 +02:00
for book in books:
for tag in getTags(book):
G.add_edge('t/'+tag, book['id'], color=readColor(book))
return G
def calcRecDist(G, books):
globRatings = []
for book in books:
if G.nodes[book['id']]['rating'] != None:
globRatings.append(G.nodes[book['id']]['rating'])
return norm.fit(globRatings)
def scaleBooksByRating(G):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] not in []:
if 'rating' in node and node['rating'] != None:
node['value'] = 20 + 5 * int(node['rating'])
else:
if 'score' in node and node['score'] != None:
node['value'] = 20 + int(5 * float(node['score']))
2021-06-14 22:20:36 +02:00
else:
node['value'] = 15
def scaleOpinionsByRating(G):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] in ['topList', 'recommender', 'author', 'series']:
if 'score' in node and node['score'] != None:
node['value'] = 20 + 5 * int(node['score'])
else:
node['value'] = 20
def addScoreToLabels(G):
for n in list(G.nodes):
node = G.nodes[n]
2022-11-19 16:28:06 +01:00
if node['t'] not in ['tag', 'newBook']:
2021-06-14 22:20:36 +02:00
if 'rating' in node and node['rating'] != None:
node['label'] += " ("+str(node['rating'])+")"
else:
2021-09-26 14:31:00 +02:00
if 'score' in node and node['score'] != None and 'se' in node:
2022-09-11 18:56:47 +02:00
node['label'] += " ({:.2f}±{:.1f})".format(
node['score'], node['se'])
2021-06-14 22:20:36 +02:00
else:
node['label'] += " (0±∞)"
2022-11-19 16:35:04 +01:00
if False and node['t'] in ['tag']:
2022-11-19 16:28:06 +01:00
if 'score' in node and node['score'] != None and 'se' in node:
node['label'] += " ({:.1f})".format(
node['score'])
else:
node['label'] += " (0)"
2021-06-14 22:20:36 +02:00
2021-10-05 18:25:27 +02:00
def genAndShowHTML(G, showButtons=False, darkMode=False, arrows=False):
net = Network('1050px', '1900px',
2021-10-05 18:25:27 +02:00
directed=arrows,
2022-09-11 18:56:47 +02:00
bgcolor=['#FFFFFF', '#181818'][darkMode])
2021-06-14 22:20:36 +02:00
if showButtons:
net.show_buttons(filter_=['configure', 'layout',
'interaction', 'physics', 'edges'])
net.from_nx(G)
net.show('nx.html')
def genAndShow3D(G, darkMode=False):
node_sizes = []
node_labels = []
node_cols = []
for n in G.nodes:
node = G.nodes[n]
2022-09-11 18:56:47 +02:00
if node['t'] == 'tag':
node_cols.append('gray')
2022-09-11 18:56:47 +02:00
elif node['t'] == 'book':
if 'score' in node: # unread book
node_cols.append('lightblue')
else:
node_cols.append('magenta')
elif 'color' in node:
node_cols.append(node['color'])
else:
node_cols.append('black')
node_labels.append(node['label'])
node_sizes.append((node['value']/8)**1.5)
2022-09-11 18:56:47 +02:00
spring = nx.spring_layout(G, dim=3, seed=random.randint(0, 65536))
x_nodes = [spring[p][0] for p in spring] # x-coordinates of nodes
y_nodes = [spring[p][1] for p in spring] # y-coordinates
z_nodes = [spring[p][2] for p in spring] # z-coordinates
2022-09-11 18:56:47 +02:00
x_edges = []
y_edges = []
z_edges = []
for edge in G.edges():
2022-09-11 18:56:47 +02:00
x_coords = [spring[edge[0]][0], spring[edge[1]][0], None]
x_edges += x_coords
2022-09-11 18:56:47 +02:00
y_coords = [spring[edge[0]][1], spring[edge[1]][1], None]
y_edges += y_coords
2022-09-11 18:56:47 +02:00
z_coords = [spring[edge[0]][2], spring[edge[1]][2], None]
z_edges += z_coords
trace_edges = go.Scatter3d(x=x_edges,
2022-09-11 18:56:47 +02:00
y=y_edges,
z=z_edges,
mode='lines',
line=dict(color='black', width=2),
hoverinfo='none')
trace_nodes = go.Scatter3d(x=x_nodes,
2022-09-11 18:56:47 +02:00
y=y_nodes,
z=z_nodes,
mode='markers',
marker=dict(symbol='circle',
size=node_sizes,
color=node_cols, # color the nodes according to their community
# colorscale=['lightgreen','magenta'], #either green or mageneta
line=dict(color='gray', width=0.5)),
text=node_labels,
hoverinfo='text')
axis = dict(showbackground=False,
2022-09-11 18:56:47 +02:00
showline=False,
zeroline=False,
showgrid=False,
showticklabels=False,
title='')
layout = go.Layout(title="",
2022-09-11 18:56:47 +02:00
width=1920,
height=1080,
plot_bgcolor=['#FFFFFF', '#181818'][darkMode],
paper_bgcolor=['#FFFFFF', '#181818'][darkMode],
showlegend=False,
scene=dict(xaxis=dict(axis),
yaxis=dict(axis),
zaxis=dict(axis),
),
margin=dict(l=0, r=0, b=0, t=0),
hovermode='closest')
data = [trace_edges, trace_nodes]
fig = go.Figure(data=data, layout=layout)
fig.show()
2022-09-11 18:56:47 +02:00
2021-10-05 18:25:27 +02:00
def buildFullGraph(darkMode=False):
2021-06-14 22:20:36 +02:00
books = loadBooksFromDB()
2021-10-05 18:25:27 +02:00
G = buildBookGraph(books, darkMode=darkMode)
2021-06-14 22:20:36 +02:00
2021-10-05 18:25:27 +02:00
graphAddAuthors(G, books, darkMode=darkMode)
graphAddRecommenders(G, books, darkMode=darkMode)
graphAddTopLists(G, books, darkMode=darkMode)
graphAddSeries(G, books, darkMode=darkMode)
graphAddTags(G, books, darkMode=darkMode)
2021-06-14 22:20:36 +02:00
return G, books
def genScores(G, books, calcPagerank=True):
2021-06-14 22:20:36 +02:00
globMu, globStd = calcRecDist(G, books)
if calcPagerank:
runPagerank(G)
2021-06-14 22:20:36 +02:00
scoreOpinions(G, globMu, globStd)
scoreUnread(G, globMu, globStd)
return globMu, globStd
2022-09-11 18:56:47 +02:00
def addImageToNode(node, cache, shape='circularImage'):
2022-09-11 18:56:47 +02:00
name = node['label'].split(' (')[0].replace('*', '')
if not name in cache or (cache[name] == False and random.random() < 0.05):
term = name
img = getWikiImage(term)
if img:
cache[name] = img
else:
cache[name] = False
else:
img = cache[name]
if img:
2022-02-11 17:37:23 +01:00
#node['imagePadding'] = '100px'
2022-09-11 18:56:47 +02:00
node['image'] = img
node['shape'] = shape
def addImagesToNodes(G):
try:
with open('.imgLinkCache.json', 'r') as cf:
cache = json.loads(cf.read())
except IOError:
cache = {}
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] in ['recommender', 'author']:
2022-09-11 18:56:47 +02:00
addImageToNode(
node, cache, ['circularImage', 'image'][node['t'] == 'author'])
with open('.imgLinkCache.json', 'w') as cf:
cf.write(json.dumps(cache))
2021-06-14 22:20:36 +02:00
2022-09-11 18:56:47 +02:00
def recommendNBooksRecommenderBased(G, mu, std, n, removeTopListsB=True, removeUselessRecommenders=True):
2021-06-14 22:20:36 +02:00
removeRestOfSeries(G)
2021-09-24 16:13:55 +02:00
removeBad(G, mu-std*2-1)
2021-09-24 17:23:34 +02:00
removeKeepBest(G, int(n*2) + 5, maxDistForRead=2)
2021-06-14 22:20:36 +02:00
removeEdge(G)
2021-09-24 18:25:37 +02:00
removeHighSpanTags(G, 6)
2021-06-14 22:20:36 +02:00
removeDangling(G, alsoBooks=False)
2021-09-24 17:23:34 +02:00
pruneTags(G, 10)
2021-06-14 22:20:36 +02:00
removeBad(G, mu, groups=['book'])
removeUselessReadBooks(G)
2021-09-24 17:23:34 +02:00
pruneTags(G, 6)
pruneRecommenderCons(G, int(n/7)+1)
pruneAuthorCons(G, int(n/15))
2021-09-24 16:13:55 +02:00
removeUselessTags(G)
if removeTopListsB:
removeTopLists(G)
2021-06-14 22:20:36 +02:00
removeDangling(G, alsoBooks=True)
2021-09-24 17:23:34 +02:00
removeKeepBest(G, n+math.ceil(n/20), maxDistForRead=1.5)
2021-06-14 22:20:36 +02:00
removeEdge(G)
removeDangling(G, alsoBooks=True)
2021-09-24 18:25:37 +02:00
removeUselessReadBooks(G)
if removeUselessRecommenders:
removeUnusedRecommenders(G)
2021-09-24 16:13:55 +02:00
removeDangling(G, alsoBooks=True)
2021-09-24 17:23:34 +02:00
removeKeepBest(G, n, maxDistForRead=1.25)
2021-06-14 22:20:36 +02:00
scaleBooksByRating(G)
scaleOpinionsByRating(G)
addScoreToLabels(G)
def recommendNBooksTagBased(G, mu, std, n, removeTopListsB=True):
removeRestOfSeries(G)
removeBad(G, mu-std*2-1)
removeKeepBest(G, int(n*2) + 5, maxDistForRead=2)
removeEdge(G)
removeHighSpanTags(G, 12)
removeDangling(G, alsoBooks=False)
pruneTags(G, 24)
removeBad(G, mu, groups=['book'])
removeUselessReadBooks(G)
pruneTags(G, 16)
pruneAuthorCons(G, int(n/5))
removeRecommenders(G)
removeUselessTags(G)
if removeTopListsB:
removeTopLists(G)
removeDangling(G, alsoBooks=True)
removeKeepBest(G, n+math.ceil(n/20), maxDistForRead=1.5)
removeUselessReadBooks(G)
2021-09-25 20:15:14 +02:00
removeUselessTags(G)
removeKeepBest(G, n, maxDistForRead=1.25)
scaleBooksByRating(G)
scaleOpinionsByRating(G)
addScoreToLabels(G)
2022-09-11 18:56:47 +02:00
2022-02-11 18:12:49 +01:00
def recommendNBooks(G, mu, std, n, removeTopListsB=True, removeUselessRecommenders=True, v3d=False):
removeRestOfSeries(G)
removeBad(G, mu-std-0.5)
removeBad(G, mu+std/2, groups=['recommender'])
removeThinRecs(G, 3)
removeKeepBest(G, int(n*2) + 5, maxDistForRead=2)
removeEdge(G)
2022-02-01 12:22:13 +01:00
removeHighSpanTags(G, 8)
2022-02-10 18:18:05 +01:00
pruneTags(G, 7)
2022-02-11 18:04:47 +01:00
removeHighSpanReadBooks(G, 14)
removeDangling(G, alsoBooks=False)
pruneRecommenders(G, 12)
removeThinRecs(G, 3)
removeBad(G, mu, groups=['book'])
removeUselessReadBooks(G)
2022-02-01 12:22:13 +01:00
pruneAuthorCons(G, int(n/5)+3)
pruneRecommenders(G, 12 - min(4, n/20))
removeUselessSeries(G, mu)
removeUselessTags(G)
2022-02-10 18:18:05 +01:00
pruneTags(G, 6)
if removeTopListsB:
removeTopLists(G)
removeDangling(G, alsoBooks=True)
2021-09-25 20:15:14 +02:00
removeKeepBest(G, n+math.ceil(n/20)+3, maxDistForRead=1.5)
removeEdge(G)
2022-02-11 18:16:43 +01:00
removeKeepBest(G, n+1, maxDistForRead=1.25)
removeUselessSeries(G, mu)
removeUselessTags(G)
removeUselessReadBooks(G)
2022-09-11 18:56:47 +02:00
removeThinRecs(G, 2 + 1 * (n > 20 and not v3d))
removeKeepBest(G, n, maxDistForRead=1.25)
scaleBooksByRating(G)
scaleOpinionsByRating(G)
addScoreToLabels(G)
2021-10-13 15:10:12 +02:00
def listScores(G, mu, std, n):
removeRestOfSeries(G)
removeKeepBest(G, n, maxDistForRead=10)
scaleBooksByRating(G)
scaleOpinionsByRating(G)
addScoreToLabels(G)
def fullGraph(G, removeTopListsB=True):
2021-06-14 22:20:36 +02:00
removeEdge(G)
removeHighSpanTags(G, 7)
removeDangling(G, alsoBooks=False)
if removeTopListsB:
removeTopLists(G)
2021-06-14 22:20:36 +02:00
pruneTags(G, 3)
removeDangling(G, alsoBooks=True)
scaleBooksByRating(G)
scaleOpinionsByRating(G)
addScoreToLabels(G)
2022-09-11 18:56:47 +02:00
def recommenderCompetence(G):
2022-09-11 18:56:47 +02:00
# removeRead(G)
removeUnread(G)
removeTags(G)
removeAuthors(G)
removeSeries(G)
removeTopLists(G)
removeEdge(G)
removeDangling(G, alsoBooks=True)
scaleBooksByRating(G)
scaleOpinionsByRating(G)
addScoreToLabels(G)
2022-09-11 18:56:47 +02:00
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'recommender':
2022-02-06 18:28:35 +01:00
if 'se' in node:
node['score'] -= node['se'] * 1
else:
if not node['score']:
node['score'] = 0
node['score'] /= 2
2022-09-11 18:56:47 +02:00
def readBooksAnalysis(G, minRating=0, showAllTags=True, removeUnconnected=False, removeTopListsB=True):
2021-06-14 22:20:36 +02:00
removeUnread(G)
removeBad(G, minRating)
if not showAllTags:
removeEdge(G)
2021-06-14 22:20:36 +02:00
removeHighSpanTags(G, 15)
removeDangling(G, alsoBooks=removeUnconnected)
if removeTopListsB:
removeTopLists(G)
2021-06-14 22:20:36 +02:00
pruneTags(G, 8)
scaleBooksByRating(G)
scaleOpinionsByRating(G)
addScoreToLabels(G)
2022-09-11 18:56:47 +02:00
2022-02-15 19:35:03 +01:00
def progress(G, books, mu, minimum=3.5):
2022-09-11 18:56:47 +02:00
findNewBooks(G, books, mu, -1, minRecSco=minimum)
bookCount = 0
2022-02-15 19:35:03 +01:00
libCount = 0
readCount = 0
toReadCount = 0
for n in list(G.nodes):
node = G.nodes[n]
2022-09-11 18:56:47 +02:00
if node['t'] in ['book', 'newBook']:
2022-02-15 19:35:03 +01:00
if node['t'] == 'book':
2022-09-11 18:56:47 +02:00
libCount += 1
bookCount += 1
2022-02-15 19:35:03 +01:00
if 'rating' in node and node['rating'] != None:
readCount += 1
2022-09-11 18:56:47 +02:00
elif 'score' in node and (node['score'] >= minimum or 'std' in node and node['std'] == 0.0):
toReadCount += 1
perc = round(readCount / (toReadCount+readCount) * 100, 2)
2022-02-15 19:35:03 +01:00
print('Books in library: '+str(libCount))
print('Books in CaliGraph: '+str(bookCount))
print('Read Books: '+str(readCount))
print('Unread Books: '+str(bookCount-readCount))
print('Recommended Books (score > '+str(round(minimum, 2))+'): '+str(toReadCount))
print('Progress: '+str(perc)+'%')
2021-06-14 22:20:36 +02:00
2022-02-11 17:50:07 +01:00
def analyze(G, books, mu, type_name, name, dist=2.1):
from fuzzywuzzy import fuzz
type_ident = type_name[0]
full_name = type_ident + "/" + name
bestRatio, match, n = 0, None, 0
for ni in list(G.nodes):
node = G.nodes[ni]
2022-09-11 18:56:47 +02:00
if node['t'] == type_name or type_name == "any":
if name == node['label'] or full_name == node['label']:
match, n = node, ni
break
ratio = fuzz.ratio(node['label'], name)
if ratio > bestRatio:
bestRatio, match, n = ratio, node, ni
2021-06-16 18:01:28 +02:00
if bestRatio < 70:
print("Best Match: "+match['label'])
2021-06-16 17:35:40 +02:00
2022-02-11 17:50:07 +01:00
findNewBooks(G, books, mu, num=-1, minRecSco=1)
2022-02-11 17:37:23 +01:00
menge = set()
waveFlow(G, match, n, dist, menge)
for n in list(G.nodes):
if n not in menge:
G.remove_node(n)
2022-02-06 21:35:00 +01:00
if dist >= 2:
removeThinRecs(G, 2)
removeHighSpanTags(G, 12, forceKeepLabels=[match['label']])
if dist > 1:
removeDangling(G, True)
scaleBooksByRating(G)
scaleOpinionsByRating(G)
#match['value'] = 100
2021-06-16 18:01:28 +02:00
if not 'shape' in match:
match['shape'] = 'star'
addScoreToLabels(G)
match['label'] = "*"+match['label']+"*"
2022-09-11 18:56:47 +02:00
def waveFlow(G, node, n, dist, menge, firstEdge=False):
if dist <= 0:
return
dist -= 1
2022-09-11 18:56:47 +02:00
if menge == set():
firstEdge = True
if node['t'] in ['topList']:
if firstEdge:
menge.add(n)
return
menge.add(n)
if node['t'] in ['tag']:
if firstEdge:
2022-09-11 18:56:47 +02:00
dist -= 0.1
else:
return
bestlist = []
keeplist = []
for m in list(G.adj[n]):
book = G.nodes[m]
if book['t'] not in ['NOTHING']:
if 'score' in book and book['score'] != None:
bestlist.append(book)
elif 'rating' in book and book['rating'] != None:
keeplist.append(book)
else:
book['score'] = 0
bestlist.append(book)
bestlist.sort(key=lambda node: node['score'], reverse=True)
2022-09-11 18:56:47 +02:00
toKeep = min(int(dist*10), math.ceil(len(bestlist)
* dist - len(keeplist)*0.5))
if toKeep <= 0:
keeplist.sort(key=lambda node: node['rating'], reverse=True)
keeplist = keeplist[:min(int(dist*10), int(len(keeplist) * dist))]
bestlist = []
else:
bestlist = bestlist[:toKeep]
for m in list(G.adj[n]):
node = G.nodes[m]
if node in bestlist or node in keeplist:
waveFlow(G, node, m, dist, menge, firstEdge=firstEdge)
2022-09-11 18:56:47 +02:00
def gensimTokensForLines(lines):
for i, line in enumerate(lines):
tokens = gensim.utils.simple_preprocess(line)
if tokens_only:
yield tokens
else:
# For training data, add tags
yield gensim.models.doc2vec.TaggedDocument(tokens, [i])
2022-09-11 18:56:47 +02:00
def buildDoc2Vec(books):
import gensim
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'book':
pass
gensimTokensForLines(lines)
2022-09-11 18:56:47 +02:00
def shell(G, books, mu, std):
from ptpython.repl import embed
embed(globals(), locals())
2022-09-11 18:56:47 +02:00
def newBooks(G, books, num, mu, std):
2022-02-06 21:45:08 +01:00
removeBad(G, mu-std*2)
2022-09-11 18:56:47 +02:00
findNewBooks(G, books, mu, num, minRecSco=mu-std)
2022-02-11 17:37:23 +01:00
removeThinRecs(G, 2)
removeUnread(G)
removeUselessReadBooks(G)
removeTags(G)
removeTopLists(G)
removeSeries(G)
removeEdge(G)
removeDangling(G, alsoBooks=True)
scaleBooksByRating(G)
scaleOpinionsByRating(G)
addScoreToLabels(G)
2022-02-11 17:50:07 +01:00
def findNewBooks(G, books, mu, num=-1, minRecSco=5):
2022-02-15 19:35:03 +01:00
mrbdf = pd.read_csv('rec_dbs/mrb_db.csv')
recs = []
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'recommender' and 'score' in node:
oldBooks = []
newBooks = []
2022-09-11 18:56:47 +02:00
recBooks = mrbdf[mrbdf['recommender'].str.contains(
node['label'])].to_dict(orient='records')
for book in recBooks:
if book['title'] in [b['title'] for b in books]:
2022-09-11 18:56:47 +02:00
oldBooks.append(
{'title': book['title'], 'author': book['author']})
else:
2022-09-11 18:56:47 +02:00
newBooks.append(
{'title': book['title'], 'author': book['author']})
recs.append({'name': node['label'], 'rec': node,
'newBooks': newBooks, 'oldBooks': oldBooks})
for rec in recs:
for book in rec['newBooks']:
2022-09-11 18:56:47 +02:00
G.add_node('n/'+book['title'], color='blue', t='newBook',
label=book['title'], author=book['author'])
2022-09-11 18:56:47 +02:00
G.add_node('r/'+rec['rec']['label'], color='orange', t='recommender',
label=rec['rec']['label'], score=rec['rec']['score'])
G.add_edge('r/'+rec['rec']['label'], 'n/' +
book['title'], color='blue')
2022-09-11 18:56:47 +02:00
G.add_node('a/'+book['author'], color='green',
t='author', label=book['author'])
G.add_edge('a/'+book['author'], 'n/'+book['title'], color='blue')
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'newBook':
ses = []
scores = []
for m in list(G.adj[n]):
adj = G.nodes[m]
2022-09-11 18:56:47 +02:00
if adj['t'] == 'recommender' and adj['score'] != None:
scores.append(adj['score'])
ses.append(adj['se'])
2022-02-11 17:37:23 +01:00
if not len(scores):
G.remove_node(n)
else:
2022-02-11 17:37:23 +01:00
ses.append(min(ses))
2022-02-11 17:50:07 +01:00
scores.append(mu)
2022-09-11 18:56:47 +02:00
# This is not how SE works. DILLIGAF?
node['fake_se'] = sum(ses)/(len(ses)**1.2) + \
0.5 + 0.5 * (len(scores) == 2)
node['score'] = sum(
2022-11-19 16:28:06 +01:00
scores)/len(scores)*1.2 - node['fake_se']*1.4 + 0.5 - 0.1/math.sqrt(len(scores))
2022-09-11 18:56:47 +02:00
if len(scores) == 2:
2022-11-19 16:28:06 +01:00
node['score'] *= 0.85
node['value'] = 20 + 5 * float(node['score'])
2022-09-11 18:56:47 +02:00
node['label'] += " ({:.2f}±{:.1f})".format(node['score'],
node['fake_se'])
2022-02-06 19:10:21 +01:00
node['label'] += '\n ' + node['author']
2022-09-11 18:56:47 +02:00
if num != -1:
2022-02-11 17:37:23 +01:00
removeKeepBest(G, num, 10, 'newBook')
# while batchSize is implemented, we only get a good gonvergence when we disable it (batchSize=-1)
# but might be necessary to enable later for a larger libary for better training performance...
# maybe try again for 128 books?
2022-09-11 18:56:47 +02:00
def evaluateFitness(books, batchSize=-1, debugPrint=False):
global weights
G = buildBookGraph(books)
graphAddAuthors(G, books)
graphAddRecommenders(G, books)
graphAddTopLists(G, books)
graphAddSeries(G, books)
graphAddTags(G, books)
runPagerank(G)
2022-09-11 18:56:47 +02:00
ratedBooks = [n for n in list(
G.nodes) if 'rating' in G.nodes[n] and G.nodes[n]['rating'] != None]
boundsLoss = 0
linSepLoss = []
2021-09-24 16:13:55 +02:00
errSq = []
2021-09-26 23:13:43 +02:00
gradient = {}
for wt in weights:
gradient[wt] = 0
mu, sigma = genScores(G, books)
for b in G.nodes:
2022-09-11 18:56:47 +02:00
batch = random.sample(ratedBooks, batchSize) if batchSize != - \
1 and len(ratedBooks) > batchSize else ratedBooks
if b in batch:
rating = G.nodes[b]['rating']
G.nodes[b]['rating'] = None
_, _ = genScores(G, books, calcPagerank=False)
2022-09-11 18:56:47 +02:00
if G.nodes[b]['score'] > rating: # over estimated
errSq.append(((rating - G.nodes[b]['score'])**2)*2)
else:
errSq.append((rating - G.nodes[b]['score'])**2)
G.nodes[b]['rating'] = rating
for wt in weights:
2022-09-11 18:56:47 +02:00
scoreB = sum([a*(1.001 if wt == w[0] else 1)*weights[w[0]]*(w[1] if len(w) > 1 else 1) for a, w in zip(G.nodes[b]['_act'],
G.nodes[b]['_wgh'])])/sum([(1.001 if wt == w[0] else 1)*weights[w[0]]*(w[1] if len(w) > 1 else 1) for w in G.nodes[b]['_wgh']])
gradient[wt] += ((rating - G.nodes[b]['score'])
** 2 - (rating - scoreB)**2)*1000
# no punishment if w within -1 and 1
regressionLoss = sum([max(0, abs(w)-1)**2 for w in weights.values()])
2022-02-07 19:57:03 +01:00
for wt in weights:
if abs(weights[wt]) > 1.0:
gradient[wt] -= weights[wt]*10
else:
gradient[wt] -= weights[wt]*1
2021-09-26 23:13:43 +02:00
for g in gradient:
gradient[g] /= len(errSq)
2021-09-26 14:31:00 +02:00
if debugPrint:
2022-02-07 19:57:03 +01:00
print(sum(errSq)/len(errSq), 0.001*regressionLoss)
fit = sum(errSq)/len(errSq) + 0.001*regressionLoss
2021-09-26 23:13:43 +02:00
return fit, gradient
2021-09-24 16:13:55 +02:00
2022-09-11 18:56:47 +02:00
def calcDissonance(books):
global weights
G = buildBookGraph(books)
graphAddAuthors(G, books)
graphAddRecommenders(G, books)
graphAddTopLists(G, books)
graphAddSeries(G, books)
graphAddTags(G, books)
runPagerank(G)
ratedBooks = [n for n in list(
G.nodes) if 'rating' in G.nodes[n] and G.nodes[n]['rating'] != None]
errSq = []
gradient = {}
for wt in weights:
gradient[wt] = 0
mu, sigma = genScores(G, books)
for b in G.nodes:
batch = ratedBooks
if b in batch:
rating = G.nodes[b]['rating']
G.nodes[b]['rating'] = None
_, _ = genScores(G, books, calcPagerank=False)
G.nodes[b]['_test_score'] = G.nodes[b]['score']
G.nodes[b]['rating'] = rating
G.nodes[b]['dissonance_off'] = rating - G.nodes[b]['score']
G.nodes[b]['dissonance_abs'] = abs(rating - G.nodes[b]['score'])
return G
def describeDissonance(books, num=-1, sortKey='dissonance_abs', sortDir=True):
bestlist = []
G = calcDissonance(books)
for n in list(G.nodes):
node = G.nodes[n]
if'dissonance_abs' in node:
bestlist.append(node)
bestlist.sort(key=lambda node: node[sortKey], reverse=sortDir)
for i, book in enumerate(bestlist):
line = book['title'] + " ("+" & ".join(book['authors'])+")" + \
": You: {:.5f}, AI: {:.5f}, Delta: {:.5f}".format(
book['rating'], book['_test_score'], book['dissonance_off'])
print("["+str(i+1).zfill(int((math.log10(num) if num != -1 else 3)+1))+"] "+line)
if num != -1 and i == num-1:
break
2021-11-23 20:51:24 +01:00
def train(initGamma, full=True):
2021-09-24 16:13:55 +02:00
global weights
2021-09-26 23:13:43 +02:00
if full:
for wt in weights:
weights[wt] = random.random()
saveWeights(weights)
gamma = initGamma
books = loadBooksFromDB()
2021-09-24 16:13:55 +02:00
bestWeights = copy.copy(weights)
2021-09-26 23:13:43 +02:00
mse, gradient = evaluateFitness(books)
delta = sum(gradient[g]**2 for g in gradient)
2021-09-26 23:13:43 +02:00
best_mse = mse
stagLen = 0
2021-12-11 11:54:25 +01:00
goal = 1.0e-4
2021-12-11 11:52:49 +01:00
if full:
2021-12-11 11:54:25 +01:00
goal = 1.0e-5
2021-09-26 23:13:43 +02:00
2021-12-11 11:52:49 +01:00
while gamma > goal and delta > goal or best_mse > 15:
2022-01-22 18:12:57 +01:00
goal *= 1.1
2021-09-26 23:13:43 +02:00
last_mse = mse
2021-09-26 23:15:50 +02:00
print({'mse': mse, 'gamma': gamma, 'delta': delta})
2021-09-26 23:13:43 +02:00
delta = sum(gradient[g]**2 for g in gradient)
for wt in weights:
2022-03-07 13:21:16 +01:00
weights[wt] += gamma*0.1*gradient[wt]/math.sqrt(delta)
2021-09-26 23:13:43 +02:00
mse, gradient = evaluateFitness(books)
if mse < last_mse:
gamma = gamma*1.25
2021-09-24 16:32:43 +02:00
else:
2021-09-26 23:13:43 +02:00
gamma *= 0.25
if mse < best_mse:
2021-09-24 16:13:55 +02:00
saveWeights(weights)
bestWeights = copy.copy(weights)
best_mse = mse
if mse > last_mse:
2021-09-26 23:13:43 +02:00
stagLen += 1
else:
stagLen = 0
if stagLen == 4 or mse > 50:
if full or mse > 10:
stagLen = 0
gamma = initGamma
if random.random() < 0.50:
for wt in weights:
2022-02-01 12:22:13 +01:00
weights[wt] = random.random()*2-0.5
else:
weights = copy.copy(bestWeights)
for wt in weights:
weights[wt] *= 0.975+0.05*random.random()
else:
break
2021-09-26 23:17:03 +02:00
print('Done.')
2021-09-24 16:13:55 +02:00
2022-09-11 18:56:47 +02:00
2021-09-24 16:13:55 +02:00
def saveWeights(weights):
with open('neuralWeights.json', 'w') as f:
f.write(json.dumps(weights))
2022-09-11 18:56:47 +02:00
2021-09-24 16:13:55 +02:00
def loadWeights():
2022-02-11 12:17:01 +01:00
try:
with open('neuralWeights.json', 'r') as f:
weights = json.loads(f.read())
except IOError:
2022-09-11 18:56:47 +02:00
weights = {"topList": 0.15, "recommender": 0.30, "author": 0.70, "series": 0.05, "tag": 0.05,
"pagerank": 0.05, "mu": 0.50, "sigma": 0.30, "bias": 0.25, "median": 0.10} # , "tgb_rank": 0.10}
2021-09-24 16:13:55 +02:00
return weights
2022-09-11 18:56:47 +02:00
def cliInterface(imgDef=False):
import argparse
2023-01-17 23:18:26 +01:00
import argcomplete
parser = argparse.ArgumentParser(description='TODO: Write Description.')
parser.add_argument('--keep-priv', action="store_true")
2021-10-17 15:51:26 +02:00
parser.add_argument('--keep-whitepapers', action="store_true")
parser.add_argument('--remove-read', action="store_true")
parser.add_argument('--remove-unread', action="store_true")
parser.add_argument('--no-web', action="store_true")
parser.add_argument('--no-list', action="store_true")
2021-06-16 17:35:40 +02:00
parser.add_argument('--remove-edge', action="store_true")
parser.add_argument('--keep-top-lists', action="store_true")
parser.add_argument('--keep-useless-recommenders', action="store_true")
parser.add_argument('--dark', action="store_true")
parser.add_argument('--curiosity', type=float, default=0.0,
help='curiosity coefficient (higher = more speculative)')
parser.add_argument('--v3d', action="store_true")
if imgDef:
parser.add_argument('--no-imgs', action="store_true")
else:
parser.add_argument('--imgs', action="store_true")
2022-02-22 15:02:48 +01:00
parser.add_argument('--perf-test', action="store_true")
2022-03-07 13:21:16 +01:00
parser.add_argument('--train', action="store_true")
cmds = parser.add_subparsers(required=True, dest='cmd')
p_rec = cmds.add_parser('recommend', description="TODO", aliases=['rec'])
2022-09-11 18:56:47 +02:00
p_rec.add_argument('-n', type=int, default=20,
help='number of books to recommend')
p_rec.add_argument('--tag-based', action="store_true")
p_rec.add_argument('--recommender-based', action="store_true")
2022-09-11 18:56:47 +02:00
p_rec.add_argument('--new', type=int, default=-1,
help='number of new books to recommend')
2021-10-13 15:10:12 +02:00
p_rec = cmds.add_parser('listScores', description="TODO", aliases=['ls'])
2022-09-11 18:56:47 +02:00
p_rec.add_argument('-n', type=int, default=50,
help='number of books to recommend')
2021-10-13 15:10:12 +02:00
p_read = cmds.add_parser('read', description="TODO", aliases=[])
p_read.add_argument('--min-rating', type=int, default=0)
p_read.add_argument('--all-tags', action="store_true")
p_read.add_argument('--only-connected', action="store_true")
p_show = cmds.add_parser('analyze', description="TODO", aliases=[])
2022-09-11 18:56:47 +02:00
p_show.add_argument(
'type', choices=['any', 'book', 'recommender', 'author', 'series', 'tag'])
p_show.add_argument('name', type=str)
2022-09-11 18:56:47 +02:00
p_show.add_argument('-d', type=float, default=2.1,
help='depth of expansion')
2021-09-24 16:13:55 +02:00
p_train = cmds.add_parser('train', description="TODO", aliases=[])
2022-09-11 18:56:47 +02:00
p_train.add_argument('-g', type=float, default=0.2,
help='learning rate gamma')
p_train.add_argument('--full', action="store_true")
2021-09-24 16:13:55 +02:00
p_prog = cmds.add_parser('progress', description="TODO", aliases=[])
2022-09-11 18:56:47 +02:00
p_prog.add_argument('-m', type=float, default=7,
help='Mimimum Score to read')
p_comp = cmds.add_parser('competence', description="TODO", aliases=[])
p_shell = cmds.add_parser('shell', description="TODO", aliases=[])
2022-09-11 18:56:47 +02:00
p_new = cmds.add_parser('newBooks', description="TODO", aliases=[])
2022-09-11 18:56:47 +02:00
p_new.add_argument('-n', type=int, default=10,
help='number of books to recommend')
p_cal = cmds.add_parser('calice', description="TODO", aliases=[])
p_dis = cmds.add_parser('dissonance', description="TODO", aliases=['dis'])
p_dis.add_argument('-n', type=int, default=-1,
help='Maximum number of books to lost')
p_dis.add_argument(
'--sort', choices=['dissonance_abs', 'dissonance_off', 'score'], default='dissonance_abs', const='dissonance_abs', nargs='?')
p_dis.add_argument('--reversed', action="store_true")
p_createCol = cmds.add_parser(
'createCaliceColumn', description="TODO", aliases=[])
p_createCol.add_argument('type', choices=['score', 'rating', 'both'])
2022-09-11 18:56:47 +02:00
p_full = cmds.add_parser('full', description="TODO", aliases=[])
2023-01-17 23:18:26 +01:00
pyzshcomplete.autocomplete(parser)
argcomplete.autocomplete(parser)
args = parser.parse_args()
if imgDef:
args.imgs = not args.no_imgs
2022-02-22 15:26:04 +01:00
if args.perf_test:
2022-02-22 15:02:48 +01:00
perfTestCLI(args)
else:
mainCLI(args)
2022-09-11 18:56:47 +02:00
2022-02-22 15:02:48 +01:00
def perfTestCLI(args):
2022-02-22 15:26:04 +01:00
import time
2022-02-22 15:02:48 +01:00
from pycallgraph import PyCallGraph
from pycallgraph import Config
from pycallgraph import GlobbingFilter
from pycallgraph.output import GraphvizOutput
config = Config()
config.trace_filter = GlobbingFilter(exclude=[
"pycallgraph.*",
2022-02-22 15:26:04 +01:00
"numpy.*"
2022-02-22 15:02:48 +01:00
])
2022-02-22 15:26:04 +01:00
with PyCallGraph(output=GraphvizOutput(output_file='perfTests/' + str(int(time.time())) + '.png'), config=config):
2022-02-22 15:02:48 +01:00
mainCLI(args)
2022-09-11 18:56:47 +02:00
2022-02-22 15:02:48 +01:00
def mainCLI(args):
2023-01-17 23:18:26 +01:00
global weights
defered_imports()
weights = loadWeights()
2022-09-11 18:56:47 +02:00
if args.cmd == "train":
2021-09-26 23:13:43 +02:00
train(args.g, args.full)
2021-09-24 16:13:55 +02:00
exit()
2022-03-07 13:21:16 +01:00
if args.train:
train(0.2, False)
bestListT = 'book'
G, books = buildFullGraph(darkMode=args.dark)
mu, std = genScores(G, books)
curiosityReward(G, args.curiosity)
if not args.keep_whitepapers:
removeWhitepapers(G)
2022-09-11 18:56:47 +02:00
if args.cmd == "recommend":
if args.new == -1:
args.new = int(args.n / 5)
if args.new != 0:
2022-09-11 18:56:47 +02:00
findNewBooks(G, books, mu, args.new, minRecSco=mu-std)
if args.tag_based:
if args.recommender_based:
2022-09-11 18:56:47 +02:00
raise Exception(
'tag-based and recommender-based can not be be combined')
recommendNBooksTagBased(
G, mu, std, args.n, not args.keep_top_lists)
elif args.recommender_based:
2022-09-11 18:56:47 +02:00
recommendNBooksRecommenderBased(
G, mu, std, args.n, not args.keep_top_lists, not args.keep_useless_recommenders)
else:
2022-09-11 18:56:47 +02:00
recommendNBooks(G, mu, std, args.n, not args.keep_top_lists,
not args.keep_useless_recommenders, args.v3d)
elif args.cmd == "listScores":
2021-10-13 15:10:12 +02:00
listScores(G, mu, std, args.n)
2022-09-11 18:56:47 +02:00
elif args.cmd == "read":
readBooksAnalysis(G, args.min_rating, args.all_tags,
args.only_connected, not args.keep_top_lists)
elif args.cmd == "analyze":
2022-02-11 17:50:07 +01:00
analyze(G, books, mu, args.type, args.name, args.d)
2022-09-11 18:56:47 +02:00
elif args.cmd == "full":
fullGraph(G, not args.keep_top_lists)
2022-09-11 18:56:47 +02:00
elif args.cmd == "competence":
bestListT = 'recommender'
recommenderCompetence(G)
2022-09-11 18:56:47 +02:00
elif args.cmd == "shell":
shell(G, books, mu, std)
2022-09-11 18:56:47 +02:00
elif args.cmd == "progress":
2022-02-15 19:35:03 +01:00
progress(G, books, mu, args.m)
return
2022-09-11 18:56:47 +02:00
elif args.cmd == "newBooks":
bestListT = 'newBook'
newBooks(G, books, args.n, mu, std)
2022-09-11 18:56:47 +02:00
elif args.cmd == "calice":
2022-03-19 11:35:30 +01:00
calice(G)
exit()
2022-09-11 18:56:47 +02:00
elif args.cmd == "dissonance":
describeDissonance(books, args.n, args.sort, not args.reversed)
exit()
elif args.cmd == "createCaliceColumn":
if args.type in ['score', 'both']:
calibreDB.createCaliceScoreColumn()
print('[*] Column "Calice Score" was created.')
if args.type in ['rating', 'both']:
calibreDB.createCaliceRatingColumn()
print('[*] Column "Calice Rating" was created.')
2022-09-11 18:56:47 +02:00
print(
'[i] To allow displaying half-stars, please active them manually in the calibre-settings.')
exit()
else:
raise Exception("Bad")
2021-09-24 16:13:55 +02:00
if not args.keep_priv:
removePriv(G)
if args.remove_read:
removeRead(G)
elif args.remove_unread:
removeUnread(G)
removeDangling(G, alsoBooks=True)
2021-06-16 17:35:40 +02:00
if args.remove_edge:
removeEdge(G)
if not args.no_list:
printBestList(G, t=bestListT)
2021-10-13 15:10:12 +02:00
if not args.no_web and not args.cmd in ['listScores']:
if args.v3d:
genAndShow3D(G, darkMode=args.dark)
else:
if args.imgs:
addImagesToNodes(G)
genAndShowHTML(G, darkMode=args.dark)
2021-06-14 22:20:36 +02:00
2021-09-24 16:13:55 +02:00
2023-01-17 23:18:26 +01:00
#weights = loadWeights()
weights = None
2021-06-14 22:20:36 +02:00
if __name__ == "__main__":
2022-03-19 11:35:30 +01:00
try:
cliInterface(imgDef=True)
except Error as e:
print("[!] {0}".format(e))
2023-01-17 23:18:26 +01:00
else:
weights = loadWeights()
defered_imports()