commit 6dabc68f3f46c507c0afbdc833c172d872d24053 Author: Dominik Roth Date: Mon Jun 14 22:20:36 2021 +0200 initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1287e37 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +__pycache__ +*.html +.venv diff --git a/main.py b/main.py new file mode 100644 index 0000000..5e37230 --- /dev/null +++ b/main.py @@ -0,0 +1,498 @@ +import os +import json +import math +import random + +import numpy as np +from scipy.stats import norm + +import matplotlib.pyplot as plt +import networkx as nx +from pyvis.network import Network + + +def getAllAuthors(books): + authors = set() + for book in books: + for author in getAuthors(book): + authors.add(author) + return list(authors) + + +def getAuthors(book): + return book['authors'].split(' & ') + + +def getRecommenders(book): + for tag in book['tags']: + if tag.find(" Recommendation") != -1: + yield tag.replace(" Recommendation", "") + + +def getTags(book): + for tag in book['tags']: + if tag.find(" Recommendation") == -1 and tag.find(" Top ") == -1: + yield tag + + +def getAllRecommenders(books): + recs = set() + for book in books: + for rec in getRecommenders(book): + recs.add(rec) + return list(recs) + + +def getTopLists(book): + lists = set() + for tag in book['tags']: + if tag.find(" Top ") != -1: + lists.add(tag.split(" Top ")[0]) + return list(lists) + + +def getAllTopLists(books): + tops = set() + for book in books: + for top in getTopLists(book): + tops.add(top) + return list(tops) + + +def getAllSeries(books): + series = set() + for book in books: + if 'series' in book: + series.add(book['series']) + return list(series) + + +def getAllTags(books): + tags = set() + for book in books: + for tag in getTags(book): + tags.add(tag) + return list(tags) + + +def getTopListWheight(book, topList): + minScope = 100000 + for tag in book['tags']: + if tag.find(topList+" Top ") != -1: + scope = int(tag.split(" Top ")[1]) + minScope = min(minScope, scope) + if minScope == 100000: + raise Exception("You stupid?") + return 100/minScope + + +def removeRead(G): + for n in list(G.nodes): + node = G.nodes[n] + if node['t'] == 'book': + if node['rating'] != None: + G.remove_node(n) + + +def removeUnread(G): + for n in list(G.nodes): + node = G.nodes[n] + if node['t'] == 'book': + if node['rating'] == None: + G.remove_node(n) + + +def removePriv(G): + for n in list(G.nodes): + node = G.nodes[n] + if node['t'] == 'book': + if 'priv' in node['tags']: + G.remove_node(n) + + +def removeDangling(G, alsoBooks=False): + for n in list(G.nodes): + node = G.nodes[n] + if node['t'] != 'book' or alsoBooks: + if not len(G.adj[n]): + G.remove_node(n) + + +def removeEdge(G): + for n in list(G.nodes): + node = G.nodes[n] + if node['t'] != 'book': + if len(G.adj[n]) < 2: + G.remove_node(n) + + +def removeBad(G, threshold, groups=['book', 'topList', 'recommender', 'author', 'series', 'tag']): + for n in list(G.nodes): + node = G.nodes[n] + if node['t'] in groups: + if 'score' in node and (node['score'] == None or node['score'] < threshold): + G.remove_node(n) + + +def removeKeepBest(G, num, maxDistForRead=1): + bestlist = [] + for n in list(G.nodes): + node = G.nodes[n] + if node['t'] == 'book': + if 'score' in node and node['score'] != None: + bestlist.append(node) + bestlist.sort(key=lambda node: node['score'], reverse=True) + bestlist = bestlist[:num] + + for n in list(G.nodes): + node = G.nodes[n] + if node['t'] == 'book' and node not in bestlist or 'score' in node and node['score'] == None: + if not 'rating' in node or node['rating'] == None or node['rating'] < bestlist[-1]['score']-maxDistForRead: + G.remove_node(n) + + +def removeTags(G): + for n in list(G.nodes): + node = G.nodes[n] + if node['t'] == 'tag': + G.remove_node(n) + + +def pruneTags(G, minCons=2): + for n in list(G.nodes): + node = G.nodes[n] + if node['t'] == 'tag': + foundCon = 0 + for book in G.adj[n]: + for con in G.adj[book]: + if G.nodes[con]['t'] not in ['tag', 'topList']: + foundCon += 1 + if foundCon > minCons: + G.remove_node(n) + + +def removeHighSpanTags(G, maxCons=5): + for n in list(G.nodes): + node = G.nodes[n] + if node['t'] == 'tag': + if len(G.adj[n]) > maxCons: + G.remove_node(n) + + +def removeTopLists(G): + for n in list(G.nodes): + node = G.nodes[n] + if node['t'] == 'topList': + G.remove_node(n) + + +def removeRestOfSeries(G): + for n in list(G.nodes): + node = G.nodes[n] + if node['t'] == 'series': + seriesState = 0 + for adj in G.adj[n]: + adjNode = G.nodes[adj] + if adjNode['rating'] != None: + seriesState = max(seriesState, int( + adjNode['series_index'])) + for adj in list(G.adj[n]): + adjNode = G.nodes[adj] + if adjNode['series_index'] > seriesState + 1.0001: + G.remove_node(adj) + + +def scoreOpinions(G, globMu, globStd, errorFac=1.2): + for n in list(G.nodes): + node = G.nodes[n] + feedbacks = [] + if node['t'] in ['topList', 'recommender', 'author', 'series', 'tag']: + adjacens = list(G.adj[n].keys()) + for adj in adjacens: + adjNode = G.nodes[adj] + if adjNode['rating'] != None: + feedbacks.append(adjNode['rating']) + if len(feedbacks): + node['mean'], node['std'] = norm.fit(feedbacks) + node['se'] = globStd / math.sqrt(len(feedbacks)) + ratio = len(feedbacks) / len(adjacens) + node['score'] = node['mean'] - errorFac * \ + node['se']*(9/10 + (1-ratio)/10) + 0.001 * \ + (node['t'] == 'recommender') + node['feedbacks'] = feedbacks + else: + node['score'] = None + + +def scoreUnread(G, globMu, globStd, errorFac=1): + for n in list(G.nodes): + feedbacks = [] + deepFeedbacks = [] + tagFeedbacks = [] + node = G.nodes[n] + if node['t'] == 'book': + if node['rating'] == None: + adjacens = list(G.adj[n].keys()) + for adj in adjacens: + adjNode = G.nodes[adj] + if 'score' in adjNode and adjNode['score'] != None: + if adjNode['t'] != 'tag': + feedbacks.append(adjNode['score']) + for fb in adjNode['feedbacks']: + deepFeedbacks.append(fb) + else: + tagFeedbacks.append(adjNode['score']) + if len(feedbacks): + node['mean'], node['std'] = norm.fit(deepFeedbacks) + node['mean2'], node['std2'] = norm.fit(feedbacks) + f_mean, f_std = norm.fit(feedbacks) + node['se'] = globStd / math.sqrt(len(deepFeedbacks)) + # - errorFac*node['se'] + node['score'] = ( + (node['mean'] - errorFac*node['se'])*4 + node['mean2']*2 + (f_mean - f_std*0.25)*1)/7 + if 'series' in node: + if node['series_index'] == 1.0: + node['score'] += 0.000000001 + else: + node['score'] = None + + +def printBestList(G, num=25): + bestlist = [] + for n in list(G.nodes): + node = G.nodes[n] + if node['t'] == 'book': + if 'score' in node and node['score'] != None: + bestlist.append(node) + bestlist.sort(key=lambda node: node['score'], reverse=True) + for i, book in enumerate(bestlist): + print("["+str(i+1).zfill(int(math.log10(num)+1))+"] "+book['title'] + + " ("+" & ".join(book['authors'])+"): {:.5f}".format(book['score'])) + if i == num-1: + break + + +def readColor(book): + if 'rating' in book: + return 'green' + else: + return 'gray' + + +def loadBooksFromDB(): + return json.loads(os.popen("calibredb list --for-machine -f all").read()) + + +def buildBookGraph(books): + G = nx.Graph() + + # Books + for book in books: + if 'rating' in book: + rating = book['rating'] + else: + rating = None + if 'comments' in book: + desc = '' # book['comments'] + else: + desc = '' + if 'series' in book: + series = book['series'] + series_index = book['series_index'] + else: + series = None + series_index = None + G.add_node(book['id'], t='book', label=book['title'], title=book['title'], shape='image', image=book['cover'], rating=rating, + tags=book['tags'], desc=desc, isbn=book['isbn'], files=book['formats'], authors=getAuthors(book), series=series, series_index=series_index) + + return G + + +def graphAddAuthors(G, books): + for author in getAllAuthors(books): + G.add_node('a/'+author, color='green', t='author', label=author) + for book in books: + for author in getAuthors(book): + G.add_edge('a/'+author, book['id'], color=readColor(book)) + return G + + +def graphAddRecommenders(G, books): + for rec in getAllRecommenders(books): + G.add_node('r/'+rec, color='orange', t='recommender', label=rec) + for book in books: + for rec in getRecommenders(book): + G.add_edge('r/'+rec, book['id'], color=readColor(book)) + return G + + +def graphAddTopLists(G, books): + for tl in getAllTopLists(books): + G.add_node('t/'+tl, color='yellow', t='topList', label=tl) + for book in books: + for top in getTopLists(book): + G.add_edge('t/'+top, book['id'], wheight=getTopListWheight( + book, top), color=readColor(book)) + return G + + +def graphAddSeries(G, books): + for series in getAllSeries(books): + G.add_node('s/'+series, color='red', t='series', label=series) + for book in books: + if 'series' in book: + G.add_edge('s/'+book['series'], book['id'], color=readColor(book)) + return G + + +def graphAddTags(G, books): + for tag in getAllTags(books): + G.add_node('t/'+tag, color='gray', t='tag', label=tag) + for book in books: + for tag in getTags(book): + G.add_edge('t/'+tag, book['id'], color=readColor(book)) + return G + + +def calcRecDist(G, books): + globRatings = [] + for book in books: + if G.nodes[book['id']]['rating'] != None: + globRatings.append(G.nodes[book['id']]['rating']) + return norm.fit(globRatings) + + +def scaleBooksByRating(G): + for n in list(G.nodes): + node = G.nodes[n] + if node['t'] not in []: + if 'rating' in node and node['rating'] != None: + node['value'] = 20 + 5 * int(node['rating']) + else: + if 'score' in node and node['score'] != None: + node['value'] = 20 + 5 * int(node['score']) + else: + node['value'] = 15 + + +def scaleOpinionsByRating(G): + for n in list(G.nodes): + node = G.nodes[n] + if node['t'] in ['topList', 'recommender', 'author', 'series']: + if 'score' in node and node['score'] != None: + node['value'] = 20 + 5 * int(node['score']) + else: + node['value'] = 20 + + +def addScoreToLabels(G): + for n in list(G.nodes): + node = G.nodes[n] + if node['t'] not in ['tag']: + if 'rating' in node and node['rating'] != None: + node['label'] += " ("+str(node['rating'])+")" + else: + if 'score' in node and node['score'] != None: + node['label'] += " (~{:.2f}".format(node['score'])+")" + else: + node['label'] += " (~0)" + + +def genAndShowHTML(G, showButtons=False): + net = Network('1080px', '1920px') + if showButtons: + net.show_buttons(filter_=['configure', 'layout', + 'interaction', 'physics', 'edges']) + net.from_nx(G) + net.show('nx.html') + + +def buildFullGraph(): + books = loadBooksFromDB() + G = buildBookGraph(books) + + graphAddAuthors(G, books) + graphAddRecommenders(G, books) + graphAddTopLists(G, books) + graphAddSeries(G, books) + graphAddTags(G, books) + return G, books + + +def genScores(G, books): + globMu, globStd = calcRecDist(G, books) + scoreOpinions(G, globMu, globStd) + scoreUnread(G, globMu, globStd) + return globMu, globStd + + +def recommendNBooks(n): + G, books = buildFullGraph() + mu, std = genScores(G, books) + + removeRestOfSeries(G) + removePriv(G) + removeBad(G, mu-std-1.5) + removeKeepBest(G, int(n*2) + 5, maxDistForRead=1.5) + removeEdge(G) + removeHighSpanTags(G, 9) + removeDangling(G, alsoBooks=False) + pruneTags(G, 4) + removeBad(G, mu, groups=['book']) + pruneTags(G, 3) + removeTopLists(G) + removeDangling(G, alsoBooks=True) + removeKeepBest(G, n, maxDistForRead=0.75) + removeEdge(G) + removeDangling(G, alsoBooks=True) + + scaleBooksByRating(G) + scaleOpinionsByRating(G) + addScoreToLabels(G) + + printBestList(G, num=n) + genAndShowHTML(G) + + +def fullGraph(): + G, books = buildFullGraph() + mu, std = genScores(G, books) + + removePriv(G) + removeEdge(G) + removeHighSpanTags(G, 7) + removeDangling(G, alsoBooks=False) + removeTopLists(G) + pruneTags(G, 3) + removeDangling(G, alsoBooks=True) + + scaleBooksByRating(G) + scaleOpinionsByRating(G) + addScoreToLabels(G) + + printBestList(G, num=100) + genAndShowHTML(G) + + +def readBooksAnalysis(): + G, books = buildFullGraph() + mu, std = genScores(G, books) + + removePriv(G) + removeUnread(G) + removeEdge(G) + removeHighSpanTags(G, 15) + removeDangling(G, alsoBooks=False) + removeTopLists(G) + pruneTags(G, 8) + + scaleBooksByRating(G) + scaleOpinionsByRating(G) + addScoreToLabels(G) + + printBestList(G, num=100) + genAndShowHTML(G) + + +if __name__ == "__main__": + recommendNBooks(30)