CaliGraph/main.py

499 lines
14 KiB
Python

import os
import json
import math
import random
import numpy as np
from scipy.stats import norm
import matplotlib.pyplot as plt
import networkx as nx
from pyvis.network import Network
def getAllAuthors(books):
authors = set()
for book in books:
for author in getAuthors(book):
authors.add(author)
return list(authors)
def getAuthors(book):
return book['authors'].split(' & ')
def getRecommenders(book):
for tag in book['tags']:
if tag.find(" Recommendation") != -1:
yield tag.replace(" Recommendation", "")
def getTags(book):
for tag in book['tags']:
if tag.find(" Recommendation") == -1 and tag.find(" Top ") == -1:
yield tag
def getAllRecommenders(books):
recs = set()
for book in books:
for rec in getRecommenders(book):
recs.add(rec)
return list(recs)
def getTopLists(book):
lists = set()
for tag in book['tags']:
if tag.find(" Top ") != -1:
lists.add(tag.split(" Top ")[0])
return list(lists)
def getAllTopLists(books):
tops = set()
for book in books:
for top in getTopLists(book):
tops.add(top)
return list(tops)
def getAllSeries(books):
series = set()
for book in books:
if 'series' in book:
series.add(book['series'])
return list(series)
def getAllTags(books):
tags = set()
for book in books:
for tag in getTags(book):
tags.add(tag)
return list(tags)
def getTopListWheight(book, topList):
minScope = 100000
for tag in book['tags']:
if tag.find(topList+" Top ") != -1:
scope = int(tag.split(" Top ")[1])
minScope = min(minScope, scope)
if minScope == 100000:
raise Exception("You stupid?")
return 100/minScope
def removeRead(G):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'book':
if node['rating'] != None:
G.remove_node(n)
def removeUnread(G):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'book':
if node['rating'] == None:
G.remove_node(n)
def removePriv(G):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'book':
if 'priv' in node['tags']:
G.remove_node(n)
def removeDangling(G, alsoBooks=False):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] != 'book' or alsoBooks:
if not len(G.adj[n]):
G.remove_node(n)
def removeEdge(G):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] != 'book':
if len(G.adj[n]) < 2:
G.remove_node(n)
def removeBad(G, threshold, groups=['book', 'topList', 'recommender', 'author', 'series', 'tag']):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] in groups:
if 'score' in node and (node['score'] == None or node['score'] < threshold):
G.remove_node(n)
def removeKeepBest(G, num, maxDistForRead=1):
bestlist = []
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'book':
if 'score' in node and node['score'] != None:
bestlist.append(node)
bestlist.sort(key=lambda node: node['score'], reverse=True)
bestlist = bestlist[:num]
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'book' and node not in bestlist or 'score' in node and node['score'] == None:
if not 'rating' in node or node['rating'] == None or node['rating'] < bestlist[-1]['score']-maxDistForRead:
G.remove_node(n)
def removeTags(G):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'tag':
G.remove_node(n)
def pruneTags(G, minCons=2):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'tag':
foundCon = 0
for book in G.adj[n]:
for con in G.adj[book]:
if G.nodes[con]['t'] not in ['tag', 'topList']:
foundCon += 1
if foundCon > minCons:
G.remove_node(n)
def removeHighSpanTags(G, maxCons=5):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'tag':
if len(G.adj[n]) > maxCons:
G.remove_node(n)
def removeTopLists(G):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'topList':
G.remove_node(n)
def removeRestOfSeries(G):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'series':
seriesState = 0
for adj in G.adj[n]:
adjNode = G.nodes[adj]
if adjNode['rating'] != None:
seriesState = max(seriesState, int(
adjNode['series_index']))
for adj in list(G.adj[n]):
adjNode = G.nodes[adj]
if adjNode['series_index'] > seriesState + 1.0001:
G.remove_node(adj)
def scoreOpinions(G, globMu, globStd, errorFac=1.2):
for n in list(G.nodes):
node = G.nodes[n]
feedbacks = []
if node['t'] in ['topList', 'recommender', 'author', 'series', 'tag']:
adjacens = list(G.adj[n].keys())
for adj in adjacens:
adjNode = G.nodes[adj]
if adjNode['rating'] != None:
feedbacks.append(adjNode['rating'])
if len(feedbacks):
node['mean'], node['std'] = norm.fit(feedbacks)
node['se'] = globStd / math.sqrt(len(feedbacks))
ratio = len(feedbacks) / len(adjacens)
node['score'] = node['mean'] - errorFac * \
node['se']*(9/10 + (1-ratio)/10) + 0.001 * \
(node['t'] == 'recommender')
node['feedbacks'] = feedbacks
else:
node['score'] = None
def scoreUnread(G, globMu, globStd, errorFac=1):
for n in list(G.nodes):
feedbacks = []
deepFeedbacks = []
tagFeedbacks = []
node = G.nodes[n]
if node['t'] == 'book':
if node['rating'] == None:
adjacens = list(G.adj[n].keys())
for adj in adjacens:
adjNode = G.nodes[adj]
if 'score' in adjNode and adjNode['score'] != None:
if adjNode['t'] != 'tag':
feedbacks.append(adjNode['score'])
for fb in adjNode['feedbacks']:
deepFeedbacks.append(fb)
else:
tagFeedbacks.append(adjNode['score'])
if len(feedbacks):
node['mean'], node['std'] = norm.fit(deepFeedbacks)
node['mean2'], node['std2'] = norm.fit(feedbacks)
f_mean, f_std = norm.fit(feedbacks)
node['se'] = globStd / math.sqrt(len(deepFeedbacks))
# - errorFac*node['se']
node['score'] = (
(node['mean'] - errorFac*node['se'])*4 + node['mean2']*2 + (f_mean)*1)/7
if 'series' in node:
if node['series_index'] == 1.0:
node['score'] += 0.000000001
else:
node['score'] = None
def printBestList(G, num=25):
bestlist = []
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'book':
if 'score' in node and node['score'] != None:
bestlist.append(node)
bestlist.sort(key=lambda node: node['score'], reverse=True)
for i, book in enumerate(bestlist):
print("["+str(i+1).zfill(int(math.log10(num)+1))+"] "+book['title'] +
" ("+" & ".join(book['authors'])+"): {:.5f}".format(book['score']))
if i == num-1:
break
def readColor(book):
if 'rating' in book:
return 'green'
else:
return 'gray'
def loadBooksFromDB():
return json.loads(os.popen("calibredb list --for-machine -f all").read())
def buildBookGraph(books):
G = nx.Graph()
# Books
for book in books:
if 'rating' in book:
rating = book['rating']
else:
rating = None
if 'comments' in book:
desc = '' # book['comments']
else:
desc = ''
if 'series' in book:
series = book['series']
series_index = book['series_index']
else:
series = None
series_index = None
G.add_node(book['id'], t='book', label=book['title'], title=book['title'], shape='image', image=book['cover'], rating=rating,
tags=book['tags'], desc=desc, isbn=book['isbn'], files=book['formats'], authors=getAuthors(book), series=series, series_index=series_index)
return G
def graphAddAuthors(G, books):
for author in getAllAuthors(books):
G.add_node('a/'+author, color='green', t='author', label=author)
for book in books:
for author in getAuthors(book):
G.add_edge('a/'+author, book['id'], color=readColor(book))
return G
def graphAddRecommenders(G, books):
for rec in getAllRecommenders(books):
G.add_node('r/'+rec, color='orange', t='recommender', label=rec)
for book in books:
for rec in getRecommenders(book):
G.add_edge('r/'+rec, book['id'], color=readColor(book))
return G
def graphAddTopLists(G, books):
for tl in getAllTopLists(books):
G.add_node('t/'+tl, color='yellow', t='topList', label=tl)
for book in books:
for top in getTopLists(book):
G.add_edge('t/'+top, book['id'], wheight=getTopListWheight(
book, top), color=readColor(book))
return G
def graphAddSeries(G, books):
for series in getAllSeries(books):
G.add_node('s/'+series, color='red', t='series', label=series)
for book in books:
if 'series' in book:
G.add_edge('s/'+book['series'], book['id'], color=readColor(book))
return G
def graphAddTags(G, books):
for tag in getAllTags(books):
G.add_node('t/'+tag, color='gray', t='tag', label=tag)
for book in books:
for tag in getTags(book):
G.add_edge('t/'+tag, book['id'], color=readColor(book))
return G
def calcRecDist(G, books):
globRatings = []
for book in books:
if G.nodes[book['id']]['rating'] != None:
globRatings.append(G.nodes[book['id']]['rating'])
return norm.fit(globRatings)
def scaleBooksByRating(G):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] not in []:
if 'rating' in node and node['rating'] != None:
node['value'] = 20 + 5 * int(node['rating'])
else:
if 'score' in node and node['score'] != None:
node['value'] = 20 + 5 * int(node['score'])
else:
node['value'] = 15
def scaleOpinionsByRating(G):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] in ['topList', 'recommender', 'author', 'series']:
if 'score' in node and node['score'] != None:
node['value'] = 20 + 5 * int(node['score'])
else:
node['value'] = 20
def addScoreToLabels(G):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] not in ['tag']:
if 'rating' in node and node['rating'] != None:
node['label'] += " ("+str(node['rating'])+")"
else:
if 'score' in node and node['score'] != None:
node['label'] += " (~{:.2f}".format(node['score'])+")"
else:
node['label'] += " (~0)"
def genAndShowHTML(G, showButtons=False):
net = Network('1080px', '1920px')
if showButtons:
net.show_buttons(filter_=['configure', 'layout',
'interaction', 'physics', 'edges'])
net.from_nx(G)
net.show('nx.html')
def buildFullGraph():
books = loadBooksFromDB()
G = buildBookGraph(books)
graphAddAuthors(G, books)
graphAddRecommenders(G, books)
graphAddTopLists(G, books)
graphAddSeries(G, books)
graphAddTags(G, books)
return G, books
def genScores(G, books):
globMu, globStd = calcRecDist(G, books)
scoreOpinions(G, globMu, globStd)
scoreUnread(G, globMu, globStd)
return globMu, globStd
def recommendNBooks(n):
G, books = buildFullGraph()
mu, std = genScores(G, books)
removeRestOfSeries(G)
removePriv(G)
removeBad(G, mu-std-1.5)
removeKeepBest(G, int(n*2) + 5, maxDistForRead=1.5)
removeEdge(G)
removeHighSpanTags(G, 9)
removeDangling(G, alsoBooks=False)
pruneTags(G, 4)
removeBad(G, mu, groups=['book'])
pruneTags(G, 3)
removeTopLists(G)
removeDangling(G, alsoBooks=True)
removeKeepBest(G, n, maxDistForRead=0.75)
removeEdge(G)
removeDangling(G, alsoBooks=True)
scaleBooksByRating(G)
scaleOpinionsByRating(G)
addScoreToLabels(G)
printBestList(G, num=n)
genAndShowHTML(G)
def fullGraph():
G, books = buildFullGraph()
mu, std = genScores(G, books)
removePriv(G)
removeEdge(G)
removeHighSpanTags(G, 7)
removeDangling(G, alsoBooks=False)
removeTopLists(G)
pruneTags(G, 3)
removeDangling(G, alsoBooks=True)
scaleBooksByRating(G)
scaleOpinionsByRating(G)
addScoreToLabels(G)
printBestList(G, num=100)
genAndShowHTML(G)
def readBooksAnalysis():
G, books = buildFullGraph()
mu, std = genScores(G, books)
removePriv(G)
removeUnread(G)
removeEdge(G)
removeHighSpanTags(G, 15)
removeDangling(G, alsoBooks=False)
removeTopLists(G)
pruneTags(G, 8)
scaleBooksByRating(G)
scaleOpinionsByRating(G)
addScoreToLabels(G)
printBestList(G, num=100)
genAndShowHTML(G)
if __name__ == "__main__":
recommendNBooks(30)