Integrated MRB-DB and added new-book-finder (from MRB)

This commit is contained in:
Dominik Moritz Roth 2022-02-06 17:59:21 +01:00
parent 7c168f3532
commit a2d747e41e
2 changed files with 3122 additions and 13 deletions

View File

@ -7,6 +7,7 @@ import copy
import random
import numpy as np
import pandas as pd
from scipy.stats import norm
import matplotlib.pyplot as plt
@ -27,16 +28,20 @@ def getAuthors(book):
def getRecommenders(book):
recs = set()
for tag in book['tags']:
if tag.find(" Recommendation") != -1:
yield tag.replace(" Recommendation", "")
recs.add(tag.replace(" Recommendation", ""))
elif tag.find("s Literature Club") != -1:
yield tag.replace("s Literature Club", "")
recs.add(tag.replace("s Literature Club", ""))
elif tag.find(":MRB") != -1:
recs.add(tag.replace(":MRB", ""))
return list(recs)
def getTags(book):
for tag in book['tags']:
if tag.find(" Recommendation") == -1 and tag.find("s Literature Club") == -1 and tag.find(" Top ") == -1:
if tag.find(" Recommendation") == -1 and tag.find("s Literature Club") == -1 and tag.find(" Top ") == -1 and tag.find(":MRB") == -1:
yield tag
@ -135,6 +140,12 @@ def removeDangling(G, alsoBooks=False):
if not len(G.adj[n]):
G.remove_node(n)
def removeThinRecs(G, minCons=3):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'recommender':
if not len(G.adj[n]) >= minCons:
G.remove_node(n)
def removeEdge(G):
for n in list(G.nodes):
@ -152,11 +163,11 @@ def removeBad(G, threshold, groups=['book', 'topList', 'recommender', 'author',
G.remove_node(n)
def removeKeepBest(G, num, maxDistForRead=1):
def removeKeepBest(G, num, maxDistForRead=1, forType='book'):
bestlist = []
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'book':
if node['t'] == forType:
if 'score' in node and node['score'] != None:
bestlist.append(node)
bestlist.sort(key=lambda node: node['score'], reverse=True)
@ -164,7 +175,7 @@ def removeKeepBest(G, num, maxDistForRead=1):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'book' and node not in bestlist or 'score' in node and node['score'] == None:
if node['t'] == forType and node not in bestlist or 'score' in node and node['score'] == None:
if not 'rating' in node or node['rating'] == None or node['rating'] < bestlist[-1]['score']-maxDistForRead:
G.remove_node(n)
@ -343,7 +354,7 @@ def removeUselessReadBooks(G):
contacts += 1
for cousin in G.adj[adj]:
cousinNode = G.nodes[cousin]
if cousinNode['t']=='book' and 'score' in cousinNode:
if cousinNode['t']=='book' and 'score' in cousinNode or cousinNode['t'] == 'newBook':
if adjNode['t']=='recommender':
force += 0.5
else:
@ -456,8 +467,28 @@ def readColor(book):
else:
return 'gray'
def loadBooksFromDB():
books = loadBooksFromCalibreDB()
infuseDataFromMRB(books)
return books
def mrbGetBook(mrbdf, title, authors):
pot = mrbdf[mrbdf['title'].str.contains(title)]
for author in authors:
for part in author.split(" "):
if len(part)>=3:
pot = mrbdf[mrbdf['author'].str.contains(part)]
return pot.to_dict(orient='records')[0] if len(pot) else False
def infuseDataFromMRB(books):
mrbdf = pd.read_csv('mrb_db.csv')
for book in books:
mrb = mrbGetBook(mrbdf, book['title'], book['authors'])
if mrb:
for rec in str(mrb['recommender']).split('|'):
book['tags'] += [rec + ':MRB']
def loadBooksFromCalibreDB():
return json.loads(os.popen("calibredb list --for-machine -f all").read())
def remove_html_tags(text):
@ -589,7 +620,7 @@ def scaleBooksByRating(G):
node['value'] = 20 + 5 * int(node['rating'])
else:
if 'score' in node and node['score'] != None:
node['value'] = 20 + 5 * int(node['score'])
node['value'] = 20 + int(5 * float(node['score']))
else:
node['value'] = 15
@ -607,7 +638,7 @@ def scaleOpinionsByRating(G):
def addScoreToLabels(G):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] not in ['tag']:
if node['t'] not in ['tag', 'newBook']:
if 'rating' in node and node['rating'] != None:
node['label'] += " ("+str(node['rating'])+")"
else:
@ -710,18 +741,20 @@ def recommendNBooks(G, mu, std, n, removeTopListsB=True, removeUselessRecommende
removeRestOfSeries(G)
removeBad(G, mu-std-0.5)
removeBad(G, mu+std/2, groups=['recommender'])
removeThinRecs(G, 3)
removeKeepBest(G, int(n*2) + 5, maxDistForRead=2)
removeEdge(G)
removeHighSpanTags(G, 8)
removeHighSpanReadBooks(G, 14)
removeDangling(G, alsoBooks=False)
pruneRecommenders(G, 16)
pruneRecommenders(G, 12)
removeThinRecs(G, 3)
pruneTags(G, 9)
removeBad(G, mu, groups=['book'])
removeUselessReadBooks(G)
pruneTags(G, 8)
pruneAuthorCons(G, int(n/5)+3)
pruneRecommenders(G, 16 - min(4, n/20))
pruneRecommenders(G, 12 - min(4, n/20))
removeUselessSeries(G, mu)
removeUselessTags(G)
if removeTopListsB:
@ -734,6 +767,7 @@ def recommendNBooks(G, mu, std, n, removeTopListsB=True, removeUselessRecommende
removeUselessTags(G)
removeUselessReadBooks(G)
removeKeepBest(G, n, maxDistForRead=1.25)
removeThinRecs(G, 2)
scaleBooksByRating(G)
scaleOpinionsByRating(G)
@ -762,7 +796,6 @@ def fullGraph(G, removeTopListsB=True):
scaleOpinionsByRating(G)
addScoreToLabels(G)
def recommenderCompetence(G):
#removeRead(G)
removeUnread(G)
@ -907,6 +940,67 @@ def shell(G, books, mu, std):
from ptpython.repl import embed
embed(globals(), locals())
def newBooks(G, books, num, mu, std):
removeBad(G, mu+std)
findNewBooks(G, books, num, minRecSco = mu-std)
removeUnread(G)
removeUselessReadBooks(G)
removeTags(G)
removeTopLists(G)
removeSeries(G)
removeEdge(G)
removeDangling(G, alsoBooks=True)
scaleBooksByRating(G)
scaleOpinionsByRating(G)
addScoreToLabels(G)
def findNewBooks(G, books, num, minRecSco=5):
removeBad(G, 0.1, groups=['recommender'])
removeThinRecs(G, 2)
mrbdf = pd.read_csv('mrb_db.csv')
recs = []
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'recommender' and 'score' in node:
oldBooks = []
newBooks = []
recBooks = mrbdf[mrbdf['recommender'].str.contains(node['label'])].to_dict(orient='records')
for book in recBooks:
if book['title'] in [b['title'] for b in books]:
oldBooks.append({'title': book['title'], 'author': book['author']})
else:
newBooks.append({'title': book['title'], 'author': book['author']})
recs.append({'name': node['label'], 'rec': node, 'newBooks': newBooks, 'oldBooks': oldBooks})
for rec in recs:
for book in rec['newBooks']:
G.add_node('n/'+book['title'], color='blue', t='newBook', label=book['title'])
G.add_node('r/'+rec['rec']['label'], color='orange', t='recommender', label=rec['rec']['label'], score=rec['rec']['score'])
G.add_edge('r/'+rec['rec']['label'], 'n/'+book['title'], color='blue')
G.add_node('a/'+book['author'], color='green', t='author', label=book['author'])
G.add_edge('a/'+book['author'], 'n/'+book['title'], color='blue')
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'newBook':
ses = []
scores = []
for m in list(G.adj[n]):
adj = G.nodes[m]
if adj['t'] == 'recommender':
scores.append(adj['score'])
ses.append(adj['se'])
if len(scores) < 2:
G.remove_node(n)
else:
node['score'] = sum(scores)/len(scores) - 0.1/math.sqrt(len(scores))
node['fake_se'] = sum(ses)/(len(ses)*1.2) # This is not how SE works. DILLIGAF?
node['value'] = 20 + 5 * float(node['score'])
node['label'] += " ({:.2f}±{:.1f})".format(node['score'], node['fake_se'])
removeKeepBest(G, num, 10, 'newBook')
# while batchSize is implemented, we only get a good gonvergence when we disable it (batchSize=-1)
# but might be necessary to enable later for a larger libary for better training performance...
# maybe try again for 128 books?
@ -1039,6 +1133,7 @@ def cliInterface():
p_rec.add_argument('-n', type=int, default=20, help='number of books to recommend')
p_rec.add_argument('--tag-based', action="store_true")
p_rec.add_argument('--recommender-based', action="store_true")
p_rec.add_argument('--new', type=int, default=-1, help='number of new books to recommend')
p_rec = cmds.add_parser('listScores', description="TODO", aliases=['ls'])
p_rec.add_argument('-n', type=int, default=50, help='number of books to recommend')
@ -1064,6 +1159,9 @@ def cliInterface():
p_shell = cmds.add_parser('shell', description="TODO", aliases=[])
p_new = cmds.add_parser('newBooks', description="TODO", aliases=[])
p_new.add_argument('-n', type=int, default=10, help='number of books to recommend')
p_full = cmds.add_parser('full', description="TODO", aliases=[])
args = parser.parse_args()
@ -1078,7 +1176,12 @@ def cliInterface():
if not args.keep_whitepapers:
removeWhitepapers(G)
if args.cmd=="recommend":
if args.new==-1:
args.new = int(args.n / 5)
if args.new != 0:
findNewBooks(G, books, args.new, minRecSco = mu-std)
if args.tag_based:
if args.recommender_based:
raise Exception('tag-based and recommender-based can not be be combined')
@ -1105,6 +1208,8 @@ def cliInterface():
elif args.cmd=="progress":
progress(G, args.m)
return
elif args.cmd=="newBooks":
newBooks(G, books, args.n, mu, std)
else:
raise Exception("Bad")

3004
mrb_db.csv Normal file

File diff suppressed because it is too large Load Diff