Integrated MRB-DB and added new-book-finder (from MRB)

This commit is contained in:
Dominik Moritz Roth 2022-02-06 17:59:21 +01:00
parent 7c168f3532
commit a2d747e41e
2 changed files with 3122 additions and 13 deletions

View File

@ -7,6 +7,7 @@ import copy
import random import random
import numpy as np import numpy as np
import pandas as pd
from scipy.stats import norm from scipy.stats import norm
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
@ -27,16 +28,20 @@ def getAuthors(book):
def getRecommenders(book): def getRecommenders(book):
recs = set()
for tag in book['tags']: for tag in book['tags']:
if tag.find(" Recommendation") != -1: if tag.find(" Recommendation") != -1:
yield tag.replace(" Recommendation", "") recs.add(tag.replace(" Recommendation", ""))
elif tag.find("s Literature Club") != -1: elif tag.find("s Literature Club") != -1:
yield tag.replace("s Literature Club", "") recs.add(tag.replace("s Literature Club", ""))
elif tag.find(":MRB") != -1:
recs.add(tag.replace(":MRB", ""))
return list(recs)
def getTags(book): def getTags(book):
for tag in book['tags']: for tag in book['tags']:
if tag.find(" Recommendation") == -1 and tag.find("s Literature Club") == -1 and tag.find(" Top ") == -1: if tag.find(" Recommendation") == -1 and tag.find("s Literature Club") == -1 and tag.find(" Top ") == -1 and tag.find(":MRB") == -1:
yield tag yield tag
@ -135,6 +140,12 @@ def removeDangling(G, alsoBooks=False):
if not len(G.adj[n]): if not len(G.adj[n]):
G.remove_node(n) G.remove_node(n)
def removeThinRecs(G, minCons=3):
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'recommender':
if not len(G.adj[n]) >= minCons:
G.remove_node(n)
def removeEdge(G): def removeEdge(G):
for n in list(G.nodes): for n in list(G.nodes):
@ -152,11 +163,11 @@ def removeBad(G, threshold, groups=['book', 'topList', 'recommender', 'author',
G.remove_node(n) G.remove_node(n)
def removeKeepBest(G, num, maxDistForRead=1): def removeKeepBest(G, num, maxDistForRead=1, forType='book'):
bestlist = [] bestlist = []
for n in list(G.nodes): for n in list(G.nodes):
node = G.nodes[n] node = G.nodes[n]
if node['t'] == 'book': if node['t'] == forType:
if 'score' in node and node['score'] != None: if 'score' in node and node['score'] != None:
bestlist.append(node) bestlist.append(node)
bestlist.sort(key=lambda node: node['score'], reverse=True) bestlist.sort(key=lambda node: node['score'], reverse=True)
@ -164,7 +175,7 @@ def removeKeepBest(G, num, maxDistForRead=1):
for n in list(G.nodes): for n in list(G.nodes):
node = G.nodes[n] node = G.nodes[n]
if node['t'] == 'book' and node not in bestlist or 'score' in node and node['score'] == None: if node['t'] == forType and node not in bestlist or 'score' in node and node['score'] == None:
if not 'rating' in node or node['rating'] == None or node['rating'] < bestlist[-1]['score']-maxDistForRead: if not 'rating' in node or node['rating'] == None or node['rating'] < bestlist[-1]['score']-maxDistForRead:
G.remove_node(n) G.remove_node(n)
@ -343,7 +354,7 @@ def removeUselessReadBooks(G):
contacts += 1 contacts += 1
for cousin in G.adj[adj]: for cousin in G.adj[adj]:
cousinNode = G.nodes[cousin] cousinNode = G.nodes[cousin]
if cousinNode['t']=='book' and 'score' in cousinNode: if cousinNode['t']=='book' and 'score' in cousinNode or cousinNode['t'] == 'newBook':
if adjNode['t']=='recommender': if adjNode['t']=='recommender':
force += 0.5 force += 0.5
else: else:
@ -456,8 +467,28 @@ def readColor(book):
else: else:
return 'gray' return 'gray'
def loadBooksFromDB(): def loadBooksFromDB():
books = loadBooksFromCalibreDB()
infuseDataFromMRB(books)
return books
def mrbGetBook(mrbdf, title, authors):
pot = mrbdf[mrbdf['title'].str.contains(title)]
for author in authors:
for part in author.split(" "):
if len(part)>=3:
pot = mrbdf[mrbdf['author'].str.contains(part)]
return pot.to_dict(orient='records')[0] if len(pot) else False
def infuseDataFromMRB(books):
mrbdf = pd.read_csv('mrb_db.csv')
for book in books:
mrb = mrbGetBook(mrbdf, book['title'], book['authors'])
if mrb:
for rec in str(mrb['recommender']).split('|'):
book['tags'] += [rec + ':MRB']
def loadBooksFromCalibreDB():
return json.loads(os.popen("calibredb list --for-machine -f all").read()) return json.loads(os.popen("calibredb list --for-machine -f all").read())
def remove_html_tags(text): def remove_html_tags(text):
@ -589,7 +620,7 @@ def scaleBooksByRating(G):
node['value'] = 20 + 5 * int(node['rating']) node['value'] = 20 + 5 * int(node['rating'])
else: else:
if 'score' in node and node['score'] != None: if 'score' in node and node['score'] != None:
node['value'] = 20 + 5 * int(node['score']) node['value'] = 20 + int(5 * float(node['score']))
else: else:
node['value'] = 15 node['value'] = 15
@ -607,7 +638,7 @@ def scaleOpinionsByRating(G):
def addScoreToLabels(G): def addScoreToLabels(G):
for n in list(G.nodes): for n in list(G.nodes):
node = G.nodes[n] node = G.nodes[n]
if node['t'] not in ['tag']: if node['t'] not in ['tag', 'newBook']:
if 'rating' in node and node['rating'] != None: if 'rating' in node and node['rating'] != None:
node['label'] += " ("+str(node['rating'])+")" node['label'] += " ("+str(node['rating'])+")"
else: else:
@ -710,18 +741,20 @@ def recommendNBooks(G, mu, std, n, removeTopListsB=True, removeUselessRecommende
removeRestOfSeries(G) removeRestOfSeries(G)
removeBad(G, mu-std-0.5) removeBad(G, mu-std-0.5)
removeBad(G, mu+std/2, groups=['recommender']) removeBad(G, mu+std/2, groups=['recommender'])
removeThinRecs(G, 3)
removeKeepBest(G, int(n*2) + 5, maxDistForRead=2) removeKeepBest(G, int(n*2) + 5, maxDistForRead=2)
removeEdge(G) removeEdge(G)
removeHighSpanTags(G, 8) removeHighSpanTags(G, 8)
removeHighSpanReadBooks(G, 14) removeHighSpanReadBooks(G, 14)
removeDangling(G, alsoBooks=False) removeDangling(G, alsoBooks=False)
pruneRecommenders(G, 16) pruneRecommenders(G, 12)
removeThinRecs(G, 3)
pruneTags(G, 9) pruneTags(G, 9)
removeBad(G, mu, groups=['book']) removeBad(G, mu, groups=['book'])
removeUselessReadBooks(G) removeUselessReadBooks(G)
pruneTags(G, 8) pruneTags(G, 8)
pruneAuthorCons(G, int(n/5)+3) pruneAuthorCons(G, int(n/5)+3)
pruneRecommenders(G, 16 - min(4, n/20)) pruneRecommenders(G, 12 - min(4, n/20))
removeUselessSeries(G, mu) removeUselessSeries(G, mu)
removeUselessTags(G) removeUselessTags(G)
if removeTopListsB: if removeTopListsB:
@ -734,6 +767,7 @@ def recommendNBooks(G, mu, std, n, removeTopListsB=True, removeUselessRecommende
removeUselessTags(G) removeUselessTags(G)
removeUselessReadBooks(G) removeUselessReadBooks(G)
removeKeepBest(G, n, maxDistForRead=1.25) removeKeepBest(G, n, maxDistForRead=1.25)
removeThinRecs(G, 2)
scaleBooksByRating(G) scaleBooksByRating(G)
scaleOpinionsByRating(G) scaleOpinionsByRating(G)
@ -762,7 +796,6 @@ def fullGraph(G, removeTopListsB=True):
scaleOpinionsByRating(G) scaleOpinionsByRating(G)
addScoreToLabels(G) addScoreToLabels(G)
def recommenderCompetence(G): def recommenderCompetence(G):
#removeRead(G) #removeRead(G)
removeUnread(G) removeUnread(G)
@ -907,6 +940,67 @@ def shell(G, books, mu, std):
from ptpython.repl import embed from ptpython.repl import embed
embed(globals(), locals()) embed(globals(), locals())
def newBooks(G, books, num, mu, std):
removeBad(G, mu+std)
findNewBooks(G, books, num, minRecSco = mu-std)
removeUnread(G)
removeUselessReadBooks(G)
removeTags(G)
removeTopLists(G)
removeSeries(G)
removeEdge(G)
removeDangling(G, alsoBooks=True)
scaleBooksByRating(G)
scaleOpinionsByRating(G)
addScoreToLabels(G)
def findNewBooks(G, books, num, minRecSco=5):
removeBad(G, 0.1, groups=['recommender'])
removeThinRecs(G, 2)
mrbdf = pd.read_csv('mrb_db.csv')
recs = []
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'recommender' and 'score' in node:
oldBooks = []
newBooks = []
recBooks = mrbdf[mrbdf['recommender'].str.contains(node['label'])].to_dict(orient='records')
for book in recBooks:
if book['title'] in [b['title'] for b in books]:
oldBooks.append({'title': book['title'], 'author': book['author']})
else:
newBooks.append({'title': book['title'], 'author': book['author']})
recs.append({'name': node['label'], 'rec': node, 'newBooks': newBooks, 'oldBooks': oldBooks})
for rec in recs:
for book in rec['newBooks']:
G.add_node('n/'+book['title'], color='blue', t='newBook', label=book['title'])
G.add_node('r/'+rec['rec']['label'], color='orange', t='recommender', label=rec['rec']['label'], score=rec['rec']['score'])
G.add_edge('r/'+rec['rec']['label'], 'n/'+book['title'], color='blue')
G.add_node('a/'+book['author'], color='green', t='author', label=book['author'])
G.add_edge('a/'+book['author'], 'n/'+book['title'], color='blue')
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'newBook':
ses = []
scores = []
for m in list(G.adj[n]):
adj = G.nodes[m]
if adj['t'] == 'recommender':
scores.append(adj['score'])
ses.append(adj['se'])
if len(scores) < 2:
G.remove_node(n)
else:
node['score'] = sum(scores)/len(scores) - 0.1/math.sqrt(len(scores))
node['fake_se'] = sum(ses)/(len(ses)*1.2) # This is not how SE works. DILLIGAF?
node['value'] = 20 + 5 * float(node['score'])
node['label'] += " ({:.2f}±{:.1f})".format(node['score'], node['fake_se'])
removeKeepBest(G, num, 10, 'newBook')
# while batchSize is implemented, we only get a good gonvergence when we disable it (batchSize=-1) # while batchSize is implemented, we only get a good gonvergence when we disable it (batchSize=-1)
# but might be necessary to enable later for a larger libary for better training performance... # but might be necessary to enable later for a larger libary for better training performance...
# maybe try again for 128 books? # maybe try again for 128 books?
@ -1039,6 +1133,7 @@ def cliInterface():
p_rec.add_argument('-n', type=int, default=20, help='number of books to recommend') p_rec.add_argument('-n', type=int, default=20, help='number of books to recommend')
p_rec.add_argument('--tag-based', action="store_true") p_rec.add_argument('--tag-based', action="store_true")
p_rec.add_argument('--recommender-based', action="store_true") p_rec.add_argument('--recommender-based', action="store_true")
p_rec.add_argument('--new', type=int, default=-1, help='number of new books to recommend')
p_rec = cmds.add_parser('listScores', description="TODO", aliases=['ls']) p_rec = cmds.add_parser('listScores', description="TODO", aliases=['ls'])
p_rec.add_argument('-n', type=int, default=50, help='number of books to recommend') p_rec.add_argument('-n', type=int, default=50, help='number of books to recommend')
@ -1064,6 +1159,9 @@ def cliInterface():
p_shell = cmds.add_parser('shell', description="TODO", aliases=[]) p_shell = cmds.add_parser('shell', description="TODO", aliases=[])
p_new = cmds.add_parser('newBooks', description="TODO", aliases=[])
p_new.add_argument('-n', type=int, default=10, help='number of books to recommend')
p_full = cmds.add_parser('full', description="TODO", aliases=[]) p_full = cmds.add_parser('full', description="TODO", aliases=[])
args = parser.parse_args() args = parser.parse_args()
@ -1078,7 +1176,12 @@ def cliInterface():
if not args.keep_whitepapers: if not args.keep_whitepapers:
removeWhitepapers(G) removeWhitepapers(G)
if args.cmd=="recommend": if args.cmd=="recommend":
if args.new==-1:
args.new = int(args.n / 5)
if args.new != 0:
findNewBooks(G, books, args.new, minRecSco = mu-std)
if args.tag_based: if args.tag_based:
if args.recommender_based: if args.recommender_based:
raise Exception('tag-based and recommender-based can not be be combined') raise Exception('tag-based and recommender-based can not be be combined')
@ -1105,6 +1208,8 @@ def cliInterface():
elif args.cmd=="progress": elif args.cmd=="progress":
progress(G, args.m) progress(G, args.m)
return return
elif args.cmd=="newBooks":
newBooks(G, books, args.n, mu, std)
else: else:
raise Exception("Bad") raise Exception("Bad")

3004
mrb_db.csv Normal file

File diff suppressed because it is too large Load Diff