Integrated MRB-DB and added new-book-finder (from MRB)
This commit is contained in:
parent
7c168f3532
commit
a2d747e41e
131
caliGraph.py
131
caliGraph.py
@ -7,6 +7,7 @@ import copy
|
||||
import random
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from scipy.stats import norm
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
@ -27,16 +28,20 @@ def getAuthors(book):
|
||||
|
||||
|
||||
def getRecommenders(book):
|
||||
recs = set()
|
||||
for tag in book['tags']:
|
||||
if tag.find(" Recommendation") != -1:
|
||||
yield tag.replace(" Recommendation", "")
|
||||
recs.add(tag.replace(" Recommendation", ""))
|
||||
elif tag.find("s Literature Club") != -1:
|
||||
yield tag.replace("s Literature Club", "")
|
||||
recs.add(tag.replace("s Literature Club", ""))
|
||||
elif tag.find(":MRB") != -1:
|
||||
recs.add(tag.replace(":MRB", ""))
|
||||
return list(recs)
|
||||
|
||||
|
||||
def getTags(book):
|
||||
for tag in book['tags']:
|
||||
if tag.find(" Recommendation") == -1 and tag.find("s Literature Club") == -1 and tag.find(" Top ") == -1:
|
||||
if tag.find(" Recommendation") == -1 and tag.find("s Literature Club") == -1 and tag.find(" Top ") == -1 and tag.find(":MRB") == -1:
|
||||
yield tag
|
||||
|
||||
|
||||
@ -135,6 +140,12 @@ def removeDangling(G, alsoBooks=False):
|
||||
if not len(G.adj[n]):
|
||||
G.remove_node(n)
|
||||
|
||||
def removeThinRecs(G, minCons=3):
|
||||
for n in list(G.nodes):
|
||||
node = G.nodes[n]
|
||||
if node['t'] == 'recommender':
|
||||
if not len(G.adj[n]) >= minCons:
|
||||
G.remove_node(n)
|
||||
|
||||
def removeEdge(G):
|
||||
for n in list(G.nodes):
|
||||
@ -152,11 +163,11 @@ def removeBad(G, threshold, groups=['book', 'topList', 'recommender', 'author',
|
||||
G.remove_node(n)
|
||||
|
||||
|
||||
def removeKeepBest(G, num, maxDistForRead=1):
|
||||
def removeKeepBest(G, num, maxDistForRead=1, forType='book'):
|
||||
bestlist = []
|
||||
for n in list(G.nodes):
|
||||
node = G.nodes[n]
|
||||
if node['t'] == 'book':
|
||||
if node['t'] == forType:
|
||||
if 'score' in node and node['score'] != None:
|
||||
bestlist.append(node)
|
||||
bestlist.sort(key=lambda node: node['score'], reverse=True)
|
||||
@ -164,7 +175,7 @@ def removeKeepBest(G, num, maxDistForRead=1):
|
||||
|
||||
for n in list(G.nodes):
|
||||
node = G.nodes[n]
|
||||
if node['t'] == 'book' and node not in bestlist or 'score' in node and node['score'] == None:
|
||||
if node['t'] == forType and node not in bestlist or 'score' in node and node['score'] == None:
|
||||
if not 'rating' in node or node['rating'] == None or node['rating'] < bestlist[-1]['score']-maxDistForRead:
|
||||
G.remove_node(n)
|
||||
|
||||
@ -343,7 +354,7 @@ def removeUselessReadBooks(G):
|
||||
contacts += 1
|
||||
for cousin in G.adj[adj]:
|
||||
cousinNode = G.nodes[cousin]
|
||||
if cousinNode['t']=='book' and 'score' in cousinNode:
|
||||
if cousinNode['t']=='book' and 'score' in cousinNode or cousinNode['t'] == 'newBook':
|
||||
if adjNode['t']=='recommender':
|
||||
force += 0.5
|
||||
else:
|
||||
@ -456,8 +467,28 @@ def readColor(book):
|
||||
else:
|
||||
return 'gray'
|
||||
|
||||
|
||||
def loadBooksFromDB():
|
||||
books = loadBooksFromCalibreDB()
|
||||
infuseDataFromMRB(books)
|
||||
return books
|
||||
|
||||
def mrbGetBook(mrbdf, title, authors):
|
||||
pot = mrbdf[mrbdf['title'].str.contains(title)]
|
||||
for author in authors:
|
||||
for part in author.split(" "):
|
||||
if len(part)>=3:
|
||||
pot = mrbdf[mrbdf['author'].str.contains(part)]
|
||||
return pot.to_dict(orient='records')[0] if len(pot) else False
|
||||
|
||||
def infuseDataFromMRB(books):
|
||||
mrbdf = pd.read_csv('mrb_db.csv')
|
||||
for book in books:
|
||||
mrb = mrbGetBook(mrbdf, book['title'], book['authors'])
|
||||
if mrb:
|
||||
for rec in str(mrb['recommender']).split('|'):
|
||||
book['tags'] += [rec + ':MRB']
|
||||
|
||||
def loadBooksFromCalibreDB():
|
||||
return json.loads(os.popen("calibredb list --for-machine -f all").read())
|
||||
|
||||
def remove_html_tags(text):
|
||||
@ -589,7 +620,7 @@ def scaleBooksByRating(G):
|
||||
node['value'] = 20 + 5 * int(node['rating'])
|
||||
else:
|
||||
if 'score' in node and node['score'] != None:
|
||||
node['value'] = 20 + 5 * int(node['score'])
|
||||
node['value'] = 20 + int(5 * float(node['score']))
|
||||
else:
|
||||
node['value'] = 15
|
||||
|
||||
@ -607,7 +638,7 @@ def scaleOpinionsByRating(G):
|
||||
def addScoreToLabels(G):
|
||||
for n in list(G.nodes):
|
||||
node = G.nodes[n]
|
||||
if node['t'] not in ['tag']:
|
||||
if node['t'] not in ['tag', 'newBook']:
|
||||
if 'rating' in node and node['rating'] != None:
|
||||
node['label'] += " ("+str(node['rating'])+")"
|
||||
else:
|
||||
@ -710,18 +741,20 @@ def recommendNBooks(G, mu, std, n, removeTopListsB=True, removeUselessRecommende
|
||||
removeRestOfSeries(G)
|
||||
removeBad(G, mu-std-0.5)
|
||||
removeBad(G, mu+std/2, groups=['recommender'])
|
||||
removeThinRecs(G, 3)
|
||||
removeKeepBest(G, int(n*2) + 5, maxDistForRead=2)
|
||||
removeEdge(G)
|
||||
removeHighSpanTags(G, 8)
|
||||
removeHighSpanReadBooks(G, 14)
|
||||
removeDangling(G, alsoBooks=False)
|
||||
pruneRecommenders(G, 16)
|
||||
pruneRecommenders(G, 12)
|
||||
removeThinRecs(G, 3)
|
||||
pruneTags(G, 9)
|
||||
removeBad(G, mu, groups=['book'])
|
||||
removeUselessReadBooks(G)
|
||||
pruneTags(G, 8)
|
||||
pruneAuthorCons(G, int(n/5)+3)
|
||||
pruneRecommenders(G, 16 - min(4, n/20))
|
||||
pruneRecommenders(G, 12 - min(4, n/20))
|
||||
removeUselessSeries(G, mu)
|
||||
removeUselessTags(G)
|
||||
if removeTopListsB:
|
||||
@ -734,6 +767,7 @@ def recommendNBooks(G, mu, std, n, removeTopListsB=True, removeUselessRecommende
|
||||
removeUselessTags(G)
|
||||
removeUselessReadBooks(G)
|
||||
removeKeepBest(G, n, maxDistForRead=1.25)
|
||||
removeThinRecs(G, 2)
|
||||
|
||||
scaleBooksByRating(G)
|
||||
scaleOpinionsByRating(G)
|
||||
@ -762,7 +796,6 @@ def fullGraph(G, removeTopListsB=True):
|
||||
scaleOpinionsByRating(G)
|
||||
addScoreToLabels(G)
|
||||
|
||||
|
||||
def recommenderCompetence(G):
|
||||
#removeRead(G)
|
||||
removeUnread(G)
|
||||
@ -907,6 +940,67 @@ def shell(G, books, mu, std):
|
||||
from ptpython.repl import embed
|
||||
embed(globals(), locals())
|
||||
|
||||
def newBooks(G, books, num, mu, std):
|
||||
removeBad(G, mu+std)
|
||||
findNewBooks(G, books, num, minRecSco = mu-std)
|
||||
removeUnread(G)
|
||||
removeUselessReadBooks(G)
|
||||
removeTags(G)
|
||||
removeTopLists(G)
|
||||
removeSeries(G)
|
||||
removeEdge(G)
|
||||
removeDangling(G, alsoBooks=True)
|
||||
|
||||
scaleBooksByRating(G)
|
||||
scaleOpinionsByRating(G)
|
||||
addScoreToLabels(G)
|
||||
|
||||
|
||||
def findNewBooks(G, books, num, minRecSco=5):
|
||||
removeBad(G, 0.1, groups=['recommender'])
|
||||
removeThinRecs(G, 2)
|
||||
mrbdf = pd.read_csv('mrb_db.csv')
|
||||
recs = []
|
||||
for n in list(G.nodes):
|
||||
node = G.nodes[n]
|
||||
if node['t'] == 'recommender' and 'score' in node:
|
||||
oldBooks = []
|
||||
newBooks = []
|
||||
recBooks = mrbdf[mrbdf['recommender'].str.contains(node['label'])].to_dict(orient='records')
|
||||
for book in recBooks:
|
||||
if book['title'] in [b['title'] for b in books]:
|
||||
oldBooks.append({'title': book['title'], 'author': book['author']})
|
||||
else:
|
||||
newBooks.append({'title': book['title'], 'author': book['author']})
|
||||
recs.append({'name': node['label'], 'rec': node, 'newBooks': newBooks, 'oldBooks': oldBooks})
|
||||
for rec in recs:
|
||||
for book in rec['newBooks']:
|
||||
G.add_node('n/'+book['title'], color='blue', t='newBook', label=book['title'])
|
||||
|
||||
G.add_node('r/'+rec['rec']['label'], color='orange', t='recommender', label=rec['rec']['label'], score=rec['rec']['score'])
|
||||
G.add_edge('r/'+rec['rec']['label'], 'n/'+book['title'], color='blue')
|
||||
|
||||
G.add_node('a/'+book['author'], color='green', t='author', label=book['author'])
|
||||
G.add_edge('a/'+book['author'], 'n/'+book['title'], color='blue')
|
||||
for n in list(G.nodes):
|
||||
node = G.nodes[n]
|
||||
if node['t'] == 'newBook':
|
||||
ses = []
|
||||
scores = []
|
||||
for m in list(G.adj[n]):
|
||||
adj = G.nodes[m]
|
||||
if adj['t'] == 'recommender':
|
||||
scores.append(adj['score'])
|
||||
ses.append(adj['se'])
|
||||
if len(scores) < 2:
|
||||
G.remove_node(n)
|
||||
else:
|
||||
node['score'] = sum(scores)/len(scores) - 0.1/math.sqrt(len(scores))
|
||||
node['fake_se'] = sum(ses)/(len(ses)*1.2) # This is not how SE works. DILLIGAF?
|
||||
node['value'] = 20 + 5 * float(node['score'])
|
||||
node['label'] += " ({:.2f}±{:.1f})".format(node['score'], node['fake_se'])
|
||||
removeKeepBest(G, num, 10, 'newBook')
|
||||
|
||||
# while batchSize is implemented, we only get a good gonvergence when we disable it (batchSize=-1)
|
||||
# but might be necessary to enable later for a larger libary for better training performance...
|
||||
# maybe try again for 128 books?
|
||||
@ -1039,6 +1133,7 @@ def cliInterface():
|
||||
p_rec.add_argument('-n', type=int, default=20, help='number of books to recommend')
|
||||
p_rec.add_argument('--tag-based', action="store_true")
|
||||
p_rec.add_argument('--recommender-based', action="store_true")
|
||||
p_rec.add_argument('--new', type=int, default=-1, help='number of new books to recommend')
|
||||
|
||||
p_rec = cmds.add_parser('listScores', description="TODO", aliases=['ls'])
|
||||
p_rec.add_argument('-n', type=int, default=50, help='number of books to recommend')
|
||||
@ -1064,6 +1159,9 @@ def cliInterface():
|
||||
|
||||
p_shell = cmds.add_parser('shell', description="TODO", aliases=[])
|
||||
|
||||
p_new = cmds.add_parser('newBooks', description="TODO", aliases=[])
|
||||
p_new.add_argument('-n', type=int, default=10, help='number of books to recommend')
|
||||
|
||||
p_full = cmds.add_parser('full', description="TODO", aliases=[])
|
||||
|
||||
args = parser.parse_args()
|
||||
@ -1078,7 +1176,12 @@ def cliInterface():
|
||||
if not args.keep_whitepapers:
|
||||
removeWhitepapers(G)
|
||||
|
||||
|
||||
if args.cmd=="recommend":
|
||||
if args.new==-1:
|
||||
args.new = int(args.n / 5)
|
||||
if args.new != 0:
|
||||
findNewBooks(G, books, args.new, minRecSco = mu-std)
|
||||
if args.tag_based:
|
||||
if args.recommender_based:
|
||||
raise Exception('tag-based and recommender-based can not be be combined')
|
||||
@ -1105,6 +1208,8 @@ def cliInterface():
|
||||
elif args.cmd=="progress":
|
||||
progress(G, args.m)
|
||||
return
|
||||
elif args.cmd=="newBooks":
|
||||
newBooks(G, books, args.n, mu, std)
|
||||
else:
|
||||
raise Exception("Bad")
|
||||
|
||||
|
3004
mrb_db.csv
Normal file
3004
mrb_db.csv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user