Integrated MRB-DB and added new-book-finder (from MRB)
This commit is contained in:
parent
7c168f3532
commit
a2d747e41e
131
caliGraph.py
131
caliGraph.py
@ -7,6 +7,7 @@ import copy
|
|||||||
import random
|
import random
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
from scipy.stats import norm
|
from scipy.stats import norm
|
||||||
|
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
@ -27,16 +28,20 @@ def getAuthors(book):
|
|||||||
|
|
||||||
|
|
||||||
def getRecommenders(book):
|
def getRecommenders(book):
|
||||||
|
recs = set()
|
||||||
for tag in book['tags']:
|
for tag in book['tags']:
|
||||||
if tag.find(" Recommendation") != -1:
|
if tag.find(" Recommendation") != -1:
|
||||||
yield tag.replace(" Recommendation", "")
|
recs.add(tag.replace(" Recommendation", ""))
|
||||||
elif tag.find("s Literature Club") != -1:
|
elif tag.find("s Literature Club") != -1:
|
||||||
yield tag.replace("s Literature Club", "")
|
recs.add(tag.replace("s Literature Club", ""))
|
||||||
|
elif tag.find(":MRB") != -1:
|
||||||
|
recs.add(tag.replace(":MRB", ""))
|
||||||
|
return list(recs)
|
||||||
|
|
||||||
|
|
||||||
def getTags(book):
|
def getTags(book):
|
||||||
for tag in book['tags']:
|
for tag in book['tags']:
|
||||||
if tag.find(" Recommendation") == -1 and tag.find("s Literature Club") == -1 and tag.find(" Top ") == -1:
|
if tag.find(" Recommendation") == -1 and tag.find("s Literature Club") == -1 and tag.find(" Top ") == -1 and tag.find(":MRB") == -1:
|
||||||
yield tag
|
yield tag
|
||||||
|
|
||||||
|
|
||||||
@ -135,6 +140,12 @@ def removeDangling(G, alsoBooks=False):
|
|||||||
if not len(G.adj[n]):
|
if not len(G.adj[n]):
|
||||||
G.remove_node(n)
|
G.remove_node(n)
|
||||||
|
|
||||||
|
def removeThinRecs(G, minCons=3):
|
||||||
|
for n in list(G.nodes):
|
||||||
|
node = G.nodes[n]
|
||||||
|
if node['t'] == 'recommender':
|
||||||
|
if not len(G.adj[n]) >= minCons:
|
||||||
|
G.remove_node(n)
|
||||||
|
|
||||||
def removeEdge(G):
|
def removeEdge(G):
|
||||||
for n in list(G.nodes):
|
for n in list(G.nodes):
|
||||||
@ -152,11 +163,11 @@ def removeBad(G, threshold, groups=['book', 'topList', 'recommender', 'author',
|
|||||||
G.remove_node(n)
|
G.remove_node(n)
|
||||||
|
|
||||||
|
|
||||||
def removeKeepBest(G, num, maxDistForRead=1):
|
def removeKeepBest(G, num, maxDistForRead=1, forType='book'):
|
||||||
bestlist = []
|
bestlist = []
|
||||||
for n in list(G.nodes):
|
for n in list(G.nodes):
|
||||||
node = G.nodes[n]
|
node = G.nodes[n]
|
||||||
if node['t'] == 'book':
|
if node['t'] == forType:
|
||||||
if 'score' in node and node['score'] != None:
|
if 'score' in node and node['score'] != None:
|
||||||
bestlist.append(node)
|
bestlist.append(node)
|
||||||
bestlist.sort(key=lambda node: node['score'], reverse=True)
|
bestlist.sort(key=lambda node: node['score'], reverse=True)
|
||||||
@ -164,7 +175,7 @@ def removeKeepBest(G, num, maxDistForRead=1):
|
|||||||
|
|
||||||
for n in list(G.nodes):
|
for n in list(G.nodes):
|
||||||
node = G.nodes[n]
|
node = G.nodes[n]
|
||||||
if node['t'] == 'book' and node not in bestlist or 'score' in node and node['score'] == None:
|
if node['t'] == forType and node not in bestlist or 'score' in node and node['score'] == None:
|
||||||
if not 'rating' in node or node['rating'] == None or node['rating'] < bestlist[-1]['score']-maxDistForRead:
|
if not 'rating' in node or node['rating'] == None or node['rating'] < bestlist[-1]['score']-maxDistForRead:
|
||||||
G.remove_node(n)
|
G.remove_node(n)
|
||||||
|
|
||||||
@ -343,7 +354,7 @@ def removeUselessReadBooks(G):
|
|||||||
contacts += 1
|
contacts += 1
|
||||||
for cousin in G.adj[adj]:
|
for cousin in G.adj[adj]:
|
||||||
cousinNode = G.nodes[cousin]
|
cousinNode = G.nodes[cousin]
|
||||||
if cousinNode['t']=='book' and 'score' in cousinNode:
|
if cousinNode['t']=='book' and 'score' in cousinNode or cousinNode['t'] == 'newBook':
|
||||||
if adjNode['t']=='recommender':
|
if adjNode['t']=='recommender':
|
||||||
force += 0.5
|
force += 0.5
|
||||||
else:
|
else:
|
||||||
@ -456,8 +467,28 @@ def readColor(book):
|
|||||||
else:
|
else:
|
||||||
return 'gray'
|
return 'gray'
|
||||||
|
|
||||||
|
|
||||||
def loadBooksFromDB():
|
def loadBooksFromDB():
|
||||||
|
books = loadBooksFromCalibreDB()
|
||||||
|
infuseDataFromMRB(books)
|
||||||
|
return books
|
||||||
|
|
||||||
|
def mrbGetBook(mrbdf, title, authors):
|
||||||
|
pot = mrbdf[mrbdf['title'].str.contains(title)]
|
||||||
|
for author in authors:
|
||||||
|
for part in author.split(" "):
|
||||||
|
if len(part)>=3:
|
||||||
|
pot = mrbdf[mrbdf['author'].str.contains(part)]
|
||||||
|
return pot.to_dict(orient='records')[0] if len(pot) else False
|
||||||
|
|
||||||
|
def infuseDataFromMRB(books):
|
||||||
|
mrbdf = pd.read_csv('mrb_db.csv')
|
||||||
|
for book in books:
|
||||||
|
mrb = mrbGetBook(mrbdf, book['title'], book['authors'])
|
||||||
|
if mrb:
|
||||||
|
for rec in str(mrb['recommender']).split('|'):
|
||||||
|
book['tags'] += [rec + ':MRB']
|
||||||
|
|
||||||
|
def loadBooksFromCalibreDB():
|
||||||
return json.loads(os.popen("calibredb list --for-machine -f all").read())
|
return json.loads(os.popen("calibredb list --for-machine -f all").read())
|
||||||
|
|
||||||
def remove_html_tags(text):
|
def remove_html_tags(text):
|
||||||
@ -589,7 +620,7 @@ def scaleBooksByRating(G):
|
|||||||
node['value'] = 20 + 5 * int(node['rating'])
|
node['value'] = 20 + 5 * int(node['rating'])
|
||||||
else:
|
else:
|
||||||
if 'score' in node and node['score'] != None:
|
if 'score' in node and node['score'] != None:
|
||||||
node['value'] = 20 + 5 * int(node['score'])
|
node['value'] = 20 + int(5 * float(node['score']))
|
||||||
else:
|
else:
|
||||||
node['value'] = 15
|
node['value'] = 15
|
||||||
|
|
||||||
@ -607,7 +638,7 @@ def scaleOpinionsByRating(G):
|
|||||||
def addScoreToLabels(G):
|
def addScoreToLabels(G):
|
||||||
for n in list(G.nodes):
|
for n in list(G.nodes):
|
||||||
node = G.nodes[n]
|
node = G.nodes[n]
|
||||||
if node['t'] not in ['tag']:
|
if node['t'] not in ['tag', 'newBook']:
|
||||||
if 'rating' in node and node['rating'] != None:
|
if 'rating' in node and node['rating'] != None:
|
||||||
node['label'] += " ("+str(node['rating'])+")"
|
node['label'] += " ("+str(node['rating'])+")"
|
||||||
else:
|
else:
|
||||||
@ -710,18 +741,20 @@ def recommendNBooks(G, mu, std, n, removeTopListsB=True, removeUselessRecommende
|
|||||||
removeRestOfSeries(G)
|
removeRestOfSeries(G)
|
||||||
removeBad(G, mu-std-0.5)
|
removeBad(G, mu-std-0.5)
|
||||||
removeBad(G, mu+std/2, groups=['recommender'])
|
removeBad(G, mu+std/2, groups=['recommender'])
|
||||||
|
removeThinRecs(G, 3)
|
||||||
removeKeepBest(G, int(n*2) + 5, maxDistForRead=2)
|
removeKeepBest(G, int(n*2) + 5, maxDistForRead=2)
|
||||||
removeEdge(G)
|
removeEdge(G)
|
||||||
removeHighSpanTags(G, 8)
|
removeHighSpanTags(G, 8)
|
||||||
removeHighSpanReadBooks(G, 14)
|
removeHighSpanReadBooks(G, 14)
|
||||||
removeDangling(G, alsoBooks=False)
|
removeDangling(G, alsoBooks=False)
|
||||||
pruneRecommenders(G, 16)
|
pruneRecommenders(G, 12)
|
||||||
|
removeThinRecs(G, 3)
|
||||||
pruneTags(G, 9)
|
pruneTags(G, 9)
|
||||||
removeBad(G, mu, groups=['book'])
|
removeBad(G, mu, groups=['book'])
|
||||||
removeUselessReadBooks(G)
|
removeUselessReadBooks(G)
|
||||||
pruneTags(G, 8)
|
pruneTags(G, 8)
|
||||||
pruneAuthorCons(G, int(n/5)+3)
|
pruneAuthorCons(G, int(n/5)+3)
|
||||||
pruneRecommenders(G, 16 - min(4, n/20))
|
pruneRecommenders(G, 12 - min(4, n/20))
|
||||||
removeUselessSeries(G, mu)
|
removeUselessSeries(G, mu)
|
||||||
removeUselessTags(G)
|
removeUselessTags(G)
|
||||||
if removeTopListsB:
|
if removeTopListsB:
|
||||||
@ -734,6 +767,7 @@ def recommendNBooks(G, mu, std, n, removeTopListsB=True, removeUselessRecommende
|
|||||||
removeUselessTags(G)
|
removeUselessTags(G)
|
||||||
removeUselessReadBooks(G)
|
removeUselessReadBooks(G)
|
||||||
removeKeepBest(G, n, maxDistForRead=1.25)
|
removeKeepBest(G, n, maxDistForRead=1.25)
|
||||||
|
removeThinRecs(G, 2)
|
||||||
|
|
||||||
scaleBooksByRating(G)
|
scaleBooksByRating(G)
|
||||||
scaleOpinionsByRating(G)
|
scaleOpinionsByRating(G)
|
||||||
@ -762,7 +796,6 @@ def fullGraph(G, removeTopListsB=True):
|
|||||||
scaleOpinionsByRating(G)
|
scaleOpinionsByRating(G)
|
||||||
addScoreToLabels(G)
|
addScoreToLabels(G)
|
||||||
|
|
||||||
|
|
||||||
def recommenderCompetence(G):
|
def recommenderCompetence(G):
|
||||||
#removeRead(G)
|
#removeRead(G)
|
||||||
removeUnread(G)
|
removeUnread(G)
|
||||||
@ -907,6 +940,67 @@ def shell(G, books, mu, std):
|
|||||||
from ptpython.repl import embed
|
from ptpython.repl import embed
|
||||||
embed(globals(), locals())
|
embed(globals(), locals())
|
||||||
|
|
||||||
|
def newBooks(G, books, num, mu, std):
|
||||||
|
removeBad(G, mu+std)
|
||||||
|
findNewBooks(G, books, num, minRecSco = mu-std)
|
||||||
|
removeUnread(G)
|
||||||
|
removeUselessReadBooks(G)
|
||||||
|
removeTags(G)
|
||||||
|
removeTopLists(G)
|
||||||
|
removeSeries(G)
|
||||||
|
removeEdge(G)
|
||||||
|
removeDangling(G, alsoBooks=True)
|
||||||
|
|
||||||
|
scaleBooksByRating(G)
|
||||||
|
scaleOpinionsByRating(G)
|
||||||
|
addScoreToLabels(G)
|
||||||
|
|
||||||
|
|
||||||
|
def findNewBooks(G, books, num, minRecSco=5):
|
||||||
|
removeBad(G, 0.1, groups=['recommender'])
|
||||||
|
removeThinRecs(G, 2)
|
||||||
|
mrbdf = pd.read_csv('mrb_db.csv')
|
||||||
|
recs = []
|
||||||
|
for n in list(G.nodes):
|
||||||
|
node = G.nodes[n]
|
||||||
|
if node['t'] == 'recommender' and 'score' in node:
|
||||||
|
oldBooks = []
|
||||||
|
newBooks = []
|
||||||
|
recBooks = mrbdf[mrbdf['recommender'].str.contains(node['label'])].to_dict(orient='records')
|
||||||
|
for book in recBooks:
|
||||||
|
if book['title'] in [b['title'] for b in books]:
|
||||||
|
oldBooks.append({'title': book['title'], 'author': book['author']})
|
||||||
|
else:
|
||||||
|
newBooks.append({'title': book['title'], 'author': book['author']})
|
||||||
|
recs.append({'name': node['label'], 'rec': node, 'newBooks': newBooks, 'oldBooks': oldBooks})
|
||||||
|
for rec in recs:
|
||||||
|
for book in rec['newBooks']:
|
||||||
|
G.add_node('n/'+book['title'], color='blue', t='newBook', label=book['title'])
|
||||||
|
|
||||||
|
G.add_node('r/'+rec['rec']['label'], color='orange', t='recommender', label=rec['rec']['label'], score=rec['rec']['score'])
|
||||||
|
G.add_edge('r/'+rec['rec']['label'], 'n/'+book['title'], color='blue')
|
||||||
|
|
||||||
|
G.add_node('a/'+book['author'], color='green', t='author', label=book['author'])
|
||||||
|
G.add_edge('a/'+book['author'], 'n/'+book['title'], color='blue')
|
||||||
|
for n in list(G.nodes):
|
||||||
|
node = G.nodes[n]
|
||||||
|
if node['t'] == 'newBook':
|
||||||
|
ses = []
|
||||||
|
scores = []
|
||||||
|
for m in list(G.adj[n]):
|
||||||
|
adj = G.nodes[m]
|
||||||
|
if adj['t'] == 'recommender':
|
||||||
|
scores.append(adj['score'])
|
||||||
|
ses.append(adj['se'])
|
||||||
|
if len(scores) < 2:
|
||||||
|
G.remove_node(n)
|
||||||
|
else:
|
||||||
|
node['score'] = sum(scores)/len(scores) - 0.1/math.sqrt(len(scores))
|
||||||
|
node['fake_se'] = sum(ses)/(len(ses)*1.2) # This is not how SE works. DILLIGAF?
|
||||||
|
node['value'] = 20 + 5 * float(node['score'])
|
||||||
|
node['label'] += " ({:.2f}±{:.1f})".format(node['score'], node['fake_se'])
|
||||||
|
removeKeepBest(G, num, 10, 'newBook')
|
||||||
|
|
||||||
# while batchSize is implemented, we only get a good gonvergence when we disable it (batchSize=-1)
|
# while batchSize is implemented, we only get a good gonvergence when we disable it (batchSize=-1)
|
||||||
# but might be necessary to enable later for a larger libary for better training performance...
|
# but might be necessary to enable later for a larger libary for better training performance...
|
||||||
# maybe try again for 128 books?
|
# maybe try again for 128 books?
|
||||||
@ -1039,6 +1133,7 @@ def cliInterface():
|
|||||||
p_rec.add_argument('-n', type=int, default=20, help='number of books to recommend')
|
p_rec.add_argument('-n', type=int, default=20, help='number of books to recommend')
|
||||||
p_rec.add_argument('--tag-based', action="store_true")
|
p_rec.add_argument('--tag-based', action="store_true")
|
||||||
p_rec.add_argument('--recommender-based', action="store_true")
|
p_rec.add_argument('--recommender-based', action="store_true")
|
||||||
|
p_rec.add_argument('--new', type=int, default=-1, help='number of new books to recommend')
|
||||||
|
|
||||||
p_rec = cmds.add_parser('listScores', description="TODO", aliases=['ls'])
|
p_rec = cmds.add_parser('listScores', description="TODO", aliases=['ls'])
|
||||||
p_rec.add_argument('-n', type=int, default=50, help='number of books to recommend')
|
p_rec.add_argument('-n', type=int, default=50, help='number of books to recommend')
|
||||||
@ -1064,6 +1159,9 @@ def cliInterface():
|
|||||||
|
|
||||||
p_shell = cmds.add_parser('shell', description="TODO", aliases=[])
|
p_shell = cmds.add_parser('shell', description="TODO", aliases=[])
|
||||||
|
|
||||||
|
p_new = cmds.add_parser('newBooks', description="TODO", aliases=[])
|
||||||
|
p_new.add_argument('-n', type=int, default=10, help='number of books to recommend')
|
||||||
|
|
||||||
p_full = cmds.add_parser('full', description="TODO", aliases=[])
|
p_full = cmds.add_parser('full', description="TODO", aliases=[])
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
@ -1078,7 +1176,12 @@ def cliInterface():
|
|||||||
if not args.keep_whitepapers:
|
if not args.keep_whitepapers:
|
||||||
removeWhitepapers(G)
|
removeWhitepapers(G)
|
||||||
|
|
||||||
|
|
||||||
if args.cmd=="recommend":
|
if args.cmd=="recommend":
|
||||||
|
if args.new==-1:
|
||||||
|
args.new = int(args.n / 5)
|
||||||
|
if args.new != 0:
|
||||||
|
findNewBooks(G, books, args.new, minRecSco = mu-std)
|
||||||
if args.tag_based:
|
if args.tag_based:
|
||||||
if args.recommender_based:
|
if args.recommender_based:
|
||||||
raise Exception('tag-based and recommender-based can not be be combined')
|
raise Exception('tag-based and recommender-based can not be be combined')
|
||||||
@ -1105,6 +1208,8 @@ def cliInterface():
|
|||||||
elif args.cmd=="progress":
|
elif args.cmd=="progress":
|
||||||
progress(G, args.m)
|
progress(G, args.m)
|
||||||
return
|
return
|
||||||
|
elif args.cmd=="newBooks":
|
||||||
|
newBooks(G, books, args.n, mu, std)
|
||||||
else:
|
else:
|
||||||
raise Exception("Bad")
|
raise Exception("Bad")
|
||||||
|
|
||||||
|
3004
mrb_db.csv
Normal file
3004
mrb_db.csv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user