New feature: Dissonance

This commit is contained in:
Dominik Moritz Roth 2022-09-11 18:56:47 +02:00
parent 6af38c686f
commit 1c34d2876f

View File

@ -18,9 +18,11 @@ import plotly.graph_objects as go
import wikipedia import wikipedia
class Error(Exception): class Error(Exception):
pass pass
def getAllAuthors(books): def getAllAuthors(books):
authors = set() authors = set()
for book in books: for book in books:
@ -131,6 +133,7 @@ def removePriv(G):
if 'priv' in node['tags']: if 'priv' in node['tags']:
G.remove_node(n) G.remove_node(n)
def removeWhitepapers(G): def removeWhitepapers(G):
for n in list(G.nodes): for n in list(G.nodes):
node = G.nodes[n] node = G.nodes[n]
@ -146,6 +149,7 @@ def removeDangling(G, alsoBooks=False):
if not len(G.adj[n]): if not len(G.adj[n]):
G.remove_node(n) G.remove_node(n)
def removeThinRecs(G, minCons=3): def removeThinRecs(G, minCons=3):
for n in list(G.nodes): for n in list(G.nodes):
node = G.nodes[n] node = G.nodes[n]
@ -153,6 +157,7 @@ def removeThinRecs(G, minCons=3):
if not len(G.adj[n]) >= minCons: if not len(G.adj[n]) >= minCons:
G.remove_node(n) G.remove_node(n)
def removeEdge(G): def removeEdge(G):
for n in list(G.nodes): for n in list(G.nodes):
node = G.nodes[n] node = G.nodes[n]
@ -256,6 +261,7 @@ def pruneRecommenderCons(G, maxCons=5):
if foundCon < 2: if foundCon < 2:
G.remove_node(m) G.remove_node(m)
def pruneAuthorCons(G, maxCons=3): def pruneAuthorCons(G, maxCons=3):
for n in list(G.nodes): for n in list(G.nodes):
node = G.nodes[n] node = G.nodes[n]
@ -281,6 +287,7 @@ def pruneAuthorCons(G, maxCons=3):
if foundCon < 2: if foundCon < 2:
G.remove_node(m) G.remove_node(m)
def removeHighSpanTags(G, maxCons=5): def removeHighSpanTags(G, maxCons=5):
for n in list(G.nodes): for n in list(G.nodes):
node = G.nodes[n] node = G.nodes[n]
@ -303,24 +310,28 @@ def removeTopLists(G):
if node['t'] == 'topList': if node['t'] == 'topList':
G.remove_node(n) G.remove_node(n)
def removeRecommenders(G): def removeRecommenders(G):
for n in list(G.nodes): for n in list(G.nodes):
node = G.nodes[n] node = G.nodes[n]
if node['t'] == 'recommender': if node['t'] == 'recommender':
G.remove_node(n) G.remove_node(n)
def removeAuthors(G): def removeAuthors(G):
for n in list(G.nodes): for n in list(G.nodes):
node = G.nodes[n] node = G.nodes[n]
if node['t'] == 'author': if node['t'] == 'author':
G.remove_node(n) G.remove_node(n)
def removeSeries(G): def removeSeries(G):
for n in list(G.nodes): for n in list(G.nodes):
node = G.nodes[n] node = G.nodes[n]
if node['t'] == 'series': if node['t'] == 'series':
G.remove_node(n) G.remove_node(n)
def removeRestOfSeries(G): def removeRestOfSeries(G):
for n in list(G.nodes): for n in list(G.nodes):
node = G.nodes[n] node = G.nodes[n]
@ -336,6 +347,7 @@ def removeRestOfSeries(G):
if adjNode['series_index'] > seriesState + 1.0001: if adjNode['series_index'] > seriesState + 1.0001:
G.remove_node(adj) G.remove_node(adj)
def removeUnusedRecommenders(G): def removeUnusedRecommenders(G):
for n in list(G.nodes): for n in list(G.nodes):
node = G.nodes[n] node = G.nodes[n]
@ -347,6 +359,7 @@ def removeUnusedRecommenders(G):
else: # No unrated recommendation else: # No unrated recommendation
G.remove_node(n) G.remove_node(n)
def removeUselessReadBooks(G): def removeUselessReadBooks(G):
minForce = 1.5 minForce = 1.5
minContact = 2 minContact = 2
@ -368,6 +381,7 @@ def removeUselessReadBooks(G):
if force < minForce or contacts < minContact: if force < minForce or contacts < minContact:
G.remove_node(n) G.remove_node(n)
def removeUselessTags(G, minUnread=1): def removeUselessTags(G, minUnread=1):
for n in list(G.nodes): for n in list(G.nodes):
node = G.nodes[n] node = G.nodes[n]
@ -380,6 +394,7 @@ def removeUselessTags(G, minUnread=1):
if foundUnread < minUnread: if foundUnread < minUnread:
G.remove_node(n) G.remove_node(n)
def removeUselessSeries(G, minSco=0): def removeUselessSeries(G, minSco=0):
for n in list(G.nodes): for n in list(G.nodes):
node = G.nodes[n] node = G.nodes[n]
@ -387,6 +402,7 @@ def removeUselessSeries(G, minSco=0):
if len(G.adj[n]) < 2 or node['score'] < minSco: if len(G.adj[n]) < 2 or node['score'] < minSco:
G.remove_node(n) G.remove_node(n)
def scoreOpinions(G, globMu, globStd): def scoreOpinions(G, globMu, globStd):
for n in list(G.nodes): for n in list(G.nodes):
node = G.nodes[n] node = G.nodes[n]
@ -406,6 +422,7 @@ def scoreOpinions(G, globMu, globStd):
else: else:
node['score'] = None node['score'] = None
def scoreUnread(G, globMu, globStd): def scoreUnread(G, globMu, globStd):
for n in list(G.nodes): for n in list(G.nodes):
feedbacks = [globMu] feedbacks = [globMu]
@ -417,13 +434,15 @@ def scoreUnread(G, globMu, globStd):
for adj in adjacens: for adj in adjacens:
adjNode = G.nodes[adj] adjNode = G.nodes[adj]
if 'score' in adjNode and adjNode['score'] != None: if 'score' in adjNode and adjNode['score'] != None:
w = [adjNode['t'], G[n][adj]['weight'] if 'weight' in G[n][adj] else 1] w = [adjNode['t'], G[n][adj]['weight']
if 'weight' in G[n][adj] else 1]
for fb in adjNode['feedbacks']: for fb in adjNode['feedbacks']:
feedbacks.append(fb) feedbacks.append(fb)
ws.append(w) ws.append(w)
if len(feedbacks): if len(feedbacks):
node['mean'], node['std'] = norm.fit(feedbacks) node['mean'], node['std'] = norm.fit(feedbacks)
node['median'] = np.percentile(feedbacks, [50], method='linear')[0] node['median'] = np.percentile(
feedbacks, [50], method='linear')[0]
node['se'] = globStd / math.sqrt(len(feedbacks)) node['se'] = globStd / math.sqrt(len(feedbacks))
feedbacks.append(node['pagerank_score']) feedbacks.append(node['pagerank_score'])
ws.append(['pagerank']) ws.append(['pagerank'])
@ -437,15 +456,18 @@ def scoreUnread(G, globMu, globStd):
# ws.append(['se']) # ws.append(['se'])
feedbacks.append(globMu) feedbacks.append(globMu)
ws.append(['bias']) ws.append(['bias'])
node['score'] = sum([fb*getWeightForType(w[0], w[1] if len(w)>1 else 1) for fb, w in zip(feedbacks, ws)])/sum([getWeightForType(w[0], w[1] if len(w)>1 else 1) for w in ws]) node['score'] = sum([fb*getWeightForType(w[0], w[1] if len(w) > 1 else 1) for fb, w in zip(
feedbacks, ws)])/sum([getWeightForType(w[0], w[1] if len(w) > 1 else 1) for w in ws])
node['_act'] = feedbacks node['_act'] = feedbacks
node['_wgh'] = ws node['_wgh'] = ws
else: else:
node['score'] = globMu + errorFac*globStd + len(feedbacks)*0.0000000001 node['score'] = globMu + errorFac * \
globStd + len(feedbacks)*0.0000000001
if 'series' in node: if 'series' in node:
if node['series_index'] == 1.0: if node['series_index'] == 1.0:
node['score'] += 0.000000001 node['score'] += 0.000000001
def getWeightForType(nodeType, edgeWeight=1): def getWeightForType(nodeType, edgeWeight=1):
global weights global weights
w = weights[nodeType] w = weights[nodeType]
@ -454,6 +476,7 @@ def getWeightForType(nodeType, edgeWeight=1):
else: else:
return w return w
def printBestList(G, t='book', num=-1): def printBestList(G, t='book', num=-1):
bestlist = [] bestlist = []
for n in list(G.nodes): for n in list(G.nodes):
@ -461,10 +484,12 @@ def printBestList(G, t='book', num=-1):
if node['t'] == t: if node['t'] == t:
if 'score' in node and node['score'] != None: if 'score' in node and node['score'] != None:
bestlist.append(node) bestlist.append(node)
bestlist.sort(key=lambda node: node['score'] + 0.00001*(node['se'] if 'se' in node else 0), reverse=True) bestlist.sort(key=lambda node: node['score'] + 0.00001 *
(node['se'] if 'se' in node else 0), reverse=True)
for i, book in enumerate(bestlist): for i, book in enumerate(bestlist):
if t == 'book': if t == 'book':
line = book['title'] + " ("+" & ".join(book['authors'])+")"+": {:.5f}".format(book['score']) line = book['title'] + " ("+" & ".join(book['authors'])+")" + \
": {:.5f}".format(book['score'])
else: else:
line = book['label'] line = book['label']
print("["+str(i+1).zfill(int((math.log10(num) if num != -1 else 3)+1))+"] "+line) print("["+str(i+1).zfill(int((math.log10(num) if num != -1 else 3)+1))+"] "+line)
@ -478,12 +503,14 @@ def readColor(book):
else: else:
return 'gray' return 'gray'
def loadBooksFromDB(): def loadBooksFromDB():
books = calibreDB.getBooks() books = calibreDB.getBooks()
infuseDataFromMRB(books) infuseDataFromMRB(books)
# infuseDataFromTGB(books) # infuseDataFromTGB(books)
return books return books
def mrbGetBook(mrbdf, title, authors): def mrbGetBook(mrbdf, title, authors):
title = title.split('(')[0] title = title.split('(')[0]
title = title.replace('*', '') title = title.replace('*', '')
@ -499,6 +526,7 @@ def mrbGetBook(mrbdf, title, authors):
return d return d
return False return False
def tgbGetBook(df, title, authors): def tgbGetBook(df, title, authors):
title = title.split('(')[0] title = title.split('(')[0]
title = title.replace('*', '') title = title.replace('*', '')
@ -514,6 +542,7 @@ def tgbGetBook(df, title, authors):
return d return d
return False return False
def infuseDataFromMRB(books): def infuseDataFromMRB(books):
mrbdf = pd.read_csv('rec_dbs/mrb_db.csv') mrbdf = pd.read_csv('rec_dbs/mrb_db.csv')
for book in books: for book in books:
@ -522,6 +551,7 @@ def infuseDataFromMRB(books):
for rec in str(mrb['recommender']).split('|'): for rec in str(mrb['recommender']).split('|'):
book['tags'] += [rec + ':MRB'] book['tags'] += [rec + ':MRB']
def infuseDataFromTGB(books): def infuseDataFromTGB(books):
for i in range(1, 3): for i in range(1, 3):
df = pd.read_csv('rec_dbs/tgb_'+str(i)+'.csv') df = pd.read_csv('rec_dbs/tgb_'+str(i)+'.csv')
@ -530,12 +560,14 @@ def infuseDataFromTGB(books):
if tgb: if tgb:
book['tgb_rank'] = int(tgb['id']) book['tgb_rank'] = int(tgb['id'])
class calibreDB(): class calibreDB():
@classmethod @classmethod
def _getTxt(cls, request): def _getTxt(cls, request):
ret = os.popen("calibredb "+request).read() ret = os.popen("calibredb "+request).read()
if not ret: if not ret:
raise Error('Unable to connect to CalibreDB. Please close all open instances of Calibre.') raise Error(
'Unable to connect to CalibreDB. Please close all open instances of Calibre.')
return ret return ret
@classmethod @classmethod
@ -557,7 +589,8 @@ class calibreDB():
cols = cls.getCustomColumns() cols = cls.getCustomColumns()
avai = ['calice_score' in cols, 'calice_rating' in cols] avai = ['calice_score' in cols, 'calice_rating' in cols]
if not any(avai): if not any(avai):
raise Error('Custom Columns missing from CalibreDB. Create columns for "Calice Score" and/or "Calice Rating" using the "createCaliceColumn" command.') raise Error(
'Custom Columns missing from CalibreDB. Create columns for "Calice Score" and/or "Calice Rating" using the "createCaliceColumn" command.')
return avai return avai
@classmethod @classmethod
@ -586,9 +619,12 @@ class calibreDB():
cls._getTxt('set_custom calice_score '+str(bookId)+' ""') cls._getTxt('set_custom calice_score '+str(bookId)+' ""')
else: else:
if sco: if sco:
cls._getTxt('set_custom calice_score '+str(bookId)+' '+str(round(score,5))) cls._getTxt('set_custom calice_score ' +
str(bookId)+' '+str(round(score, 5)))
if rat: if rat:
cls._getTxt('set_custom calice_rating '+str(bookId)+' '+str(int(round(score)))) cls._getTxt('set_custom calice_rating ' +
str(bookId)+' '+str(int(round(score))))
def calice(G): def calice(G):
scores = {} scores = {}
@ -602,10 +638,12 @@ def calice(G):
calibreDB.writeCaliceColumnMultiple(scores) calibreDB.writeCaliceColumnMultiple(scores)
print('Done.') print('Done.')
def remove_html_tags(text): def remove_html_tags(text):
clean = re.compile('<.*?>') clean = re.compile('<.*?>')
return re.sub(clean, '', text) return re.sub(clean, '', text)
def getKeywords(txt, rake): def getKeywords(txt, rake):
txt = remove_html_tags(txt) txt = remove_html_tags(txt)
k = [] k = []
@ -624,6 +662,7 @@ def getKeywords(txt,rake):
return k return k
return [] return []
def runPagerank(G): def runPagerank(G):
try: try:
scores = nx.pagerank(G=G) scores = nx.pagerank(G=G)
@ -634,6 +673,7 @@ def runPagerank(G):
for n in list(G.nodes): for n in list(G.nodes):
G.nodes[n]['pagerank_score'] = scores[n] if n in scores else 0 G.nodes[n]['pagerank_score'] = scores[n] if n in scores else 0
def buildBookGraph(books, darkMode=False, extractKeywords=True, mergeTags=True): def buildBookGraph(books, darkMode=False, extractKeywords=True, mergeTags=True):
G = nx.Graph() G = nx.Graph()
if extractKeywords: if extractKeywords:
@ -652,7 +692,8 @@ def buildBookGraph(books, darkMode=False, extractKeywords=True, mergeTags=True):
else: else:
desc = '' desc = ''
if 'comments' in book and extractKeywords: if 'comments' in book and extractKeywords:
sanitized = re.sub(r'[^a-zA-Z0-9\s\.äöü]+', '', book['comments']).replace('\n',' ') sanitized = re.sub(r'[^a-zA-Z0-9\s\.äöü]+',
'', book['comments']).replace('\n', ' ')
keywords = getKeywords(sanitized, rake) keywords = getKeywords(sanitized, rake)
else: else:
keywords = [] keywords = []
@ -664,10 +705,12 @@ def buildBookGraph(books, darkMode=False, extractKeywords=True, mergeTags=True):
else: else:
series = None series = None
series_index = None series_index = None
G.add_node(book['id'], t='book', label=book['title'], title=book['title'], shape='image', image=book['cover'], rating=rating, tags=tags, keywords=keywords, desc=desc, isbn=book['isbn'], files=book['formats'], authors=getAuthors(book), series=series, series_index=series_index, calibreID=book['id']) G.add_node(book['id'], t='book', label=book['title'], title=book['title'], shape='image', image=book['cover'], rating=rating, tags=tags, keywords=keywords,
desc=desc, isbn=book['isbn'], files=book['formats'], authors=getAuthors(book), series=series, series_index=series_index, calibreID=book['id'])
return G return G
def getWikiImage(search_term): def getWikiImage(search_term):
from fuzzywuzzy import fuzz from fuzzywuzzy import fuzz
WIKI_REQUEST = 'http://en.wikipedia.org/w/api.php?action=query&prop=pageimages&format=json&piprop=original&titles=' WIKI_REQUEST = 'http://en.wikipedia.org/w/api.php?action=query&prop=pageimages&format=json&piprop=original&titles='
@ -681,12 +724,14 @@ def getWikiImage(search_term):
title = wkpage.title title = wkpage.title
response = requests.get(WIKI_REQUEST+title) response = requests.get(WIKI_REQUEST+title)
json_data = json.loads(response.text) json_data = json.loads(response.text)
img_link = list(json_data['query']['pages'].values())[0]['original']['source'] img_link = list(json_data['query']['pages'].values())[
0]['original']['source']
return img_link return img_link
except: except:
print('[!] No match for '+search_term+' on WikiPedia...') print('[!] No match for '+search_term+' on WikiPedia...')
return None return None
def graphAddAuthors(G, books, darkMode=False): def graphAddAuthors(G, books, darkMode=False):
for author in getAllAuthors(books): for author in getAllAuthors(books):
G.add_node('a/'+author, color='green', t='author', label=author) G.add_node('a/'+author, color='green', t='author', label=author)
@ -695,6 +740,7 @@ def graphAddAuthors(G, books, darkMode=False):
G.add_edge('a/'+author, book['id'], color=readColor(book)) G.add_edge('a/'+author, book['id'], color=readColor(book))
return G return G
def graphAddRecommenders(G, books, darkMode=False): def graphAddRecommenders(G, books, darkMode=False):
for rec in getAllRecommenders(books): for rec in getAllRecommenders(books):
G.add_node('r/'+rec, color='orange', t='recommender', label=rec) G.add_node('r/'+rec, color='orange', t='recommender', label=rec)
@ -703,6 +749,7 @@ def graphAddRecommenders(G, books, darkMode=False):
G.add_edge('r/'+rec, book['id'], color=readColor(book)) G.add_edge('r/'+rec, book['id'], color=readColor(book))
return G return G
def graphAddTopLists(G, books, darkMode=False): def graphAddTopLists(G, books, darkMode=False):
for tl in getAllTopLists(books): for tl in getAllTopLists(books):
G.add_node('t/'+tl, color='yellow', t='topList', label=tl) G.add_node('t/'+tl, color='yellow', t='topList', label=tl)
@ -715,7 +762,8 @@ def graphAddTopLists(G, books, darkMode=False):
def graphAddSeries(G, books, darkMode=False): def graphAddSeries(G, books, darkMode=False):
for series in getAllSeries(books): for series in getAllSeries(books):
G.add_node('s/'+series, color='red', t='series', label=series, shape='triangle') G.add_node('s/'+series, color='red', t='series',
label=series, shape='triangle')
for book in books: for book in books:
if 'series' in book: if 'series' in book:
G.add_edge('s/'+book['series'], book['id'], color=readColor(book)) G.add_edge('s/'+book['series'], book['id'], color=readColor(book))
@ -724,7 +772,8 @@ def graphAddSeries(G, books, darkMode=False):
def graphAddTags(G, books, darkMode=False): def graphAddTags(G, books, darkMode=False):
for tag in getAllTags(books): for tag in getAllTags(books):
G.add_node('t/'+tag, color=['lightGray','darkgray'][darkMode], t='tag', label=tag, shape='box') G.add_node('t/'+tag, color=['lightGray', 'darkgray']
[darkMode], t='tag', label=tag, shape='box')
for book in books: for book in books:
for tag in getTags(book): for tag in getTags(book):
G.add_edge('t/'+tag, book['id'], color=readColor(book)) G.add_edge('t/'+tag, book['id'], color=readColor(book))
@ -770,7 +819,8 @@ def addScoreToLabels(G):
node['label'] += " ("+str(node['rating'])+")" node['label'] += " ("+str(node['rating'])+")"
else: else:
if 'score' in node and node['score'] != None and 'se' in node: if 'score' in node and node['score'] != None and 'se' in node:
node['label'] += " ({:.2f}±{:.1f})".format(node['score'], node['se']) node['label'] += " ({:.2f}±{:.1f})".format(
node['score'], node['se'])
else: else:
node['label'] += " (0±∞)" node['label'] += " (0±∞)"
@ -870,6 +920,7 @@ def genAndShow3D(G, darkMode=False):
fig.show() fig.show()
def buildFullGraph(darkMode=False): def buildFullGraph(darkMode=False):
books = loadBooksFromDB() books = loadBooksFromDB()
G = buildBookGraph(books, darkMode=darkMode) G = buildBookGraph(books, darkMode=darkMode)
@ -890,6 +941,7 @@ def genScores(G, books, calcPagerank=True):
scoreUnread(G, globMu, globStd) scoreUnread(G, globMu, globStd)
return globMu, globStd return globMu, globStd
def addImageToNode(node, cache, shape='circularImage'): def addImageToNode(node, cache, shape='circularImage'):
name = node['label'].split(' (')[0].replace('*', '') name = node['label'].split(' (')[0].replace('*', '')
if not name in cache or (cache[name] == False and random.random() < 0.05): if not name in cache or (cache[name] == False and random.random() < 0.05):
@ -906,6 +958,7 @@ def addImageToNode(node, cache, shape='circularImage'):
node['image'] = img node['image'] = img
node['shape'] = shape node['shape'] = shape
def addImagesToNodes(G): def addImagesToNodes(G):
try: try:
with open('.imgLinkCache.json', 'r') as cf: with open('.imgLinkCache.json', 'r') as cf:
@ -915,10 +968,12 @@ def addImagesToNodes(G):
for n in list(G.nodes): for n in list(G.nodes):
node = G.nodes[n] node = G.nodes[n]
if node['t'] in ['recommender', 'author']: if node['t'] in ['recommender', 'author']:
addImageToNode(node, cache, ['circularImage','image'][node['t']=='author']) addImageToNode(
node, cache, ['circularImage', 'image'][node['t'] == 'author'])
with open('.imgLinkCache.json', 'w') as cf: with open('.imgLinkCache.json', 'w') as cf:
cf.write(json.dumps(cache)) cf.write(json.dumps(cache))
def recommendNBooksRecommenderBased(G, mu, std, n, removeTopListsB=True, removeUselessRecommenders=True): def recommendNBooksRecommenderBased(G, mu, std, n, removeTopListsB=True, removeUselessRecommenders=True):
removeRestOfSeries(G) removeRestOfSeries(G)
removeBad(G, mu-std*2-1) removeBad(G, mu-std*2-1)
@ -976,6 +1031,7 @@ def recommendNBooksTagBased(G, mu, std, n, removeTopListsB=True):
scaleOpinionsByRating(G) scaleOpinionsByRating(G)
addScoreToLabels(G) addScoreToLabels(G)
def recommendNBooks(G, mu, std, n, removeTopListsB=True, removeUselessRecommenders=True, v3d=False): def recommendNBooks(G, mu, std, n, removeTopListsB=True, removeUselessRecommenders=True, v3d=False):
removeRestOfSeries(G) removeRestOfSeries(G)
removeBad(G, mu-std-0.5) removeBad(G, mu-std-0.5)
@ -1035,6 +1091,7 @@ def fullGraph(G, removeTopListsB=True):
scaleOpinionsByRating(G) scaleOpinionsByRating(G)
addScoreToLabels(G) addScoreToLabels(G)
def recommenderCompetence(G): def recommenderCompetence(G):
# removeRead(G) # removeRead(G)
removeUnread(G) removeUnread(G)
@ -1060,6 +1117,7 @@ def recommenderCompetence(G):
node['score'] = 0 node['score'] = 0
node['score'] /= 2 node['score'] /= 2
def readBooksAnalysis(G, minRating=0, showAllTags=True, removeUnconnected=False, removeTopListsB=True): def readBooksAnalysis(G, minRating=0, showAllTags=True, removeUnconnected=False, removeTopListsB=True):
removeUnread(G) removeUnread(G)
removeBad(G, minRating) removeBad(G, minRating)
@ -1075,6 +1133,7 @@ def readBooksAnalysis(G, minRating=0, showAllTags=True, removeUnconnected=False,
scaleOpinionsByRating(G) scaleOpinionsByRating(G)
addScoreToLabels(G) addScoreToLabels(G)
def progress(G, books, mu, minimum=3.5): def progress(G, books, mu, minimum=3.5):
findNewBooks(G, books, mu, -1, minRecSco=minimum) findNewBooks(G, books, mu, -1, minRecSco=minimum)
bookCount = 0 bookCount = 0
@ -1138,6 +1197,7 @@ def analyze(G, books, mu, type_name, name, dist=2.1):
addScoreToLabels(G) addScoreToLabels(G)
match['label'] = "*"+match['label']+"*" match['label'] = "*"+match['label']+"*"
def waveFlow(G, node, n, dist, menge, firstEdge=False): def waveFlow(G, node, n, dist, menge, firstEdge=False):
if dist <= 0: if dist <= 0:
return return
@ -1167,7 +1227,8 @@ def waveFlow(G, node, n, dist, menge, firstEdge=False):
book['score'] = 0 book['score'] = 0
bestlist.append(book) bestlist.append(book)
bestlist.sort(key=lambda node: node['score'], reverse=True) bestlist.sort(key=lambda node: node['score'], reverse=True)
toKeep = min(int(dist*10), math.ceil(len(bestlist) * dist - len(keeplist)*0.5)) toKeep = min(int(dist*10), math.ceil(len(bestlist)
* dist - len(keeplist)*0.5))
if toKeep <= 0: if toKeep <= 0:
keeplist.sort(key=lambda node: node['rating'], reverse=True) keeplist.sort(key=lambda node: node['rating'], reverse=True)
keeplist = keeplist[:min(int(dist*10), int(len(keeplist) * dist))] keeplist = keeplist[:min(int(dist*10), int(len(keeplist) * dist))]
@ -1180,6 +1241,7 @@ def waveFlow(G, node, n, dist, menge, firstEdge=False):
if node in bestlist or node in keeplist: if node in bestlist or node in keeplist:
waveFlow(G, node, m, dist, menge, firstEdge=firstEdge) waveFlow(G, node, m, dist, menge, firstEdge=firstEdge)
def gensimTokensForLines(lines): def gensimTokensForLines(lines):
for i, line in enumerate(lines): for i, line in enumerate(lines):
tokens = gensim.utils.simple_preprocess(line) tokens = gensim.utils.simple_preprocess(line)
@ -1189,6 +1251,7 @@ def gensimTokensForLines(lines):
# For training data, add tags # For training data, add tags
yield gensim.models.doc2vec.TaggedDocument(tokens, [i]) yield gensim.models.doc2vec.TaggedDocument(tokens, [i])
def buildDoc2Vec(books): def buildDoc2Vec(books):
import gensim import gensim
for n in list(G.nodes): for n in list(G.nodes):
@ -1197,10 +1260,12 @@ def buildDoc2Vec(books):
pass pass
gensimTokensForLines(lines) gensimTokensForLines(lines)
def shell(G, books, mu, std): def shell(G, books, mu, std):
from ptpython.repl import embed from ptpython.repl import embed
embed(globals(), locals()) embed(globals(), locals())
def newBooks(G, books, num, mu, std): def newBooks(G, books, num, mu, std):
removeBad(G, mu-std*2) removeBad(G, mu-std*2)
findNewBooks(G, books, mu, num, minRecSco=mu-std) findNewBooks(G, books, mu, num, minRecSco=mu-std)
@ -1226,21 +1291,29 @@ def findNewBooks(G, books, mu, num=-1, minRecSco=5):
if node['t'] == 'recommender' and 'score' in node: if node['t'] == 'recommender' and 'score' in node:
oldBooks = [] oldBooks = []
newBooks = [] newBooks = []
recBooks = mrbdf[mrbdf['recommender'].str.contains(node['label'])].to_dict(orient='records') recBooks = mrbdf[mrbdf['recommender'].str.contains(
node['label'])].to_dict(orient='records')
for book in recBooks: for book in recBooks:
if book['title'] in [b['title'] for b in books]: if book['title'] in [b['title'] for b in books]:
oldBooks.append({'title': book['title'], 'author': book['author']}) oldBooks.append(
{'title': book['title'], 'author': book['author']})
else: else:
newBooks.append({'title': book['title'], 'author': book['author']}) newBooks.append(
recs.append({'name': node['label'], 'rec': node, 'newBooks': newBooks, 'oldBooks': oldBooks}) {'title': book['title'], 'author': book['author']})
recs.append({'name': node['label'], 'rec': node,
'newBooks': newBooks, 'oldBooks': oldBooks})
for rec in recs: for rec in recs:
for book in rec['newBooks']: for book in rec['newBooks']:
G.add_node('n/'+book['title'], color='blue', t='newBook', label=book['title'], author=book['author']) G.add_node('n/'+book['title'], color='blue', t='newBook',
label=book['title'], author=book['author'])
G.add_node('r/'+rec['rec']['label'], color='orange', t='recommender', label=rec['rec']['label'], score=rec['rec']['score']) G.add_node('r/'+rec['rec']['label'], color='orange', t='recommender',
G.add_edge('r/'+rec['rec']['label'], 'n/'+book['title'], color='blue') label=rec['rec']['label'], score=rec['rec']['score'])
G.add_edge('r/'+rec['rec']['label'], 'n/' +
book['title'], color='blue')
G.add_node('a/'+book['author'], color='green', t='author', label=book['author']) G.add_node('a/'+book['author'], color='green',
t='author', label=book['author'])
G.add_edge('a/'+book['author'], 'n/'+book['title'], color='blue') G.add_edge('a/'+book['author'], 'n/'+book['title'], color='blue')
for n in list(G.nodes): for n in list(G.nodes):
node = G.nodes[n] node = G.nodes[n]
@ -1257,12 +1330,16 @@ def findNewBooks(G, books, mu, num=-1, minRecSco=5):
else: else:
ses.append(min(ses)) ses.append(min(ses))
scores.append(mu) scores.append(mu)
node['fake_se'] = sum(ses)/(len(ses)**1.2) + 0.5 + 0.5 * (len(scores)==2) # This is not how SE works. DILLIGAF? # This is not how SE works. DILLIGAF?
node['score'] = sum(scores)/len(scores)*1.2 - node['fake_se']*1.6 + 0.5 - 0.1/math.sqrt(len(scores)) node['fake_se'] = sum(ses)/(len(ses)**1.2) + \
0.5 + 0.5 * (len(scores) == 2)
node['score'] = sum(
scores)/len(scores)*1.2 - node['fake_se']*1.6 + 0.5 - 0.1/math.sqrt(len(scores))
if len(scores) == 2: if len(scores) == 2:
node['score'] *= 0.80 node['score'] *= 0.80
node['value'] = 20 + 5 * float(node['score']) node['value'] = 20 + 5 * float(node['score'])
node['label'] += " ({:.2f}±{:.1f})".format(node['score'], node['fake_se']) node['label'] += " ({:.2f}±{:.1f})".format(node['score'],
node['fake_se'])
node['label'] += '\n ' + node['author'] node['label'] += '\n ' + node['author']
if num != -1: if num != -1:
removeKeepBest(G, num, 10, 'newBook') removeKeepBest(G, num, 10, 'newBook')
@ -1270,6 +1347,8 @@ def findNewBooks(G, books, mu, num=-1, minRecSco=5):
# while batchSize is implemented, we only get a good gonvergence when we disable it (batchSize=-1) # while batchSize is implemented, we only get a good gonvergence when we disable it (batchSize=-1)
# but might be necessary to enable later for a larger libary for better training performance... # but might be necessary to enable later for a larger libary for better training performance...
# maybe try again for 128 books? # maybe try again for 128 books?
def evaluateFitness(books, batchSize=-1, debugPrint=False): def evaluateFitness(books, batchSize=-1, debugPrint=False):
global weights global weights
G = buildBookGraph(books) G = buildBookGraph(books)
@ -1280,7 +1359,8 @@ def evaluateFitness(books, batchSize=-1, debugPrint=False):
graphAddTags(G, books) graphAddTags(G, books)
runPagerank(G) runPagerank(G)
ratedBooks = [n for n in list(G.nodes) if 'rating' in G.nodes[n] and G.nodes[n]['rating'] != None] ratedBooks = [n for n in list(
G.nodes) if 'rating' in G.nodes[n] and G.nodes[n]['rating'] != None]
boundsLoss = 0 boundsLoss = 0
linSepLoss = [] linSepLoss = []
errSq = [] errSq = []
@ -1289,7 +1369,8 @@ def evaluateFitness(books, batchSize=-1, debugPrint=False):
gradient[wt] = 0 gradient[wt] = 0
mu, sigma = genScores(G, books) mu, sigma = genScores(G, books)
for b in G.nodes: for b in G.nodes:
batch = random.sample(ratedBooks, batchSize) if batchSize!=-1 and len(ratedBooks) > batchSize else ratedBooks batch = random.sample(ratedBooks, batchSize) if batchSize != - \
1 and len(ratedBooks) > batchSize else ratedBooks
if b in batch: if b in batch:
rating = G.nodes[b]['rating'] rating = G.nodes[b]['rating']
G.nodes[b]['rating'] = None G.nodes[b]['rating'] = None
@ -1300,9 +1381,12 @@ def evaluateFitness(books, batchSize=-1, debugPrint=False):
errSq.append((rating - G.nodes[b]['score'])**2) errSq.append((rating - G.nodes[b]['score'])**2)
G.nodes[b]['rating'] = rating G.nodes[b]['rating'] = rating
for wt in weights: for wt in weights:
scoreB = sum([a*(1.001 if wt==w[0] else 1)*weights[w[0]]*(w[1] if len(w)>1 else 1) for a,w in zip(G.nodes[b]['_act'], G.nodes[b]['_wgh'])])/sum([(1.001 if wt==w[0] else 1)*weights[w[0]]*(w[1] if len(w)>1 else 1) for w in G.nodes[b]['_wgh']]) scoreB = sum([a*(1.001 if wt == w[0] else 1)*weights[w[0]]*(w[1] if len(w) > 1 else 1) for a, w in zip(G.nodes[b]['_act'],
gradient[wt] += ((rating - G.nodes[b]['score'])**2 - (rating - scoreB)**2)*1000 G.nodes[b]['_wgh'])])/sum([(1.001 if wt == w[0] else 1)*weights[w[0]]*(w[1] if len(w) > 1 else 1) for w in G.nodes[b]['_wgh']])
regressionLoss = sum([max(0,abs(w)-1)**2 for w in weights.values()]) # no punishment if w within -1 and 1 gradient[wt] += ((rating - G.nodes[b]['score'])
** 2 - (rating - scoreB)**2)*1000
# no punishment if w within -1 and 1
regressionLoss = sum([max(0, abs(w)-1)**2 for w in weights.values()])
for wt in weights: for wt in weights:
if abs(weights[wt]) > 1.0: if abs(weights[wt]) > 1.0:
gradient[wt] -= weights[wt]*10 gradient[wt] -= weights[wt]*10
@ -1315,6 +1399,55 @@ def evaluateFitness(books, batchSize=-1, debugPrint=False):
fit = sum(errSq)/len(errSq) + 0.001*regressionLoss fit = sum(errSq)/len(errSq) + 0.001*regressionLoss
return fit, gradient return fit, gradient
def calcDissonance(books):
global weights
G = buildBookGraph(books)
graphAddAuthors(G, books)
graphAddRecommenders(G, books)
graphAddTopLists(G, books)
graphAddSeries(G, books)
graphAddTags(G, books)
runPagerank(G)
ratedBooks = [n for n in list(
G.nodes) if 'rating' in G.nodes[n] and G.nodes[n]['rating'] != None]
errSq = []
gradient = {}
for wt in weights:
gradient[wt] = 0
mu, sigma = genScores(G, books)
for b in G.nodes:
batch = ratedBooks
if b in batch:
rating = G.nodes[b]['rating']
G.nodes[b]['rating'] = None
_, _ = genScores(G, books, calcPagerank=False)
G.nodes[b]['_test_score'] = G.nodes[b]['score']
G.nodes[b]['rating'] = rating
G.nodes[b]['dissonance_off'] = rating - G.nodes[b]['score']
G.nodes[b]['dissonance_abs'] = abs(rating - G.nodes[b]['score'])
return G
def describeDissonance(books, num=-1, sortKey='dissonance_abs', sortDir=True):
bestlist = []
G = calcDissonance(books)
for n in list(G.nodes):
node = G.nodes[n]
if'dissonance_abs' in node:
bestlist.append(node)
bestlist.sort(key=lambda node: node[sortKey], reverse=sortDir)
for i, book in enumerate(bestlist):
line = book['title'] + " ("+" & ".join(book['authors'])+")" + \
": You: {:.5f}, AI: {:.5f}, Delta: {:.5f}".format(
book['rating'], book['_test_score'], book['dissonance_off'])
print("["+str(i+1).zfill(int((math.log10(num) if num != -1 else 3)+1))+"] "+line)
if num != -1 and i == num-1:
break
def train(initGamma, full=True): def train(initGamma, full=True):
global weights global weights
if full: if full:
@ -1367,18 +1500,22 @@ def train(initGamma, full=True):
break break
print('Done.') print('Done.')
def saveWeights(weights): def saveWeights(weights):
with open('neuralWeights.json', 'w') as f: with open('neuralWeights.json', 'w') as f:
f.write(json.dumps(weights)) f.write(json.dumps(weights))
def loadWeights(): def loadWeights():
try: try:
with open('neuralWeights.json', 'r') as f: with open('neuralWeights.json', 'r') as f:
weights = json.loads(f.read()) weights = json.loads(f.read())
except IOError: except IOError:
weights = {"topList": 0.15, "recommender": 0.30, "author": 0.70, "series": 0.05, "tag": 0.05, "pagerank": 0.05, "mu": 0.50, "sigma": 0.30, "bias": 0.25, "median": 0.10} #, "tgb_rank": 0.10} weights = {"topList": 0.15, "recommender": 0.30, "author": 0.70, "series": 0.05, "tag": 0.05,
"pagerank": 0.05, "mu": 0.50, "sigma": 0.30, "bias": 0.25, "median": 0.10} # , "tgb_rank": 0.10}
return weights return weights
def cliInterface(imgDef=False): def cliInterface(imgDef=False):
import argparse import argparse
@ -1403,13 +1540,16 @@ def cliInterface(imgDef=False):
cmds = parser.add_subparsers(required=True, dest='cmd') cmds = parser.add_subparsers(required=True, dest='cmd')
p_rec = cmds.add_parser('recommend', description="TODO", aliases=['rec']) p_rec = cmds.add_parser('recommend', description="TODO", aliases=['rec'])
p_rec.add_argument('-n', type=int, default=20, help='number of books to recommend') p_rec.add_argument('-n', type=int, default=20,
help='number of books to recommend')
p_rec.add_argument('--tag-based', action="store_true") p_rec.add_argument('--tag-based', action="store_true")
p_rec.add_argument('--recommender-based', action="store_true") p_rec.add_argument('--recommender-based', action="store_true")
p_rec.add_argument('--new', type=int, default=-1, help='number of new books to recommend') p_rec.add_argument('--new', type=int, default=-1,
help='number of new books to recommend')
p_rec = cmds.add_parser('listScores', description="TODO", aliases=['ls']) p_rec = cmds.add_parser('listScores', description="TODO", aliases=['ls'])
p_rec.add_argument('-n', type=int, default=50, help='number of books to recommend') p_rec.add_argument('-n', type=int, default=50,
help='number of books to recommend')
p_read = cmds.add_parser('read', description="TODO", aliases=[]) p_read = cmds.add_parser('read', description="TODO", aliases=[])
p_read.add_argument('--min-rating', type=int, default=0) p_read.add_argument('--min-rating', type=int, default=0)
@ -1417,27 +1557,40 @@ def cliInterface(imgDef=False):
p_read.add_argument('--only-connected', action="store_true") p_read.add_argument('--only-connected', action="store_true")
p_show = cmds.add_parser('analyze', description="TODO", aliases=[]) p_show = cmds.add_parser('analyze', description="TODO", aliases=[])
p_show.add_argument('type', choices=['any', 'book', 'recommender', 'author', 'series', 'tag']) p_show.add_argument(
'type', choices=['any', 'book', 'recommender', 'author', 'series', 'tag'])
p_show.add_argument('name', type=str) p_show.add_argument('name', type=str)
p_show.add_argument('-d', type=float, default=2.1, help='depth of expansion') p_show.add_argument('-d', type=float, default=2.1,
help='depth of expansion')
p_train = cmds.add_parser('train', description="TODO", aliases=[]) p_train = cmds.add_parser('train', description="TODO", aliases=[])
p_train.add_argument('-g', type=float, default=0.2, help='learning rate gamma') p_train.add_argument('-g', type=float, default=0.2,
help='learning rate gamma')
p_train.add_argument('--full', action="store_true") p_train.add_argument('--full', action="store_true")
p_prog = cmds.add_parser('progress', description="TODO", aliases=[]) p_prog = cmds.add_parser('progress', description="TODO", aliases=[])
p_prog.add_argument('-m', type=float, default=7, help='Mimimum Score to read') p_prog.add_argument('-m', type=float, default=7,
help='Mimimum Score to read')
p_comp = cmds.add_parser('competence', description="TODO", aliases=[]) p_comp = cmds.add_parser('competence', description="TODO", aliases=[])
p_shell = cmds.add_parser('shell', description="TODO", aliases=[]) p_shell = cmds.add_parser('shell', description="TODO", aliases=[])
p_new = cmds.add_parser('newBooks', description="TODO", aliases=[]) p_new = cmds.add_parser('newBooks', description="TODO", aliases=[])
p_new.add_argument('-n', type=int, default=10, help='number of books to recommend') p_new.add_argument('-n', type=int, default=10,
help='number of books to recommend')
p_col = cmds.add_parser('calice', description="TODO", aliases=[]) p_cal = cmds.add_parser('calice', description="TODO", aliases=[])
p_createCol = cmds.add_parser('createCaliceColumn', description="TODO", aliases=[]) p_dis = cmds.add_parser('dissonance', description="TODO", aliases=['dis'])
p_dis.add_argument('-n', type=int, default=-1,
help='Maximum number of books to lost')
p_dis.add_argument(
'--sort', choices=['dissonance_abs', 'dissonance_off', 'score'], default='dissonance_abs', const='dissonance_abs', nargs='?')
p_dis.add_argument('--reversed', action="store_true")
p_createCol = cmds.add_parser(
'createCaliceColumn', description="TODO", aliases=[])
p_createCol.add_argument('type', choices=['score', 'rating', 'both']) p_createCol.add_argument('type', choices=['score', 'rating', 'both'])
p_full = cmds.add_parser('full', description="TODO", aliases=[]) p_full = cmds.add_parser('full', description="TODO", aliases=[])
@ -1452,6 +1605,7 @@ def cliInterface(imgDef=False):
else: else:
mainCLI(args) mainCLI(args)
def perfTestCLI(args): def perfTestCLI(args):
import time import time
from pycallgraph import PyCallGraph from pycallgraph import PyCallGraph
@ -1466,6 +1620,7 @@ def perfTestCLI(args):
with PyCallGraph(output=GraphvizOutput(output_file='perfTests/' + str(int(time.time())) + '.png'), config=config): with PyCallGraph(output=GraphvizOutput(output_file='perfTests/' + str(int(time.time())) + '.png'), config=config):
mainCLI(args) mainCLI(args)
def mainCLI(args): def mainCLI(args):
if args.cmd == "train": if args.cmd == "train":
train(args.g, args.full) train(args.g, args.full)
@ -1482,7 +1637,6 @@ def mainCLI(args):
if not args.keep_whitepapers: if not args.keep_whitepapers:
removeWhitepapers(G) removeWhitepapers(G)
if args.cmd == "recommend": if args.cmd == "recommend":
if args.new == -1: if args.new == -1:
args.new = int(args.n / 5) args.new = int(args.n / 5)
@ -1490,16 +1644,21 @@ def mainCLI(args):
findNewBooks(G, books, mu, args.new, minRecSco=mu-std) findNewBooks(G, books, mu, args.new, minRecSco=mu-std)
if args.tag_based: if args.tag_based:
if args.recommender_based: if args.recommender_based:
raise Exception('tag-based and recommender-based can not be be combined') raise Exception(
recommendNBooksTagBased(G, mu, std, args.n, not args.keep_top_lists) 'tag-based and recommender-based can not be be combined')
recommendNBooksTagBased(
G, mu, std, args.n, not args.keep_top_lists)
elif args.recommender_based: elif args.recommender_based:
recommendNBooksRecommenderBased(G, mu, std, args.n, not args.keep_top_lists, not args.keep_useless_recommenders) recommendNBooksRecommenderBased(
G, mu, std, args.n, not args.keep_top_lists, not args.keep_useless_recommenders)
else: else:
recommendNBooks(G, mu, std, args.n, not args.keep_top_lists, not args.keep_useless_recommenders, args.v3d) recommendNBooks(G, mu, std, args.n, not args.keep_top_lists,
not args.keep_useless_recommenders, args.v3d)
elif args.cmd == "listScores": elif args.cmd == "listScores":
listScores(G, mu, std, args.n) listScores(G, mu, std, args.n)
elif args.cmd == "read": elif args.cmd == "read":
readBooksAnalysis(G, args.min_rating, args.all_tags, args.only_connected, not args.keep_top_lists) readBooksAnalysis(G, args.min_rating, args.all_tags,
args.only_connected, not args.keep_top_lists)
elif args.cmd == "analyze": elif args.cmd == "analyze":
analyze(G, books, mu, args.type, args.name, args.d) analyze(G, books, mu, args.type, args.name, args.d)
elif args.cmd == "full": elif args.cmd == "full":
@ -1518,6 +1677,9 @@ def mainCLI(args):
elif args.cmd == "calice": elif args.cmd == "calice":
calice(G) calice(G)
exit() exit()
elif args.cmd == "dissonance":
describeDissonance(books, args.n, args.sort, not args.reversed)
exit()
elif args.cmd == "createCaliceColumn": elif args.cmd == "createCaliceColumn":
if args.type in ['score', 'both']: if args.type in ['score', 'both']:
calibreDB.createCaliceScoreColumn() calibreDB.createCaliceScoreColumn()
@ -1525,12 +1687,12 @@ def mainCLI(args):
if args.type in ['rating', 'both']: if args.type in ['rating', 'both']:
calibreDB.createCaliceRatingColumn() calibreDB.createCaliceRatingColumn()
print('[*] Column "Calice Rating" was created.') print('[*] Column "Calice Rating" was created.')
print('[i] To allow displaying half-stars, please active them manually in the calibre-settings.') print(
'[i] To allow displaying half-stars, please active them manually in the calibre-settings.')
exit() exit()
else: else:
raise Exception("Bad") raise Exception("Bad")
if not args.keep_priv: if not args.keep_priv:
removePriv(G) removePriv(G)
if args.remove_read: if args.remove_read: