diff --git a/caliGraph.py b/caliGraph.py index 4745747..dd1cfff 100755 --- a/caliGraph.py +++ b/caliGraph.py @@ -18,9 +18,11 @@ import plotly.graph_objects as go import wikipedia + class Error(Exception): pass + def getAllAuthors(books): authors = set() for book in books: @@ -131,6 +133,7 @@ def removePriv(G): if 'priv' in node['tags']: G.remove_node(n) + def removeWhitepapers(G): for n in list(G.nodes): node = G.nodes[n] @@ -146,6 +149,7 @@ def removeDangling(G, alsoBooks=False): if not len(G.adj[n]): G.remove_node(n) + def removeThinRecs(G, minCons=3): for n in list(G.nodes): node = G.nodes[n] @@ -153,6 +157,7 @@ def removeThinRecs(G, minCons=3): if not len(G.adj[n]) >= minCons: G.remove_node(n) + def removeEdge(G): for n in list(G.nodes): node = G.nodes[n] @@ -256,6 +261,7 @@ def pruneRecommenderCons(G, maxCons=5): if foundCon < 2: G.remove_node(m) + def pruneAuthorCons(G, maxCons=3): for n in list(G.nodes): node = G.nodes[n] @@ -281,6 +287,7 @@ def pruneAuthorCons(G, maxCons=3): if foundCon < 2: G.remove_node(m) + def removeHighSpanTags(G, maxCons=5): for n in list(G.nodes): node = G.nodes[n] @@ -293,7 +300,7 @@ def removeHighSpanReadBooks(G, maxCons=8): for n in list(G.nodes): node = G.nodes[n] if node['t'] == 'book' and node['rating'] != None: - if sum([1 for adj in G.adj[n] if G.nodes[adj]['t']=='recommender']) > maxCons: + if sum([1 for adj in G.adj[n] if G.nodes[adj]['t'] == 'recommender']) > maxCons: G.remove_node(n) @@ -303,24 +310,28 @@ def removeTopLists(G): if node['t'] == 'topList': G.remove_node(n) + def removeRecommenders(G): for n in list(G.nodes): node = G.nodes[n] if node['t'] == 'recommender': G.remove_node(n) + def removeAuthors(G): for n in list(G.nodes): node = G.nodes[n] if node['t'] == 'author': G.remove_node(n) + def removeSeries(G): for n in list(G.nodes): node = G.nodes[n] if node['t'] == 'series': G.remove_node(n) + def removeRestOfSeries(G): for n in list(G.nodes): node = G.nodes[n] @@ -336,17 +347,19 @@ def removeRestOfSeries(G): if adjNode['series_index'] > seriesState + 1.0001: G.remove_node(adj) + def removeUnusedRecommenders(G): for n in list(G.nodes): node = G.nodes[n] if node['t'] == 'recommender': for adj in G.adj[n]: adjNode = G.nodes[adj] - if adjNode['t']=='book' and 'score' in adjNode: + if adjNode['t'] == 'book' and 'score' in adjNode: break - else: # No unrated recommendation + else: # No unrated recommendation G.remove_node(n) + def removeUselessReadBooks(G): minForce = 1.5 minContact = 2 @@ -360,14 +373,15 @@ def removeUselessReadBooks(G): contacts += 1 for cousin in G.adj[adj]: cousinNode = G.nodes[cousin] - if cousinNode['t']=='book' and 'score' in cousinNode or cousinNode['t'] == 'newBook': - if adjNode['t']=='recommender': + if cousinNode['t'] == 'book' and 'score' in cousinNode or cousinNode['t'] == 'newBook': + if adjNode['t'] == 'recommender': force += 0.5 else: force += 1 if force < minForce or contacts < minContact: G.remove_node(n) + def removeUselessTags(G, minUnread=1): for n in list(G.nodes): node = G.nodes[n] @@ -375,11 +389,12 @@ def removeUselessTags(G, minUnread=1): foundUnread = 0 for adj in G.adj[n]: adjNode = G.nodes[adj] - if adjNode['t']=='book' and 'score' in adjNode: + if adjNode['t'] == 'book' and 'score' in adjNode: foundUnread += 1 if foundUnread < minUnread: G.remove_node(n) + def removeUselessSeries(G, minSco=0): for n in list(G.nodes): node = G.nodes[n] @@ -387,6 +402,7 @@ def removeUselessSeries(G, minSco=0): if len(G.adj[n]) < 2 or node['score'] < minSco: G.remove_node(n) + def scoreOpinions(G, globMu, globStd): for n in list(G.nodes): node = G.nodes[n] @@ -406,6 +422,7 @@ def scoreOpinions(G, globMu, globStd): else: node['score'] = None + def scoreUnread(G, globMu, globStd): for n in list(G.nodes): feedbacks = [globMu] @@ -417,35 +434,40 @@ def scoreUnread(G, globMu, globStd): for adj in adjacens: adjNode = G.nodes[adj] if 'score' in adjNode and adjNode['score'] != None: - w = [adjNode['t'], G[n][adj]['weight'] if 'weight' in G[n][adj] else 1] + w = [adjNode['t'], G[n][adj]['weight'] + if 'weight' in G[n][adj] else 1] for fb in adjNode['feedbacks']: feedbacks.append(fb) ws.append(w) if len(feedbacks): node['mean'], node['std'] = norm.fit(feedbacks) - node['median'] = np.percentile(feedbacks, [50], method='linear')[0] + node['median'] = np.percentile( + feedbacks, [50], method='linear')[0] node['se'] = globStd / math.sqrt(len(feedbacks)) feedbacks.append(node['pagerank_score']) ws.append(['pagerank']) #feedbacks.append(10/math.ln10(10+node['tgb_rank']) if 'tgb_rank' in node else 0) - #ws.append(['tgb_rank']) + # ws.append(['tgb_rank']) feedbacks.append(node['std']) ws.append(['sigma']) feedbacks.append(node['median']) ws.append(['median']) - #feedbacks.append(node['se']) - #ws.append(['se']) + # feedbacks.append(node['se']) + # ws.append(['se']) feedbacks.append(globMu) ws.append(['bias']) - node['score'] = sum([fb*getWeightForType(w[0], w[1] if len(w)>1 else 1) for fb, w in zip(feedbacks, ws)])/sum([getWeightForType(w[0], w[1] if len(w)>1 else 1) for w in ws]) + node['score'] = sum([fb*getWeightForType(w[0], w[1] if len(w) > 1 else 1) for fb, w in zip( + feedbacks, ws)])/sum([getWeightForType(w[0], w[1] if len(w) > 1 else 1) for w in ws]) node['_act'] = feedbacks node['_wgh'] = ws else: - node['score'] = globMu + errorFac*globStd + len(feedbacks)*0.0000000001 + node['score'] = globMu + errorFac * \ + globStd + len(feedbacks)*0.0000000001 if 'series' in node: if node['series_index'] == 1.0: node['score'] += 0.000000001 + def getWeightForType(nodeType, edgeWeight=1): global weights w = weights[nodeType] @@ -454,6 +476,7 @@ def getWeightForType(nodeType, edgeWeight=1): else: return w + def printBestList(G, t='book', num=-1): bestlist = [] for n in list(G.nodes): @@ -461,14 +484,16 @@ def printBestList(G, t='book', num=-1): if node['t'] == t: if 'score' in node and node['score'] != None: bestlist.append(node) - bestlist.sort(key=lambda node: node['score'] + 0.00001*(node['se'] if 'se' in node else 0), reverse=True) + bestlist.sort(key=lambda node: node['score'] + 0.00001 * + (node['se'] if 'se' in node else 0), reverse=True) for i, book in enumerate(bestlist): - if t=='book': - line = book['title'] + " ("+" & ".join(book['authors'])+")"+": {:.5f}".format(book['score']) + if t == 'book': + line = book['title'] + " ("+" & ".join(book['authors'])+")" + \ + ": {:.5f}".format(book['score']) else: line = book['label'] - print("["+str(i+1).zfill(int((math.log10(num) if num!=-1 else 3)+1))+"] "+line) - if num!=-1 and i == num-1: + print("["+str(i+1).zfill(int((math.log10(num) if num != -1 else 3)+1))+"] "+line) + if num != -1 and i == num-1: break @@ -478,42 +503,46 @@ def readColor(book): else: return 'gray' + def loadBooksFromDB(): books = calibreDB.getBooks() infuseDataFromMRB(books) - #infuseDataFromTGB(books) + # infuseDataFromTGB(books) return books + def mrbGetBook(mrbdf, title, authors): title = title.split('(')[0] - title = title.replace('*','') + title = title.replace('*', '') pot = mrbdf[mrbdf['title'].str.contains(title)] dic = pot.to_dict(orient='records') for d in dic: for author in authors: parts = author.split(" ") for part in [parts[0], parts[-1]]: - if d['author'].find(part)==-1: + if d['author'].find(part) == -1: break else: return d return False + def tgbGetBook(df, title, authors): title = title.split('(')[0] - title = title.replace('*','') + title = title.replace('*', '') pot = df[df['title'].str.contains(title)] dic = pot.to_dict(orient='records') for d in dic: for author in authors: parts = author.split(" ") for part in [parts[0], parts[-1]]: - if d['author'].find(part)==-1: + if d['author'].find(part) == -1: break else: return d return False + def infuseDataFromMRB(books): mrbdf = pd.read_csv('rec_dbs/mrb_db.csv') for book in books: @@ -522,20 +551,23 @@ def infuseDataFromMRB(books): for rec in str(mrb['recommender']).split('|'): book['tags'] += [rec + ':MRB'] + def infuseDataFromTGB(books): - for i in range(1,3): + for i in range(1, 3): df = pd.read_csv('rec_dbs/tgb_'+str(i)+'.csv') for book in books: tgb = tgbGetBook(df, book['title'], book['authors']) if tgb: book['tgb_rank'] = int(tgb['id']) + class calibreDB(): @classmethod def _getTxt(cls, request): ret = os.popen("calibredb "+request).read() if not ret: - raise Error('Unable to connect to CalibreDB. Please close all open instances of Calibre.') + raise Error( + 'Unable to connect to CalibreDB. Please close all open instances of Calibre.') return ret @classmethod @@ -557,7 +589,8 @@ class calibreDB(): cols = cls.getCustomColumns() avai = ['calice_score' in cols, 'calice_rating' in cols] if not any(avai): - raise Error('Custom Columns missing from CalibreDB. Create columns for "Calice Score" and/or "Calice Rating" using the "createCaliceColumn" command.') + raise Error( + 'Custom Columns missing from CalibreDB. Create columns for "Calice Score" and/or "Calice Rating" using the "createCaliceColumn" command.') return avai @classmethod @@ -586,9 +619,12 @@ class calibreDB(): cls._getTxt('set_custom calice_score '+str(bookId)+' ""') else: if sco: - cls._getTxt('set_custom calice_score '+str(bookId)+' '+str(round(score,5))) + cls._getTxt('set_custom calice_score ' + + str(bookId)+' '+str(round(score, 5))) if rat: - cls._getTxt('set_custom calice_rating '+str(bookId)+' '+str(int(round(score)))) + cls._getTxt('set_custom calice_rating ' + + str(bookId)+' '+str(int(round(score)))) + def calice(G): scores = {} @@ -602,28 +638,31 @@ def calice(G): calibreDB.writeCaliceColumnMultiple(scores) print('Done.') + def remove_html_tags(text): clean = re.compile('<.*?>') return re.sub(clean, '', text) -def getKeywords(txt,rake): + +def getKeywords(txt, rake): txt = remove_html_tags(txt) k = [] rake.extract_keywords_from_text(txt) kws = rake.get_ranked_phrases_with_scores() - for i,(score,kw) in enumerate(kws): + for i, (score, kw) in enumerate(kws): l = len(kw.split(' ')) - if kw.lower() not in ['p', 'die', 'best', 'known', 'fk', 'p pp', 'one'] and len(kw)>3 and kw.find('div')==-1 and kw.lower().find('p p')==-1: - k.append((score**(1/(l*0.4)),kw)) - k.sort(key=lambda x: x[0],reverse=True) + if kw.lower() not in ['p', 'die', 'best', 'known', 'fk', 'p pp', 'one'] and len(kw) > 3 and kw.find('div') == -1 and kw.lower().find('p p') == -1: + k.append((score**(1/(l*0.4)), kw)) + k.sort(key=lambda x: x[0], reverse=True) if k: minSco = k[0][0]/3*2 - for i,kw in enumerate(k): + for i, kw in enumerate(k): if kw[0] < minSco: - return [(sco,word.title()) for sco,word in k[:i]] + return [(sco, word.title()) for sco, word in k[:i]] return k return [] + def runPagerank(G): try: scores = nx.pagerank(G=G) @@ -634,6 +673,7 @@ def runPagerank(G): for n in list(G.nodes): G.nodes[n]['pagerank_score'] = scores[n] if n in scores else 0 + def buildBookGraph(books, darkMode=False, extractKeywords=True, mergeTags=True): G = nx.Graph() if extractKeywords: @@ -652,8 +692,9 @@ def buildBookGraph(books, darkMode=False, extractKeywords=True, mergeTags=True): else: desc = '' if 'comments' in book and extractKeywords: - sanitized = re.sub(r'[^a-zA-Z0-9\s\.äöü]+', '', book['comments']).replace('\n',' ') - keywords = getKeywords(sanitized,rake) + sanitized = re.sub(r'[^a-zA-Z0-9\s\.äöü]+', + '', book['comments']).replace('\n', ' ') + keywords = getKeywords(sanitized, rake) else: keywords = [] if mergeTags: @@ -664,29 +705,33 @@ def buildBookGraph(books, darkMode=False, extractKeywords=True, mergeTags=True): else: series = None series_index = None - G.add_node(book['id'], t='book', label=book['title'], title=book['title'], shape='image', image=book['cover'], rating=rating, tags=tags, keywords=keywords, desc=desc, isbn=book['isbn'], files=book['formats'], authors=getAuthors(book), series=series, series_index=series_index, calibreID=book['id']) + G.add_node(book['id'], t='book', label=book['title'], title=book['title'], shape='image', image=book['cover'], rating=rating, tags=tags, keywords=keywords, + desc=desc, isbn=book['isbn'], files=book['formats'], authors=getAuthors(book), series=series, series_index=series_index, calibreID=book['id']) return G + def getWikiImage(search_term): from fuzzywuzzy import fuzz WIKI_REQUEST = 'http://en.wikipedia.org/w/api.php?action=query&prop=pageimages&format=json&piprop=original&titles=' try: print('[i] Searching for >'+search_term+'< on WikiPedia...') - result = wikipedia.search(search_term, results = 1) + result = wikipedia.search(search_term, results=1) if fuzz.ratio(search_term, result) < 50: raise Exception('blub') wikipedia.set_lang('en') - wkpage = wikipedia.WikipediaPage(title = result[0]) + wkpage = wikipedia.WikipediaPage(title=result[0]) title = wkpage.title - response = requests.get(WIKI_REQUEST+title) + response = requests.get(WIKI_REQUEST+title) json_data = json.loads(response.text) - img_link = list(json_data['query']['pages'].values())[0]['original']['source'] + img_link = list(json_data['query']['pages'].values())[ + 0]['original']['source'] return img_link except: print('[!] No match for '+search_term+' on WikiPedia...') return None + def graphAddAuthors(G, books, darkMode=False): for author in getAllAuthors(books): G.add_node('a/'+author, color='green', t='author', label=author) @@ -695,6 +740,7 @@ def graphAddAuthors(G, books, darkMode=False): G.add_edge('a/'+author, book['id'], color=readColor(book)) return G + def graphAddRecommenders(G, books, darkMode=False): for rec in getAllRecommenders(books): G.add_node('r/'+rec, color='orange', t='recommender', label=rec) @@ -703,6 +749,7 @@ def graphAddRecommenders(G, books, darkMode=False): G.add_edge('r/'+rec, book['id'], color=readColor(book)) return G + def graphAddTopLists(G, books, darkMode=False): for tl in getAllTopLists(books): G.add_node('t/'+tl, color='yellow', t='topList', label=tl) @@ -715,7 +762,8 @@ def graphAddTopLists(G, books, darkMode=False): def graphAddSeries(G, books, darkMode=False): for series in getAllSeries(books): - G.add_node('s/'+series, color='red', t='series', label=series, shape='triangle') + G.add_node('s/'+series, color='red', t='series', + label=series, shape='triangle') for book in books: if 'series' in book: G.add_edge('s/'+book['series'], book['id'], color=readColor(book)) @@ -724,7 +772,8 @@ def graphAddSeries(G, books, darkMode=False): def graphAddTags(G, books, darkMode=False): for tag in getAllTags(books): - G.add_node('t/'+tag, color=['lightGray','darkgray'][darkMode], t='tag', label=tag, shape='box') + G.add_node('t/'+tag, color=['lightGray', 'darkgray'] + [darkMode], t='tag', label=tag, shape='box') for book in books: for tag in getTags(book): G.add_edge('t/'+tag, book['id'], color=readColor(book)) @@ -770,7 +819,8 @@ def addScoreToLabels(G): node['label'] += " ("+str(node['rating'])+")" else: if 'score' in node and node['score'] != None and 'se' in node: - node['label'] += " ({:.2f}±{:.1f})".format(node['score'], node['se']) + node['label'] += " ({:.2f}±{:.1f})".format( + node['score'], node['se']) else: node['label'] += " (0±∞)" @@ -778,7 +828,7 @@ def addScoreToLabels(G): def genAndShowHTML(G, showButtons=False, darkMode=False, arrows=False): net = Network('1050px', '1900px', directed=arrows, - bgcolor=['#FFFFFF','#181818'][darkMode]) + bgcolor=['#FFFFFF', '#181818'][darkMode]) if showButtons: net.show_buttons(filter_=['configure', 'layout', 'interaction', 'physics', 'edges']) @@ -792,10 +842,10 @@ def genAndShow3D(G, darkMode=False): node_cols = [] for n in G.nodes: node = G.nodes[n] - if node['t']=='tag': + if node['t'] == 'tag': node_cols.append('gray') - elif node['t']=='book': - if 'score' in node: # unread book + elif node['t'] == 'book': + if 'score' in node: # unread book node_cols.append('lightblue') else: node_cols.append('magenta') @@ -807,69 +857,70 @@ def genAndShow3D(G, darkMode=False): node_labels.append(node['label']) node_sizes.append((node['value']/8)**1.5) - spring = nx.spring_layout(G,dim=3, seed=random.randint(0, 65536)) - x_nodes = [spring[p][0] for p in spring]# x-coordinates of nodes - y_nodes = [spring[p][1] for p in spring]# y-coordinates - z_nodes = [spring[p][2] for p in spring]# z-coordinates + spring = nx.spring_layout(G, dim=3, seed=random.randint(0, 65536)) + x_nodes = [spring[p][0] for p in spring] # x-coordinates of nodes + y_nodes = [spring[p][1] for p in spring] # y-coordinates + z_nodes = [spring[p][2] for p in spring] # z-coordinates - x_edges=[] - y_edges=[] - z_edges=[] + x_edges = [] + y_edges = [] + z_edges = [] for edge in G.edges(): - x_coords = [spring[edge[0]][0],spring[edge[1]][0],None] + x_coords = [spring[edge[0]][0], spring[edge[1]][0], None] x_edges += x_coords - y_coords = [spring[edge[0]][1],spring[edge[1]][1],None] + y_coords = [spring[edge[0]][1], spring[edge[1]][1], None] y_edges += y_coords - z_coords = [spring[edge[0]][2],spring[edge[1]][2],None] + z_coords = [spring[edge[0]][2], spring[edge[1]][2], None] z_edges += z_coords trace_edges = go.Scatter3d(x=x_edges, - y=y_edges, - z=z_edges, - mode='lines', - line=dict(color='black', width=2), - hoverinfo='none') + y=y_edges, + z=z_edges, + mode='lines', + line=dict(color='black', width=2), + hoverinfo='none') trace_nodes = go.Scatter3d(x=x_nodes, - y=y_nodes, - z=z_nodes, - mode='markers', - marker=dict(symbol='circle', - size=node_sizes, - color=node_cols, #color the nodes according to their community - #colorscale=['lightgreen','magenta'], #either green or mageneta - line=dict(color='gray', width=0.5)), - text=node_labels, - hoverinfo='text') + y=y_nodes, + z=z_nodes, + mode='markers', + marker=dict(symbol='circle', + size=node_sizes, + color=node_cols, # color the nodes according to their community + # colorscale=['lightgreen','magenta'], #either green or mageneta + line=dict(color='gray', width=0.5)), + text=node_labels, + hoverinfo='text') axis = dict(showbackground=False, - showline=False, - zeroline=False, - showgrid=False, - showticklabels=False, - title='') + showline=False, + zeroline=False, + showgrid=False, + showticklabels=False, + title='') layout = go.Layout(title="", - width=1920, - height=1080, - plot_bgcolor=['#FFFFFF','#181818'][darkMode], - paper_bgcolor=['#FFFFFF','#181818'][darkMode], - showlegend=False, - scene=dict(xaxis=dict(axis), - yaxis=dict(axis), - zaxis=dict(axis), - ), - margin=dict(l=0, r=0, b=0, t=0), - hovermode='closest') + width=1920, + height=1080, + plot_bgcolor=['#FFFFFF', '#181818'][darkMode], + paper_bgcolor=['#FFFFFF', '#181818'][darkMode], + showlegend=False, + scene=dict(xaxis=dict(axis), + yaxis=dict(axis), + zaxis=dict(axis), + ), + margin=dict(l=0, r=0, b=0, t=0), + hovermode='closest') data = [trace_edges, trace_nodes] fig = go.Figure(data=data, layout=layout) fig.show() + def buildFullGraph(darkMode=False): books = loadBooksFromDB() G = buildBookGraph(books, darkMode=darkMode) @@ -890,9 +941,10 @@ def genScores(G, books, calcPagerank=True): scoreUnread(G, globMu, globStd) return globMu, globStd + def addImageToNode(node, cache, shape='circularImage'): - name = node['label'].split(' (')[0].replace('*','') - if not name in cache or (cache[name]==False and random.random()<0.05): + name = node['label'].split(' (')[0].replace('*', '') + if not name in cache or (cache[name] == False and random.random() < 0.05): term = name img = getWikiImage(term) if img: @@ -903,8 +955,9 @@ def addImageToNode(node, cache, shape='circularImage'): img = cache[name] if img: #node['imagePadding'] = '100px' - node['image']=img - node['shape']=shape + node['image'] = img + node['shape'] = shape + def addImagesToNodes(G): try: @@ -915,10 +968,12 @@ def addImagesToNodes(G): for n in list(G.nodes): node = G.nodes[n] if node['t'] in ['recommender', 'author']: - addImageToNode(node, cache, ['circularImage','image'][node['t']=='author']) + addImageToNode( + node, cache, ['circularImage', 'image'][node['t'] == 'author']) with open('.imgLinkCache.json', 'w') as cf: cf.write(json.dumps(cache)) + def recommendNBooksRecommenderBased(G, mu, std, n, removeTopListsB=True, removeUselessRecommenders=True): removeRestOfSeries(G) removeBad(G, mu-std*2-1) @@ -976,6 +1031,7 @@ def recommendNBooksTagBased(G, mu, std, n, removeTopListsB=True): scaleOpinionsByRating(G) addScoreToLabels(G) + def recommendNBooks(G, mu, std, n, removeTopListsB=True, removeUselessRecommenders=True, v3d=False): removeRestOfSeries(G) removeBad(G, mu-std-0.5) @@ -1005,7 +1061,7 @@ def recommendNBooks(G, mu, std, n, removeTopListsB=True, removeUselessRecommende removeUselessSeries(G, mu) removeUselessTags(G) removeUselessReadBooks(G) - removeThinRecs(G, 2 + 1 * (n>20 and not v3d)) + removeThinRecs(G, 2 + 1 * (n > 20 and not v3d)) removeKeepBest(G, n, maxDistForRead=1.25) scaleBooksByRating(G) @@ -1035,8 +1091,9 @@ def fullGraph(G, removeTopListsB=True): scaleOpinionsByRating(G) addScoreToLabels(G) + def recommenderCompetence(G): - #removeRead(G) + # removeRead(G) removeUnread(G) removeTags(G) removeAuthors(G) @@ -1049,7 +1106,7 @@ def recommenderCompetence(G): scaleBooksByRating(G) scaleOpinionsByRating(G) addScoreToLabels(G) - + for n in list(G.nodes): node = G.nodes[n] if node['t'] == 'recommender': @@ -1060,6 +1117,7 @@ def recommenderCompetence(G): node['score'] = 0 node['score'] /= 2 + def readBooksAnalysis(G, minRating=0, showAllTags=True, removeUnconnected=False, removeTopListsB=True): removeUnread(G) removeBad(G, minRating) @@ -1075,21 +1133,22 @@ def readBooksAnalysis(G, minRating=0, showAllTags=True, removeUnconnected=False, scaleOpinionsByRating(G) addScoreToLabels(G) + def progress(G, books, mu, minimum=3.5): - findNewBooks(G, books, mu, -1, minRecSco = minimum) + findNewBooks(G, books, mu, -1, minRecSco=minimum) bookCount = 0 libCount = 0 readCount = 0 toReadCount = 0 for n in list(G.nodes): node = G.nodes[n] - if node['t'] in ['book','newBook']: + if node['t'] in ['book', 'newBook']: if node['t'] == 'book': - libCount +=1 + libCount += 1 bookCount += 1 if 'rating' in node and node['rating'] != None: readCount += 1 - elif 'score' in node and (node['score'] >= minimum or 'std' in node and node['std']==0.0): + elif 'score' in node and (node['score'] >= minimum or 'std' in node and node['std'] == 0.0): toReadCount += 1 perc = round(readCount / (toReadCount+readCount) * 100, 2) print('Books in library: '+str(libCount)) @@ -1107,8 +1166,8 @@ def analyze(G, books, mu, type_name, name, dist=2.1): bestRatio, match, n = 0, None, 0 for ni in list(G.nodes): node = G.nodes[ni] - if node['t'] == type_name or type_name=="any": - if name==node['label'] or full_name==node['label']: + if node['t'] == type_name or type_name == "any": + if name == node['label'] or full_name == node['label']: match, n = node, ni break ratio = fuzz.ratio(node['label'], name) @@ -1138,12 +1197,13 @@ def analyze(G, books, mu, type_name, name, dist=2.1): addScoreToLabels(G) match['label'] = "*"+match['label']+"*" + def waveFlow(G, node, n, dist, menge, firstEdge=False): if dist <= 0: return dist -= 1 - if menge==set(): - firstEdge=True + if menge == set(): + firstEdge = True if node['t'] in ['topList']: if firstEdge: menge.add(n) @@ -1151,7 +1211,7 @@ def waveFlow(G, node, n, dist, menge, firstEdge=False): menge.add(n) if node['t'] in ['tag']: if firstEdge: - dist-=0.1 + dist -= 0.1 else: return bestlist = [] @@ -1167,7 +1227,8 @@ def waveFlow(G, node, n, dist, menge, firstEdge=False): book['score'] = 0 bestlist.append(book) bestlist.sort(key=lambda node: node['score'], reverse=True) - toKeep = min(int(dist*10), math.ceil(len(bestlist) * dist - len(keeplist)*0.5)) + toKeep = min(int(dist*10), math.ceil(len(bestlist) + * dist - len(keeplist)*0.5)) if toKeep <= 0: keeplist.sort(key=lambda node: node['rating'], reverse=True) keeplist = keeplist[:min(int(dist*10), int(len(keeplist) * dist))] @@ -1180,6 +1241,7 @@ def waveFlow(G, node, n, dist, menge, firstEdge=False): if node in bestlist or node in keeplist: waveFlow(G, node, m, dist, menge, firstEdge=firstEdge) + def gensimTokensForLines(lines): for i, line in enumerate(lines): tokens = gensim.utils.simple_preprocess(line) @@ -1189,6 +1251,7 @@ def gensimTokensForLines(lines): # For training data, add tags yield gensim.models.doc2vec.TaggedDocument(tokens, [i]) + def buildDoc2Vec(books): import gensim for n in list(G.nodes): @@ -1197,13 +1260,15 @@ def buildDoc2Vec(books): pass gensimTokensForLines(lines) + def shell(G, books, mu, std): from ptpython.repl import embed embed(globals(), locals()) + def newBooks(G, books, num, mu, std): removeBad(G, mu-std*2) - findNewBooks(G, books, mu, num, minRecSco = mu-std) + findNewBooks(G, books, mu, num, minRecSco=mu-std) removeThinRecs(G, 2) removeUnread(G) removeUselessReadBooks(G) @@ -1226,21 +1291,29 @@ def findNewBooks(G, books, mu, num=-1, minRecSco=5): if node['t'] == 'recommender' and 'score' in node: oldBooks = [] newBooks = [] - recBooks = mrbdf[mrbdf['recommender'].str.contains(node['label'])].to_dict(orient='records') + recBooks = mrbdf[mrbdf['recommender'].str.contains( + node['label'])].to_dict(orient='records') for book in recBooks: if book['title'] in [b['title'] for b in books]: - oldBooks.append({'title': book['title'], 'author': book['author']}) + oldBooks.append( + {'title': book['title'], 'author': book['author']}) else: - newBooks.append({'title': book['title'], 'author': book['author']}) - recs.append({'name': node['label'], 'rec': node, 'newBooks': newBooks, 'oldBooks': oldBooks}) + newBooks.append( + {'title': book['title'], 'author': book['author']}) + recs.append({'name': node['label'], 'rec': node, + 'newBooks': newBooks, 'oldBooks': oldBooks}) for rec in recs: for book in rec['newBooks']: - G.add_node('n/'+book['title'], color='blue', t='newBook', label=book['title'], author=book['author']) + G.add_node('n/'+book['title'], color='blue', t='newBook', + label=book['title'], author=book['author']) - G.add_node('r/'+rec['rec']['label'], color='orange', t='recommender', label=rec['rec']['label'], score=rec['rec']['score']) - G.add_edge('r/'+rec['rec']['label'], 'n/'+book['title'], color='blue') + G.add_node('r/'+rec['rec']['label'], color='orange', t='recommender', + label=rec['rec']['label'], score=rec['rec']['score']) + G.add_edge('r/'+rec['rec']['label'], 'n/' + + book['title'], color='blue') - G.add_node('a/'+book['author'], color='green', t='author', label=book['author']) + G.add_node('a/'+book['author'], color='green', + t='author', label=book['author']) G.add_edge('a/'+book['author'], 'n/'+book['title'], color='blue') for n in list(G.nodes): node = G.nodes[n] @@ -1249,7 +1322,7 @@ def findNewBooks(G, books, mu, num=-1, minRecSco=5): scores = [] for m in list(G.adj[n]): adj = G.nodes[m] - if adj['t'] == 'recommender' and adj['score']!=None: + if adj['t'] == 'recommender' and adj['score'] != None: scores.append(adj['score']) ses.append(adj['se']) if not len(scores): @@ -1257,19 +1330,25 @@ def findNewBooks(G, books, mu, num=-1, minRecSco=5): else: ses.append(min(ses)) scores.append(mu) - node['fake_se'] = sum(ses)/(len(ses)**1.2) + 0.5 + 0.5 * (len(scores)==2) # This is not how SE works. DILLIGAF? - node['score'] = sum(scores)/len(scores)*1.2 - node['fake_se']*1.6 + 0.5 - 0.1/math.sqrt(len(scores)) - if len(scores)==2: - node['score']*=0.80 + # This is not how SE works. DILLIGAF? + node['fake_se'] = sum(ses)/(len(ses)**1.2) + \ + 0.5 + 0.5 * (len(scores) == 2) + node['score'] = sum( + scores)/len(scores)*1.2 - node['fake_se']*1.6 + 0.5 - 0.1/math.sqrt(len(scores)) + if len(scores) == 2: + node['score'] *= 0.80 node['value'] = 20 + 5 * float(node['score']) - node['label'] += " ({:.2f}±{:.1f})".format(node['score'], node['fake_se']) + node['label'] += " ({:.2f}±{:.1f})".format(node['score'], + node['fake_se']) node['label'] += '\n ' + node['author'] - if num!=-1: + if num != -1: removeKeepBest(G, num, 10, 'newBook') # while batchSize is implemented, we only get a good gonvergence when we disable it (batchSize=-1) # but might be necessary to enable later for a larger libary for better training performance... # maybe try again for 128 books? + + def evaluateFitness(books, batchSize=-1, debugPrint=False): global weights G = buildBookGraph(books) @@ -1280,7 +1359,8 @@ def evaluateFitness(books, batchSize=-1, debugPrint=False): graphAddTags(G, books) runPagerank(G) - ratedBooks = [n for n in list(G.nodes) if 'rating' in G.nodes[n] and G.nodes[n]['rating'] != None] + ratedBooks = [n for n in list( + G.nodes) if 'rating' in G.nodes[n] and G.nodes[n]['rating'] != None] boundsLoss = 0 linSepLoss = [] errSq = [] @@ -1289,20 +1369,24 @@ def evaluateFitness(books, batchSize=-1, debugPrint=False): gradient[wt] = 0 mu, sigma = genScores(G, books) for b in G.nodes: - batch = random.sample(ratedBooks, batchSize) if batchSize!=-1 and len(ratedBooks) > batchSize else ratedBooks + batch = random.sample(ratedBooks, batchSize) if batchSize != - \ + 1 and len(ratedBooks) > batchSize else ratedBooks if b in batch: rating = G.nodes[b]['rating'] G.nodes[b]['rating'] = None _, _ = genScores(G, books, calcPagerank=False) - if G.nodes[b]['score'] > rating: # over estimated + if G.nodes[b]['score'] > rating: # over estimated errSq.append(((rating - G.nodes[b]['score'])**2)*2) else: errSq.append((rating - G.nodes[b]['score'])**2) G.nodes[b]['rating'] = rating for wt in weights: - scoreB = sum([a*(1.001 if wt==w[0] else 1)*weights[w[0]]*(w[1] if len(w)>1 else 1) for a,w in zip(G.nodes[b]['_act'], G.nodes[b]['_wgh'])])/sum([(1.001 if wt==w[0] else 1)*weights[w[0]]*(w[1] if len(w)>1 else 1) for w in G.nodes[b]['_wgh']]) - gradient[wt] += ((rating - G.nodes[b]['score'])**2 - (rating - scoreB)**2)*1000 - regressionLoss = sum([max(0,abs(w)-1)**2 for w in weights.values()]) # no punishment if w within -1 and 1 + scoreB = sum([a*(1.001 if wt == w[0] else 1)*weights[w[0]]*(w[1] if len(w) > 1 else 1) for a, w in zip(G.nodes[b]['_act'], + G.nodes[b]['_wgh'])])/sum([(1.001 if wt == w[0] else 1)*weights[w[0]]*(w[1] if len(w) > 1 else 1) for w in G.nodes[b]['_wgh']]) + gradient[wt] += ((rating - G.nodes[b]['score']) + ** 2 - (rating - scoreB)**2)*1000 + # no punishment if w within -1 and 1 + regressionLoss = sum([max(0, abs(w)-1)**2 for w in weights.values()]) for wt in weights: if abs(weights[wt]) > 1.0: gradient[wt] -= weights[wt]*10 @@ -1315,6 +1399,55 @@ def evaluateFitness(books, batchSize=-1, debugPrint=False): fit = sum(errSq)/len(errSq) + 0.001*regressionLoss return fit, gradient + +def calcDissonance(books): + global weights + G = buildBookGraph(books) + graphAddAuthors(G, books) + graphAddRecommenders(G, books) + graphAddTopLists(G, books) + graphAddSeries(G, books) + graphAddTags(G, books) + runPagerank(G) + + ratedBooks = [n for n in list( + G.nodes) if 'rating' in G.nodes[n] and G.nodes[n]['rating'] != None] + errSq = [] + gradient = {} + for wt in weights: + gradient[wt] = 0 + mu, sigma = genScores(G, books) + for b in G.nodes: + batch = ratedBooks + if b in batch: + rating = G.nodes[b]['rating'] + G.nodes[b]['rating'] = None + _, _ = genScores(G, books, calcPagerank=False) + G.nodes[b]['_test_score'] = G.nodes[b]['score'] + G.nodes[b]['rating'] = rating + G.nodes[b]['dissonance_off'] = rating - G.nodes[b]['score'] + G.nodes[b]['dissonance_abs'] = abs(rating - G.nodes[b]['score']) + + return G + + +def describeDissonance(books, num=-1, sortKey='dissonance_abs', sortDir=True): + bestlist = [] + G = calcDissonance(books) + for n in list(G.nodes): + node = G.nodes[n] + if'dissonance_abs' in node: + bestlist.append(node) + bestlist.sort(key=lambda node: node[sortKey], reverse=sortDir) + for i, book in enumerate(bestlist): + line = book['title'] + " ("+" & ".join(book['authors'])+")" + \ + ": You: {:.5f}, AI: {:.5f}, Delta: {:.5f}".format( + book['rating'], book['_test_score'], book['dissonance_off']) + print("["+str(i+1).zfill(int((math.log10(num) if num != -1 else 3)+1))+"] "+line) + if num != -1 and i == num-1: + break + + def train(initGamma, full=True): global weights if full: @@ -1367,18 +1500,22 @@ def train(initGamma, full=True): break print('Done.') + def saveWeights(weights): with open('neuralWeights.json', 'w') as f: f.write(json.dumps(weights)) + def loadWeights(): try: with open('neuralWeights.json', 'r') as f: weights = json.loads(f.read()) except IOError: - weights = {"topList": 0.15, "recommender": 0.30, "author": 0.70, "series": 0.05, "tag": 0.05, "pagerank": 0.05, "mu": 0.50, "sigma": 0.30, "bias": 0.25, "median": 0.10} #, "tgb_rank": 0.10} + weights = {"topList": 0.15, "recommender": 0.30, "author": 0.70, "series": 0.05, "tag": 0.05, + "pagerank": 0.05, "mu": 0.50, "sigma": 0.30, "bias": 0.25, "median": 0.10} # , "tgb_rank": 0.10} return weights + def cliInterface(imgDef=False): import argparse @@ -1403,13 +1540,16 @@ def cliInterface(imgDef=False): cmds = parser.add_subparsers(required=True, dest='cmd') p_rec = cmds.add_parser('recommend', description="TODO", aliases=['rec']) - p_rec.add_argument('-n', type=int, default=20, help='number of books to recommend') + p_rec.add_argument('-n', type=int, default=20, + help='number of books to recommend') p_rec.add_argument('--tag-based', action="store_true") p_rec.add_argument('--recommender-based', action="store_true") - p_rec.add_argument('--new', type=int, default=-1, help='number of new books to recommend') + p_rec.add_argument('--new', type=int, default=-1, + help='number of new books to recommend') p_rec = cmds.add_parser('listScores', description="TODO", aliases=['ls']) - p_rec.add_argument('-n', type=int, default=50, help='number of books to recommend') + p_rec.add_argument('-n', type=int, default=50, + help='number of books to recommend') p_read = cmds.add_parser('read', description="TODO", aliases=[]) p_read.add_argument('--min-rating', type=int, default=0) @@ -1417,29 +1557,42 @@ def cliInterface(imgDef=False): p_read.add_argument('--only-connected', action="store_true") p_show = cmds.add_parser('analyze', description="TODO", aliases=[]) - p_show.add_argument('type', choices=['any', 'book', 'recommender', 'author', 'series', 'tag']) + p_show.add_argument( + 'type', choices=['any', 'book', 'recommender', 'author', 'series', 'tag']) p_show.add_argument('name', type=str) - p_show.add_argument('-d', type=float, default=2.1, help='depth of expansion') + p_show.add_argument('-d', type=float, default=2.1, + help='depth of expansion') p_train = cmds.add_parser('train', description="TODO", aliases=[]) - p_train.add_argument('-g', type=float, default=0.2, help='learning rate gamma') + p_train.add_argument('-g', type=float, default=0.2, + help='learning rate gamma') p_train.add_argument('--full', action="store_true") p_prog = cmds.add_parser('progress', description="TODO", aliases=[]) - p_prog.add_argument('-m', type=float, default=7, help='Mimimum Score to read') + p_prog.add_argument('-m', type=float, default=7, + help='Mimimum Score to read') p_comp = cmds.add_parser('competence', description="TODO", aliases=[]) p_shell = cmds.add_parser('shell', description="TODO", aliases=[]) - + p_new = cmds.add_parser('newBooks', description="TODO", aliases=[]) - p_new.add_argument('-n', type=int, default=10, help='number of books to recommend') - - p_col = cmds.add_parser('calice', description="TODO", aliases=[]) - - p_createCol = cmds.add_parser('createCaliceColumn', description="TODO", aliases=[]) + p_new.add_argument('-n', type=int, default=10, + help='number of books to recommend') + + p_cal = cmds.add_parser('calice', description="TODO", aliases=[]) + + p_dis = cmds.add_parser('dissonance', description="TODO", aliases=['dis']) + p_dis.add_argument('-n', type=int, default=-1, + help='Maximum number of books to lost') + p_dis.add_argument( + '--sort', choices=['dissonance_abs', 'dissonance_off', 'score'], default='dissonance_abs', const='dissonance_abs', nargs='?') + p_dis.add_argument('--reversed', action="store_true") + + p_createCol = cmds.add_parser( + 'createCaliceColumn', description="TODO", aliases=[]) p_createCol.add_argument('type', choices=['score', 'rating', 'both']) - + p_full = cmds.add_parser('full', description="TODO", aliases=[]) args = parser.parse_args() @@ -1452,6 +1605,7 @@ def cliInterface(imgDef=False): else: mainCLI(args) + def perfTestCLI(args): import time from pycallgraph import PyCallGraph @@ -1466,8 +1620,9 @@ def perfTestCLI(args): with PyCallGraph(output=GraphvizOutput(output_file='perfTests/' + str(int(time.time())) + '.png'), config=config): mainCLI(args) + def mainCLI(args): - if args.cmd=="train": + if args.cmd == "train": train(args.g, args.full) exit() @@ -1482,55 +1637,62 @@ def mainCLI(args): if not args.keep_whitepapers: removeWhitepapers(G) - - if args.cmd=="recommend": - if args.new==-1: + if args.cmd == "recommend": + if args.new == -1: args.new = int(args.n / 5) if args.new != 0: - findNewBooks(G, books, mu, args.new, minRecSco = mu-std) + findNewBooks(G, books, mu, args.new, minRecSco=mu-std) if args.tag_based: if args.recommender_based: - raise Exception('tag-based and recommender-based can not be be combined') - recommendNBooksTagBased(G, mu, std, args.n, not args.keep_top_lists) + raise Exception( + 'tag-based and recommender-based can not be be combined') + recommendNBooksTagBased( + G, mu, std, args.n, not args.keep_top_lists) elif args.recommender_based: - recommendNBooksRecommenderBased(G, mu, std, args.n, not args.keep_top_lists, not args.keep_useless_recommenders) + recommendNBooksRecommenderBased( + G, mu, std, args.n, not args.keep_top_lists, not args.keep_useless_recommenders) else: - recommendNBooks(G, mu, std, args.n, not args.keep_top_lists, not args.keep_useless_recommenders, args.v3d) - elif args.cmd=="listScores": + recommendNBooks(G, mu, std, args.n, not args.keep_top_lists, + not args.keep_useless_recommenders, args.v3d) + elif args.cmd == "listScores": listScores(G, mu, std, args.n) - elif args.cmd=="read": - readBooksAnalysis(G, args.min_rating, args.all_tags, args.only_connected, not args.keep_top_lists) - elif args.cmd=="analyze": + elif args.cmd == "read": + readBooksAnalysis(G, args.min_rating, args.all_tags, + args.only_connected, not args.keep_top_lists) + elif args.cmd == "analyze": analyze(G, books, mu, args.type, args.name, args.d) - elif args.cmd=="full": + elif args.cmd == "full": fullGraph(G, not args.keep_top_lists) - elif args.cmd=="competence": + elif args.cmd == "competence": bestListT = 'recommender' recommenderCompetence(G) - elif args.cmd=="shell": + elif args.cmd == "shell": shell(G, books, mu, std) - elif args.cmd=="progress": + elif args.cmd == "progress": progress(G, books, mu, args.m) return - elif args.cmd=="newBooks": + elif args.cmd == "newBooks": bestListT = 'newBook' newBooks(G, books, args.n, mu, std) - elif args.cmd=="calice": + elif args.cmd == "calice": calice(G) exit() - elif args.cmd=="createCaliceColumn": + elif args.cmd == "dissonance": + describeDissonance(books, args.n, args.sort, not args.reversed) + exit() + elif args.cmd == "createCaliceColumn": if args.type in ['score', 'both']: calibreDB.createCaliceScoreColumn() print('[*] Column "Calice Score" was created.') if args.type in ['rating', 'both']: calibreDB.createCaliceRatingColumn() print('[*] Column "Calice Rating" was created.') - print('[i] To allow displaying half-stars, please active them manually in the calibre-settings.') + print( + '[i] To allow displaying half-stars, please active them manually in the calibre-settings.') exit() else: raise Exception("Bad") - if not args.keep_priv: removePriv(G) if args.remove_read: