CaliGraph/caliGraph.py

#!./.venv/bin/python3.10
import os
import re
import json
import math
import copy
import random
import requests

from collections import defaultdict

import numpy as np
import pandas as pd
from scipy.stats import norm

import matplotlib.pyplot as plt
import networkx as nx
from pyvis.network import Network
import plotly.graph_objects as go

import wikipedia

def getAllAuthors(books):
    authors = set()
    for book in books:
        for author in getAuthors(book):
            authors.add(author)
    return list(authors)


def getAuthors(book):
    return book['authors'].split(' & ')


def getRecommenders(book):
    recs = set()
    for tag in book['tags']:
        if tag.find(" Recommendation") != -1:
            recs.add(tag.replace(" Recommendation", ""))
        elif tag.find("s Literature Club") != -1:
            recs.add(tag.replace("s Literature Club", ""))
        elif tag.find(":MRB") != -1:
            recs.add(tag.replace(":MRB", ""))
    return list(recs)


def getTags(book):
    for tag in book['tags']:
        if tag.find(" Recommendation") == -1 and tag.find("s Literature Club") == -1 and tag.find(" Top ") == -1 and tag.find(":MRB") == -1:
            yield tag


def getAllRecommenders(books):
    recs = set()
    for book in books:
        for rec in getRecommenders(book):
            recs.add(rec)
    return list(recs)


def getTopLists(book):
    lists = set()
    for tag in book['tags']:
        if tag.find(" Top ") != -1:
            lists.add(tag.split(" Top ")[0])
    return list(lists)


def getAllTopLists(books):
    tops = set()
    for book in books:
        for top in getTopLists(book):
            tops.add(top)
    return list(tops)


def getAllSeries(books):
    series = set()
    for book in books:
        if 'series' in book:
            series.add(book['series'])
    return list(series)


def getAllTags(books):
    tags = set()
    for book in books:
        for tag in getTags(book):
            tags.add(tag)
    return list(tags)


def getTopListWeight(book, topList):
    minScope = 100000
    for tag in book['tags']:
        if tag.find(topList+" Top ") != -1:
            scope = int(tag.split(" Top ")[1])
            minScope = min(minScope, scope)
    if minScope == 100000:
        raise Exception("You stupid?")
    if minScope == 10:
        return 1
    elif minScope == 25:
        return 0.85
    elif minScope == 100:
        return 0.5
    return 50 / minScope


def removeRead(G):
    for n in list(G.nodes):
        node = G.nodes[n]
        if node['t'] == 'book':
            if node['rating'] != None:
                G.remove_node(n)


def removeUnread(G):
    for n in list(G.nodes):
        node = G.nodes[n]
        if node['t'] == 'book':
            if node['rating'] == None:
                G.remove_node(n)


def removePriv(G):
    for n in list(G.nodes):
        node = G.nodes[n]
        if node['t'] == 'book':
            if 'priv' in node['tags']:
                G.remove_node(n)

def removeWhitepapers(G):
    for n in list(G.nodes):
        node = G.nodes[n]
        if node['t'] == 'book':
            if 'whitepaper' in node['tags']:
                G.remove_node(n)


def removeDangling(G, alsoBooks=False):
    for n in list(G.nodes):
        node = G.nodes[n]
        if node['t'] != 'book' or alsoBooks:
            if not len(G.adj[n]):
                G.remove_node(n)

def removeThinRecs(G, minCons=3):
    for n in list(G.nodes):
        node = G.nodes[n]
        if node['t'] == 'recommender':
            if not len(G.adj[n]) >= minCons:
                G.remove_node(n)

def removeEdge(G):
    for n in list(G.nodes):
        node = G.nodes[n]
        if node['t'] != 'book':
            if len(G.adj[n]) < 2:
                G.remove_node(n)


def removeBad(G, threshold, groups=['book', 'topList', 'recommender', 'author', 'series', 'tag']):
    for n in list(G.nodes):
        node = G.nodes[n]
        if node['t'] in groups:
            if 'score' in node and (node['score'] == None or node['score'] < threshold):
                G.remove_node(n)


def removeKeepBest(G, num, maxDistForRead=1, forType='book'):
    bestlist = []
    for n in list(G.nodes):
        node = G.nodes[n]
        if node['t'] == forType:
            if 'score' in node and node['score'] != None:
                bestlist.append(node)
    bestlist.sort(key=lambda node: node['score'], reverse=True)
    bestlist = bestlist[:num]

    for n in list(G.nodes):
        node = G.nodes[n]
        if node['t'] == forType and node not in bestlist or 'score' in node and node['score'] == None:
            if not 'rating' in node or node['rating'] == None or node['rating'] < bestlist[-1]['score']-maxDistForRead:
                G.remove_node(n)


def removeTags(G):
    for n in list(G.nodes):
        node = G.nodes[n]
        if node['t'] == 'tag':
            G.remove_node(n)


def pruneTags(G, minCons=2):
    for n in sorted(list(G.nodes), key=lambda i: G.nodes[i]['score'] + len(G.nodes[i]['feedbacks'])/5 if 'score' in G.nodes[i] and 'feedbacks' in G.nodes[i] else 0):
        node = G.nodes[n]
        if node['t'] == 'tag':
            foundCon = 0
            for book in G.adj[n]:
                for con in G.adj[book]:
                    conType = G.nodes[con]['t']
                    if conType not in ['topList']:
                        if conType in ['recommender']:
                            foundCon += 0.5
                        elif conType in ['tag', 'series']:
                            foundCon += 0.25
                        else:
                            foundCon += 1
            if foundCon > minCons:
                G.remove_node(n)


def pruneRecommenders(G, minCons=2):
    for n in sorted(list(G.nodes), key=lambda i: G.nodes[i]['score'] if 'score' in G.nodes[i] else 0):
        node = G.nodes[n]
        if node['t'] == 'recommender':
            foundCon = 0
            for book in G.adj[n]:
                for con in G.adj[book]:
                    conType = G.nodes[con]['t']
                    if conType not in ['topList']:
                        if conType in ['recommender']:
                            foundCon += 0.5
                        elif conType in ['tag', 'series']:
                            foundCon += 0.25
                        else:
                            foundCon += 1
            if foundCon > minCons:
                G.remove_node(n)


def pruneRecommenderCons(G, maxCons=5):
    for n in list(G.nodes):
        node = G.nodes[n]
        if node['t'] == 'recommender':
            if len(G.adj[n]) > maxCons:
                bestlist = []
                for m in list(G.adj[n]):
                    book = G.nodes[m]
                    if book['t'] == 'book':
                        if 'score' in book and book['score'] != None:
                            bestlist.append(book)
                bestlist.sort(key=lambda node: node['score'], reverse=True)
                bestlist = bestlist[:maxCons]

                for m in list(G.adj[n]):
                    book = G.nodes[m]
                    if book['t'] == 'book' and book not in bestlist or 'score' in book and book['score'] == None:
                        if not 'rating' in book or book['rating'] == None:
                            foundCon = 0
                            for con in G.adj[m]:
                                if G.nodes[con]['t'] not in ['topList']:
                                    foundCon += 1
                            if foundCon < 2:
                                G.remove_node(m)

def pruneAuthorCons(G, maxCons=3):
    for n in list(G.nodes):
        node = G.nodes[n]
        if node['t'] == 'author':
            if len(G.adj[n]) > maxCons:
                bestlist = []
                for m in list(G.adj[n]):
                    book = G.nodes[m]
                    if book['t'] == 'book':
                        if 'score' in book and book['score'] != None:
                            bestlist.append(book)
                bestlist.sort(key=lambda node: node['score'], reverse=True)
                bestlist = bestlist[:maxCons]

                for m in list(G.adj[n]):
                    book = G.nodes[m]
                    if book['t'] == 'book' and book not in bestlist or 'score' in book and book['score'] == None:
                        if not 'rating' in book or book['rating'] == None:
                            foundCon = 0
                            for con in G.adj[m]:
                                if G.nodes[con]['t'] not in ['topList']:
                                    foundCon += 1
                            if foundCon < 2:
                                G.remove_node(m)

def removeHighSpanTags(G, maxCons=5):
    for n in list(G.nodes):
        node = G.nodes[n]
        if node['t'] == 'tag':
            if len(G.adj[n]) > maxCons:
                G.remove_node(n)


def removeHighSpanReadBooks(G, maxCons=8):
    for n in list(G.nodes):
        node = G.nodes[n]
        if node['t'] == 'book' and node['rating'] != None:
            if sum([1 for adj in G.adj[n] if G.nodes[adj]['t']=='recommender']) > maxCons:
                G.remove_node(n)


def removeTopLists(G):
    for n in list(G.nodes):
        node = G.nodes[n]
        if node['t'] == 'topList':
            G.remove_node(n)

def removeRecommenders(G):
    for n in list(G.nodes):
        node = G.nodes[n]
        if node['t'] == 'recommender':
            G.remove_node(n)

def removeAuthors(G):
    for n in list(G.nodes):
        node = G.nodes[n]
        if node['t'] == 'author':
            G.remove_node(n)

def removeSeries(G):
    for n in list(G.nodes):
        node = G.nodes[n]
        if node['t'] == 'series':
            G.remove_node(n)

def removeRestOfSeries(G):
    for n in list(G.nodes):
        node = G.nodes[n]
        if node['t'] == 'series':
            seriesState = 0
            for adj in G.adj[n]:
                adjNode = G.nodes[adj]
                if adjNode['rating'] != None:
                    seriesState = max(seriesState, int(
                        adjNode['series_index']))
            for adj in list(G.adj[n]):
                adjNode = G.nodes[adj]
                if adjNode['series_index'] > seriesState + 1.0001:
                    G.remove_node(adj)

def removeUnusedRecommenders(G):
    for n in list(G.nodes):
        node = G.nodes[n]
        if node['t'] == 'recommender':
            for adj in G.adj[n]:
                adjNode = G.nodes[adj]
                if adjNode['t']=='book' and 'score' in adjNode:
                    break
            else: # No unrated recommendation
                G.remove_node(n)

def removeUselessReadBooks(G):
    minForce = 1.5
    minContact = 2
    for n in list(G.nodes):
        node = G.nodes[n]
        if node['t'] == 'book' and node['rating'] != None:
            force = 0
            contacts = 0
            for adj in G.adj[n]:
                adjNode = G.nodes[adj]
                contacts += 1
                for cousin in G.adj[adj]:
                    cousinNode = G.nodes[cousin]
                    if cousinNode['t']=='book' and 'score' in cousinNode or cousinNode['t'] == 'newBook':
                        if adjNode['t']=='recommender':
                            force += 0.5
                        else:
                            force += 1
            if force < minForce or contacts < minContact:
                G.remove_node(n)

def removeUselessTags(G, minUnread=1):
    for n in list(G.nodes):
        node = G.nodes[n]
        if node['t'] == 'tag':
            foundUnread = 0
            for adj in G.adj[n]:
                adjNode = G.nodes[adj]
                if adjNode['t']=='book' and 'score' in adjNode:
                    foundUnread += 1
            if foundUnread < minUnread:
                G.remove_node(n)

def removeUselessSeries(G, minSco=0):
    for n in list(G.nodes):
        node = G.nodes[n]
        if node['t'] == 'series':
            if len(G.adj[n]) < 2 or node['score'] < minSco:
                G.remove_node(n)

def scoreOpinions(G, globMu, globStd):
    for n in list(G.nodes):
        node = G.nodes[n]
        feedbacks = []
        if node['t'] not in ['book']:
            adjacens = list(G.adj[n].keys())
            for adj in adjacens:
                adjNode = G.nodes[adj]
                if adjNode['rating'] != None:
                    feedbacks.append(adjNode['rating'])
            if len(feedbacks):
                node['mean'], node['std'] = norm.fit(feedbacks)
                node['se'] = globStd / math.sqrt(len(feedbacks))
                ratio = len(feedbacks) / len(adjacens)
                node['score'] = node['mean']
                node['feedbacks'] = feedbacks
            else:
                node['score'] = None

def scoreUnread(G, globMu, globStd):
    neuralBins = defaultdict(list)
    feedbacks = [globMu-globStd, globMu+globStd]
    for n in list(G.nodes):
        node = G.nodes[n]
        if node['t'] == 'book':
            if node['rating'] == None:
                adjacens = list(G.adj[n].keys())
                for adj in adjacens:
                    adjNode = G.nodes[adj]
                    if 'score' in adjNode and adjNode['score'] != None:
                        w = adjNode['t']
                        for fb in adjNode['feedbacks']:
                            neuralBins[w].append(fb)
                            feedbacks.append(fb)
                node['mean'], node['std'] = norm.fit(feedbacks)
                node['median'] = np.percentile(feedbacks, [50], method='linear')[0]
                node['se'] = globStd / math.sqrt(len(feedbacks))
                neuralBins['mean'] = [node['mean']]
                neuralBins['sigma'] = [node['std']]
                neuralBins['median'] = [node['median']]
                neuralBins['se'] = [node['se']]
                neuralBins['pagerank'] = [node['pagerank_score']]
                if 'tgb_rank' in node:
                    neuralBins['tgbrank'] = [10/math.ln10(10+node['tgb_rank'])]
                neuralBins['bias'] = [globMu]
                score = 0
                nb = dict(neuralBins)
                act = {}
                for b in nb:
                    act[b] = sum(nb[b])/len(nb[b])
                    score += act[b] * getWeightForType(b)
                score /= sum([abs(getWeightForType(b)) for b in nb])
                node['score'] = math.tanh(score/10)*10
                node['_act'] = act
                if 'series' in node:
                    if node['series_index'] == 1.0:
                        node['score'] += 0.000000001

def getWeightForType(nodeType):
    global weights
    if nodeType not in weights:
        weights[nodeType] = 0.1
        saveWeights(weights)
        print('[i] neuralWeights-Vector extended with >'+nodeType+'<')
    return weights[nodeType]

def printBestList(G, t='book', num=-1):
    bestlist = []
    for n in list(G.nodes):
        node = G.nodes[n]
        if node['t'] == t:
            if 'score' in node and node['score'] != None:
                bestlist.append(node)
    bestlist.sort(key=lambda node: node['score'] + 0.00001*(node['se'] if 'se' in node else 0), reverse=True)
    for i, book in enumerate(bestlist):
        if t=='book':
            line = book['title'] + " ("+" & ".join(book['authors'])+")"+": {:.5f}".format(book['score'])
        else:
            line = book['label']
        print("["+str(i+1).zfill(int((math.log10(num) if num!=-1 else 3)+1))+"] "+line)
        if num!=-1 and i == num-1:
            break


def readColor(book):
    if 'rating' in book:
        return 'green'
    else:
        return 'gray'

def loadBooksFromDB():
    books = loadBooksFromCalibreDB()
    infuseDataFromMRB(books)
    #infuseDataFromTGB(books)
    return books

def mrbGetBook(mrbdf, title, authors):
    title = title.split('(')[0]
    title = title.replace('*','')
    pot = mrbdf[mrbdf['title'].str.contains(title)]
    dic = pot.to_dict(orient='records')
    for d in dic:
        for author in authors:
            parts = author.split(" ")
            for part in [parts[0], parts[-1]]:
                if d['author'].find(part)==-1:
                    break
            else:
                return d
    return False

def tgbGetBook(df, title, authors):
    title = title.split('(')[0]
    title = title.replace('*','')
    pot = df[df['title'].str.contains(title)]
    dic = pot.to_dict(orient='records')
    for d in dic:
        for author in authors:
            parts = author.split(" ")
            for part in [parts[0], parts[-1]]:
                if d['author'].find(part)==-1:
                    break
            else:
                return d
    return False

def infuseDataFromMRB(books):
    mrbdf = pd.read_csv('rec_dbs/mrb_db.csv')
    for book in books:
        mrb = mrbGetBook(mrbdf, book['title'], book['authors'])
        if mrb:
            for rec in str(mrb['recommender']).split('|'):
                book['tags'] += [rec + ':MRB']

def infuseDataFromTGB(books):
    for i in range(1,3):
        df = pd.read_csv('rec_dbs/tgb_'+str(i)+'.csv')
        for book in books:
            tgb = tgbGetBook(df, book['title'], book['authors'])
            if tgb:
                book['tgb_rank'] = int(tgb['id'])

def loadBooksFromCalibreDB():
    return json.loads(os.popen("calibredb list --for-machine -f all").read())

def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

def getKeywords(txt,rake):
    txt = remove_html_tags(txt)
    k = []
    rake.extract_keywords_from_text(txt)
    kws = rake.get_ranked_phrases_with_scores()
    for i,(score,kw) in enumerate(kws):
        l = len(kw.split(' '))
        if kw.lower() not in ['p', 'die', 'best', 'known', 'fk', 'p pp', 'one'] and len(kw)>3 and kw.find('div')==-1 and kw.lower().find('p p')==-1:
            k.append((score**(1/(l*0.4)),kw))
    k.sort(key=lambda x: x[0],reverse=True)
    if k:
        minSco = k[0][0]/3*2
        for i,kw in enumerate(k):
            if kw[0] < minSco:
                return [(sco,word.title()) for sco,word in k[:i]]
        return k
    return []

def runPagerank(G):
    try:
        scores = nx.pagerank(G=G)
    except nx.exception.PowerIterationFailedConvergence:
        print('[!] Could not calculate pagerank-scores: Power iteration of the eigenvector calculation did not converge')
        print('[ ]                                      Recommendations will be of slighly lower quality')
        scores = {}
    for n in list(G.nodes):
        G.nodes[n]['pagerank_score'] = scores[n] if n in scores else 0

def buildBookGraph(books, darkMode=False, extractKeywords=True, mergeTags=True):
    G = nx.Graph()
    if extractKeywords:
        from rake_nltk.rake import Rake
    rake = Rake()

    # Books
    for book in books:
        tags = book['tags']
        if 'rating' in book:
            rating = book['rating']
        else:
            rating = None
        if 'comments' in book:
            desc = book['comments']
        else:
            desc = ''
        if 'comments' in book and extractKeywords:
            sanitized = re.sub(r'[^a-zA-Z0-9\s\.äöü]+', '', book['comments']).replace('\n',' ')
            keywords = getKeywords(sanitized,rake)
        else:
            keywords = []
        if mergeTags:
            tags = tags + [word for (score, word) in keywords]
        if 'series' in book:
            series = book['series']
            series_index = book['series_index']
        else:
            series = None
            series_index = None
        G.add_node(book['id'], t='book', label=book['title'], title=book['title'], shape='image', image=book['cover'], rating=rating, tags=tags, keywords=keywords, desc=desc, isbn=book['isbn'], files=book['formats'], authors=getAuthors(book), series=series, series_index=series_index)

    return G

def getWikiImage(search_term):
    from fuzzywuzzy import fuzz
    WIKI_REQUEST = 'http://en.wikipedia.org/w/api.php?action=query&prop=pageimages&format=json&piprop=original&titles='
    try:
        print('[i] Searching for >'+search_term+'< on WikiPedia...')
        result = wikipedia.search(search_term, results = 1)
        if fuzz.ratio(search_term, result) < 50:
            raise Exception('blub')
        wikipedia.set_lang('en')
        wkpage = wikipedia.WikipediaPage(title = result[0])
        title = wkpage.title
        response  = requests.get(WIKI_REQUEST+title)
        json_data = json.loads(response.text)
        img_link = list(json_data['query']['pages'].values())[0]['original']['source']
        return img_link
    except:
        print('[!] No match for '+search_term+' on WikiPedia...')
        return None

def graphAddAuthors(G, books, darkMode=False):
    for author in getAllAuthors(books):
        G.add_node('a/'+author, color='green', t='author', label=author)
    for book in books:
        for author in getAuthors(book):
            G.add_edge('a/'+author, book['id'], color=readColor(book))
    return G

def graphAddRecommenders(G, books, darkMode=False):
    for rec in getAllRecommenders(books):
        G.add_node('r/'+rec, color='orange', t='recommender', label=rec)
    for book in books:
        for rec in getRecommenders(book):
            G.add_edge('r/'+rec, book['id'], color=readColor(book))
    return G

def graphAddTopLists(G, books, darkMode=False):
    for tl in getAllTopLists(books):
        G.add_node('t/'+tl, color='yellow', t='topList', label=tl)
    for book in books:
        for top in getTopLists(book):
            G.add_edge('t/'+top, book['id'], weight=getTopListWeight(
                book, top), color=readColor(book))
    return G


def graphAddSeries(G, books, darkMode=False):
    for series in getAllSeries(books):
        G.add_node('s/'+series, color='red', t='series', label=series, shape='triangle')
    for book in books:
        if 'series' in book:
            G.add_edge('s/'+book['series'], book['id'], color=readColor(book))
    return G


def graphAddTags(G, books, darkMode=False):
    for tag in getAllTags(books):
        G.add_node('t/'+tag, color=['lightGray','darkgray'][darkMode], t='tag', label=tag, shape='box')
    for book in books:
        for tag in getTags(book):
            G.add_edge('t/'+tag, book['id'], color=readColor(book))
    return G


def calcRecDist(G, books):
    globRatings = []
    for book in books:
        if G.nodes[book['id']]['rating'] != None:
            globRatings.append(G.nodes[book['id']]['rating'])
    return norm.fit(globRatings)


def scaleBooksByRating(G):
    for n in list(G.nodes):
        node = G.nodes[n]
        if node['t'] not in []:
            if 'rating' in node and node['rating'] != None:
                node['value'] = 20 + 5 * int(node['rating'])
            else:
                if 'score' in node and node['score'] != None:
                    node['value'] = 20 + int(5 * float(node['score']))
                else:
                    node['value'] = 15


def scaleOpinionsByRating(G):
    for n in list(G.nodes):
        node = G.nodes[n]
        if node['t'] in ['topList', 'recommender', 'author', 'series']:
            if 'score' in node and node['score'] != None:
                node['value'] = 20 + 5 * int(node['score'])
            else:
                node['value'] = 20


def addScoreToLabels(G):
    for n in list(G.nodes):
        node = G.nodes[n]
        if node['t'] not in ['tag', 'newBook']:
            if 'rating' in node and node['rating'] != None:
                node['label'] += " ("+str(node['rating'])+")"
            else:
                if 'score' in node and node['score'] != None and 'se' in node:
                    node['label'] += " ({:.2f}±{:.1f})".format(node['score'], node['se'])
                else:
                    node['label'] += " (0±∞)"


def genAndShowHTML(G, showButtons=False, darkMode=False, arrows=False):
    net = Network('1050px', '1900px',
                  directed=arrows,
                  bgcolor=['#FFFFFF','#181818'][darkMode])
    if showButtons:
        net.show_buttons(filter_=['configure', 'layout',
                                  'interaction', 'physics', 'edges'])
    net.from_nx(G)
    net.show('nx.html')


def genAndShow3D(G, darkMode=False):
    node_sizes = []
    node_labels = []
    node_cols = []
    for n in G.nodes:
        node = G.nodes[n]
        if node['t']=='tag':
            node_cols.append('gray')
        elif node['t']=='book':
            if 'score' in node: # unread book
                node_cols.append('lightblue')
            else:
                node_cols.append('magenta')
        elif 'color' in node:
            node_cols.append(node['color'])
        else:
            node_cols.append('black')

        node_labels.append(node['label'])
        node_sizes.append((node['value']/8)**1.5)

    spring = nx.spring_layout(G,dim=3, seed=random.randint(0, 65536))
    x_nodes = [spring[p][0] for p in spring]# x-coordinates of nodes
    y_nodes = [spring[p][1] for p in spring]# y-coordinates
    z_nodes = [spring[p][2] for p in spring]# z-coordinates

    x_edges=[]
    y_edges=[]
    z_edges=[]

    for edge in G.edges():
        x_coords = [spring[edge[0]][0],spring[edge[1]][0],None]
        x_edges += x_coords

        y_coords = [spring[edge[0]][1],spring[edge[1]][1],None]
        y_edges += y_coords

        z_coords = [spring[edge[0]][2],spring[edge[1]][2],None]
        z_edges += z_coords

    trace_edges = go.Scatter3d(x=x_edges,
                        y=y_edges,
                        z=z_edges,
                        mode='lines',
                        line=dict(color='black', width=2),
                        hoverinfo='none')

    trace_nodes = go.Scatter3d(x=x_nodes,
                        y=y_nodes,
                        z=z_nodes,
                        mode='markers',
                        marker=dict(symbol='circle',
                                    size=node_sizes,
                                    color=node_cols, #color the nodes according to their community
                                    #colorscale=['lightgreen','magenta'], #either green or mageneta
                                    line=dict(color='gray', width=0.5)),
                        text=node_labels,
                        hoverinfo='text')

    axis = dict(showbackground=False,
            showline=False,
            zeroline=False,
            showgrid=False,
            showticklabels=False,
            title='')

    layout = go.Layout(title="",
                width=1920,
                height=1080,
                plot_bgcolor=['#FFFFFF','#181818'][darkMode],
                paper_bgcolor=['#FFFFFF','#181818'][darkMode],
                showlegend=False,
                scene=dict(xaxis=dict(axis),
                        yaxis=dict(axis),
                        zaxis=dict(axis),
                        ),
                margin=dict(l=0, r=0, b=0, t=0),
                hovermode='closest')

    data = [trace_edges, trace_nodes]
    fig = go.Figure(data=data, layout=layout)

    fig.show()

def buildFullGraph(darkMode=False):
    books = loadBooksFromDB()
    G = buildBookGraph(books, darkMode=darkMode)

    graphAddAuthors(G, books, darkMode=darkMode)
    graphAddRecommenders(G, books, darkMode=darkMode)
    graphAddTopLists(G, books, darkMode=darkMode)
    graphAddSeries(G, books, darkMode=darkMode)
    graphAddTags(G, books, darkMode=darkMode)
    return G, books


def genScores(G, books, calcPagerank=True):
    globMu, globStd = calcRecDist(G, books)
    if calcPagerank:
        runPagerank(G)
    scoreOpinions(G, globMu, globStd)
    scoreUnread(G, globMu, globStd)
    return globMu, globStd

def addImageToNode(node, cache, shape='circularImage'):
    name = node['label'].split(' (')[0].replace('*','')
    if not name in cache or (cache[name]==False and random.random()<0.05):
        term = name
        img = getWikiImage(term)
        if img:
            cache[name] = img
        else:
            cache[name] = False
    else:
        img = cache[name]
    if img:
        #node['imagePadding'] = '100px'
        node['image']=img
        node['shape']=shape

def addImagesToNodes(G):
    try:
        with open('.imgLinkCache.json', 'r') as cf:
            cache = json.loads(cf.read())
    except IOError:
        cache = {}
    for n in list(G.nodes):
        node = G.nodes[n]
        if node['t'] in ['recommender', 'author']:
            addImageToNode(node, cache, ['circularImage','image'][node['t']=='author'])
    with open('.imgLinkCache.json', 'w') as cf:
        cf.write(json.dumps(cache))

def recommendNBooksRecommenderBased(G, mu, std, n, removeTopListsB=True, removeUselessRecommenders=True):
    removeRestOfSeries(G)
    removeBad(G, mu-std*2-1)
    removeKeepBest(G, int(n*2) + 5, maxDistForRead=2)
    removeEdge(G)
    removeHighSpanTags(G, 6)
    removeDangling(G, alsoBooks=False)
    pruneTags(G, 10)
    removeBad(G, mu, groups=['book'])
    removeUselessReadBooks(G)
    pruneTags(G, 6)
    pruneRecommenderCons(G, int(n/7)+1)
    pruneAuthorCons(G, int(n/15))
    removeUselessTags(G)
    if removeTopListsB:
        removeTopLists(G)
    removeDangling(G, alsoBooks=True)
    removeKeepBest(G, n+math.ceil(n/20), maxDistForRead=1.5)
    removeEdge(G)
    removeDangling(G, alsoBooks=True)
    removeUselessReadBooks(G)
    if removeUselessRecommenders:
        removeUnusedRecommenders(G)
        removeDangling(G, alsoBooks=True)
    removeKeepBest(G, n, maxDistForRead=1.25)

    scaleBooksByRating(G)
    scaleOpinionsByRating(G)
    addScoreToLabels(G)


def recommendNBooksTagBased(G, mu, std, n, removeTopListsB=True):
    removeRestOfSeries(G)
    removeBad(G, mu-std*2-1)
    removeKeepBest(G, int(n*2) + 5, maxDistForRead=2)
    removeEdge(G)
    removeHighSpanTags(G, 12)
    removeDangling(G, alsoBooks=False)
    pruneTags(G, 24)
    removeBad(G, mu, groups=['book'])
    removeUselessReadBooks(G)
    pruneTags(G, 16)
    pruneAuthorCons(G, int(n/5))
    removeRecommenders(G)
    removeUselessTags(G)
    if removeTopListsB:
        removeTopLists(G)
    removeDangling(G, alsoBooks=True)
    removeKeepBest(G, n+math.ceil(n/20), maxDistForRead=1.5)
    removeUselessReadBooks(G)
    removeUselessTags(G)
    removeKeepBest(G, n, maxDistForRead=1.25)

    scaleBooksByRating(G)
    scaleOpinionsByRating(G)
    addScoreToLabels(G)

def recommendNBooks(G, mu, std, n, removeTopListsB=True, removeUselessRecommenders=True, v3d=False):
    removeRestOfSeries(G)
    removeBad(G, mu-std-0.5)
    removeBad(G, mu+std/2, groups=['recommender'])
    removeThinRecs(G, 3)
    removeKeepBest(G, int(n*2) + 5, maxDistForRead=2)
    removeEdge(G)
    removeHighSpanTags(G, 8)
    pruneTags(G, 7)
    removeHighSpanReadBooks(G, 14)
    removeDangling(G, alsoBooks=False)
    pruneRecommenders(G, 12)
    removeThinRecs(G, 3)
    removeBad(G, mu, groups=['book'])
    removeUselessReadBooks(G)
    pruneAuthorCons(G, int(n/5)+3)
    pruneRecommenders(G, 12 - min(4, n/20))
    removeUselessSeries(G, mu)
    removeUselessTags(G)
    pruneTags(G, 6)
    if removeTopListsB:
        removeTopLists(G)
    removeDangling(G, alsoBooks=True)
    removeKeepBest(G, n+math.ceil(n/20)+3, maxDistForRead=1.5)
    removeEdge(G)
    removeKeepBest(G, n+1, maxDistForRead=1.25)
    removeUselessSeries(G, mu)
    removeUselessTags(G)
    removeUselessReadBooks(G)
    removeThinRecs(G, 2 + 1 * (n>20 and not v3d))
    removeKeepBest(G, n, maxDistForRead=1.25)

    scaleBooksByRating(G)
    scaleOpinionsByRating(G)
    addScoreToLabels(G)


def listScores(G, mu, std, n):
    removeRestOfSeries(G)
    removeKeepBest(G, n, maxDistForRead=10)

    scaleBooksByRating(G)
    scaleOpinionsByRating(G)
    addScoreToLabels(G)


def fullGraph(G, removeTopListsB=True):
    removeEdge(G)
    removeHighSpanTags(G, 7)
    removeDangling(G, alsoBooks=False)
    if removeTopListsB:
        removeTopLists(G)
    pruneTags(G, 3)
    removeDangling(G, alsoBooks=True)

    scaleBooksByRating(G)
    scaleOpinionsByRating(G)
    addScoreToLabels(G)

def recommenderCompetence(G):
    #removeRead(G)
    removeUnread(G)
    removeTags(G)
    removeAuthors(G)
    removeSeries(G)
    removeTopLists(G)

    removeEdge(G)
    removeDangling(G, alsoBooks=True)

    scaleBooksByRating(G)
    scaleOpinionsByRating(G)
    addScoreToLabels(G)
    
    for n in list(G.nodes):
        node = G.nodes[n]
        if node['t'] == 'recommender':
            if 'se' in node:
                node['score'] -= node['se'] * 1
            else:
                if not node['score']:
                    node['score'] = 0
                node['score'] /= 2

def readBooksAnalysis(G, minRating=0, showAllTags=True, removeUnconnected=False, removeTopListsB=True):
    removeUnread(G)
    removeBad(G, minRating)
    if not showAllTags:
        removeEdge(G)
    removeHighSpanTags(G, 15)
    removeDangling(G, alsoBooks=removeUnconnected)
    if removeTopListsB:
        removeTopLists(G)
    pruneTags(G, 8)

    scaleBooksByRating(G)
    scaleOpinionsByRating(G)
    addScoreToLabels(G)

def progress(G, books, mu, minimum=3.5):
    findNewBooks(G, books, mu, -1, minRecSco = minimum)
    bookCount = 0
    libCount = 0
    readCount = 0
    toReadCount = 0
    for n in list(G.nodes):
        node = G.nodes[n]
        if node['t'] in ['book','newBook']:
            if node['t'] == 'book':
                libCount +=1
            bookCount += 1
            if 'rating' in node and node['rating'] != None:
                readCount += 1
            elif 'score' in node and (node['score'] >= minimum or 'std' in node and node['std']==0.0):
                toReadCount += 1
    perc = round(readCount / (toReadCount+readCount) * 100, 2)
    print('Books in library: '+str(libCount))
    print('Books in CaliGraph: '+str(bookCount))
    print('Read Books: '+str(readCount))
    print('Unread Books: '+str(bookCount-readCount))
    print('Recommended Books (score > '+str(round(minimum, 2))+'): '+str(toReadCount))
    print('Progress: '+str(perc)+'%')


def analyze(G, books, mu, type_name, name, dist=2.1):
    from fuzzywuzzy import fuzz
    type_ident = type_name[0]
    full_name = type_ident + "/" + name
    bestRatio, match, n = 0, None, 0
    for ni in list(G.nodes):
        node = G.nodes[ni]
        if node['t'] == type_name or type_name=="any":
            if name==node['label'] or full_name==node['label']:
                match, n = node, ni
                break
            ratio = fuzz.ratio(node['label'], name)
            if ratio > bestRatio:
                bestRatio, match, n = ratio, node, ni
    if bestRatio < 70:
        print("Best Match: "+match['label'])

    findNewBooks(G, books, mu, num=-1, minRecSco=1)

    menge = set()
    waveFlow(G, match, n, dist, menge)
    for n in list(G.nodes):
        if n not in menge:
            G.remove_node(n)
    if dist >= 2:
        removeThinRecs(G, 2)
    removeHighSpanTags(G, 12)
    if dist > 1:
        removeDangling(G, True)

    scaleBooksByRating(G)
    scaleOpinionsByRating(G)
    #match['value'] = 100
    if not 'shape' in match:
        match['shape'] = 'star'
    addScoreToLabels(G)
    match['label'] = "*"+match['label']+"*"

def waveFlow(G, node, n, dist, menge, firstEdge=False):
    if dist <= 0:
        return
    dist -= 1
    if menge==set():
        firstEdge=True
    if node['t'] in ['topList']:
        if firstEdge:
            menge.add(n)
        return
    menge.add(n)
    if node['t'] in ['tag']:
        if firstEdge:
            dist-=0.1
        else:
            return
    bestlist = []
    keeplist = []
    for m in list(G.adj[n]):
        book = G.nodes[m]
        if book['t'] not in ['NOTHING']:
            if 'score' in book and book['score'] != None:
                bestlist.append(book)
            elif 'rating' in book and book['rating'] != None:
                keeplist.append(book)
            else:
                book['score'] = 0
                bestlist.append(book)
    bestlist.sort(key=lambda node: node['score'], reverse=True)
    toKeep = min(int(dist*10), math.ceil(len(bestlist) * dist - len(keeplist)*0.5))
    if toKeep <= 0:
        keeplist.sort(key=lambda node: node['rating'], reverse=True)
        keeplist = keeplist[:min(int(dist*10), int(len(keeplist) * dist))]
        bestlist = []
    else:
        bestlist = bestlist[:toKeep]

    for m in list(G.adj[n]):
        node = G.nodes[m]
        if node in bestlist or node in keeplist:
            waveFlow(G, node, m, dist, menge, firstEdge=firstEdge)

def gensimTokensForLines(lines):
    for i, line in enumerate(lines):
        tokens = gensim.utils.simple_preprocess(line)
        if tokens_only:
            yield tokens
        else:
            # For training data, add tags
            yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

def buildDoc2Vec(books):
    import gensim
    for n in list(G.nodes):
        node = G.nodes[n]
        if node['t'] == 'book':
            pass
    gensimTokensForLines(lines)

def shell(G, books, mu, std):
    from ptpython.repl import embed
    embed(globals(), locals())

def newBooks(G, books, num, mu, std):
    removeBad(G, mu-std*2)
    findNewBooks(G, books, mu, num, minRecSco = mu-std)
    removeThinRecs(G, 2)
    removeUnread(G)
    removeUselessReadBooks(G)
    removeTags(G)
    removeTopLists(G)
    removeSeries(G)
    removeEdge(G)
    removeDangling(G, alsoBooks=True)

    scaleBooksByRating(G)
    scaleOpinionsByRating(G)
    addScoreToLabels(G)


def findNewBooks(G, books, mu, num=-1, minRecSco=5):
    mrbdf = pd.read_csv('rec_dbs/mrb_db.csv')
    recs = []
    for n in list(G.nodes):
        node = G.nodes[n]
        if node['t'] == 'recommender' and 'score' in node:
            oldBooks = []
            newBooks = []
            recBooks = mrbdf[mrbdf['recommender'].str.contains(node['label'])].to_dict(orient='records')
            for book in recBooks:
                if book['title'] in [b['title'] for b in books]:
                    oldBooks.append({'title': book['title'], 'author': book['author']})
                else:
                    newBooks.append({'title': book['title'], 'author': book['author']})
            recs.append({'name': node['label'], 'rec': node, 'newBooks': newBooks, 'oldBooks': oldBooks})
    for rec in recs:
        for book in rec['newBooks']:
            G.add_node('n/'+book['title'], color='blue', t='newBook', label=book['title'], author=book['author'])

            G.add_node('r/'+rec['rec']['label'], color='orange', t='recommender', label=rec['rec']['label'], score=rec['rec']['score'])
            G.add_edge('r/'+rec['rec']['label'], 'n/'+book['title'], color='blue')

            G.add_node('a/'+book['author'], color='green', t='author', label=book['author'])
            G.add_edge('a/'+book['author'], 'n/'+book['title'], color='blue')
    for n in list(G.nodes):
        node = G.nodes[n]
        if node['t'] == 'newBook':
            ses = []
            scores = []
            for m in list(G.adj[n]):
                adj = G.nodes[m]
                if adj['t'] == 'recommender' and adj['score']!=None:
                    scores.append(adj['score'])
                    ses.append(adj['se'])
            if not len(scores):
                G.remove_node(n)
            else:
                ses.append(min(ses))
                scores.append(mu)
                node['fake_se'] = sum(ses)/(len(ses)**1.2) + 0.5 + 0.5 * (len(scores)==2) # This is not how SE works. DILLIGAF?
                node['score'] = sum(scores)/len(scores)*1.2 - node['fake_se']*2 + 0.5 - 0.1/math.sqrt(len(scores))
                if len(scores)==2:
                    node['score']*=0.80
                node['value'] = 20 + 5 * float(node['score'])
                node['label'] += " ({:.2f}±{:.1f})".format(node['score'], node['fake_se'])
                node['label'] += '\n ' + node['author']
    if num!=-1:
        removeKeepBest(G, num, 10, 'newBook')

# while batchSize is implemented, we only get a good gonvergence when we disable it (batchSize=-1)
# but might be necessary to enable later for a larger libary for better training performance...
# maybe try again for 128 books?
def evaluateFitness(books, batchSize=16, debugPrint=False):
    global weights
    G = buildBookGraph(books)
    graphAddAuthors(G, books)
    graphAddRecommenders(G, books)
    graphAddTopLists(G, books)
    graphAddSeries(G, books)
    graphAddTags(G, books)
    runPagerank(G)

    ratedBooks = [n for n in list(G.nodes) if 'rating' in G.nodes[n] and G.nodes[n]['rating'] != None]
    boundsLoss = 0
    linSepLoss = []
    errSq = []
    gradient = {}
    for w in weights:
        gradient[w] = 0
    mu, sigma = genScores(G, books)
    batch = random.sample(ratedBooks, batchSize) if batchSize!=-1 and len(ratedBooks) > batchSize else ratedBooks
    for b in G.nodes:
        if b in ratedBooks:
            node = G.nodes[b]
            rating = G.nodes[b]['rating']
            G.nodes[b]['rating'] = None
            _, _ = genScores(G, books, calcPagerank=False)
            if G.nodes[b]['score'] > rating: # over estimated
                errSq.append(((rating - G.nodes[b]['score'])**2)*2)
            else:
                errSq.append((rating - G.nodes[b]['score'])**2)
            G.nodes[b]['rating'] = rating
            if b in batch:
                for wt in weights:
                    scoreB = 0
                    for w in node['_act']:
                        scoreB += node['_act'][w] * (getWeightForType(w) + (0.001 if wt==w else 0))
                    scoreB /= sum([abs(getWeightForType(w)) for w in node['_act']])
                    scoreB = math.tanh(scoreB/10)*10
                    gradient[wt] += ((rating - G.nodes[b]['score'])**2 - (rating - scoreB)**2)*1000
    regressionLoss = sum([max(0,abs(w)-1)**2 for w in weights.values()]) # no punishment if w within -1 and 1
    for wt in weights:
        if abs(weights[wt]) > 1.0:
            gradient[wt] -= weights[wt]*3
    for g in gradient:
        gradient[g] /= len(batch)
    if debugPrint:
        print(sum(errSq)/len(errSq), 0.001*regressionLoss)
    fit = sum(errSq)/len(errSq) + 0.001*regressionLoss
    return fit, gradient

def train(initGamma, full=True):
    global weights
    if full:
        for wt in weights:
            weights[wt] = random.random()
        saveWeights(weights)
    gamma = initGamma
    books = loadBooksFromDB()
    bestWeights = copy.copy(weights)
    mse, gradient = evaluateFitness(books)
    delta = math.sqrt(sum(gradient[g]**2 for g in gradient)/len(gradient))
    best_mse = mse
    stagLen = 0
    goal = 1.0e-4
    if full:
        goal = 1.0e-5

    while gamma > goal and delta > goal or best_mse > 15:
        goal *= 1.1
        last_mse = mse
        print({'mse': mse, 'gamma': gamma, 'delta': delta})
        delta = sum(gradient[g]**2 for g in gradient)
        for wt in weights:
            if wt in gradient:
                weights[wt] += gamma*gradient[wt]/math.sqrt(delta)
            #else:
            #    del weights[wt]
        mse, gradient = evaluateFitness(books)
        if mse < last_mse:
            gamma = gamma*1.25
        else:
            gamma *= 0.25
        if mse < best_mse:
            saveWeights(weights)
            bestWeights = copy.copy(weights)
            best_mse = mse
        if mse > last_mse:
            stagLen += 1
        else:
            stagLen = 0
        if stagLen == 4 or mse > 50:
            if full or mse > 10:
                stagLen = 0
                gamma = initGamma
                if random.random() < 0.50:
                    for wt in weights:
                        weights[wt] = random.random()*2-0.5
                else:
                    weights = copy.copy(bestWeights)
                    for wt in weights:
                        weights[wt] *= 0.975+0.05*random.random()
            else:
                break
    print('Done.')

def saveWeights(weights):
    with open('neuralWeights.json', 'w') as f:
        f.write(json.dumps(weights))

def loadWeights():
    try:
        with open('neuralWeights.json', 'r') as f:
            weights = json.loads(f.read())
    except IOError:
        weights = {"topList": 0.15, "recommender": 0.30, "author": 0.70, "series": 0.05, "tag": 0.05, "pagerank": 0.05, "mu": 0.50, "sigma": 0.30, "bias": 0.25} #, "tgb_rank": 0.10}
    return weights

def cliInterface():
    import argparse

    parser = argparse.ArgumentParser(description='TODO: Write Description.')
    parser.add_argument('--keep-priv', action="store_true")
    parser.add_argument('--keep-whitepapers', action="store_true")
    parser.add_argument('--remove-read', action="store_true")
    parser.add_argument('--remove-unread', action="store_true")
    parser.add_argument('--no-web', action="store_true")
    parser.add_argument('--no-list', action="store_true")
    parser.add_argument('--remove-edge', action="store_true")
    parser.add_argument('--keep-top-lists', action="store_true")
    parser.add_argument('--keep-useless-recommenders', action="store_true")
    parser.add_argument('--dark', action="store_true")
    parser.add_argument('--v3d', action="store_true")
    parser.add_argument('--imgs', action="store_true")
    cmds = parser.add_subparsers(required=True, dest='cmd')

    p_rec = cmds.add_parser('recommend', description="TODO", aliases=['rec'])
    p_rec.add_argument('-n', type=int, default=20, help='number of books to recommend')
    p_rec.add_argument('--tag-based', action="store_true")
    p_rec.add_argument('--recommender-based', action="store_true")
    p_rec.add_argument('--new', type=int, default=-1, help='number of new books to recommend')

    p_rec = cmds.add_parser('listScores', description="TODO", aliases=['ls'])
    p_rec.add_argument('-n', type=int, default=50, help='number of books to recommend')

    p_read = cmds.add_parser('read', description="TODO", aliases=[])
    p_read.add_argument('--min-rating', type=int, default=0)
    p_read.add_argument('--all-tags', action="store_true")
    p_read.add_argument('--only-connected', action="store_true")

    p_show = cmds.add_parser('analyze', description="TODO", aliases=[])
    p_show.add_argument('type', choices=['any', 'book', 'recommender', 'author', 'series', 'tag'])
    p_show.add_argument('name', type=str)
    p_show.add_argument('-d', type=float, default=2.1, help='depth of expansion')

    p_train = cmds.add_parser('train', description="TODO", aliases=[])
    p_train.add_argument('-g', type=float, default=0.2, help='learning rate gamma')
    p_train.add_argument('--full', action="store_true")

    p_prog = cmds.add_parser('progress', description="TODO", aliases=[])
    p_prog.add_argument('-m', type=float, default=7, help='Mimimum Score to read')

    p_comp = cmds.add_parser('competence', description="TODO", aliases=[])

    p_shell = cmds.add_parser('shell', description="TODO", aliases=[])
    
    p_new = cmds.add_parser('newBooks', description="TODO", aliases=[])
    p_new.add_argument('-n', type=int, default=10, help='number of books to recommend')
    
    p_full = cmds.add_parser('full', description="TODO", aliases=[])

    args = parser.parse_args()

    if args.cmd=="train":
        train(args.g, args.full)
        exit()

    bestListT = 'book'

    G, books = buildFullGraph(darkMode=args.dark)
    mu, std = genScores(G, books)

    if not args.keep_whitepapers:
        removeWhitepapers(G)


    if args.cmd=="recommend":
        if args.new==-1:
            args.new = int(args.n / 5)
        if args.new != 0:
            findNewBooks(G, books, mu, args.new, minRecSco = mu-std)
        if args.tag_based:
            if args.recommender_based:
                raise Exception('tag-based and recommender-based can not be be combined')
            recommendNBooksTagBased(G, mu, std, args.n, not args.keep_top_lists)
        elif args.recommender_based:
            recommendNBooksRecommenderBased(G, mu, std, args.n, not args.keep_top_lists, not args.keep_useless_recommenders)
        else:
            recommendNBooks(G, mu, std, args.n, not args.keep_top_lists, not args.keep_useless_recommenders, args.v3d)
    elif args.cmd=="listScores":
        listScores(G, mu, std, args.n)
    elif args.cmd=="read":
        readBooksAnalysis(G, args.min_rating, args.all_tags, args.only_connected, not args.keep_top_lists)
    elif args.cmd=="analyze":
        analyze(G, books, mu, args.type, args.name, args.d)
    elif args.cmd=="full":
        fullGraph(G, not args.keep_top_lists)
    elif args.cmd=="competence":
        bestListT = 'recommender'
        recommenderCompetence(G)
    elif args.cmd=="shell":
        shell(G, books, mu, std)
    elif args.cmd=="progress":
        progress(G, books, mu, args.m)
        return
    elif args.cmd=="newBooks":
        bestListT = 'newBook'
        newBooks(G, books, args.n, mu, std)
    else:
        raise Exception("Bad")


    if not args.keep_priv:
        removePriv(G)
    if args.remove_read:
        removeRead(G)
    elif args.remove_unread:
        removeUnread(G)

    removeDangling(G, alsoBooks=True)

    if args.remove_edge:
        removeEdge(G)

    if not args.no_list:
        printBestList(G, t=bestListT)
    if not args.no_web and not args.cmd in ['listScores']:
        if args.v3d:
            genAndShow3D(G, darkMode=args.dark)
        else:
            if args.imgs:
                addImagesToNodes(G)
            genAndShowHTML(G, darkMode=args.dark)


weights = loadWeights()
if __name__ == "__main__":
    cliInterface()