Lookup images for authors and recommenders on WikiPedia

This commit is contained in:
Dominik Moritz Roth 2022-02-11 12:14:24 +01:00
parent f5c3077cb4
commit 5e6dc9ffe2
2 changed files with 147 additions and 8 deletions

1
.gitignore vendored
View File

@ -3,3 +3,4 @@ __pycache__
.venv .venv
neuralWeights.json neuralWeights.json
neuralWeights.json.bak neuralWeights.json.bak
.imgLinkCache.json

View File

@ -5,6 +5,7 @@ import json
import math import math
import copy import copy
import random import random
import requests
import numpy as np import numpy as np
import pandas as pd import pandas as pd
@ -13,7 +14,9 @@ from scipy.stats import norm
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import networkx as nx import networkx as nx
from pyvis.network import Network from pyvis.network import Network
import plotly.graph_objects as go
import wikipedia
def getAllAuthors(books): def getAllAuthors(books):
authors = set() authors = set()
@ -562,6 +565,24 @@ def buildBookGraph(books, darkMode=False, extractKeywords=True, mergeTags=True):
return G return G
def getWikiImage(search_term):
from fuzzywuzzy import fuzz
WIKI_REQUEST = 'http://en.wikipedia.org/w/api.php?action=query&prop=pageimages&format=json&piprop=original&titles='
try:
print('[i] Searching for >'+search_term+'< on WikiPedia...')
result = wikipedia.search(search_term, results = 1)
if fuzz.ratio(search_term, result) < 50:
raise Exception('blub')
wikipedia.set_lang('en')
wkpage = wikipedia.WikipediaPage(title = result[0])
title = wkpage.title
response = requests.get(WIKI_REQUEST+title)
json_data = json.loads(response.text)
img_link = list(json_data['query']['pages'].values())[0]['original']['source']
return img_link
except:
print('[!] No match for '+search_term+' on WikiPedia...')
return None
def graphAddAuthors(G, books, darkMode=False): def graphAddAuthors(G, books, darkMode=False):
for author in getAllAuthors(books): for author in getAllAuthors(books):
@ -571,7 +592,6 @@ def graphAddAuthors(G, books, darkMode=False):
G.add_edge('a/'+author, book['id'], color=readColor(book)) G.add_edge('a/'+author, book['id'], color=readColor(book))
return G return G
def graphAddRecommenders(G, books, darkMode=False): def graphAddRecommenders(G, books, darkMode=False):
for rec in getAllRecommenders(books): for rec in getAllRecommenders(books):
G.add_node('r/'+rec, color='orange', t='recommender', label=rec) G.add_node('r/'+rec, color='orange', t='recommender', label=rec)
@ -580,7 +600,6 @@ def graphAddRecommenders(G, books, darkMode=False):
G.add_edge('r/'+rec, book['id'], color=readColor(book)) G.add_edge('r/'+rec, book['id'], color=readColor(book))
return G return G
def graphAddTopLists(G, books, darkMode=False): def graphAddTopLists(G, books, darkMode=False):
for tl in getAllTopLists(books): for tl in getAllTopLists(books):
G.add_node('t/'+tl, color='yellow', t='topList', label=tl) G.add_node('t/'+tl, color='yellow', t='topList', label=tl)
@ -664,6 +683,90 @@ def genAndShowHTML(G, showButtons=False, darkMode=False, arrows=False):
net.show('nx.html') net.show('nx.html')
def genAndShow3D(G, darkMode=False):
node_sizes = []
node_labels = []
node_cols = []
for n in G.nodes:
node = G.nodes[n]
if node['t']=='tag':
node_cols.append('gray')
elif node['t']=='book':
if 'score' in node: # unread book
node_cols.append('lightblue')
else:
node_cols.append('magenta')
elif 'color' in node:
node_cols.append(node['color'])
else:
node_cols.append('black')
node_labels.append(node['label'])
node_sizes.append((node['value']/8)**1.5)
spring = nx.spring_layout(G,dim=3, seed=random.randint(0, 65536))
x_nodes = [spring[p][0] for p in spring]# x-coordinates of nodes
y_nodes = [spring[p][1] for p in spring]# y-coordinates
z_nodes = [spring[p][2] for p in spring]# z-coordinates
x_edges=[]
y_edges=[]
z_edges=[]
for edge in G.edges():
x_coords = [spring[edge[0]][0],spring[edge[1]][0],None]
x_edges += x_coords
y_coords = [spring[edge[0]][1],spring[edge[1]][1],None]
y_edges += y_coords
z_coords = [spring[edge[0]][2],spring[edge[1]][2],None]
z_edges += z_coords
trace_edges = go.Scatter3d(x=x_edges,
y=y_edges,
z=z_edges,
mode='lines',
line=dict(color='black', width=2),
hoverinfo='none')
trace_nodes = go.Scatter3d(x=x_nodes,
y=y_nodes,
z=z_nodes,
mode='markers',
marker=dict(symbol='circle',
size=node_sizes,
color=node_cols, #color the nodes according to their community
#colorscale=['lightgreen','magenta'], #either green or mageneta
line=dict(color='gray', width=0.5)),
text=node_labels,
hoverinfo='text')
axis = dict(showbackground=False,
showline=False,
zeroline=False,
showgrid=False,
showticklabels=False,
title='')
layout = go.Layout(title="",
width=1920,
height=1080,
plot_bgcolor=['#FFFFFF','#181818'][darkMode],
paper_bgcolor=['#FFFFFF','#181818'][darkMode],
showlegend=False,
scene=dict(xaxis=dict(axis),
yaxis=dict(axis),
zaxis=dict(axis),
),
margin=dict(l=0, r=0, b=0, t=0),
hovermode='closest')
data = [trace_edges, trace_nodes]
fig = go.Figure(data=data, layout=layout)
fig.show()
def buildFullGraph(darkMode=False): def buildFullGraph(darkMode=False):
books = loadBooksFromDB() books = loadBooksFromDB()
G = buildBookGraph(books, darkMode=darkMode) G = buildBookGraph(books, darkMode=darkMode)
@ -684,6 +787,32 @@ def genScores(G, books, calcPagerank=True):
scoreUnread(G, globMu, globStd) scoreUnread(G, globMu, globStd)
return globMu, globStd return globMu, globStd
def addImageToNode(node, cache, shape='circularImage'):
name = node['label'].split(' (')[0]
if not name in cache:
term = name
img = getWikiImage(term)
if img:
cache[name] = img
else:
img = cache[name]
if img:
node['imagePadding'] = '100px'
node['image']=img
node['shape']=shape
def addImagesToNodes(G):
try:
with open('.imgLinkCache.json', 'r') as cf:
cache = json.loads(cf.read())
except IOError:
cache = {}
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] in ['recommender', 'author']:
addImageToNode(node, cache, ['circularImage','image'][node['t']=='author'])
with open('.imgLinkCache.json', 'w') as cf:
cf.write(json.dumps(cache))
def recommendNBooksRecommenderBased(G, mu, std, n, removeTopListsB=True, removeUselessRecommenders=True): def recommendNBooksRecommenderBased(G, mu, std, n, removeTopListsB=True, removeUselessRecommenders=True):
removeRestOfSeries(G) removeRestOfSeries(G)
@ -1013,11 +1142,13 @@ def findNewBooks(G, books, num, minRecSco=5):
scores.append(adj['score']) scores.append(adj['score'])
ses.append(adj['se']) ses.append(adj['se'])
ses.append(min(ses)) ses.append(min(ses))
if len(scores) < 2: if False and len(scores) < 2:
G.remove_node(n) G.remove_node(n)
else: else:
node['fake_se'] = sum(ses)/(len(ses)**1.2) # This is not how SE works. DILLIGAF? node['fake_se'] = sum(ses)/(len(ses)**1.2) + 0.5 + 0.5 * (len(scores)==1) # This is not how SE works. DILLIGAF?
node['score'] = sum(scores)/len(scores)*1.2 - node['fake_se']*2.5 + 0.5 - 0.1/math.sqrt(len(scores)) node['score'] = sum(scores)/len(scores)*1.2 - node['fake_se']*2 + 0.5 - 0.1/math.sqrt(len(scores))
if len(scores)==1:
node['score']*=0.80
node['value'] = 20 + 5 * float(node['score']) node['value'] = 20 + 5 * float(node['score'])
node['label'] += " ({:.2f}±{:.1f})".format(node['score'], node['fake_se']) node['label'] += " ({:.2f}±{:.1f})".format(node['score'], node['fake_se'])
node['label'] += '\n ' + node['author'] node['label'] += '\n ' + node['author']
@ -1145,7 +1276,9 @@ def cliInterface():
parser.add_argument('--remove-edge', action="store_true") parser.add_argument('--remove-edge', action="store_true")
parser.add_argument('--keep-top-lists', action="store_true") parser.add_argument('--keep-top-lists', action="store_true")
parser.add_argument('--keep-useless-recommenders', action="store_true") parser.add_argument('--keep-useless-recommenders', action="store_true")
parser.add_argument('--dark-mode', action="store_true") parser.add_argument('--dark', action="store_true")
parser.add_argument('--v3d', action="store_true")
parser.add_argument('--imgs', action="store_true")
cmds = parser.add_subparsers(required=True, dest='cmd') cmds = parser.add_subparsers(required=True, dest='cmd')
p_rec = cmds.add_parser('recommend', description="TODO", aliases=['rec']) p_rec = cmds.add_parser('recommend', description="TODO", aliases=['rec'])
@ -1191,7 +1324,7 @@ def cliInterface():
bestListT = 'book' bestListT = 'book'
G, books = buildFullGraph(darkMode=args.dark_mode) G, books = buildFullGraph(darkMode=args.dark)
mu, std = genScores(G, books) mu, std = genScores(G, books)
if not args.keep_whitepapers: if not args.keep_whitepapers:
@ -1249,7 +1382,12 @@ def cliInterface():
if not args.no_list: if not args.no_list:
printBestList(G, t=bestListT) printBestList(G, t=bestListT)
if not args.no_web and not args.cmd in ['listScores']: if not args.no_web and not args.cmd in ['listScores']:
genAndShowHTML(G, darkMode=args.dark_mode) if args.v3d:
genAndShow3D(G, darkMode=args.dark)
else:
if args.imgs:
addImagesToNodes(G)
genAndShowHTML(G, darkMode=args.dark)
weights = loadWeights() weights = loadWeights()