Lookup images for authors and recommenders on WikiPedia

This commit is contained in:
Dominik Moritz Roth 2022-02-11 12:14:24 +01:00
parent f5c3077cb4
commit 5e6dc9ffe2
2 changed files with 147 additions and 8 deletions

1
.gitignore vendored
View File

@ -3,3 +3,4 @@ __pycache__
.venv
neuralWeights.json
neuralWeights.json.bak
.imgLinkCache.json

View File

@ -5,6 +5,7 @@ import json
import math
import copy
import random
import requests
import numpy as np
import pandas as pd
@ -13,7 +14,9 @@ from scipy.stats import norm
import matplotlib.pyplot as plt
import networkx as nx
from pyvis.network import Network
import plotly.graph_objects as go
import wikipedia
def getAllAuthors(books):
authors = set()
@ -562,6 +565,24 @@ def buildBookGraph(books, darkMode=False, extractKeywords=True, mergeTags=True):
return G
def getWikiImage(search_term):
from fuzzywuzzy import fuzz
WIKI_REQUEST = 'http://en.wikipedia.org/w/api.php?action=query&prop=pageimages&format=json&piprop=original&titles='
try:
print('[i] Searching for >'+search_term+'< on WikiPedia...')
result = wikipedia.search(search_term, results = 1)
if fuzz.ratio(search_term, result) < 50:
raise Exception('blub')
wikipedia.set_lang('en')
wkpage = wikipedia.WikipediaPage(title = result[0])
title = wkpage.title
response = requests.get(WIKI_REQUEST+title)
json_data = json.loads(response.text)
img_link = list(json_data['query']['pages'].values())[0]['original']['source']
return img_link
except:
print('[!] No match for '+search_term+' on WikiPedia...')
return None
def graphAddAuthors(G, books, darkMode=False):
for author in getAllAuthors(books):
@ -571,7 +592,6 @@ def graphAddAuthors(G, books, darkMode=False):
G.add_edge('a/'+author, book['id'], color=readColor(book))
return G
def graphAddRecommenders(G, books, darkMode=False):
for rec in getAllRecommenders(books):
G.add_node('r/'+rec, color='orange', t='recommender', label=rec)
@ -580,7 +600,6 @@ def graphAddRecommenders(G, books, darkMode=False):
G.add_edge('r/'+rec, book['id'], color=readColor(book))
return G
def graphAddTopLists(G, books, darkMode=False):
for tl in getAllTopLists(books):
G.add_node('t/'+tl, color='yellow', t='topList', label=tl)
@ -664,6 +683,90 @@ def genAndShowHTML(G, showButtons=False, darkMode=False, arrows=False):
net.show('nx.html')
def genAndShow3D(G, darkMode=False):
node_sizes = []
node_labels = []
node_cols = []
for n in G.nodes:
node = G.nodes[n]
if node['t']=='tag':
node_cols.append('gray')
elif node['t']=='book':
if 'score' in node: # unread book
node_cols.append('lightblue')
else:
node_cols.append('magenta')
elif 'color' in node:
node_cols.append(node['color'])
else:
node_cols.append('black')
node_labels.append(node['label'])
node_sizes.append((node['value']/8)**1.5)
spring = nx.spring_layout(G,dim=3, seed=random.randint(0, 65536))
x_nodes = [spring[p][0] for p in spring]# x-coordinates of nodes
y_nodes = [spring[p][1] for p in spring]# y-coordinates
z_nodes = [spring[p][2] for p in spring]# z-coordinates
x_edges=[]
y_edges=[]
z_edges=[]
for edge in G.edges():
x_coords = [spring[edge[0]][0],spring[edge[1]][0],None]
x_edges += x_coords
y_coords = [spring[edge[0]][1],spring[edge[1]][1],None]
y_edges += y_coords
z_coords = [spring[edge[0]][2],spring[edge[1]][2],None]
z_edges += z_coords
trace_edges = go.Scatter3d(x=x_edges,
y=y_edges,
z=z_edges,
mode='lines',
line=dict(color='black', width=2),
hoverinfo='none')
trace_nodes = go.Scatter3d(x=x_nodes,
y=y_nodes,
z=z_nodes,
mode='markers',
marker=dict(symbol='circle',
size=node_sizes,
color=node_cols, #color the nodes according to their community
#colorscale=['lightgreen','magenta'], #either green or mageneta
line=dict(color='gray', width=0.5)),
text=node_labels,
hoverinfo='text')
axis = dict(showbackground=False,
showline=False,
zeroline=False,
showgrid=False,
showticklabels=False,
title='')
layout = go.Layout(title="",
width=1920,
height=1080,
plot_bgcolor=['#FFFFFF','#181818'][darkMode],
paper_bgcolor=['#FFFFFF','#181818'][darkMode],
showlegend=False,
scene=dict(xaxis=dict(axis),
yaxis=dict(axis),
zaxis=dict(axis),
),
margin=dict(l=0, r=0, b=0, t=0),
hovermode='closest')
data = [trace_edges, trace_nodes]
fig = go.Figure(data=data, layout=layout)
fig.show()
def buildFullGraph(darkMode=False):
books = loadBooksFromDB()
G = buildBookGraph(books, darkMode=darkMode)
@ -684,6 +787,32 @@ def genScores(G, books, calcPagerank=True):
scoreUnread(G, globMu, globStd)
return globMu, globStd
def addImageToNode(node, cache, shape='circularImage'):
name = node['label'].split(' (')[0]
if not name in cache:
term = name
img = getWikiImage(term)
if img:
cache[name] = img
else:
img = cache[name]
if img:
node['imagePadding'] = '100px'
node['image']=img
node['shape']=shape
def addImagesToNodes(G):
try:
with open('.imgLinkCache.json', 'r') as cf:
cache = json.loads(cf.read())
except IOError:
cache = {}
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] in ['recommender', 'author']:
addImageToNode(node, cache, ['circularImage','image'][node['t']=='author'])
with open('.imgLinkCache.json', 'w') as cf:
cf.write(json.dumps(cache))
def recommendNBooksRecommenderBased(G, mu, std, n, removeTopListsB=True, removeUselessRecommenders=True):
removeRestOfSeries(G)
@ -1013,11 +1142,13 @@ def findNewBooks(G, books, num, minRecSco=5):
scores.append(adj['score'])
ses.append(adj['se'])
ses.append(min(ses))
if len(scores) < 2:
if False and len(scores) < 2:
G.remove_node(n)
else:
node['fake_se'] = sum(ses)/(len(ses)**1.2) # This is not how SE works. DILLIGAF?
node['score'] = sum(scores)/len(scores)*1.2 - node['fake_se']*2.5 + 0.5 - 0.1/math.sqrt(len(scores))
node['fake_se'] = sum(ses)/(len(ses)**1.2) + 0.5 + 0.5 * (len(scores)==1) # This is not how SE works. DILLIGAF?
node['score'] = sum(scores)/len(scores)*1.2 - node['fake_se']*2 + 0.5 - 0.1/math.sqrt(len(scores))
if len(scores)==1:
node['score']*=0.80
node['value'] = 20 + 5 * float(node['score'])
node['label'] += " ({:.2f}±{:.1f})".format(node['score'], node['fake_se'])
node['label'] += '\n ' + node['author']
@ -1145,7 +1276,9 @@ def cliInterface():
parser.add_argument('--remove-edge', action="store_true")
parser.add_argument('--keep-top-lists', action="store_true")
parser.add_argument('--keep-useless-recommenders', action="store_true")
parser.add_argument('--dark-mode', action="store_true")
parser.add_argument('--dark', action="store_true")
parser.add_argument('--v3d', action="store_true")
parser.add_argument('--imgs', action="store_true")
cmds = parser.add_subparsers(required=True, dest='cmd')
p_rec = cmds.add_parser('recommend', description="TODO", aliases=['rec'])
@ -1191,7 +1324,7 @@ def cliInterface():
bestListT = 'book'
G, books = buildFullGraph(darkMode=args.dark_mode)
G, books = buildFullGraph(darkMode=args.dark)
mu, std = genScores(G, books)
if not args.keep_whitepapers:
@ -1249,7 +1382,12 @@ def cliInterface():
if not args.no_list:
printBestList(G, t=bestListT)
if not args.no_web and not args.cmd in ['listScores']:
genAndShowHTML(G, darkMode=args.dark_mode)
if args.v3d:
genAndShow3D(G, darkMode=args.dark)
else:
if args.imgs:
addImagesToNodes(G)
genAndShowHTML(G, darkMode=args.dark)
weights = loadWeights()