Better Tag-Pruning (keep more good tags)

This commit is contained in:
Dominik Moritz Roth 2021-06-23 15:45:32 +02:00
parent a8d9f96e70
commit cc6606f468

View File

@ -28,11 +28,13 @@ def getRecommenders(book):
for tag in book['tags']: for tag in book['tags']:
if tag.find(" Recommendation") != -1: if tag.find(" Recommendation") != -1:
yield tag.replace(" Recommendation", "") yield tag.replace(" Recommendation", "")
elif tag.find("s Literature Club") != -1:
yield tag.replace("s Literature Club", "")
def getTags(book): def getTags(book):
for tag in book['tags']: for tag in book['tags']:
if tag.find(" Recommendation") == -1 and tag.find(" Top ") == -1: if tag.find(" Recommendation") == -1 and tag.find("s Literature Club") == -1 and tag.find(" Top ") == -1:
yield tag yield tag
@ -166,8 +168,14 @@ def pruneTags(G, minCons=2):
foundCon = 0 foundCon = 0
for book in G.adj[n]: for book in G.adj[n]:
for con in G.adj[book]: for con in G.adj[book]:
if G.nodes[con]['t'] not in ['tag', 'topList', 'series']: conType = G.nodes[con]['t']
foundCon += 1 if conType not in ['topList']:
if conType in ['recommender']:
foundCon += 0.5
elif conType in ['tag', 'series']:
foundCon += 0.25
else:
foundCon += 1
if foundCon > minCons: if foundCon > minCons:
G.remove_node(n) G.remove_node(n)
@ -483,9 +491,9 @@ def recommendNBooks(G, mu, std, n):
removeEdge(G) removeEdge(G)
removeHighSpanTags(G, 9) removeHighSpanTags(G, 9)
removeDangling(G, alsoBooks=False) removeDangling(G, alsoBooks=False)
pruneTags(G, 4) pruneTags(G, 6)
removeBad(G, mu, groups=['book']) removeBad(G, mu, groups=['book'])
pruneTags(G, 3) pruneTags(G, 4.25)
pruneRecommenderCons(G, int(n/7)+1) pruneRecommenderCons(G, int(n/7)+1)
pruneAuthorCons(G, int(n/15)) pruneAuthorCons(G, int(n/15))
removeTopLists(G) removeTopLists(G)
@ -544,7 +552,7 @@ def analyze(G, type_name, name, dist=2.7):
print("Best Match: "+match['label']) print("Best Match: "+match['label'])
menge = set() menge = set()
pruneDist(G, match, n, dist, menge) waveFlow(G, match, n, dist, menge)
for n in list(G.nodes): for n in list(G.nodes):
if n not in menge: if n not in menge:
G.remove_node(n) G.remove_node(n)
@ -560,7 +568,7 @@ def analyze(G, type_name, name, dist=2.7):
addScoreToLabels(G) addScoreToLabels(G)
match['label'] = "*"+match['label']+"*" match['label'] = "*"+match['label']+"*"
def pruneDist(G, node, n, dist, menge, firstEdge=False): def waveFlow(G, node, n, dist, menge, firstEdge=False):
if dist <= 0: if dist <= 0:
return return
dist -= 1 dist -= 1
@ -600,7 +608,7 @@ def pruneDist(G, node, n, dist, menge, firstEdge=False):
for m in list(G.adj[n]): for m in list(G.adj[n]):
node = G.nodes[m] node = G.nodes[m]
if node in bestlist or node in keeplist: if node in bestlist or node in keeplist:
pruneDist(G, node, m, dist, menge, firstEdge=firstEdge) waveFlow(G, node, m, dist, menge, firstEdge=firstEdge)
def cliInterface(): def cliInterface():
import argparse import argparse