New shell-command, nltk for keyword extraction from description
This commit is contained in:
parent
36baf1aaec
commit
199fab7875
64
caliGraph.py
64
caliGraph.py
@ -1,5 +1,6 @@
|
|||||||
#!./.venv/bin/python3.10
|
#!./.venv/bin/python3.10
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import json
|
import json
|
||||||
import math
|
import math
|
||||||
import copy
|
import copy
|
||||||
@ -454,28 +455,55 @@ def readColor(book):
|
|||||||
def loadBooksFromDB():
|
def loadBooksFromDB():
|
||||||
return json.loads(os.popen("calibredb list --for-machine -f all").read())
|
return json.loads(os.popen("calibredb list --for-machine -f all").read())
|
||||||
|
|
||||||
|
def remove_html_tags(text):
|
||||||
|
clean = re.compile('<.*?>')
|
||||||
|
return re.sub(clean, '', text)
|
||||||
|
|
||||||
def buildBookGraph(books, darkMode=False):
|
def getKeywords(txt,rake):
|
||||||
|
txt = remove_html_tags(txt)
|
||||||
|
k = []
|
||||||
|
rake.extract_keywords_from_text(txt)
|
||||||
|
kws = rake.get_ranked_phrases_with_scores()
|
||||||
|
for i,(score,kw) in enumerate(kws):
|
||||||
|
l = len(kw.split(' '))
|
||||||
|
k.append((score**(1/(l*0.5)),kw))
|
||||||
|
k.sort(key=lambda x: x[0],reverse=True)
|
||||||
|
minSco = k[0][0]/3*2
|
||||||
|
for i,kw in enumerate(k):
|
||||||
|
if kw[0] < minSco:
|
||||||
|
return [(sco,word.title()) for sco,word in k[:i]]
|
||||||
|
return k
|
||||||
|
|
||||||
|
def buildBookGraph(books, darkMode=False, extractKeywords=True, mergeTags=True):
|
||||||
G = nx.Graph()
|
G = nx.Graph()
|
||||||
|
if extractKeywords:
|
||||||
|
from rake_nltk.rake import Rake
|
||||||
|
rake = Rake()
|
||||||
|
|
||||||
# Books
|
# Books
|
||||||
for book in books:
|
for book in books:
|
||||||
|
tags = book['tags']
|
||||||
if 'rating' in book:
|
if 'rating' in book:
|
||||||
rating = book['rating']
|
rating = book['rating']
|
||||||
else:
|
else:
|
||||||
rating = None
|
rating = None
|
||||||
if 'comments' in book:
|
if 'comments' in book:
|
||||||
desc = '' # book['comments']
|
desc = book['comments']
|
||||||
else:
|
else:
|
||||||
desc = ''
|
desc = ''
|
||||||
|
if 'comments' in book and extractKeywords:
|
||||||
|
keywords = getKeywords(book['comments'],rake)
|
||||||
|
else:
|
||||||
|
keywords = []
|
||||||
|
if mergeTags:
|
||||||
|
tags = tags + [word for (score, word) in keywords]
|
||||||
if 'series' in book:
|
if 'series' in book:
|
||||||
series = book['series']
|
series = book['series']
|
||||||
series_index = book['series_index']
|
series_index = book['series_index']
|
||||||
else:
|
else:
|
||||||
series = None
|
series = None
|
||||||
series_index = None
|
series_index = None
|
||||||
G.add_node(book['id'], t='book', label=book['title'], title=book['title'], shape='image', image=book['cover'], rating=rating,
|
G.add_node(book['id'], t='book', label=book['title'], title=book['title'], shape='image', image=book['cover'], rating=rating, tags=tags, keywords=keywords, desc=desc, isbn=book['isbn'], files=book['formats'], authors=getAuthors(book), series=series, series_index=series_index)
|
||||||
tags=book['tags'], desc=desc, isbn=book['isbn'], files=book['formats'], authors=getAuthors(book), series=series, series_index=series_index)
|
|
||||||
|
|
||||||
return G
|
return G
|
||||||
|
|
||||||
@ -837,6 +865,27 @@ def waveFlow(G, node, n, dist, menge, firstEdge=False):
|
|||||||
if node in bestlist or node in keeplist:
|
if node in bestlist or node in keeplist:
|
||||||
waveFlow(G, node, m, dist, menge, firstEdge=firstEdge)
|
waveFlow(G, node, m, dist, menge, firstEdge=firstEdge)
|
||||||
|
|
||||||
|
def gensimTokensForLines(lines):
|
||||||
|
for i, line in enumerate(lines):
|
||||||
|
tokens = gensim.utils.simple_preprocess(line)
|
||||||
|
if tokens_only:
|
||||||
|
yield tokens
|
||||||
|
else:
|
||||||
|
# For training data, add tags
|
||||||
|
yield gensim.models.doc2vec.TaggedDocument(tokens, [i])
|
||||||
|
|
||||||
|
def buildDoc2Vec(books):
|
||||||
|
import gensim
|
||||||
|
for n in list(G.nodes):
|
||||||
|
node = G.nodes[n]
|
||||||
|
if node['t'] == 'book':
|
||||||
|
pass
|
||||||
|
gensimTokensForLines(lines)
|
||||||
|
|
||||||
|
def shell(G, books, mu, std):
|
||||||
|
from ptpython.repl import embed
|
||||||
|
embed(globals(), locals())
|
||||||
|
|
||||||
def evaluateFitness(books, debugPrint=False):
|
def evaluateFitness(books, debugPrint=False):
|
||||||
global weights
|
global weights
|
||||||
G = buildBookGraph(books)
|
G = buildBookGraph(books)
|
||||||
@ -985,6 +1034,8 @@ def cliInterface():
|
|||||||
|
|
||||||
p_comp = cmds.add_parser('competence', description="TODO", aliases=[])
|
p_comp = cmds.add_parser('competence', description="TODO", aliases=[])
|
||||||
|
|
||||||
|
p_shell = cmds.add_parser('shell', description="TODO", aliases=[])
|
||||||
|
|
||||||
p_full = cmds.add_parser('full', description="TODO", aliases=[])
|
p_full = cmds.add_parser('full', description="TODO", aliases=[])
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
@ -1018,7 +1069,10 @@ def cliInterface():
|
|||||||
fullGraph(G, not args.keep_top_lists)
|
fullGraph(G, not args.keep_top_lists)
|
||||||
elif args.cmd=="competence":
|
elif args.cmd=="competence":
|
||||||
recommenderCompetence(G)
|
recommenderCompetence(G)
|
||||||
elif args.cmd=="progress":
|
elif args.cmd=="shell":
|
||||||
|
shell(G, books, mu, std)
|
||||||
|
elif args.cmd=="competence":
|
||||||
|
recommenderCompetence(G)
|
||||||
progress(G, args.m)
|
progress(G, args.m)
|
||||||
return
|
return
|
||||||
else:
|
else:
|
||||||
|
Loading…
Reference in New Issue
Block a user