New shell-command, nltk for keyword extraction from description

This commit is contained in:
Dominik Moritz Roth 2021-12-11 13:58:01 +01:00
parent 36baf1aaec
commit 199fab7875

View File

@ -1,5 +1,6 @@
#!./.venv/bin/python3.10
import os
import re
import json
import math
import copy
@ -454,28 +455,55 @@ def readColor(book):
def loadBooksFromDB():
return json.loads(os.popen("calibredb list --for-machine -f all").read())
def remove_html_tags(text):
clean = re.compile('<.*?>')
return re.sub(clean, '', text)
def buildBookGraph(books, darkMode=False):
def getKeywords(txt,rake):
txt = remove_html_tags(txt)
k = []
rake.extract_keywords_from_text(txt)
kws = rake.get_ranked_phrases_with_scores()
for i,(score,kw) in enumerate(kws):
l = len(kw.split(' '))
k.append((score**(1/(l*0.5)),kw))
k.sort(key=lambda x: x[0],reverse=True)
minSco = k[0][0]/3*2
for i,kw in enumerate(k):
if kw[0] < minSco:
return [(sco,word.title()) for sco,word in k[:i]]
return k
def buildBookGraph(books, darkMode=False, extractKeywords=True, mergeTags=True):
G = nx.Graph()
if extractKeywords:
from rake_nltk.rake import Rake
rake = Rake()
# Books
for book in books:
tags = book['tags']
if 'rating' in book:
rating = book['rating']
else:
rating = None
if 'comments' in book:
desc = '' # book['comments']
desc = book['comments']
else:
desc = ''
if 'comments' in book and extractKeywords:
keywords = getKeywords(book['comments'],rake)
else:
keywords = []
if mergeTags:
tags = tags + [word for (score, word) in keywords]
if 'series' in book:
series = book['series']
series_index = book['series_index']
else:
series = None
series_index = None
G.add_node(book['id'], t='book', label=book['title'], title=book['title'], shape='image', image=book['cover'], rating=rating,
tags=book['tags'], desc=desc, isbn=book['isbn'], files=book['formats'], authors=getAuthors(book), series=series, series_index=series_index)
G.add_node(book['id'], t='book', label=book['title'], title=book['title'], shape='image', image=book['cover'], rating=rating, tags=tags, keywords=keywords, desc=desc, isbn=book['isbn'], files=book['formats'], authors=getAuthors(book), series=series, series_index=series_index)
return G
@ -837,6 +865,27 @@ def waveFlow(G, node, n, dist, menge, firstEdge=False):
if node in bestlist or node in keeplist:
waveFlow(G, node, m, dist, menge, firstEdge=firstEdge)
def gensimTokensForLines(lines):
for i, line in enumerate(lines):
tokens = gensim.utils.simple_preprocess(line)
if tokens_only:
yield tokens
else:
# For training data, add tags
yield gensim.models.doc2vec.TaggedDocument(tokens, [i])
def buildDoc2Vec(books):
import gensim
for n in list(G.nodes):
node = G.nodes[n]
if node['t'] == 'book':
pass
gensimTokensForLines(lines)
def shell(G, books, mu, std):
from ptpython.repl import embed
embed(globals(), locals())
def evaluateFitness(books, debugPrint=False):
global weights
G = buildBookGraph(books)
@ -985,6 +1034,8 @@ def cliInterface():
p_comp = cmds.add_parser('competence', description="TODO", aliases=[])
p_shell = cmds.add_parser('shell', description="TODO", aliases=[])
p_full = cmds.add_parser('full', description="TODO", aliases=[])
args = parser.parse_args()
@ -1018,7 +1069,10 @@ def cliInterface():
fullGraph(G, not args.keep_top_lists)
elif args.cmd=="competence":
recommenderCompetence(G)
elif args.cmd=="progress":
elif args.cmd=="shell":
shell(G, books, mu, std)
elif args.cmd=="competence":
recommenderCompetence(G)
progress(G, args.m)
return
else: