From 199fab78754f35649183b6ff308249468fc7f2d9 Mon Sep 17 00:00:00 2001 From: Dominik Roth Date: Sat, 11 Dec 2021 13:58:01 +0100 Subject: [PATCH] New shell-command, nltk for keyword extraction from description --- caliGraph.py | 64 ++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 59 insertions(+), 5 deletions(-) diff --git a/caliGraph.py b/caliGraph.py index 8376ba1..2bb17e1 100755 --- a/caliGraph.py +++ b/caliGraph.py @@ -1,5 +1,6 @@ #!./.venv/bin/python3.10 import os +import re import json import math import copy @@ -454,28 +455,55 @@ def readColor(book): def loadBooksFromDB(): return json.loads(os.popen("calibredb list --for-machine -f all").read()) +def remove_html_tags(text): + clean = re.compile('<.*?>') + return re.sub(clean, '', text) -def buildBookGraph(books, darkMode=False): +def getKeywords(txt,rake): + txt = remove_html_tags(txt) + k = [] + rake.extract_keywords_from_text(txt) + kws = rake.get_ranked_phrases_with_scores() + for i,(score,kw) in enumerate(kws): + l = len(kw.split(' ')) + k.append((score**(1/(l*0.5)),kw)) + k.sort(key=lambda x: x[0],reverse=True) + minSco = k[0][0]/3*2 + for i,kw in enumerate(k): + if kw[0] < minSco: + return [(sco,word.title()) for sco,word in k[:i]] + return k + +def buildBookGraph(books, darkMode=False, extractKeywords=True, mergeTags=True): G = nx.Graph() + if extractKeywords: + from rake_nltk.rake import Rake + rake = Rake() # Books for book in books: + tags = book['tags'] if 'rating' in book: rating = book['rating'] else: rating = None if 'comments' in book: - desc = '' # book['comments'] + desc = book['comments'] else: desc = '' + if 'comments' in book and extractKeywords: + keywords = getKeywords(book['comments'],rake) + else: + keywords = [] + if mergeTags: + tags = tags + [word for (score, word) in keywords] if 'series' in book: series = book['series'] series_index = book['series_index'] else: series = None series_index = None - G.add_node(book['id'], t='book', label=book['title'], title=book['title'], shape='image', image=book['cover'], rating=rating, - tags=book['tags'], desc=desc, isbn=book['isbn'], files=book['formats'], authors=getAuthors(book), series=series, series_index=series_index) + G.add_node(book['id'], t='book', label=book['title'], title=book['title'], shape='image', image=book['cover'], rating=rating, tags=tags, keywords=keywords, desc=desc, isbn=book['isbn'], files=book['formats'], authors=getAuthors(book), series=series, series_index=series_index) return G @@ -837,6 +865,27 @@ def waveFlow(G, node, n, dist, menge, firstEdge=False): if node in bestlist or node in keeplist: waveFlow(G, node, m, dist, menge, firstEdge=firstEdge) +def gensimTokensForLines(lines): + for i, line in enumerate(lines): + tokens = gensim.utils.simple_preprocess(line) + if tokens_only: + yield tokens + else: + # For training data, add tags + yield gensim.models.doc2vec.TaggedDocument(tokens, [i]) + +def buildDoc2Vec(books): + import gensim + for n in list(G.nodes): + node = G.nodes[n] + if node['t'] == 'book': + pass + gensimTokensForLines(lines) + +def shell(G, books, mu, std): + from ptpython.repl import embed + embed(globals(), locals()) + def evaluateFitness(books, debugPrint=False): global weights G = buildBookGraph(books) @@ -985,6 +1034,8 @@ def cliInterface(): p_comp = cmds.add_parser('competence', description="TODO", aliases=[]) + p_shell = cmds.add_parser('shell', description="TODO", aliases=[]) + p_full = cmds.add_parser('full', description="TODO", aliases=[]) args = parser.parse_args() @@ -1018,7 +1069,10 @@ def cliInterface(): fullGraph(G, not args.keep_top_lists) elif args.cmd=="competence": recommenderCompetence(G) - elif args.cmd=="progress": + elif args.cmd=="shell": + shell(G, books, mu, std) + elif args.cmd=="competence": + recommenderCompetence(G) progress(G, args.m) return else: