diff --git a/caliGraph.py b/caliGraph.py index 2bb17e1..76af4fc 100755 --- a/caliGraph.py +++ b/caliGraph.py @@ -460,19 +460,22 @@ def remove_html_tags(text): return re.sub(clean, '', text) def getKeywords(txt,rake): - txt = remove_html_tags(txt) - k = [] - rake.extract_keywords_from_text(txt) - kws = rake.get_ranked_phrases_with_scores() - for i,(score,kw) in enumerate(kws): - l = len(kw.split(' ')) - k.append((score**(1/(l*0.5)),kw)) - k.sort(key=lambda x: x[0],reverse=True) - minSco = k[0][0]/3*2 - for i,kw in enumerate(k): - if kw[0] < minSco: - return [(sco,word.title()) for sco,word in k[:i]] - return k + txt = remove_html_tags(txt) + k = [] + rake.extract_keywords_from_text(txt) + kws = rake.get_ranked_phrases_with_scores() + for i,(score,kw) in enumerate(kws): + l = len(kw.split(' ')) + if kw.lower() not in ['p', 'die', 'best', 'known', 'fk', 'p pp', 'one'] and len(kw)>3 and kw.find('div')==-1 and kw.lower().find('p p')==-1: + k.append((score**(1/(l*0.4)),kw)) + k.sort(key=lambda x: x[0],reverse=True) + if k: + minSco = k[0][0]/3*2 + for i,kw in enumerate(k): + if kw[0] < minSco: + return [(sco,word.title()) for sco,word in k[:i]] + return k + return [] def buildBookGraph(books, darkMode=False, extractKeywords=True, mergeTags=True): G = nx.Graph() @@ -492,7 +495,8 @@ def buildBookGraph(books, darkMode=False, extractKeywords=True, mergeTags=True): else: desc = '' if 'comments' in book and extractKeywords: - keywords = getKeywords(book['comments'],rake) + sanitized = re.sub(r'[^a-zA-Z0-9\s\.äöü]+', '', book['comments']).replace('\n',' ') + keywords = getKeywords(sanitized,rake) else: keywords = [] if mergeTags: