Sanitize description of books before rake

This commit is contained in:
Dominik Moritz Roth 2021-12-18 18:09:45 +01:00
parent 7af5109e7f
commit a6d9a75030

View File

@ -460,19 +460,22 @@ def remove_html_tags(text):
return re.sub(clean, '', text)
def getKeywords(txt,rake):
txt = remove_html_tags(txt)
k = []
rake.extract_keywords_from_text(txt)
kws = rake.get_ranked_phrases_with_scores()
for i,(score,kw) in enumerate(kws):
l = len(kw.split(' '))
k.append((score**(1/(l*0.5)),kw))
k.sort(key=lambda x: x[0],reverse=True)
minSco = k[0][0]/3*2
for i,kw in enumerate(k):
if kw[0] < minSco:
return [(sco,word.title()) for sco,word in k[:i]]
return k
txt = remove_html_tags(txt)
k = []
rake.extract_keywords_from_text(txt)
kws = rake.get_ranked_phrases_with_scores()
for i,(score,kw) in enumerate(kws):
l = len(kw.split(' '))
if kw.lower() not in ['p', 'die', 'best', 'known', 'fk', 'p pp', 'one'] and len(kw)>3 and kw.find('div')==-1 and kw.lower().find('p p')==-1:
k.append((score**(1/(l*0.4)),kw))
k.sort(key=lambda x: x[0],reverse=True)
if k:
minSco = k[0][0]/3*2
for i,kw in enumerate(k):
if kw[0] < minSco:
return [(sco,word.title()) for sco,word in k[:i]]
return k
return []
def buildBookGraph(books, darkMode=False, extractKeywords=True, mergeTags=True):
G = nx.Graph()
@ -492,7 +495,8 @@ def buildBookGraph(books, darkMode=False, extractKeywords=True, mergeTags=True):
else:
desc = ''
if 'comments' in book and extractKeywords:
keywords = getKeywords(book['comments'],rake)
sanitized = re.sub(r'[^a-zA-Z0-9\s\.äöü]+', '', book['comments']).replace('\n',' ')
keywords = getKeywords(sanitized,rake)
else:
keywords = []
if mergeTags: