Sanitize description of books before rake
This commit is contained in:
parent
7af5109e7f
commit
a6d9a75030
32
caliGraph.py
32
caliGraph.py
@ -460,19 +460,22 @@ def remove_html_tags(text):
|
||||
return re.sub(clean, '', text)
|
||||
|
||||
def getKeywords(txt,rake):
|
||||
txt = remove_html_tags(txt)
|
||||
k = []
|
||||
rake.extract_keywords_from_text(txt)
|
||||
kws = rake.get_ranked_phrases_with_scores()
|
||||
for i,(score,kw) in enumerate(kws):
|
||||
l = len(kw.split(' '))
|
||||
k.append((score**(1/(l*0.5)),kw))
|
||||
k.sort(key=lambda x: x[0],reverse=True)
|
||||
minSco = k[0][0]/3*2
|
||||
for i,kw in enumerate(k):
|
||||
if kw[0] < minSco:
|
||||
return [(sco,word.title()) for sco,word in k[:i]]
|
||||
return k
|
||||
txt = remove_html_tags(txt)
|
||||
k = []
|
||||
rake.extract_keywords_from_text(txt)
|
||||
kws = rake.get_ranked_phrases_with_scores()
|
||||
for i,(score,kw) in enumerate(kws):
|
||||
l = len(kw.split(' '))
|
||||
if kw.lower() not in ['p', 'die', 'best', 'known', 'fk', 'p pp', 'one'] and len(kw)>3 and kw.find('div')==-1 and kw.lower().find('p p')==-1:
|
||||
k.append((score**(1/(l*0.4)),kw))
|
||||
k.sort(key=lambda x: x[0],reverse=True)
|
||||
if k:
|
||||
minSco = k[0][0]/3*2
|
||||
for i,kw in enumerate(k):
|
||||
if kw[0] < minSco:
|
||||
return [(sco,word.title()) for sco,word in k[:i]]
|
||||
return k
|
||||
return []
|
||||
|
||||
def buildBookGraph(books, darkMode=False, extractKeywords=True, mergeTags=True):
|
||||
G = nx.Graph()
|
||||
@ -492,7 +495,8 @@ def buildBookGraph(books, darkMode=False, extractKeywords=True, mergeTags=True):
|
||||
else:
|
||||
desc = ''
|
||||
if 'comments' in book and extractKeywords:
|
||||
keywords = getKeywords(book['comments'],rake)
|
||||
sanitized = re.sub(r'[^a-zA-Z0-9\s\.äöü]+', '', book['comments']).replace('\n',' ')
|
||||
keywords = getKeywords(sanitized,rake)
|
||||
else:
|
||||
keywords = []
|
||||
if mergeTags:
|
||||
|
Loading…
Reference in New Issue
Block a user