Skip to content

Commit

Permalink
fix medium heuristic for finding the content node
Browse files Browse the repository at this point in the history
  • Loading branch information
idoshamun authored Oct 25, 2021
1 parent a5b15ff commit a737681
Showing 1 changed file with 10 additions and 7 deletions.
17 changes: 10 additions & 7 deletions newspaper/extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -1014,13 +1014,16 @@ def nodes_to_check(self, doc):
on like paragraphs and tables
"""
nodes_to_check = []
for tag in ['p', 'pre', 'td', 'ol', 'ul']:
items = self.parser.getElementsByTag(doc, tag=tag)
nodes_to_check += items
for tag in ['section']:
items = self.parser.getElementsByTag(doc, tag=tag)
if len(items) > 1:
nodes_to_check = items
articles = self.parser.getElementsByTag(doc, tag='article')
if len(articles) > 0:
# Specific heuristic for Medium articles
sections = self.parser.getElementsByTag(articles[0], tag='section')
if len(sections) > 1:
nodes_to_check = sections
if len(nodes_to_check) == 0:
for tag in ['p', 'pre', 'td', 'ol', 'ul']:
items = self.parser.getElementsByTag(doc, tag=tag)
nodes_to_check += items
return nodes_to_check

def is_table_and_no_para_exist(self, e):
Expand Down

0 comments on commit a737681

Please sign in to comment.