Skip to content

Commit

Permalink
add fallback heuristics when can't find top node
Browse files Browse the repository at this point in the history
  • Loading branch information
idoshamun authored Oct 27, 2021
1 parent a4dcbe4 commit 7b95eb6
Showing 1 changed file with 8 additions and 0 deletions.
8 changes: 8 additions & 0 deletions newspaper/article.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,14 @@ def parse(self):
self.top_node = self.extractor.calculate_best_node(self.doc)
if self.top_node is None:
self.top_node = self.extractor.calculate_best_node(self.clean_doc)
if self.top_node is None:
self.top_node = self.extractor.parser.getElementById(self.doc, 'content')
if self.top_node is None:
for tag in ['article', 'main']:
nodes = self.extractor.parser.getElementsByTag(self.doc, tag=tag)
if len(nodes) > 0:
self.top_node = nodes[0]
break
if self.top_node is not None:
video_extractor = VideoExtractor(self.config, self.top_node)
self.set_movies(video_extractor.get_videos())
Expand Down

0 comments on commit 7b95eb6

Please sign in to comment.