fix medium heuristic for finding the content node

dailydotdev · Oct 25, 2021 · a737681 · a737681
1 parent a5b15ff
commit a737681
Showing 1 changed file with 10 additions and 7 deletions.
diff --git a/newspaper/extractors.py b/newspaper/extractors.py
@@ -1014,13 +1014,16 @@ def nodes_to_check(self, doc):
         on like paragraphs and tables
         """
         nodes_to_check = []
-        for tag in ['p', 'pre', 'td', 'ol', 'ul']:
-            items = self.parser.getElementsByTag(doc, tag=tag)
-            nodes_to_check += items
-        for tag in ['section']:
-            items = self.parser.getElementsByTag(doc, tag=tag)
-            if len(items) > 1:
-                nodes_to_check = items
+        articles = self.parser.getElementsByTag(doc, tag='article')
+        if len(articles) > 0:
+            # Specific heuristic for Medium articles
+            sections = self.parser.getElementsByTag(articles[0], tag='section')
+            if len(sections) > 1:
+                nodes_to_check = sections
+        if len(nodes_to_check) == 0:
+            for tag in ['p', 'pre', 'td', 'ol', 'ul']:
+                items = self.parser.getElementsByTag(doc, tag=tag)
+                nodes_to_check += items
         return nodes_to_check
 
     def is_table_and_no_para_exist(self, e):