pull in a higher level inner text

WHOSpeeches · Jan 4, 2021 · 0737ef2 · 0737ef2
1 parent b5874a0
commit 0737ef2
Showing 1 changed file with 9 additions and 10 deletions.
diff --git a/code/convert_speeches_text.py b/code/convert_speeches_text.py
@@ -69,12 +69,10 @@ def _get_paragraphs(tree: etree) -> t.Iterator[str]:
     Gets the paragraphs from the speech
     There are several formats we need to try in order of decreasing occurrence
     """
-    xpaths = ["//article/div/p", "//article/div/div", "//article/div/br"]
-    for xpath in xpaths:
-        paragraphs = tree.findall(xpath)
-        paragraphs = [paragraph for paragraph in _get_innertext(paragraphs)]
-        if len(paragraphs) > 0:
-            return paragraphs
+    paragraphs = tree.findall('//article/div')
+    paragraphs = [paragraph for paragraph in _get_innertext(paragraphs)]
+    if len(paragraphs) > 0:
+        return paragraphs
 
 @typechecked
 def _get_innertext(nodes: t.List[etree.Element]) -> t.Iterator[str]:
@@ -108,7 +106,8 @@ def _get_innertext(nodes: t.List[etree.Element]) -> t.Iterator[str]:
         help = 'File containing the speeches'' text',
         type = pathlib.Path,
         required = True)
-    args = parser.parse_args()
-    print(f'folder in: {args.folder_in}')
-    print(f'file out: {args.file_out}')
-    convert_speeches_text(args.folder_in, args.file_out)
+    #args = parser.parse_args()
+    #print(f'folder in: {args.folder_in}')
+    #print(f'file out: {args.file_out}')
+    #convert_speeches_text(args.folder_in, args.file_out)
+    convert_speeches_text(pathlib.Path('d:/datasets/who/raw'), pathlib.Path('d:/datasets/who/corpus.jsonl'))