Skip to content

Commit

Permalink
pull in a higher level inner text
Browse files Browse the repository at this point in the history
  • Loading branch information
markanewman committed Jan 4, 2021
1 parent b5874a0 commit 0737ef2
Showing 1 changed file with 9 additions and 10 deletions.
19 changes: 9 additions & 10 deletions code/convert_speeches_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,12 +69,10 @@ def _get_paragraphs(tree: etree) -> t.Iterator[str]:
Gets the paragraphs from the speech
There are several formats we need to try in order of decreasing occurrence
"""
xpaths = ["//article/div/p", "//article/div/div", "//article/div/br"]
for xpath in xpaths:
paragraphs = tree.findall(xpath)
paragraphs = [paragraph for paragraph in _get_innertext(paragraphs)]
if len(paragraphs) > 0:
return paragraphs
paragraphs = tree.findall('//article/div')
paragraphs = [paragraph for paragraph in _get_innertext(paragraphs)]
if len(paragraphs) > 0:
return paragraphs

@typechecked
def _get_innertext(nodes: t.List[etree.Element]) -> t.Iterator[str]:
Expand Down Expand Up @@ -108,7 +106,8 @@ def _get_innertext(nodes: t.List[etree.Element]) -> t.Iterator[str]:
help = 'File containing the speeches'' text',
type = pathlib.Path,
required = True)
args = parser.parse_args()
print(f'folder in: {args.folder_in}')
print(f'file out: {args.file_out}')
convert_speeches_text(args.folder_in, args.file_out)
#args = parser.parse_args()
#print(f'folder in: {args.folder_in}')
#print(f'file out: {args.file_out}')
#convert_speeches_text(args.folder_in, args.file_out)
convert_speeches_text(pathlib.Path('d:/datasets/who/raw'), pathlib.Path('d:/datasets/who/corpus.jsonl'))

0 comments on commit 0737ef2

Please sign in to comment.