Skip to content

Commit

Permalink
Connect timestamps extraction to main_driver #8
Browse files Browse the repository at this point in the history
  • Loading branch information
JetamZ committed Oct 5, 2024
1 parent a742770 commit 724ba46
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 8 deletions.
4 changes: 2 additions & 2 deletions MetadataExtraction/speechParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import argparse
import tqdm

from timestampsExtractor import timestampsExtractor
from MetadataExtraction.timestampsExtractor import timestampsExtractor

from collections import defaultdict
from lxml import etree
Expand Down Expand Up @@ -101,7 +101,7 @@ def process_file(self, filePath):
result - dictionary where keys are IDs of speakers and values are lists of speeches
"""
# Extract the information on when the speech was given
te = timestampsExtractor(self.corpus_root, "timestampsCSV.xslt")
te = timestampsExtractor(self.corpus_root, "MetadataExtraction/timestampsCSV.xslt")
result = defaultdict()
root = (etree.parse(filePath)).getroot()
timestampsInfo = te.pipeline(filePath)
Expand Down
4 changes: 2 additions & 2 deletions MetadataExtraction/timestampsExtractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def __transformFileCSV(self, filepath):
xml_file = filepath
xml_tree = etree.parse(xml_file)
result = self.apply_script(xml_tree)
with open("transformedCSV.csv", 'wb') as f:
with open("MetadataExtraction/transformedCSV.csv", 'wb') as f:
f.write(result)


Expand Down Expand Up @@ -154,7 +154,7 @@ def __get_total_duration(self, speech_timestamps):

def pipeline(self, filepath):
self.__transformFileCSV(filepath)
return self.__process_file("transformedCSV.csv")
return self.__process_file("MetadataExtraction/transformedCSV.csv")

def main(args):
tsExtractor = timestampsExtractor(args.corpus_root, args.script)
Expand Down
9 changes: 5 additions & 4 deletions main_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,11 @@ def __parse_speech_files(self):
transcript_files = teiCorpus.getElementsByTagName('xi:include')
for elem in tqdm.tqdm(transcript_files, leave=False, desc="Iterationg thorugh transcript_files"):
ref = elem.getAttribute("href")
filePath = self.source + '/' + ref
contents = speech_parser.process_file(filePath)
if contents:
self.databaseInserter.insert_speeches(contents)
if ref[0:2] == "ps":
filePath = self.source + '/' + ref
contents = speech_parser.process_file(filePath)
if contents:
self.databaseInserter.insert_speeches(contents)

def __parse_persons_file(self, file, source_corpus):
person_parser = personParser(file, source_corpus)
Expand Down

0 comments on commit 724ba46

Please sign in to comment.