Connect timestamps extraction to main_driver #8

ufal · Oct 5, 2024 · 724ba46 · 724ba46
1 parent a742770
commit 724ba46
Show file tree

Hide file tree

Showing 3 changed files with 9 additions and 8 deletions.
diff --git a/MetadataExtraction/speechParser.py b/MetadataExtraction/speechParser.py
@@ -4,7 +4,7 @@
 import argparse
 import tqdm
 
-from timestampsExtractor import timestampsExtractor
+from MetadataExtraction.timestampsExtractor import timestampsExtractor
 
 from collections import defaultdict
 from lxml import etree
@@ -101,7 +101,7 @@ def process_file(self, filePath):
             result - dictionary where keys are IDs of speakers and values are lists of speeches
         """
         # Extract the information on when the speech was given
-        te = timestampsExtractor(self.corpus_root, "timestampsCSV.xslt")
+        te = timestampsExtractor(self.corpus_root, "MetadataExtraction/timestampsCSV.xslt")
         result = defaultdict()
         root = (etree.parse(filePath)).getroot()
         timestampsInfo = te.pipeline(filePath)

diff --git a/MetadataExtraction/timestampsExtractor.py b/MetadataExtraction/timestampsExtractor.py
@@ -51,7 +51,7 @@ def __transformFileCSV(self, filepath):
         xml_file = filepath
         xml_tree = etree.parse(xml_file)
         result = self.apply_script(xml_tree)
-        with open("transformedCSV.csv", 'wb') as f:
+        with open("MetadataExtraction/transformedCSV.csv", 'wb') as f:
             f.write(result)
 
 
@@ -154,7 +154,7 @@ def __get_total_duration(self,  speech_timestamps):
 
     def pipeline(self, filepath):
         self.__transformFileCSV(filepath)
-        return self.__process_file("transformedCSV.csv")
+        return self.__process_file("MetadataExtraction/transformedCSV.csv")
 
 def main(args):
     tsExtractor = timestampsExtractor(args.corpus_root, args.script)

diff --git a/main_driver.py b/main_driver.py
@@ -35,10 +35,11 @@ def __parse_speech_files(self):
         transcript_files = teiCorpus.getElementsByTagName('xi:include')
         for elem in tqdm.tqdm(transcript_files, leave=False, desc="Iterationg thorugh transcript_files"):
             ref = elem.getAttribute("href")
-            filePath = self.source + '/' + ref
-            contents = speech_parser.process_file(filePath)
-            if contents:
-                self.databaseInserter.insert_speeches(contents)
+            if ref[0:2] == "ps":
+                filePath = self.source + '/' + ref
+                contents = speech_parser.process_file(filePath)
+                if contents:
+                    self.databaseInserter.insert_speeches(contents)
 
     def __parse_persons_file(self, file, source_corpus):
         person_parser = personParser(file, source_corpus)