Skip to content

Commit

Permalink
Incorporate the timestamps extraction into speechParser #8
Browse files Browse the repository at this point in the history
  • Loading branch information
JetamZ committed Oct 5, 2024
1 parent a51be59 commit 7a87162
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 17 deletions.
20 changes: 17 additions & 3 deletions MetadataExtraction/speechParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import argparse
import tqdm

from timestampsExtractor import timestampsExtractor

from collections import defaultdict
from lxml import etree
import xml.etree.ElementTree as ET
Expand All @@ -27,6 +29,8 @@ class Speech:
speakerID = str()
when = str()
total_duration = int()
earliest_timeline = str()
latest_timeline = str()

def __init__(self, tokens, sentences, NE_refs, role, speech_id, speaker, when):
self.tokens = tokens
Expand All @@ -36,9 +40,12 @@ def __init__(self, tokens, sentences, NE_refs, role, speech_id, speaker, when):
self.speechID = speech_id
self.speakerID = speaker
self.when = when
self.total_duration = None
self.earliest_timeline = None
self.latest_timeline = None

def __str__(self):
result = f"***\nSpeech {self.speechID} given by {self.speakerID} at {self.when}\ntokens: {self.tokens}\nsentences: {self.sentences}\nnamed entity refferences: {self.named_entity_refferences}\n***\n"
result = f"***\nSpeech {self.speechID} given by {self.speakerID} at {self.when}\ntokens: {self.tokens}\nsentences: {self.sentences}\nnamed entity refferences: {self.named_entity_refferences}\ntotal_duration: {self.total_duration}\nearliest_timeline: {self.earliest_timeline}\nlatest_timeline: {self.latest_timeline}\n***\n"
return result

class speechParser:
Expand Down Expand Up @@ -94,14 +101,16 @@ def process_file(self, filePath):
result - dictionary where keys are IDs of speakers and values are lists of speeches
"""
# Extract the information on when the speech was given
te = timestampsExtractor(self.corpus_root, "timestampsCSV.xslt")
result = defaultdict()
root = (etree.parse(filePath)).getroot()
timestampsInfo = te.pipeline(filePath)
current_speech = 0
namespace = '{'+str(list(root.nsmap.values())[0])+'}'
if root.tag == f"{namespace}TEI":
date = root.find('.//teiHeader/profileDesc/settingDesc/setting/date', root.nsmap)
when = date.get('when')


# Extract other information
utterances = root.findall(".//text/body/div/u", root.nsmap)
for u in utterances:
Expand All @@ -113,11 +122,16 @@ def process_file(self, filePath):
tokens_count, sentences_count, named_entities_count = self.__get_relevant_tags_count(u)

ut = Speech(tokens_count, sentences_count, named_entities_count, role, utterance_id, speaker, when)

if (len(timestampsInfo) > 0):
ut.earliest_timeline = timestampsInfo[current_speech][0]
ut.latest_timeline = timestampsInfo[current_speech][1]
ut.total_duration = timestampsInfo[current_speech][2]
if not speaker in result.keys():
result[speaker] = [ut]
else:
result[speaker].append(ut)

current_speech+=1
return result
return None

Expand Down
1 change: 0 additions & 1 deletion MetadataExtraction/timestampsCSV.xslt
Original file line number Diff line number Diff line change
Expand Up @@ -55,5 +55,4 @@
<xsl:value-of select="key('whenByID', substring($sinceRef, 2))/@absolute" />
<xsl:text>&#10;</xsl:text>
</xsl:template>

</xsl:stylesheet>
37 changes: 24 additions & 13 deletions MetadataExtraction/timestampsExtractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,15 @@ def __init__(self, corpus_root, script):
with open("TokensMissingAnchors.csv", 'a') as f:
f.write("ID\n")

def transformFileCSV(self, filepath):
def __transformFileCSV(self, filepath):

xml_file = filepath
xml_tree = etree.parse(xml_file)
result = self.apply_script(xml_tree)
with open("transformedCSV.csv", 'wb') as f:
f.write(result)


def extractTimestamps(self):
"""
Pass through each transcript file and extract the timestamps
Expand All @@ -64,9 +65,9 @@ def extractTimestamps(self):
for elem in tqdm.tqdm(transcript_files, leave=False, desc="Iterating thorugh transcript files."):
ref = elem.getAttribute("href")
filepath = self.corpus_dir + "/" + ref
self.transformFileCSV(filepath)
self.__transformFileCSV(filepath)

def process_speeches(self, file_out):
def __process_file(self, file_out):
"""
Method for separating the timestamps of different speeches in the output of the
transformation XSLT script.
Expand Down Expand Up @@ -112,20 +113,27 @@ def process_speeches(self, file_out):
f.write(row['ID'] + ',\n')

else:
self.speakers[speakers_in_file[current_speech]] += self.__get_total_duration(intervals)
# self.speakers[speakers_in_file[current_speech]] += self.__get_total_duration(intervals)

if (len(times) != 1):
if (len(times) >= 1):
results.append([times[0], times[-1], self.__get_total_duration(intervals)])
else:
results.append([times[0], times[0], self.__get_total_duration(intervals)])

results.append([None, None, self.__get_total_duration(intervals)])
current_speech += 1
intervals = []
times = [actual_timeline]

# print(self.speakers)
# Store the leftovers
# print(times)
if (len(times) >= 1):
results.append([times[0], times[-1], self.__get_total_duration(intervals)])

current_speech += 1
intervals = []
times = [actual_timeline]

return results

def __get_total_duration(self, speech_timestamps):
"""
Method responsible for getiing the total duration of the given speech
Expand All @@ -140,14 +148,17 @@ def __get_total_duration(self, speech_timestamps):
-----------
speech_timestamps - timestamps for the currently examined speech (CSV file or raw, not decided yet)
"""
# print(speech_timestamps)
if (len(speech_timestamps)) < 1:
return 0
return abs(float(speech_timestamps[-1]) - float(speech_timestamps[0]))


def pipeline(self, filepath):
self.__transformFileCSV(filepath)
return self.__process_file("transformedCSV.csv")

def main(args):
tsExtractor = timestampsExtractor(args.corpus_root, args.script)
tsExtractor.transformFileCSV(f"../../ParCzech.TEI.ana/ps2013-001/{args.specific_file}")
print(tsExtractor.process_speeches("transformedCSV.csv"))
print(tsExtractor.pipeline(f"../../ParCzech.TEI.ana/ps2013-001/{args.specific_file}"))
# tsExtractor.extractTimestamps()
# print(args.script)
print("Done!")
Expand Down

0 comments on commit 7a87162

Please sign in to comment.