Skip to content

Commit

Permalink
Format timelines and add absolute time of speech start and speech end #8
Browse files Browse the repository at this point in the history
  • Loading branch information
JetamZ committed Oct 27, 2024
1 parent 3257b54 commit 06ede3d
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 10 deletions.
4 changes: 3 additions & 1 deletion DatabaseCommunication/DatabaseInserter.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,8 @@ def insert_speeches(self, speeches):
s.unaligned_tokens,
s.time_spoken,
s.time_silent,
s.time_unknown))
s.time_unknown,
s.time_start,
s.time_end))

self.connection.commit()
7 changes: 5 additions & 2 deletions DatabaseCommunication/DatabaseTableCreator.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,15 @@ def create_tables(self):
role VARCHAR(100),
person_id VARCHAR(100),
total_duration REAL,
earliest_timestamp VARCHAR(100),
latest_timestamp VARCHAR(100),
earliest_timestamp TIME,
latest_timestamp TIME,
unaligned_tokens INTEGER,
time_spoken REAL,
time_silent REAL,
time_unknown REAL,
time_start TIME,
time_end TIME,
FOREIGN KEY (person_id)
REFERENCES Person (person_id)
)
Expand Down
4 changes: 2 additions & 2 deletions DatabaseCommunication/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,6 @@ class OrganisationCommands(StrEnum):

class SpeechCommands(StrEnum):
INSERT_ALL = """
INSERT INTO speech(id, date, token_count, sentence_count, named_entity_count, role, person_id, total_duration, earliest_timestamp, latest_timestamp, unaligned_tokens, time_spoken, time_silent, time_unknown)
VALUES(%s,%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON CONFLICT (id) DO NOTHING
INSERT INTO speech(id, date, token_count, sentence_count, named_entity_count, role, person_id, total_duration, earliest_timestamp, latest_timestamp, unaligned_tokens, time_spoken, time_silent, time_unknown, time_start, time_end)
VALUES(%s,%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON CONFLICT (id) DO NOTHING
"""
44 changes: 39 additions & 5 deletions MetadataExtraction/speechParser2.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import xml.etree.ElementTree as ET
import xml.dom.minidom
import csv
from datetime import datetime, timedelta


args_parser = argparse.ArgumentParser()
args_parser.add_argument("--file", type=str, default="../../ParCzech.TEI.ana/ps2013-001/ps2013-001-01-000-999.ana.xml")
Expand Down Expand Up @@ -38,16 +40,38 @@ def __init__(self, tokens, sentences, NE_refs, role, speech_id, speaker, when):
self.time_spoken = None
self.time_silent = None
self.time_unknown = None
self.time_start = None
self.time_end = None

def loadTimestampsInfo(self, timestamps_info):
self.earliest_timeline = timestamps_info[0]
self.latest_timeline = timestamps_info[1]
if timestamps_info[0] and timestamps_info[1]:
self.earliest_timeline = timestamps_info[0][0]
self.latest_timeline = timestamps_info[1][0]

self.total_duration = timestamps_info[2]
self.unaligned_tokens = timestamps_info[3]
self.time_spoken = timestamps_info[4]
self.time_silent = timestamps_info[5]
self.time_unknown = max(0,self.total_duration - self.time_spoken - self.time_silent)


if self.earliest_timeline and self.latest_timeline:

begin_offset = float(timestamps_info[0][1]) / 1000
end_offset = float(timestamps_info[1][1]) / 1000

dt_earliest = datetime.strptime(self.earliest_timeline, "%Y-%m-%dT%H:%M:%S")
dt_latest = datetime.strptime(self.latest_timeline, "%Y-%m-%dT%H:%M:%S")

dt_earliest_offset = dt_earliest + timedelta(seconds=begin_offset)
dt_latest_offset = dt_latest + timedelta(seconds=end_offset)

self.time_start = dt_earliest_offset.strftime("%H:%M:%S")
self.time_end = dt_latest_offset.strftime("%H:%M:%S")

self.earliest_timeline = dt_earliest.strftime("%H:%M:%S")
self.latest_timeline = dt_latest.strftime("%H:%M:%S")


def __str__(self):
result ="---SPEECH---\n"
result += f"ID: {self.speechID}\n"
Expand All @@ -64,6 +88,8 @@ def __str__(self):
result += f"unaligned tokens: {self.unaligned_tokens}\n"
result += f"earliest timeline: {self.earliest_timeline}\n"
result += f"latest timeline: {self.latest_timeline}\n"
result += f"time start: {self.time_start}\n"
result += f"time end: {self.time_end}"
return result

class speechParser2:
Expand Down Expand Up @@ -169,6 +195,8 @@ def __processTimestampsCSV(self):
unaligned_tokens = 0
time_silent = 0
rows = list(reader)
first_interval = None
last_interval = None
actual_timeline = None
previous_end = None
for row in rows:
Expand All @@ -187,9 +215,10 @@ def __processTimestampsCSV(self):
else:
leftovers = self.__get_total_duration_ms(intervals)
if (leftovers > 0):
last_interval = intervals[-1]
total_duration += leftovers

results.append([times[0], times[-1], total_duration,unaligned_tokens,
results.append([(times[0],first_interval), (times[-1], last_interval), total_duration,unaligned_tokens,
total_spoken, time_silent])
total_spoken = 0
intervals = []
Expand All @@ -199,6 +228,7 @@ def __processTimestampsCSV(self):
total_duration = 0
time_silent = 0
previous_end = None
first_interval, last_interval = None, None
elif row['Type'] == 'T':
if ((row['Time'] != actual_timeline) and (row['Time'] != '')):
actual_timeline = row['Time']
Expand All @@ -210,6 +240,10 @@ def __processTimestampsCSV(self):
if (row['Begin'] and row['End']):
if previous_end != None:
time_silent += float(row['Begin']) - float(previous_end)

if (first_interval == None):
first_interval = row["Begin"]

intervals.append(row['Begin'])
intervals.append(row['End'])
total_spoken += float(row['End']) - float(row['Begin'])
Expand All @@ -218,7 +252,7 @@ def __processTimestampsCSV(self):
unaligned_tokens += 1
previous_end = None


return results

def __get_total_duration_ms(self, speech_timestamps):
Expand Down

0 comments on commit 06ede3d

Please sign in to comment.