Add additional data to speech entry in database #8

ufal · Oct 17, 2024 · f99d76e · f99d76e
1 parent e22089d
commit f99d76e
Show file tree

Hide file tree

Showing 4 changed files with 31 additions and 16 deletions.
diff --git a/DatabaseCommunication/DatabaseInserter.py b/DatabaseCommunication/DatabaseInserter.py
@@ -32,7 +32,7 @@ def insert_persons(self, persons):
                 if len(p.sex) > 1:
                     p.sex = 'U'
 
-                cursor.execute(PersonCommands.INSERT_ALL,(p.personID, p.sex, p.birth,))
+                cursor.execute(PersonCommands.INSERT_ALL,(p.personID, p.sex, p.birth))
                 self.__insert_name_records(p.name_records,p.personID, cursor)
             self.connection.commit()
 
@@ -114,14 +114,19 @@ def insert_speeches(self, speeches):
         with self.connection.cursor() as cursor:
             for author in speeches:
                 for s in speeches[author]:
-                    cursor.execute(SpeechCommands.INSERT_ALL, (s.when,
+                    cursor.execute(SpeechCommands.INSERT_ALL, (s.speechID,
+                                                               s.when,
                                                                str(s.tokens),
                                                                str(s.sentences),
                                                                str(s.named_entity_refferences),
                                                                s.role[1:],
                                                                s.speakerID[1:],
                                                                s.total_duration,
                                                                s.earliest_timeline,
-                                                               s.latest_timeline))
+                                                               s.latest_timeline,
+                                                               s.unaligned_tokens,
+                                                               s.time_spoken,
+                                                               s.time_silent,
+                                                               s.time_unknown))
 
             self.connection.commit()
diff --git a/DatabaseCommunication/DatabaseTableCreator.py b/DatabaseCommunication/DatabaseTableCreator.py
@@ -48,7 +48,7 @@ def create_tables(self):
                 """,
                 """
                 CREATE TABLE IF NOT EXISTS speech (
-                    id SERIAL PRIMARY KEY,
+                    id VARCHAR(100) PRIMARY KEY,
                     date DATE,
                     token_count INTEGER, 
                     sentence_count INTEGER,
@@ -58,6 +58,10 @@ def create_tables(self):
                     total_duration REAL,
                     earliest_timestamp VARCHAR(100),
                     latest_timestamp VARCHAR(100),
+                    unaligned_tokens INTEGER,
+                    time_spoken REAL,
+                    time_silent REAL,
+                    time_unknown REAL,
                     FOREIGN KEY (person_id)
                         REFERENCES Person (person_id)
                 )

diff --git a/DatabaseCommunication/commands.py b/DatabaseCommunication/commands.py
@@ -13,6 +13,6 @@ class OrganisationCommands(StrEnum):
 
 class SpeechCommands(StrEnum):
     INSERT_ALL = """
-                 INSERT INTO speech(date, token_count, sentence_count, named_entity_count, role, person_id, total_duration, earliest_timestamp, latest_timestamp)
-                 VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s)
+                 INSERT INTO speech(id, date, token_count, sentence_count, named_entity_count, role, person_id, total_duration, earliest_timestamp, latest_timestamp, unaligned_tokens, time_spoken, time_silent, time_unknown)
+                 VALUES(%s,%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON CONFLICT (id) DO NOTHING
                  """
diff --git a/MetadataExtraction/timestampsCSV.xslt b/MetadataExtraction/timestampsCSV.xslt
@@ -14,19 +14,24 @@
 	<!-- ID - of the speaker if the type is 'S' or of the token if the Type is 'T' -->
 	<!-- begin - marks the beginning of the token in audio -->
 	<!-- end - marks the end of the token in audion-->
-		<xsl:text>Type,ID,Speech,Begin,End,Time&#10;</xsl:text>
+		<xsl:text>Type,ID,Begin,End,Duration,Time&#10;</xsl:text>
 		<xsl:apply-templates select="tei:text/tei:body/tei:div/tei:u" />
-		<xsl:for-each select="//tei:w">
-			<xsl:call-template name="word" />
-		</xsl:for-each>
+
+		<!-- Artificially insert one "speaker" row so that the information about last speech is stored.-->
+		<xsl:text>S,END,END,END,END,END</xsl:text>
 	</xsl:template>
 
 
 	<!-- Keep the ID of a speaker -->
 	<xsl:template match="tei:u">
 		<xsl:text>S,</xsl:text>
 		<xsl:value-of select="@who"/>
-		<xsl:text>,,&#10;</xsl:text>
+		<xsl:text>,</xsl:text>
+		<xsl:value-of select="@xml:id" />
+		<xsl:text>,,,&#10;</xsl:text>
+		<xsl:for-each select="descendant::tei:w">
+			<xsl:call-template name="word" />
+		</xsl:for-each>
 	</xsl:template>
 
 	<!-- <xsl:template match="tei:w"> -->
@@ -36,10 +41,6 @@
 		<xsl:value-of select="@xml:id" />
 		<xsl:text>,</xsl:text>
 
-		<!-- Get the utterance the tag belongs to -->
-		<xsl:value-of select="substring-before(substring-after(@xml:id, 'u'), '.p')" />
-		<xsl:text>,</xsl:text>
-
 		<!-- Get the start timestamp of the tag -->		
 		<xsl:variable name="startSynch" select="preceding-sibling::tei:anchor[1]/@synch" />
 		<xsl:value-of select="key('whenByID', substring($startSynch, 2))/@interval" />
@@ -49,7 +50,12 @@
 		<xsl:variable name="endSynch" select="following-sibling::tei:anchor[1]/@synch" />
 		<xsl:value-of select="key('whenByID', substring($endSynch, 2))/@interval" />
 		<xsl:text>,</xsl:text>
-
+
+		<!-- Get the duration of the tag -->
+		<xsl:variable name="first" select="key('whenByID', substring($endSynch, 2))/@interval " />
+		<xsl:variable name="second" select="key('whenByID', substring($startSynch, 2))/@interval " />
+		<xsl:value-of select="$first - $second" /> 
+		<xsl:text>,</xsl:text>
 		<!-- Get the time the speech was given-->
 		<xsl:variable name="sinceRef" select="key('whenByID', substring($startSynch, 2))/@since" />
 		<xsl:value-of select="key('whenByID', substring($sinceRef, 2))/@absolute" />