From cc909ebb07c9548c722c637b2e20ca33b5dd6a1e Mon Sep 17 00:00:00 2001 From: Matej Date: Thu, 24 Oct 2024 13:32:43 +0200 Subject: [PATCH] Add first version of speech validation and some tests for it #10 #9 --- MetadataExtraction/speechParser2.py | 75 +++- ...tualTimestampsMultipleTimelinesInvalid.txt | 0 .../actual/actualTimestampsValidInvalid.txt | 15 + ...ctedTimestampsMultipleTimelinesInvalid.txt | 2 + .../expectedTimestampsValidInvalid.txt | 16 + .../timestampsMultipleTimelinesInvalid.xml | 255 ++++++++++++ .../inputs/timestampsValidInvalid.xml | 385 ++++++++++++++++++ test/MetadataExtraction/tester.py | 2 + 8 files changed, 741 insertions(+), 9 deletions(-) create mode 100644 test/MetadataExtraction/examples/actual/actualTimestampsMultipleTimelinesInvalid.txt create mode 100644 test/MetadataExtraction/examples/actual/actualTimestampsValidInvalid.txt create mode 100644 test/MetadataExtraction/examples/expected/expectedTimestampsMultipleTimelinesInvalid.txt create mode 100644 test/MetadataExtraction/examples/expected/expectedTimestampsValidInvalid.txt create mode 100644 test/MetadataExtraction/examples/inputs/timestampsMultipleTimelinesInvalid.xml create mode 100644 test/MetadataExtraction/examples/inputs/timestampsValidInvalid.xml diff --git a/MetadataExtraction/speechParser2.py b/MetadataExtraction/speechParser2.py index 0ae4f4c..c0caacd 100644 --- a/MetadataExtraction/speechParser2.py +++ b/MetadataExtraction/speechParser2.py @@ -77,7 +77,7 @@ def __transformFileToCSV(self, transformation, file): with open(transformation[1], "wb") as f: f.write(result) - def __processSpeechesCSV(self): + def __processSpeechesCSV(self, invalid_speeches): result = defaultdict() timestamps_info = self.__processTimestampsCSV() current_speech = 0 @@ -93,16 +93,69 @@ def __processSpeechesCSV(self): row["personID"], row["date"]) - if (len(timestamps_info) > 0): - utterance.loadTimestampsInfo(timestamps_info[current_speech]) - - if (not row["personID"] in result): - result[row["personID"]] = [utterance] - else: - result[row["personID"]].append(utterance) + if row['ID'] not in invalid_speeches: + if (len(timestamps_info) > 0): + utterance.loadTimestampsInfo(timestamps_info[current_speech]) + if (not row["personID"] in result): + result[row["personID"]] = [utterance] + else: + result[row["personID"]].append(utterance) + current_speech += 1 return result + def __validateData(self): + """ + Method for validating speech data and finding speeches (so far, later maybe just sentences) + with malformed timelines. + """ + + valid_speeches = [] + invalid_speeches = [] + with open(self.transformations[1][1], 'r', encoding='utf-8') as csvfile: + reader = csv.DictReader(csvfile) + rows = list(reader) + current_speech = None + current_timeline = None + intervals = [] + times = [] + valid = True + for row in rows: + if row['Type'] == 'S': + if current_speech == None: + current_speech = row['Begin'] + else: + if len(times) <= 1: + if all(x <= y for x,y in zip(intervals, intervals[1:])): + valid_speeches.append(current_speech) + else: + invalid_speeches.append(current_speech) + else: + if len(intervals) > 0: + valid = all(x <= y for x, y in zip(intervals, intervals[1:])) + if valid: + valid_speeches.append(current_speech) + else: + invalid_speeches.append(current_speech) + intervals = [] + times = [] + current_speech = row['Begin'] + elif row['Type'] == 'T': + + if (row['Time'] != current_timeline) and (row['Time'] != ''): + current_timeline = row['Time'] + times.append(current_timeline) + + valid = all(x <= y for x,y in zip(intervals, intervals[1:])) + + intervals = [] + + if (row['Begin'] and row['End']): + intervals.append(row['Begin']) + intervals.append(row['End']) + + return invalid_speeches + def __processTimestampsCSV(self): results = [] with open(self.transformations[1][1], 'r', encoding="utf-8") as csvfile: @@ -171,9 +224,13 @@ def __get_total_duration_ms(self, speech_timestamps): def pipeline(self, file): + invalid = [] for transformation in self.transformations: self.__transformFileToCSV(transformation, file) - result = self.__processSpeechesCSV() + for invalid_speech in self.__validateData(): + invalid.append(invalid_speech) + + result = self.__processSpeechesCSV(invalid) return result def main(args): diff --git a/test/MetadataExtraction/examples/actual/actualTimestampsMultipleTimelinesInvalid.txt b/test/MetadataExtraction/examples/actual/actualTimestampsMultipleTimelinesInvalid.txt new file mode 100644 index 0000000..e69de29 diff --git a/test/MetadataExtraction/examples/actual/actualTimestampsValidInvalid.txt b/test/MetadataExtraction/examples/actual/actualTimestampsValidInvalid.txt new file mode 100644 index 0000000..c41d616 --- /dev/null +++ b/test/MetadataExtraction/examples/actual/actualTimestampsValidInvalid.txt @@ -0,0 +1,15 @@ +---SPEECH--- +ID: timestampsValidInvalid.u1 +author: #personX +role: #roleX +when: 2013-11-25 +tokens: 13 +sentences: 1 +named entity refferences: 2 +total duration: 1040.0 +total spoken: 700.0 +time silent: 340.0 +time unknown: 0 +unaligned tokens: 0 +earliest timeline: 2024-10-21T14:49:00 +latest timeline: 2024-10-21T14:49:00 \ No newline at end of file diff --git a/test/MetadataExtraction/examples/expected/expectedTimestampsMultipleTimelinesInvalid.txt b/test/MetadataExtraction/examples/expected/expectedTimestampsMultipleTimelinesInvalid.txt new file mode 100644 index 0000000..139597f --- /dev/null +++ b/test/MetadataExtraction/examples/expected/expectedTimestampsMultipleTimelinesInvalid.txt @@ -0,0 +1,2 @@ + + diff --git a/test/MetadataExtraction/examples/expected/expectedTimestampsValidInvalid.txt b/test/MetadataExtraction/examples/expected/expectedTimestampsValidInvalid.txt new file mode 100644 index 0000000..72d8f5b --- /dev/null +++ b/test/MetadataExtraction/examples/expected/expectedTimestampsValidInvalid.txt @@ -0,0 +1,16 @@ +---SPEECH--- +ID: timestampsValidInvalid.u1 +author: #personX +role: #roleX +when: 2013-11-25 +tokens: 13 +sentences: 1 +named entity refferences: 2 +total duration: 1040.0 +total spoken: 700.0 +time silent: 340.0 +time unknown: 0 +unaligned tokens: 0 +earliest timeline: 2024-10-21T14:49:00 +latest timeline: 2024-10-21T14:49:00 + diff --git a/test/MetadataExtraction/examples/inputs/timestampsMultipleTimelinesInvalid.xml b/test/MetadataExtraction/examples/inputs/timestampsMultipleTimelinesInvalid.xml new file mode 100644 index 0000000..fea74fe --- /dev/null +++ b/test/MetadataExtraction/examples/inputs/timestampsMultipleTimelinesInvalid.xml @@ -0,0 +1,255 @@ + + + + + + Český parlamentní korpus, Poslanecká sněmovna, 2013-11-25 ps2013-001-01-000-999 [ParCzech.ana] + Czech parliamentary corpus, Chamber of Deputies, 2013-11-25 ps2013-001-01-000-999 [ParCzech.ana] + Parlament České republiky, Poslanecká sněmovna, 2013-11-25, Začátek schůze Poslanecké sněmovny 25. listopadu 2013 ve 14.05 hodin Přítomno: 199 poslanců + Parliament of the Czech Republic, Chamber of Deputies, 2013-11-25 + Začátek schůze Poslanecké sněmovny 25. listopadu 2013 ve 14.05 hodin Přítomno: 199 poslanců + ps2013 + ps2013/001 + ps2013/001/01 + ps2013/001/999 + + Matyáš Kopp + Data retrieval + TEI XML corpus encoding + Linguistic annotation + + + LINDAT/CLARIAH-CZ: Digitální výzkumná infrastruktura pro jazykové technologie, umění a humanitní vědy + LINDAT/CLARIAH-CZ: Digital Research Infrastructure for Language Technologies, Arts and Humanities + + + + 4.0 + + + 1 promluv + 1 speeches + 173 slov + 173 words + + + + LINDAT/CLARIAH-CZ: Digitální výzkumná infrastruktura pro jazykové technologie, umění a humanitní vědy + LINDAT/CLARIAH-CZ: Digital Research Infrastructure for Language Technologies, Arts and Humanities + www.lindat.cz + + http://hdl.handle.net/11234/1-5360 + + https://creativecommons.org/publicdomain/zero/1.0/ +

This work is licensed under the CC0 1.0 Universal (CC0 1.0) Public Domain Dedication.

+
+ 2024-01-26 +
+ + + Parlament České republiky, Poslanecká sněmovna + Parliament of the Czech Republic, Chamber of Deputies + https://www.psp.cz/eknih/2013ps/stenprot/001schuz/s001001.htm + 25.11.2013 + + + + + + + +
+ + +

+ ParCzech is a project on compiling Czech parliamentary data into annotated corpora. It mostly follows the ParlaMint project's recommendation, but the data are slightly extended in several ways. Texts contain links to original voting and prints. Except for the 4-class named entities classification, it also includes a more detailed CNEC hierarchical classification. The text in the annotated version is aligned with audio on the token level. And morphological annotation contains pdt tagsed besides UD PoS and features.

+
+ + + + + + + + + + + + + + + + + + + + + + +
+ + + + Parlament České republiky - Poslanecká sněmovna + Sněmovní 176/4 + Praha + Czech Republic + 2013-11-25 + + + +
+ + +
+ + + + + Lorem + + + Ipsum + + + dolor + + + sit + + + amet + + , + + consecteur + + + adipiscing + + + elit + + + + sed + + + do + + + eiusmod + + + tempor + + + incididunt + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
diff --git a/test/MetadataExtraction/examples/inputs/timestampsValidInvalid.xml b/test/MetadataExtraction/examples/inputs/timestampsValidInvalid.xml new file mode 100644 index 0000000..6002aeb --- /dev/null +++ b/test/MetadataExtraction/examples/inputs/timestampsValidInvalid.xml @@ -0,0 +1,385 @@ + + + + + + Český parlamentní korpus, Poslanecká sněmovna, 2013-11-25 ps2013-001-01-000-999 [ParCzech.ana] + Czech parliamentary corpus, Chamber of Deputies, 2013-11-25 ps2013-001-01-000-999 [ParCzech.ana] + Parlament České republiky, Poslanecká sněmovna, 2013-11-25, Začátek schůze Poslanecké sněmovny 25. listopadu 2013 ve 14.05 hodin Přítomno: 199 poslanců + Parliament of the Czech Republic, Chamber of Deputies, 2013-11-25 + Začátek schůze Poslanecké sněmovny 25. listopadu 2013 ve 14.05 hodin Přítomno: 199 poslanců + ps2013 + ps2013/001 + ps2013/001/01 + ps2013/001/999 + + Matyáš Kopp + Data retrieval + TEI XML corpus encoding + Linguistic annotation + + + LINDAT/CLARIAH-CZ: Digitální výzkumná infrastruktura pro jazykové technologie, umění a humanitní vědy + LINDAT/CLARIAH-CZ: Digital Research Infrastructure for Language Technologies, Arts and Humanities + + + + 4.0 + + + 1 promluv + 1 speeches + 173 slov + 173 words + + + + LINDAT/CLARIAH-CZ: Digitální výzkumná infrastruktura pro jazykové technologie, umění a humanitní vědy + LINDAT/CLARIAH-CZ: Digital Research Infrastructure for Language Technologies, Arts and Humanities + www.lindat.cz + + http://hdl.handle.net/11234/1-5360 + + https://creativecommons.org/publicdomain/zero/1.0/ +

This work is licensed under the CC0 1.0 Universal (CC0 1.0) Public Domain Dedication.

+
+ 2024-01-26 +
+ + + Parlament České republiky, Poslanecká sněmovna + Parliament of the Czech Republic, Chamber of Deputies + https://www.psp.cz/eknih/2013ps/stenprot/001schuz/s001001.htm + 25.11.2013 + + + + + + + +
+ + +

+ ParCzech is a project on compiling Czech parliamentary data into annotated corpora. It mostly follows the ParlaMint project's recommendation, but the data are slightly extended in several ways. Texts contain links to original voting and prints. Except for the 4-class named entities classification, it also includes a more detailed CNEC hierarchical classification. The text in the annotated version is aligned with audio on the token level. And morphological annotation contains pdt tagsed besides UD PoS and features.

+
+ + + + + + + + + + + + + + + + + + + + + + +
+ + + + Parlament České republiky - Poslanecká sněmovna + Sněmovní 176/4 + Praha + Czech Republic + 2013-11-25 + + + +
+ + +
+ + + + + Lorem + + + + Ipsum + + + dolor + + + + sit + + + amet + + , + + consecteur + + + + adipiscing + + + + elit + + + + + + sed + + + do + + + eiusmod + + + tempor + + + incididunt + + + + + + + + + Lorem + + Ipsum + + dolor + + + sit + + + + amet + + + , + + consecteur + + + adipiscing + + + + elit + + + + + sed + + + do + + + eiusmod + + + tempor + + + + incididunt + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
diff --git a/test/MetadataExtraction/tester.py b/test/MetadataExtraction/tester.py index af8e90b..f0e7708 100644 --- a/test/MetadataExtraction/tester.py +++ b/test/MetadataExtraction/tester.py @@ -13,6 +13,8 @@ ("NAMES AND DATES", "--file=examples/inputs/timestampsNamesAndDates.xml", "--wd=../../MetadataExtraction", "examples/expected/expectedTimestampsNamesAndDates.txt"), ("MISSING ANCHORS", "--file=examples/inputs/timestampsMissngAnchors.xml", "--wd=../../MetadataExtraction", "examples/expected/expectedTimestampsMissngAnchors.txt"), ("MULTIPLE SPEECHES", "--file=examples/inputs/timestampsMultipleSpeeches.xml", "--wd=../../MetadataExtraction", "examples/expected/expectedTimestampsMultipleSpeeces.txt"), + ("MESSY SPEECH", "--file=examples/inputs/timestampsValidInvalid.xml", "--wd=../../MetadataExtraction", "examples/expected/expectedTimestampsValidInvalid.txt"), + ("MESSY SPEECH MULTIPLE TIMELINES", "--file=examples/inputs/timestampsMultipleTimelinesInvalid.xml", "--wd=../../MetadataExtraction", "examples/expected/expectedTimestampsMultipleTimelinesInvalid.txt"), ] test_cases_persons = [