From b4e37a06cff2f20e74eaed8866ff236e777b2e14 Mon Sep 17 00:00:00 2001 From: Matej Date: Tue, 22 Oct 2024 12:36:34 +0200 Subject: [PATCH] Add one more forgotten test + update tester #9 --- .../expectedTimestampsMultipleSpeeces.txt | 31 ++ .../inputs/timestampsMultipleSpeeches.xml | 385 ++++++++++++++++++ test/MetadataExtraction/tester.py | 11 +- 3 files changed, 422 insertions(+), 5 deletions(-) create mode 100644 test/MetadataExtraction/examples/expected/expectedTimestampsMultipleSpeeces.txt create mode 100644 test/MetadataExtraction/examples/inputs/timestampsMultipleSpeeches.xml diff --git a/test/MetadataExtraction/examples/expected/expectedTimestampsMultipleSpeeces.txt b/test/MetadataExtraction/examples/expected/expectedTimestampsMultipleSpeeces.txt new file mode 100644 index 0000000..81048a6 --- /dev/null +++ b/test/MetadataExtraction/examples/expected/expectedTimestampsMultipleSpeeces.txt @@ -0,0 +1,31 @@ +---SPEECH--- +ID: timestampsMultipleSpeeches.u1 +author: #personX +role: #roleX +when: 2013-11-25 +tokens: 13 +sentences: 1 +named entity refferences: 2 +total duration: 1040.0 +total spoken: 700.0 +time silent: 340.0 +time unknown: 0 +unaligned tokens: 0 +earliest timeline: 2024-10-21T14:49:00 +latest timeline: 2024-10-21T14:49:00 +---SPEECH--- +ID: timestampsMultipleSpeeches.u2 +author: #personY +role: #roleY +when: 2013-11-25 +tokens: 13 +sentences: 1 +named entity refferences: 2 +total duration: 1040.0 +total spoken: 650.0 +time silent: 290.0 +time unknown: 100.0 +unaligned tokens: 1 +earliest timeline: 2024-10-21T14:49:00 +latest timeline: 2024-10-21T14:49:00 + diff --git a/test/MetadataExtraction/examples/inputs/timestampsMultipleSpeeches.xml b/test/MetadataExtraction/examples/inputs/timestampsMultipleSpeeches.xml new file mode 100644 index 0000000..d3fc7e5 --- /dev/null +++ b/test/MetadataExtraction/examples/inputs/timestampsMultipleSpeeches.xml @@ -0,0 +1,385 @@ + + + + + + Český parlamentní korpus, Poslanecká sněmovna, 2013-11-25 ps2013-001-01-000-999 [ParCzech.ana] + Czech parliamentary corpus, Chamber of Deputies, 2013-11-25 ps2013-001-01-000-999 [ParCzech.ana] + Parlament České republiky, Poslanecká sněmovna, 2013-11-25, Začátek schůze Poslanecké sněmovny 25. listopadu 2013 ve 14.05 hodin Přítomno: 199 poslanců + Parliament of the Czech Republic, Chamber of Deputies, 2013-11-25 + Začátek schůze Poslanecké sněmovny 25. listopadu 2013 ve 14.05 hodin Přítomno: 199 poslanců + ps2013 + ps2013/001 + ps2013/001/01 + ps2013/001/999 + + Matyáš Kopp + Data retrieval + TEI XML corpus encoding + Linguistic annotation + + + LINDAT/CLARIAH-CZ: Digitální výzkumná infrastruktura pro jazykové technologie, umění a humanitní vědy + LINDAT/CLARIAH-CZ: Digital Research Infrastructure for Language Technologies, Arts and Humanities + + + + 4.0 + + + 1 promluv + 1 speeches + 173 slov + 173 words + + + + LINDAT/CLARIAH-CZ: Digitální výzkumná infrastruktura pro jazykové technologie, umění a humanitní vědy + LINDAT/CLARIAH-CZ: Digital Research Infrastructure for Language Technologies, Arts and Humanities + www.lindat.cz + + http://hdl.handle.net/11234/1-5360 + + https://creativecommons.org/publicdomain/zero/1.0/ +

This work is licensed under the CC0 1.0 Universal (CC0 1.0) Public Domain Dedication.

+
+ 2024-01-26 +
+ + + Parlament České republiky, Poslanecká sněmovna + Parliament of the Czech Republic, Chamber of Deputies + https://www.psp.cz/eknih/2013ps/stenprot/001schuz/s001001.htm + 25.11.2013 + + + + + + + +
+ + +

+ ParCzech is a project on compiling Czech parliamentary data into annotated corpora. It mostly follows the ParlaMint project's recommendation, but the data are slightly extended in several ways. Texts contain links to original voting and prints. Except for the 4-class named entities classification, it also includes a more detailed CNEC hierarchical classification. The text in the annotated version is aligned with audio on the token level. And morphological annotation contains pdt tagsed besides UD PoS and features.

+
+ + + + + + + + + + + + + + + + + + + + + + +
+ + + + Parlament České republiky - Poslanecká sněmovna + Sněmovní 176/4 + Praha + Czech Republic + 2013-11-25 + + + +
+ + +
+ + + + + Lorem + + + + Ipsum + + + dolor + + + + sit + + + amet + + , + + consecteur + + + + adipiscing + + + + elit + + + + + + sed + + + do + + + eiusmod + + + tempor + + + incididunt + + + + + + + + + Lorem + + Ipsum + + dolor + + + sit + + + + amet + + + , + + consecteur + + + adipiscing + + + + elit + + + + + sed + + + do + + + eiusmod + + + tempor + + + + incididunt + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
diff --git a/test/MetadataExtraction/tester.py b/test/MetadataExtraction/tester.py index b6b472b..af8e90b 100644 --- a/test/MetadataExtraction/tester.py +++ b/test/MetadataExtraction/tester.py @@ -8,13 +8,11 @@ args_parser.add_argument("--p",action="store_true", help="Set this flag to test persons") test_cases_speeches = [ - # ("REAL","--file=examples/inputs/real.xml", "--wd=../../MetadataExtraction", "examples/expected/expectedReal.txt"), - # ("MISSING TIMESTAMPS","--file=examples/inputs/missingTimestamps.xml", "--wd=../../MetadataExtraction", "examples/expected/expectedMissingTimestamps.txt"), - # ("ENCAPSULATED WORDS","--file=examples/inputs/encapsulatedWords.xml", "--wd=../../MetadataExtraction", "examples/expected/expectedEncapsulatedWords.txt"), - # ("MULTIPLE SPEECHES", "--file=examples/inputs/multipleSpeeches.xml", "--wd=../../MetadataExtraction", "examples/expected/expectedMultipleSpeeches.txt"), ("SIMPLE TIMESTAMPS", "--file=examples/inputs/timestampsSimple.xml", "--wd=../../MetadataExtraction", "examples/expected/expectedTimestampsSimple.txt"), ("MULTIPLE TIMELINES", "--file=examples/inputs/timestampsMultipleTimelines.xml", "--wd=../../MetadataExtraction", "examples/expected/expectedTimestampsMultipleTimelines.txt"), ("NAMES AND DATES", "--file=examples/inputs/timestampsNamesAndDates.xml", "--wd=../../MetadataExtraction", "examples/expected/expectedTimestampsNamesAndDates.txt"), + ("MISSING ANCHORS", "--file=examples/inputs/timestampsMissngAnchors.xml", "--wd=../../MetadataExtraction", "examples/expected/expectedTimestampsMissngAnchors.txt"), + ("MULTIPLE SPEECHES", "--file=examples/inputs/timestampsMultipleSpeeches.xml", "--wd=../../MetadataExtraction", "examples/expected/expectedTimestampsMultipleSpeeces.txt"), ] test_cases_persons = [ @@ -36,7 +34,10 @@ def test(what, test_cases): print(error) with open(expected_output, 'r') as f: expected = f.read().strip() - + + with open(f"examples/actual/actual{expected_output[26:]}", 'w') as f: + f.write(actual_output) + if (actual_output == expected): print(f"Test {test_name}: PASSED!") else: