diff --git a/test/MetadataExtraction/examples/expected/expectedTimestampsMultipleTimelines.txt b/test/MetadataExtraction/examples/expected/expectedTimestampsMultipleTimelines.txt new file mode 100644 index 0000000..0a9cfb1 --- /dev/null +++ b/test/MetadataExtraction/examples/expected/expectedTimestampsMultipleTimelines.txt @@ -0,0 +1,16 @@ +---SPEECH--- +ID: timestampsMultipleTimelines.u1 +author: #personX +role: #roleX +when: 2013-11-25 +tokens: 13 +sentences: 1 +named entity refferences: 0 +total duration: 1030.0 +total spoken: 700.0 +time silent: 330.0 +time unknown: 0 +unaligned tokens: 0 +earliest timeline: 2024-10-21T14:49:00 +latest timeline: 2024-10-21T15:09:00 + diff --git a/test/MetadataExtraction/examples/expected/expectedTimestampsNamesAndDates.txt b/test/MetadataExtraction/examples/expected/expectedTimestampsNamesAndDates.txt new file mode 100644 index 0000000..8e8e67f --- /dev/null +++ b/test/MetadataExtraction/examples/expected/expectedTimestampsNamesAndDates.txt @@ -0,0 +1,16 @@ +---SPEECH--- +ID: timestampsNamesAndDates.u1 +author: #personX +role: #roleX +when: 2013-11-25 +tokens: 13 +sentences: 1 +named entity refferences: 2 +total duration: 1040.0 +total spoken: 700.0 +time silent: 340.0 +time unknown: 0 +unaligned tokens: 0 +earliest timeline: 2024-10-21T14:49:00 +latest timeline: 2024-10-21T14:49:00 + diff --git a/test/MetadataExtraction/examples/expected/expectedTimestampsSimple.txt b/test/MetadataExtraction/examples/expected/expectedTimestampsSimple.txt new file mode 100644 index 0000000..10b0a41 --- /dev/null +++ b/test/MetadataExtraction/examples/expected/expectedTimestampsSimple.txt @@ -0,0 +1,16 @@ +---SPEECH--- +ID: timestampsSimple.u1 +author: #personX +role: #roleX +when: 2013-11-25 +tokens: 13 +sentences: 1 +named entity refferences: 0 +total duration: 1040.0 +total spoken: 700.0 +time silent: 340.0 +time unknown: 0 +unaligned tokens: 0 +earliest timeline: 2024-10-21T14:49:00 +latest timeline: 2024-10-21T14:49:00 + diff --git a/test/MetadataExtraction/examples/inputs/timestampsMultipleTimelines.xml b/test/MetadataExtraction/examples/inputs/timestampsMultipleTimelines.xml new file mode 100644 index 0000000..24347a9 --- /dev/null +++ b/test/MetadataExtraction/examples/inputs/timestampsMultipleTimelines.xml @@ -0,0 +1,255 @@ + + + + + + Český parlamentní korpus, Poslanecká sněmovna, 2013-11-25 ps2013-001-01-000-999 [ParCzech.ana] + Czech parliamentary corpus, Chamber of Deputies, 2013-11-25 ps2013-001-01-000-999 [ParCzech.ana] + Parlament České republiky, Poslanecká sněmovna, 2013-11-25, Začátek schůze Poslanecké sněmovny 25. listopadu 2013 ve 14.05 hodin Přítomno: 199 poslanců + Parliament of the Czech Republic, Chamber of Deputies, 2013-11-25 + Začátek schůze Poslanecké sněmovny 25. listopadu 2013 ve 14.05 hodin Přítomno: 199 poslanců + ps2013 + ps2013/001 + ps2013/001/01 + ps2013/001/999 + + Matyáš Kopp + Data retrieval + TEI XML corpus encoding + Linguistic annotation + + + LINDAT/CLARIAH-CZ: Digitální výzkumná infrastruktura pro jazykové technologie, umění a humanitní vědy + LINDAT/CLARIAH-CZ: Digital Research Infrastructure for Language Technologies, Arts and Humanities + + + + 4.0 + + + 1 promluv + 1 speeches + 173 slov + 173 words + + + + LINDAT/CLARIAH-CZ: Digitální výzkumná infrastruktura pro jazykové technologie, umění a humanitní vědy + LINDAT/CLARIAH-CZ: Digital Research Infrastructure for Language Technologies, Arts and Humanities + www.lindat.cz + + http://hdl.handle.net/11234/1-5360 + + https://creativecommons.org/publicdomain/zero/1.0/ +

This work is licensed under the CC0 1.0 Universal (CC0 1.0) Public Domain Dedication.

+
+ 2024-01-26 +
+ + + Parlament České republiky, Poslanecká sněmovna + Parliament of the Czech Republic, Chamber of Deputies + https://www.psp.cz/eknih/2013ps/stenprot/001schuz/s001001.htm + 25.11.2013 + + + + + + + +
+ + +

+ ParCzech is a project on compiling Czech parliamentary data into annotated corpora. It mostly follows the ParlaMint project's recommendation, but the data are slightly extended in several ways. Texts contain links to original voting and prints. Except for the 4-class named entities classification, it also includes a more detailed CNEC hierarchical classification. The text in the annotated version is aligned with audio on the token level. And morphological annotation contains pdt tagsed besides UD PoS and features.

+
+ + + + + + + + + + + + + + + + + + + + + + +
+ + + + Parlament České republiky - Poslanecká sněmovna + Sněmovní 176/4 + Praha + Czech Republic + 2013-11-25 + + + +
+ + +
+ + + + + Lorem + + + Ipsum + + + dolor + + + sit + + + amet + + , + + consecteur + + + adipiscing + + + elit + + + + sed + + + do + + + eiusmod + + + tempor + + + incididunt + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
diff --git a/test/MetadataExtraction/examples/inputs/timestampsNamesAndDates.xml b/test/MetadataExtraction/examples/inputs/timestampsNamesAndDates.xml new file mode 100644 index 0000000..14c9f60 --- /dev/null +++ b/test/MetadataExtraction/examples/inputs/timestampsNamesAndDates.xml @@ -0,0 +1,254 @@ + + + + + + Český parlamentní korpus, Poslanecká sněmovna, 2013-11-25 ps2013-001-01-000-999 [ParCzech.ana] + Czech parliamentary corpus, Chamber of Deputies, 2013-11-25 ps2013-001-01-000-999 [ParCzech.ana] + Parlament České republiky, Poslanecká sněmovna, 2013-11-25, Začátek schůze Poslanecké sněmovny 25. listopadu 2013 ve 14.05 hodin Přítomno: 199 poslanců + Parliament of the Czech Republic, Chamber of Deputies, 2013-11-25 + Začátek schůze Poslanecké sněmovny 25. listopadu 2013 ve 14.05 hodin Přítomno: 199 poslanců + ps2013 + ps2013/001 + ps2013/001/01 + ps2013/001/999 + + Matyáš Kopp + Data retrieval + TEI XML corpus encoding + Linguistic annotation + + + LINDAT/CLARIAH-CZ: Digitální výzkumná infrastruktura pro jazykové technologie, umění a humanitní vědy + LINDAT/CLARIAH-CZ: Digital Research Infrastructure for Language Technologies, Arts and Humanities + + + + 4.0 + + + 1 promluv + 1 speeches + 173 slov + 173 words + + + + LINDAT/CLARIAH-CZ: Digitální výzkumná infrastruktura pro jazykové technologie, umění a humanitní vědy + LINDAT/CLARIAH-CZ: Digital Research Infrastructure for Language Technologies, Arts and Humanities + www.lindat.cz + + http://hdl.handle.net/11234/1-5360 + + https://creativecommons.org/publicdomain/zero/1.0/ +

This work is licensed under the CC0 1.0 Universal (CC0 1.0) Public Domain Dedication.

+
+ 2024-01-26 +
+ + + Parlament České republiky, Poslanecká sněmovna + Parliament of the Czech Republic, Chamber of Deputies + https://www.psp.cz/eknih/2013ps/stenprot/001schuz/s001001.htm + 25.11.2013 + + + + + + + +
+ + +

+ ParCzech is a project on compiling Czech parliamentary data into annotated corpora. It mostly follows the ParlaMint project's recommendation, but the data are slightly extended in several ways. Texts contain links to original voting and prints. Except for the 4-class named entities classification, it also includes a more detailed CNEC hierarchical classification. The text in the annotated version is aligned with audio on the token level. And morphological annotation contains pdt tagsed besides UD PoS and features.

+
+ + + + + + + + + + + + + + + + + + + + + + +
+ + + + Parlament České republiky - Poslanecká sněmovna + Sněmovní 176/4 + Praha + Czech Republic + 2013-11-25 + + + +
+ + +
+ + + + + Lorem + + + Ipsum + + + dolor + + + + sit + + + amet + + + , + + + consecteur + + + adipiscing + + + + elit + + + + sed + + + do + + + eiusmod + + + + tempor + + + incididunt + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
diff --git a/test/MetadataExtraction/examples/inputs/timestampsSimple.xml b/test/MetadataExtraction/examples/inputs/timestampsSimple.xml new file mode 100644 index 0000000..0f096d5 --- /dev/null +++ b/test/MetadataExtraction/examples/inputs/timestampsSimple.xml @@ -0,0 +1,248 @@ + + + + + + Český parlamentní korpus, Poslanecká sněmovna, 2013-11-25 ps2013-001-01-000-999 [ParCzech.ana] + Czech parliamentary corpus, Chamber of Deputies, 2013-11-25 ps2013-001-01-000-999 [ParCzech.ana] + Parlament České republiky, Poslanecká sněmovna, 2013-11-25, Začátek schůze Poslanecké sněmovny 25. listopadu 2013 ve 14.05 hodin Přítomno: 199 poslanců + Parliament of the Czech Republic, Chamber of Deputies, 2013-11-25 + Začátek schůze Poslanecké sněmovny 25. listopadu 2013 ve 14.05 hodin Přítomno: 199 poslanců + ps2013 + ps2013/001 + ps2013/001/01 + ps2013/001/999 + + Matyáš Kopp + Data retrieval + TEI XML corpus encoding + Linguistic annotation + + + LINDAT/CLARIAH-CZ: Digitální výzkumná infrastruktura pro jazykové technologie, umění a humanitní vědy + LINDAT/CLARIAH-CZ: Digital Research Infrastructure for Language Technologies, Arts and Humanities + + + + 4.0 + + + 1 promluv + 1 speeches + 173 slov + 173 words + + + + LINDAT/CLARIAH-CZ: Digitální výzkumná infrastruktura pro jazykové technologie, umění a humanitní vědy + LINDAT/CLARIAH-CZ: Digital Research Infrastructure for Language Technologies, Arts and Humanities + www.lindat.cz + + http://hdl.handle.net/11234/1-5360 + + https://creativecommons.org/publicdomain/zero/1.0/ +

This work is licensed under the CC0 1.0 Universal (CC0 1.0) Public Domain Dedication.

+
+ 2024-01-26 +
+ + + Parlament České republiky, Poslanecká sněmovna + Parliament of the Czech Republic, Chamber of Deputies + https://www.psp.cz/eknih/2013ps/stenprot/001schuz/s001001.htm + 25.11.2013 + + + + + + + +
+ + +

+ ParCzech is a project on compiling Czech parliamentary data into annotated corpora. It mostly follows the ParlaMint project's recommendation, but the data are slightly extended in several ways. Texts contain links to original voting and prints. Except for the 4-class named entities classification, it also includes a more detailed CNEC hierarchical classification. The text in the annotated version is aligned with audio on the token level. And morphological annotation contains pdt tagsed besides UD PoS and features.

+
+ + + + + + + + + + + + + + + + + + + + + + +
+ + + + Parlament České republiky - Poslanecká sněmovna + Sněmovní 176/4 + Praha + Czech Republic + 2013-11-25 + + + +
+ + +
+ + + + + Lorem + + + Ipsum + + + dolor + + + sit + + + amet + + , + + consecteur + + + adipiscing + + + elit + + + + sed + + + do + + + eiusmod + + + tempor + + + incididunt + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
diff --git a/test/MetadataExtraction/tester.py b/test/MetadataExtraction/tester.py index 7ee6496..b6b472b 100644 --- a/test/MetadataExtraction/tester.py +++ b/test/MetadataExtraction/tester.py @@ -8,11 +8,14 @@ args_parser.add_argument("--p",action="store_true", help="Set this flag to test persons") test_cases_speeches = [ - ("REAL","--file=examples/inputs/real.xml", "--wd=../../MetadataExtraction", "examples/expected/expectedReal.txt"), - ("MISSING TIMESTAMPS","--file=examples/inputs/missingTimestamps.xml", "--wd=../../MetadataExtraction", "examples/expected/expectedMissingTimestamps.txt"), - ("ENCAPSULATED WORDS","--file=examples/inputs/encapsulatedWords.xml", "--wd=../../MetadataExtraction", "examples/expected/expectedEncapsulatedWords.txt"), - ("MULTIPLE SPEECHES", "--file=examples/inputs/multipleSpeeches.xml", "--wd=../../MetadataExtraction", "examples/expected/expectedMultipleSpeeches.txt"), -] + # ("REAL","--file=examples/inputs/real.xml", "--wd=../../MetadataExtraction", "examples/expected/expectedReal.txt"), + # ("MISSING TIMESTAMPS","--file=examples/inputs/missingTimestamps.xml", "--wd=../../MetadataExtraction", "examples/expected/expectedMissingTimestamps.txt"), + # ("ENCAPSULATED WORDS","--file=examples/inputs/encapsulatedWords.xml", "--wd=../../MetadataExtraction", "examples/expected/expectedEncapsulatedWords.txt"), + # ("MULTIPLE SPEECHES", "--file=examples/inputs/multipleSpeeches.xml", "--wd=../../MetadataExtraction", "examples/expected/expectedMultipleSpeeches.txt"), + ("SIMPLE TIMESTAMPS", "--file=examples/inputs/timestampsSimple.xml", "--wd=../../MetadataExtraction", "examples/expected/expectedTimestampsSimple.txt"), + ("MULTIPLE TIMELINES", "--file=examples/inputs/timestampsMultipleTimelines.xml", "--wd=../../MetadataExtraction", "examples/expected/expectedTimestampsMultipleTimelines.txt"), + ("NAMES AND DATES", "--file=examples/inputs/timestampsNamesAndDates.xml", "--wd=../../MetadataExtraction", "examples/expected/expectedTimestampsNamesAndDates.txt"), + ] test_cases_persons = [ ("NO ISSUES", "--file=examples/inputs/noIssuesPerson.xml", "--wd=../../MetadataExtraction", "examples/expected/expectedNoIssuesPersons.txt"), @@ -29,7 +32,8 @@ def test(what, test_cases): process = subprocess.run([my_venv, what, test_file, test_wd], capture_output=True, text=True) actual_output = process.stdout.strip() - + error = process.stderr.strip() + print(error) with open(expected_output, 'r') as f: expected = f.read().strip()