Skip to content

Commit

Permalink
Add MT samples (#716).
Browse files Browse the repository at this point in the history
  • Loading branch information
TomazErjavec committed Jul 24, 2023
1 parent 6d2d038 commit 7d358e2
Show file tree
Hide file tree
Showing 326 changed files with 633,369 additions and 1 deletion.
201 changes: 201 additions & 0 deletions Data/ParlaMint-AT/ParlaMint-AT-en.ana.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
<?xml version="1.0" encoding="UTF-8"?>
<teiCorpus xmlns="http://www.tei-c.org/ns/1.0"
xml:lang="en"
xml:id="ParlaMint-AT-en.ana"
corresp="../ParlaMint-AT.TEI.ana/ParlaMint-AT.ana.xml">
<teiHeader>
<fileDesc>
<titleStmt>
<title xml:lang="de" type="main">Österreichisches Parlamentskorpus ParlaMint-AT-en [ParlaMint-en.ana SAMPLE]</title>
<title xml:lang="en" type="main">Austrian parliamentary corpus ParlaMint-AT-en [ParlaMint-en.ana SAMPLE]</title>
<title xml:lang="de" type="sub">Stenographische Protokolle der Plenarsitzungen des Österreichischen Nationalrats, XX. Gesetzgebungsberiode - XXVII. Gesetzgebungsperiode (1996 - 2022)</title>
<title xml:lang="en" type="sub">Shorthand records of the plenary sittings of the National Council of the Austrian parliament, terms 20 - terms 27 (1996 - 2022)</title>
<meeting n="27" corresp="#NR" ana="#parla.lower #parla.term #NR.XXVII"/>
<meeting n="26" corresp="#NR" ana="#parla.lower #parla.term #NR.XXVI"/>
<meeting n="25" corresp="#NR" ana="#parla.lower #parla.term #NR.XXV"/>
<meeting n="24" corresp="#NR" ana="#parla.lower #parla.term #NR.XXIV"/>
<meeting n="23" corresp="#NR" ana="#parla.lower #parla.term #NR.XXIII"/>
<meeting n="22" corresp="#NR" ana="#parla.lower #parla.term #NR.XXII"/>
<meeting n="21" corresp="#NR" ana="#parla.lower #parla.term #NR.XXI"/>
<meeting n="20" corresp="#NR" ana="#parla.lower #parla.term #NR.XX"/>
<respStmt>
<persName ref="https://orcid.org/0000-0002-8111-5584">Hannes Pirker</persName>
<persName ref="https://orcid.org/0000-0003-2436-0361">Daniel Schopper</persName>
<persName ref="https://orcid.org/0000-0002-1631-4560">Tanja Wissik</persName>
<resp xml:lang="de">Projektplanung und Methode</resp>
<resp xml:lang="en">Project set-up and methodology</resp>
</respStmt>
<respStmt>
<persName>Hannes Pirker</persName>
<resp xml:lang="de">Datenbeschaffung, Korpuskodierung in TEI und automatische linguistische Annotation</resp>
<resp xml:lang="en">Data retrieval, TEI corpus encoding and automatic linguistic annotation</resp>
</respStmt>
<respStmt>
<persName>Daniel Schopper</persName>
<resp xml:lang="de">XSLT Transformationen</resp>
<resp xml:lang="en">XSLT transformations</resp>
</respStmt>
<respStmt>
<persName>Martin Kirnbauer</persName>
<resp xml:lang="de">Einige der manuellen Korrekturen</resp>
<resp xml:lang="en">Some of the manual curation</resp>
</respStmt>
<respStmt>
<persName>Tanja Wissik</persName>
<resp xml:lang="de">Metadaten und Übersetzung</resp>
<resp xml:lang="en">Metadata and translation</resp>
</respStmt>
<respStmt>
<persName>Taja Kuzman</persName>
<persName>Nikola Ljubešić</persName>
<resp xml:lang="en">Machine translation to English and linguistic analysis of the translation</resp>
</respStmt>
<funder>
<orgName xml:lang="de">CLARIN-ERIC</orgName>
<orgName xml:lang="en">CLARIN-ERIC (Common Language Resources and Technology Infrastructure—European Research Infrastructure Consortium)</orgName>
<ref target="https://www.clarin.eu/">www.clarin.eu</ref>
</funder>
<funder>
<orgName xml:lang="de">ÖAW (Österreichische Akademie der Wissenschaften)</orgName>
<orgName xml:lang="en">ÖAW (Austrian Academy of Sciences)</orgName>
<ref target="https://www.oeaw.ac.at/">www.oeaw.ac.at</ref>
</funder>
</titleStmt>
<editionStmt>
<edition>3.0</edition>
</editionStmt>
<extent><!--These numbers do not reflect the size of the sample!-->
<measure unit="speeches" quantity="227991" xml:lang="en">227,991 speeches</measure>
<measure unit="words" quantity="63932213" xml:lang="en">63,932,213 words</measure>
</extent>
<publicationStmt>
<publisher>
<orgName xml:lang="de">Die CLARIN Forschungsinfrastruktur</orgName>
<orgName xml:lang="en">The CLARIN research infrastructure</orgName>
<ref target="https://www.clarin.eu/">www.clarin.eu</ref>
</publisher>
<idno type="URI" subtype="handle">http://hdl.handle.net/11356/1810</idno>
<availability status="free">
<licence>http://creativecommons.org/licenses/by/4.0/</licence>
<p xml:lang="de">Dieses Werk ist lizensiert unter der <ref target="http://creativecommons.org/licenses/by/4.0/">Creative Commons Namensnennung 4.0 International Lizenz (CC BY 4.0)</ref>.</p>
<p xml:lang="en">This work is licensed under the <ref target="http://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International License</ref>.</p>
</availability>
<date when="2023-06-24">2023-06-24</date>
</publicationStmt>
<sourceDesc>
<bibl>
<title type="main" xml:lang="de">Stenographische Protokolle der Plenarsitzungen des Nationalrats der Republik Österreich</title>
<title type="main" xml:lang="en">Shorthand records of the plenary sittings of the National Council of the Austrian parliament</title>
<publisher>Parlamentsdirektion</publisher>
<idno type="URI" subtype="parliament">https://www.parlament.gv.at/PAKT/STPROT</idno>
<date from="1996-01-15" to="2022-05-19">15.01.1996 - 19.05.2022</date>
</bibl>
</sourceDesc>
</fileDesc>
<encodingDesc>
<projectDesc>
<p xml:lang="en">
<ref target="https://www.clarin.eu/content/parlamint">ParlaMint</ref> is a project that aims to (1) create a multilingual set of comparable corpora of parliamentary proceedings uniformly encoded according to the <ref target="https://clarin-eric.github.io/ParlaMint/">ParlaMint encoding guidelines</ref>, covering the period from 2015 to mid-2022; (2) add linguistic annotations to the corpora and machine-translate them to English; (3) make the corpora available through concordancers; and (4) build use cases in Political Sciences and Digital Humanities based on the corpus data.</p>
<p xml:lang="de">
<ref target="https://www.clarin.eu/content/parlamint">ParlaMint</ref>
</p>
</projectDesc>
<editorialDecl>
<correction>
<p>No correction of source texts was performed.</p>
</correction>
<normalization>
<p>Text has not been normalised, except for spacing. Printed matter quoted in the protocols was removed</p>
</normalization>
<hyphenation>
<p>No end-of-line hyphens were present in the source.</p>
</hyphenation>
<quotation>
<p>Quotation marks have been left in the text and are not explicitly marked up.</p>
</quotation>
<segmentation>
<p>The texts are segmented into utterances (speeches) and segments (corresponding to paragraphs in the source transcription).</p>
</segmentation>
</editorialDecl>
<tagsDecl><!--These numbers do not reflect the size of the sample!-->
<namespace name="http://www.tei-c.org/ns/1.0">
<tagUsage gi="body" occurs="1197"/>
<tagUsage gi="desc" occurs="346176"/>
<tagUsage gi="div" occurs="1197"/>
<tagUsage gi="gap" occurs="14864"/>
<tagUsage gi="kinesic" occurs="248593"/>
<tagUsage gi="name" occurs="2100135"/>
<tagUsage gi="note" occurs="668625"/>
<tagUsage gi="pb" occurs="116531"/>
<tagUsage gi="pc" occurs="9280443"/>
<tagUsage gi="s" occurs="3919672"/>
<tagUsage gi="seg" occurs="662401"/>
<tagUsage gi="text" occurs="1197"/>
<tagUsage gi="u" occurs="227991"/>
<tagUsage gi="vocal" occurs="82719"/>
<tagUsage gi="w" occurs="63932213"/>
</namespace>
</tagsDecl>
<classDecl>
<xi:include xmlns:xi="http://www.w3.org/2001/XInclude"
href="ParlaMint-taxonomy-parla.legislature.xml"/>
<xi:include xmlns:xi="http://www.w3.org/2001/XInclude"
href="ParlaMint-taxonomy-speaker_types.xml"/>
<xi:include xmlns:xi="http://www.w3.org/2001/XInclude"
href="ParlaMint-taxonomy-subcorpus.xml"/>
<xi:include xmlns:xi="http://www.w3.org/2001/XInclude"
href="ParlaMint-taxonomy-NER.ana.xml"/>
</classDecl>
<appInfo>
<application ident="EasyNMT" version="2.0">
<label>EasyNMT (OPUS-MT model)</label>
<desc>Translation to English done with EasyNMT (<ref target="https://github.com/UKPLab/EasyNMT">https://github.com/UKPLab/EasyNMT</ref>) with OPUS-MT model gmw (<ref target="https://github.com/Helsinki-NLP/Opus-MT">https://github.com/Helsinki-NLP/Opus-MT</ref>)</desc>
</application>
<application ident="Stanza" version="1.5">
<label>Stanza</label>
<desc>Tokenisation, PoS tagging, lemmatization, and NER annotation done with Stanza (<ref target="https://stanfordnlp.github.io/stanza/">https://stanfordnlp.github.io/stanza/</ref>) with the model for English. For NER the conll03 model with 4 NE classes was used.</desc>
</application>
</appInfo>
</encodingDesc>
<profileDesc>
<settingDesc>
<setting>
<name type="city" xml:lang="de">Wien</name>
<name type="city" xml:lang="en">Vienna</name>
<name type="country" xml:lang="de" key="AT">Österreich</name>
<name type="country" xml:lang="en" key="AT">Austria</name>
<date from="1996-01-15" to="2022-04-27"/>
</setting>
</settingDesc>
<textClass>
<catRef scheme="#ParlaMint-taxonomy-parla.legislature"
target="#parla.bi #parla.lower"/>
</textClass>
<particDesc>
<xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="ParlaMint-AT-listOrg.xml"/>
<xi:include xmlns:xi="http://www.w3.org/2001/XInclude"
href="ParlaMint-AT-listPerson.xml"/>
</particDesc>
<langUsage>
<language ident="de" xml:lang="de">Deutsch</language>
<language ident="de" xml:lang="en">German</language>
<language ident="en" xml:lang="de">Englisch</language>
<language ident="en" xml:lang="en">English</language>
</langUsage>
</profileDesc>
<revisionDesc>
<change when="2023-06-24">
<name>Tomaž Erjavec</name>: Made sample.</change>
<change when="2023-06-24">parlamint2release script: Fix some identifiable erros for the release.</change>
<change when="2023-06-23">
<name>Tomaž Erjavec</name>: Generate TEI version of MTed corpus.</change>
<change when="2023-06-24">parlamint-add-common-content script: Adding common content.</change>
</revisionDesc>
</teiHeader>
<xi:include xmlns:xi="http://www.w3.org/2001/XInclude"
href="ParlaMint-AT-en_2005-03-31-022-XXII-NRSITZ-00100.ana.xml"/>
<xi:include xmlns:xi="http://www.w3.org/2001/XInclude"
href="ParlaMint-AT-en_2014-09-24-025-XXV-NRSITZ-00042.ana.xml"/>
<xi:include xmlns:xi="http://www.w3.org/2001/XInclude"
href="ParlaMint-AT-en_2022-05-19-027-XXVII-NRSITZ-00159.ana.xml"/>
</teiCorpus>
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
ID Title Date Body Term Session Meeting Sitting Agenda Subcorpus Speaker_role Speaker_MP Speaker_Minister Speaker_party Speaker_party_name Party_status Speaker_name Speaker_gender Speaker_birth
ParlaMint-AT_2005-03-31-022-XXII-NRSITZ-00100_d7e355 Sitting Number 100, Legislative period XXII, Thursday, 31. March 2005 2005-03-31 Lower house 22 100 Reference Chairperson MP - FPÖ Freiheitlicher Parlamentsklub Prinzhorn, Thomas M 1943
ParlaMint-AT_2005-03-31-022-XXII-NRSITZ-00100_d7e386 Sitting Number 100, Legislative period XXII, Thursday, 31. March 2005 2005-03-31 Lower house 22 100 Reference Chairperson MP - FPÖ Freiheitlicher Parlamentsklub Prinzhorn, Thomas M 1943
Loading

0 comments on commit 7d358e2

Please sign in to comment.