From 69aea502e0f360cca24808e3bc525175f3072d3c Mon Sep 17 00:00:00 2001 From: Colin Batchelor Date: Mon, 16 Dec 2024 22:40:06 +0000 Subject: [PATCH] validate_gd_extras.py: added check for a csubj relation where expected, for example to a head word like 'urrainn' or 'toil'. Updated dev and test to reflect this. More detail in issue #43. --- gd_arcosg-ud-dev.conllu | 49 ++++++++++++++-------------- gd_arcosg-ud-test.conllu | 48 ++++++++++++++------------- not-to-release/validate_gd_extras.py | 41 +++++++++++++++++++++-- 3 files changed, 88 insertions(+), 50 deletions(-) diff --git a/gd_arcosg-ud-dev.conllu b/gd_arcosg-ud-dev.conllu index 58dee33..c5c0c76 100644 --- a/gd_arcosg-ud-dev.conllu +++ b/gd_arcosg-ud-dev.conllu @@ -1118,7 +1118,7 @@ 3 iongantach iongantach ADJ Ap CleftType=Adj 0 root _ _ 4 mar mar SCONJ Cs _ 6 mark _ _ 5 a a PART Q-r PartType=Vb|PronType=Rel 6 mark:prt _ _ -6 chuala cluinn VERB V-s Mood=Ind|Tense=Past|VerbForm=Fin 3 csubj:cleft _ _ +6 chuala cluinn VERB V-s Mood=Ind|Tense=Past|VerbForm=Fin 3 csubj:cop _ _ # sent_id = c03_088 # speaker = [1] @@ -1602,7 +1602,7 @@ 14 gur is AUX Wpdia Polarity=Aff|Tense=Pres 15 cop _ _ 15 dòcha dòcha NOUN Uf _ 9 ccomp _ _ 16 gun gu PART Qa PartType=Cmpl 17 mark:prt _ _ -17 robh bi VERB V-s--d Mood=Ind|Tense=Past|VerbForm=Fin 15 acl _ _ +17 robh bi VERB V-s--d Mood=Ind|Tense=Past|VerbForm=Fin 15 csubj:cop _ _ 18 feadhainn feadhainn NOUN Ncsfn Case=Nom|Gender=Fem|Number=Sing 17 nsubj _ _ 19 eile eile ADJ Aq-sfn Case=Nom|Gender=Fem|Number=Sing 18 amod _ _ 20-21 ann _ _ _ _ _ _ _ _ @@ -1612,7 +1612,7 @@ 23 b’ is AUX Ws Tense=Past 24 cop _ _ 24 fhearr math ADJ Apc CleftType=Adj|Degree=Cmp,Sup 18 acl:relcl _ _ 25 a a PART Ug PartType=Inf 26 mark:prt _ _ -26 chòrdadh còrd VERB V-h Mood=Ind|VerbForm=Fin 24 csubj:cleft _ _ +26 chòrdadh còrd VERB V-h Mood=Ind|VerbForm=Fin 24 csubj:cop _ _ 27 ri ri ADP Sp _ 28 case _ _ 28 gillean gille NOUN Ncpmd Case=Dat|Gender=Masc|Number=Plur 26 obl _ _ 29 ‘s 's CCONJ Cc _ 30 cc _ _ @@ -1773,7 +1773,7 @@ 7 trì trì NUM Mc NumForm=Word|NumType=Card 8 nummod _ _ 8 seachdainnean seachdainn NOUN Ncpfn Case=Nom|Gender=Fem|Number=Plur 4 ccomp _ _ 9 nach nach PART Qn PartType=Cmpl|Polarity=Neg 10 mark:prt _ _ -10 bi bi VERB V-f--d Mood=Ind|Tense=Fut|VerbForm=Fin 8 ccomp _ _ +10 bi bi VERB V-f--d Mood=Ind|Tense=Fut|VerbForm=Fin 8 csubj:cleft _ _ 11 an an DET Tds Definite=Def|Number=Sing|PronType=Art 12 det _ _ 12 date date NOUN Xfe Foreign=Yes 10 nsubj _ _ 13-14 orra _ _ _ _ _ _ _ _ @@ -1968,7 +1968,7 @@ 4 do do ADP Sp _ 5 case _ _ 5 sinn sinn PRON Pp1p Number=Plur|Person=1|PronType=Prs 3 nmod _ _ 6 a a DET Dp3sm Gender=Masc|Number=Sing|Person=3|Poss=Yes|PronType=Prs 7 obj _ _ -7 fhàgail fàg NOUN Nv VerbForm=Inf 3 xcomp _ _ +7 fhàgail fàg NOUN Nv VerbForm=Inf 3 csubj:cop _ _ 8 dhan do ADP Sp _ 10 case _ _ 9 a' an DET Tds Definite=Def|Number=Sing|PronType=Art 10 det _ _ 10 weekend weekend NOUN Xfe Foreign=Yes 7 obl _ _ @@ -3172,7 +3172,7 @@ 12 mhòr mòr ADJ Ap _ 6 xcomp:pred _ _ 13 nach is AUX Wpdin Polarity=Neg|Tense=Pres 15 cop _ _ 14 - - PUNCT Fb _ 15 punct _ _ -15 h-ì ì PRON Pp3sf Gender=Fem|Number=Sing|Person=3|PronType=Prs 12 ccomp _ SpaceAfter=No +15 h-ì ì PRON Pp3sf Gender=Fem|Number=Sing|Person=3|PronType=Prs 12 csubj:cop _ SpaceAfter=No 16 ... ... PUNCT Fb _ 17 punct _ _ 17 òirleach òirleach NOUN Ncsfn Case=Nom|Gender=Fem|Number=Sing 15 nsubj _ _ 18 eile eile ADJ Aq-sfn Case=Nom|Gender=Fem|Number=Sing 17 amod _ _ @@ -3542,7 +3542,7 @@ 9 as is AUX Wpr PronType=Rel|Tense=Pres 10 cop _ _ 10 coireach coireach ADJ Ap _ 8 acl:relcl _ _ 11 gun gu PART Qa PartType=Cmpl 12 mark:prt _ _ -12 thog tog VERB V-s Mood=Ind|Tense=Past|VerbForm=Fin 10 ccomp _ _ +12 thog tog VERB V-s Mood=Ind|Tense=Past|VerbForm=Fin 10 csubj:cop _ _ 13 Teàrlach Teàrlach PROPN Nn-mn Case=Nom|Gender=Masc 12 nsubj _ _ 14 is is CCONJ Cc _ 15 cc _ _ 15 Aonghas Aonghas PROPN Nn-mn Case=Nom|Gender=Masc 13 conj _ _ @@ -3903,7 +3903,7 @@ 24 Eilean eil NOUN Ncpmg Case=Gen|Gender=Masc|Number=Plur 22 nmod _ _ 25 - - PUNCT Fb _ 14 punct _ _ 26 as is AUX Wpr PronType=Rel|Tense=Pres 27 cop _ _ -27 fhaide fada ADJ Apc Degree=Cmp,Sup 14 csubj:cop _ _ +27 fhaide fada ADJ Apc Degree=Cmp,Sup 14 csubj:cleft _ _ 28-29 leam _ _ _ _ _ _ _ SpaceAfter=No 28 le le ADP Sp _ 29 case _ _ 29 mi mi PRON Pp1s Number=Sing|Person=1|PronType=Prs 27 obl _ _ @@ -4240,7 +4240,7 @@ 12 iomagain iomagain NOUN Ncsfd Case=Dat|Gender=Fem|Number=Sing 9 nmod _ _ 13 seo seo DET Dd PronType=Art 12 det _ _ 14 a a PART Q-r PartType=Vb|PronType=Rel 15 obl _ _ -15 bha bi VERB V-s Mood=Ind|Tense=Past|VerbForm=Fin 12 acl:relcl _ _ +15 bha bi VERB V-s Mood=Ind|Tense=Past|VerbForm=Fin 9 csubj:cleft _ _ 16 aobhar aobhar NOUN Ncsmn Case=Nom|Gender=Masc|Number=Sing 15 nsubj _ _ 17 nan an DET Tdpmg Case=Gen|Definite=Def|Gender=Masc|Number=Plur|PronType=Art 18 det _ _ 18 Stiùbhartach stiùbhartach NOUN Ncpmg Case=Gen|Gender=Masc|Number=Plur 16 nmod _ _ @@ -4695,7 +4695,7 @@ 6-7 dhomh _ _ _ _ _ _ _ _ 6 do do ADP Sp _ 7 case _ _ 7 mi mi PRON Pp1s Number=Sing|Person=1|PronType=Prs 5 nmod _ _ -8 aideachadh aidich NOUN Nv VerbForm=Vnoun 5 xcomp _ _ +8 aideachadh aidich NOUN Nv VerbForm=Vnoun 5 csubj:cop _ _ 9 cho cho ADV Rg AdvType=Man 10 advmod _ _ 10 mór mór ADJ Ap _ 8 xcomp:pred _ _ 11 's 's CCONJ Cc _ 13 cc _ _ @@ -4937,7 +4937,7 @@ 23 a a PART Q-r PartType=Vb|PronType=Rel 24 nsubj _ _ 24 tha bi VERB V-p Mood=Ind|Tense=Pres|VerbForm=Fin 14 acl:relcl _ _ 25 's is AUX Wp-i Tense=Pres 26 cop _ _ -26 mathaid mathaid NOUN Uf _ 24 parataxis _ _ +26 mathaid mathaid NOUN Uf _ 24 advcl _ _ 27 air air PART Sa _ 28 case _ _ 28 traoghadh traogh NOUN Nv VerbForm=Vnoun 24 xcomp:pred _ _ 29 beagan beagan NOUN Ncsmn Case=Nom|Gender=Masc|Number=Sing 28 obj _ _ @@ -5563,7 +5563,7 @@ 40 esan e PRON Pp3sm-e Form=Emp|Gender=Masc|Number=Sing|Person=3|PronType=Prs 39 nsubj _ SpaceAfter=No 41 , , PUNCT Fi _ 39 punct _ _ 42 “ “ PUNCT Fq _ 43 punct _ SpaceAfter=No -43 cadal caidil NOUN Nv VerbForm=Vnoun 34 xcomp _ _ +43 cadal caidil NOUN Nv VerbForm=Vnoun 34 csubj:cop _ _ 44 còmhla còmhla ADV Rg AdvType=Man 47 advmod _ _ 45 ris ri ADP Sp _ 47 case _ _ 46 a’ an DET Tdsf Definite=Def|Gender=Fem|Number=Sing|PronType=Art 47 det _ _ @@ -5601,7 +5601,7 @@ 7 fhèin fèin PRON Px PronType=Prs|Reflex=Yes 6 nmod _ _ 8 as is AUX Wpr PronType=Rel|Tense=Pres 9 cop _ _ 9 coltaiche coltach ADJ Apc Degree=Cmp,Sup 6 csubj:cop _ _ -10 cadal caidil NOUN Nv VerbForm=Vnoun 9 xcomp _ _ +10 cadal caidil NOUN Nv VerbForm=Vnoun 9 csubj:cop _ _ 11 còmhla còmhla ADV Rg AdvType=Man 14 advmod _ _ 12 ris ri ADP Sp _ 14 case _ _ 13 a’ an DET Tdsf Definite=Def|Gender=Fem|Number=Sing|PronType=Art 14 det _ _ @@ -7114,6 +7114,7 @@ 30 dhùnadh dùin NOUN Nv VerbForm=Inf 19 xcomp _ SpaceAfter=No 31 . . PUNCT Fe _ 1 punct _ _ +# comment = 2024-12-16: node 14 would normally be preceded by "do" # sent_id = ns06_013 # text = Thubhairt Annabel Goldie, as leth nan Toraidhean, gum bu choir a’ chùis a bhith air a dhol fa chomhair na Pàrlamaid Albannaich mus deach co-dhùnadh a dhèanamh. 1 Thubhairt tubhairt VERB V-s Mood=Ind|Tense=Past|VerbForm=Fin 0 root _ _ @@ -7129,9 +7130,9 @@ 11 bu is AUX Ws Tense=Past 12 cop _ _ 12 choir choir NOUN Uf _ 1 conj _ _ 13 a’ an DET Tdsf Definite=Def|Gender=Fem|Number=Sing|PronType=Art 14 det _ _ -14 chùis cùis NOUN Ncsfn Case=Nom|Gender=Fem|Number=Sing 12 nsubj _ _ +14 chùis cùis NOUN Ncsfn Case=Nom|Gender=Fem|Number=Sing 12 nmod _ _ 15 a a PART Ug PartType=Inf 16 mark:prt _ _ -16 bhith bi NOUN Nv VerbForm=Inf 12 xcomp _ _ +16 bhith bi NOUN Nv VerbForm=Inf 12 csubj:cop _ _ 17 air air PART Sa _ 19 case _ _ 18 a a DET Dp3sm Gender=Masc|Number=Sing|Person=3|Poss=Yes|PronType=Prs 19 obj _ _ 19 dhol rach NOUN Nv VerbForm=Inf 16 xcomp:pred _ _ @@ -7800,7 +7801,7 @@ 28 b' is AUX Ws Tense=Past 29 cop _ _ 29 e e PRON Pp3sm Gender=Masc|Number=Sing|Person=3|PronType=Prs 18 advcl _ _ 30 gu gu PART Qa PartType=Cmpl 31 mark:prt _ _ -31 robh bi VERB V-s--d Mood=Ind|Tense=Past|VerbForm=Fin 29 ccomp _ _ +31 robh bi VERB V-s--d Mood=Ind|Tense=Past|VerbForm=Fin 29 csubj:cop _ _ 32 clann clann NOUN Ncsfn Case=Nom|Gender=Fem|Number=Sing 31 nsubj _ _ 33-34 aice _ _ _ _ _ _ _ SpaceAfter=No 33 aig aig ADP Sp _ 34 case _ _ @@ -8563,7 +8564,7 @@ 10 's is AUX Wp-i Tense=Pres 11 cop _ _ 11 dòcha dòcha NOUN Uf _ 7 parataxis _ _ 12 gu gu PART Qa PartType=Cmpl 13 mark:prt _ _ -13 robh bi VERB V-s--d Mood=Ind|Tense=Past|VerbForm=Fin 11 ccomp _ _ +13 robh bi VERB V-s--d Mood=Ind|Tense=Past|VerbForm=Fin 11 csubj:cop _ _ 14 iad iad PRON Pp3p Number=Plur|Person=3|PronType=Prs 13 nsubj _ _ 15 uair uair NOUN Ncsfn Case=Nom|Gender=Fem|Number=Sing 13 xcomp:pred _ _ 16 no no CCONJ Cc _ 17 cc _ _ @@ -9281,7 +9282,7 @@ 13 's is AUX Wp-i Tense=Pres 14 cop _ _ 14 dòcha dòcha NOUN Uf _ 4 ccomp _ _ 15 gu gu PART Qa PartType=Cmpl 16 mark:prt _ _ -16 robh bi VERB V-s--d Mood=Ind|Tense=Past|VerbForm=Fin 14 ccomp _ _ +16 robh bi VERB V-s--d Mood=Ind|Tense=Past|VerbForm=Fin 14 csubj:cop _ _ 17 e e PRON Pp3sm Gender=Masc|Number=Sing|Person=3|PronType=Prs 16 nsubj _ _ 18 a' ag PART Sa _ 19 case _ _ 19 toir toir NOUN Nv VerbForm=Vnoun 16 xcomp:pred _ _ @@ -9468,7 +9469,7 @@ 6 do do ADP Sp _ 7 case _ _ 7 iad iad PRON Pp3p Number=Plur|Person=3|PronType=Prs 5 nmod _ _ 8 ri ri ADP Sp _ 9 case _ _ -9 toir toir NOUN Nv VerbForm=Vnoun 5 xcomp _ _ +9 toir toir NOUN Nv VerbForm=Vnoun 5 csubj:cop _ _ 10 seachad seachad ADV Rg AdvType=Man 9 advmod _ _ 11 faisg faisg ADJ Ap _ 9 xcomp:pred _ _ 12 air air ADP Sp _ 14 case _ _ @@ -11415,13 +11416,13 @@ 21 e e PRON Pp3sm Gender=Masc|Number=Sing|Person=3|PronType=Prs 28 advcl _ _ 22 gun gu PART Qa PartType=Cmpl 24 mark:prt _ _ 23 do do PART Q--s Tense=Past 24 mark:prt _ _ -24 dh’fhosgail fosgail VERB V-s Mood=Ind|Tense=Past|VerbForm=Fin 21 ccomp _ _ +24 dh’fhosgail fosgail VERB V-s Mood=Ind|Tense=Past|VerbForm=Fin 21 csubj:cop _ _ 25 e e PRON Pp3sm Gender=Masc|Number=Sing|Person=3|PronType=Prs 24 nsubj _ SpaceAfter=No 26 , , PUNCT Fi _ 28 punct _ _ 27 ’s is AUX Wp-i Tense=Pres 28 cop _ _ 28 dòcha dòcha NOUN Uf _ 9 ccomp _ _ 29 nach nach PART Qn PartType=Cmpl|Polarity=Neg 30 mark:prt _ _ -30 biodh bi VERB V-h--d Mood=Ind|VerbForm=Fin 28 ccomp _ _ +30 biodh bi VERB V-h--d Mood=Ind|VerbForm=Fin 28 csubj:cop _ _ 31 duine duine NOUN Ncsmn Case=Nom|Gender=Masc|Number=Sing 30 nsubj _ _ 32-33 san _ _ _ _ _ _ _ _ 32 anns an ADP Sp ExtPos=ADP 34 case _ _ @@ -11787,7 +11788,7 @@ 9 a-nis a-nis ADV Rt AdvType=Tim 5 advmod _ _ 10 sùil sùil NOUN Ncsfn Case=Nom|Gender=Fem|Number=Sing 12 obj _ _ 11 a a PART Ug PartType=Inf 12 mark:prt _ _ -12 thoirt toir NOUN Nv VerbForm=Inf 5 xcomp _ _ +12 thoirt toir NOUN Nv VerbForm=Inf 5 csubj:cop _ _ 13 a-rithist a-rithist ADV Rt AdvType=Tim 12 advmod _ _ 14 air air ADP Sp _ 20 case _ _ 15 9 9 NUM Mn NumForm=Digit|NumType=Card 16 nummod _ _ @@ -13294,7 +13295,7 @@ 20 sin sin PRON Pd PronType=Dem 17 ccomp _ _ 21 a a PART Q-r PartType=Vb|PronType=Rel 23 nsubj _ _ 22 b' is AUX Ws Tense=Past 23 cop _ _ -23 aobhar aobhar NOUN Ncsmn Case=Nom|Gender=Masc|Number=Sing 20 csubj:cop _ _ +23 aobhar aobhar NOUN Ncsmn Case=Nom|Gender=Masc|Number=Sing 20 csubj:cleft _ _ 24 gun gu PART Qa PartType=Cmpl 25 mark:prt _ _ 25 deach rach VERB V-s--d Mood=Ind|Tense=Past|VerbForm=Fin 23 ccomp _ _ 26 i i PRON Pp3sf Gender=Fem|Number=Sing|Person=3|PronType=Prs 25 nsubj _ _ @@ -13355,7 +13356,7 @@ 5 cha is AUX Wp-in Polarity=Neg|Tense=Pres 6 cop _ _ 6 mhòr mòr ADJ Ap _ 4 ccomp _ _ 7 nach nach PART Qn PartType=Cmpl|Polarity=Neg 8 mark:prt _ _ -8 deach rach VERB V-s--d Mood=Ind|Tense=Past|VerbForm=Fin 6 ccomp _ _ +8 deach rach VERB V-s--d Mood=Ind|Tense=Past|VerbForm=Fin 6 csubj:cop _ _ 9 ainm ainm NOUN Ncsmn Case=Nom|Gender=Masc|Number=Sing 8 nsubj _ _ 10 Aitken Aitken PROPN Nn _ 9 nmod _ _ 11 dhan do ADP Sp _ 13 case _ _ diff --git a/gd_arcosg-ud-test.conllu b/gd_arcosg-ud-test.conllu index b691272..b5e1952 100644 --- a/gd_arcosg-ud-test.conllu +++ b/gd_arcosg-ud-test.conllu @@ -1802,7 +1802,7 @@ 9-10 dhuinn _ _ _ _ _ _ _ _ 9 do do ADP Sp _ 10 case _ _ 10 sinn sinn PRON Pp1p Number=Plur|Person=1|PronType=Prs 8 nmod _ _ -11 feuchainn feuch NOUN Nv VerbForm=Vnoun 8 xcomp _ _ +11 feuchainn feuch NOUN Nv VerbForm=Vnoun 8 csubj:cop _ _ 12 suas suas ADV Rs AdvType=Loc 11 advmod _ _ 13 bheil bi VERB V-p--d Mood=Ind|Tense=Pres|VerbForm=Fin 2 parataxis _ _ 14 thu thu PRON Pp2s Number=Sing|Person=2|PronType=Prs 13 nsubj _ _ @@ -2055,7 +2055,7 @@ 18 de de ADP Sp _ 21 case _ _ 19 dh’astar astar NOUN Ncsmd Case=Dat|Gender=Masc|Number=Sing 21 obj _ _ 20 a a PART Ug PartType=Inf 21 mark:prt _ _ -21 dhèanamh dèan NOUN Nv VerbForm=Inf 15 xcomp _ _ +21 dhèanamh dèan NOUN Nv VerbForm=Inf 15 csubj:cop _ _ 22 mun mun SCONJ Cs _ 23 mark _ _ 23 dorchnaicheadh dorchnaich VERB V-h--d Mood=Ind|VerbForm=Fin 15 advcl _ _ 24 i i PRON Pp3sf Gender=Fem|Number=Sing|Person=3|PronType=Prs 23 nsubj _ SpaceAfter=No @@ -2415,6 +2415,7 @@ 22 thu thu PRON Pp2s Number=Sing|Person=2|PronType=Prs 21 nsubj _ SpaceAfter=No 23 . . PUNCT Fe _ 1 punct _ _ +# comment = 2024-12-16: retagged fuireach to NOUN/Nv but need to check the rest of the expression # sent_id = f03_025 # text = B' fheàrr dhuit cus fuireach thall gu madainn na thu fhéin a chur an cunnart a' greasad dhachaidh. 1 B' is AUX Ws Tense=Past 2 cop _ _ @@ -2422,8 +2423,8 @@ 3-4 dhuit _ _ _ _ _ _ _ _ 3 do do ADP Sp _ 4 case _ _ 4 thu thu PRON Pp2s Number=Sing|Person=2|PronType=Prs 2 nmod _ _ -5 cus cus NOUN Ncsmn Case=Nom|Gender=Masc|Number=Sing 2 nsubj _ _ -6 fuireach fuireach NOUN Ncsmg Case=Gen|Gender=Masc|Number=Sing 5 nmod _ _ +5 cus cus NOUN Ncsmn Case=Nom|Gender=Masc|Number=Sing 6 nmod _ _ +6 fuireach fuireach NOUN Nv VerbForm=Vnoun 2 csubj:cop _ _ 7 thall thall ADV Rs AdvType=Loc 6 advmod _ _ 8 gu gu ADP Sp _ 9 case _ _ 9 madainn madainn NOUN Ncsfg Case=Gen|Gender=Fem|Number=Sing 5 nmod _ _ @@ -2431,7 +2432,7 @@ 11 thu thu PRON Pp2s Number=Sing|Person=2|PronType=Prs 14 obj _ _ 12 fhéin féin PRON Px PronType=Prs|Reflex=Yes 11 nmod _ _ 13 a a PART Ug PartType=Inf 14 mark:prt _ _ -14 chur cuir NOUN Nv VerbForm=Inf 2 xcomp _ _ +14 chur cuir NOUN Nv VerbForm=Inf 6 xcomp _ _ 15 an an ADP Sp _ 16 case _ _ 16 cunnart cunnart NOUN Ncsmd Case=Dat|Gender=Masc|Number=Sing 14 obl _ _ 17 a' ag PART Sa _ 18 case _ _ @@ -2582,7 +2583,7 @@ 18 no no CCONJ Cc _ 19 cc _ _ 19 togail togail NOUN Ncsfn Case=Nom|Gender=Fem|Number=Sing 17 conj _ _ 20 a a PART Ug PartType=Inf 21 mark:prt _ _ -21 dhèanamh dèan NOUN Nv VerbForm=Inf 14 xcomp _ SpaceAfter=No +21 dhèanamh dèan NOUN Nv VerbForm=Inf 14 csubj:cop _ SpaceAfter=No 22 ; ; PUNCT Fi _ 25 punct _ _ 23 agus agus CCONJ Cc _ 25 cc _ _ 24 cha cha PART Qn PartType=Vb|Polarity=Neg 25 mark:prt _ _ @@ -3582,7 +3583,7 @@ 10 chan is AUX Wp-in Polarity=Neg|Tense=Pres 11 cop _ _ 11 urrainn urrainn NOUN Uf _ 0 root _ _ 12 nach is AUX Wpdin Polarity=Neg|Tense=Pres 13 cop _ _ -13 e e PRON Pp3sm Gender=Masc|Number=Sing|Person=3|PronType=Prs 11 ccomp _ _ +13 e e PRON Pp3sm Gender=Masc|Number=Sing|Person=3|PronType=Prs 11 csubj:cop _ _ 14 seo seo DET Dd PronType=Art 13 det _ _ 15 an an DET Tdsm Definite=Def|Gender=Masc|Number=Sing|PronType=Art 16 det _ _ 16 t-òran òran NOUN Ncsmn Case=Nom|Gender=Masc|Number=Sing 13 nsubj _ _ @@ -4071,7 +4072,7 @@ 18 do do ADP Sp _ 19 case _ _ 19 neach neach NOUN Ncsmd Case=Dat|Gender=Masc|Number=Sing 17 nmod _ _ 20 a a PART Ug PartType=Inf 21 mark:prt _ _ -21 ràdh abair NOUN Nv VerbForm=Inf 17 xcomp _ _ +21 ràdh abair NOUN Nv VerbForm=Inf 17 csubj:cop _ _ 22 gun gu PART Qa PartType=Cmpl 23 mark:prt _ _ 23 robh bi VERB V-s--d Mood=Ind|Tense=Past|VerbForm=Fin 21 ccomp _ _ 24 Sir sir PROPN Xfe Foreign=Yes 23 nsubj _ _ @@ -4171,7 +4172,7 @@ 17 do do ADP Sp _ 18 case _ _ 18 Shir shir NOUN Xfe Foreign=Yes 16 nmod _ _ 19 lain Iain PROPN Nn-md Case=Dat|Gender=Masc|Typo=Yes 18 flat:name _ CorrectForm=Iain -20 tilleadh till NOUN Nv VerbForm=Vnoun 16 xcomp _ _ +20 tilleadh till NOUN Nv VerbForm=Vnoun 16 csubj:cop _ _ 21 a a ADP Sp _ 22 case _ _ 22 Mhuile Muile PROPN Nt NounType=Top 20 obl _ SpaceAfter=No 23 . . PUNCT Fe _ 2 punct _ _ @@ -4563,7 +4564,7 @@ 9 a a DET Dp3sf Gender=Fem|Number=Sing|Person=3|Poss=Yes|PronType=Prs 10 nmod:poss _ _ 10 bàrdachd bàrdachd NOUN Ncsfd Case=Dat|Gender=Fem|Number=Sing 7 nmod _ _ 11 a a PART Ug PartType=Inf 12 mark:prt _ _ -12 chur cuir NOUN Nv VerbForm=Inf 3 xcomp _ _ +12 chur cuir NOUN Nv VerbForm=Inf 3 csubj:cop _ _ 13-14 'na _ _ _ _ _ _ _ _ 13 an an ADP Sp _ 15 case _ _ 14 a a PRON Dp3sm Gender=Masc|Number=Sing|Person=3|Poss=Yes|PronType=Prs 15 nmod:poss _ _ @@ -6109,7 +6110,7 @@ 28 ise i PRON Pp3sf-e Form=Emp|Gender=Fem|Number=Sing|Person=3|PronType=Prs 27 nsubj _ SpaceAfter=No 29 , , PUNCT Fi _ 27 punct _ _ 30 " " PUNCT Fq _ 31 punct _ SpaceAfter=No -31 sealltainn seall NOUN Nv VerbForm=Vnoun 23 xcomp _ _ +31 sealltainn seall NOUN Nv VerbForm=Vnoun 23 csubj:cop _ _ 32 suas suas ADV Rs AdvType=Loc 31 advmod _ SpaceAfter=No 33 " " PUNCT Fz _ 34 punct _ _ 34 ars' arsa VERB V-s Mood=Ind|Tense=Past|VerbForm=Fin 31 parataxis _ _ @@ -6199,7 +6200,7 @@ 27 phàrtaidh pàrtadh NOUN Ncsmd Case=Dat|Gender=Masc|Number=Sing 24 nmod _ _ 28 Làbarach làbarach ADJ Aq-smd Case=Dat|Gender=Masc|Number=Sing 27 amod _ _ 29 a a PART Ug PartType=Inf 30 mark:prt _ _ -30 bhith bi NOUN Nv VerbForm=Inf 24 xcomp _ _ +30 bhith bi NOUN Nv VerbForm=Inf 24 csubj:cop _ _ 31 a' ag PART Sa _ 32 case _ _ 32 feuchainn feuch NOUN Nv VerbForm=Vnoun 30 xcomp:pred _ _ 33 ris ri ADP Sp _ 35 case _ _ @@ -6291,7 +6292,7 @@ 10 choir choir NOUN Uf _ 7 ccomp _ _ 11 cead cead NOUN Ncsmn Case=Nom|Gender=Masc|Number=Sing 13 obj _ _ 12 a a PART Ug PartType=Inf 13 mark:prt _ _ -13 thoirt toir NOUN Nv VerbForm=Inf 10 xcomp _ _ +13 thoirt toir NOUN Nv VerbForm=Inf 10 csubj:cop _ _ 14 don don ADP Sp _ 16 case _ _ 15 a’ an DET Tdsm Definite=Def|Gender=Masc|Number=Sing|PronType=Art 16 det _ _ 16 Bhall-Pàrlamaid ball-pàrlamaid NOUN Ncsmd Case=Dat|Gender=Masc|Number=Sing 13 obl _ _ @@ -6381,7 +6382,7 @@ 4 fireannach fireannach NOUN Ncsmn Case=Nom|Gender=Masc|Number=Sing 7 obj _ _ 5 eile eile ADJ Aq-smn Case=Nom|Gender=Masc|Number=Sing 4 amod _ _ 6 a a PART Ug PartType=Inf 7 mark:prt _ _ -7 thogail tog NOUN Nv VerbForm=Inf 3 xcomp _ _ +7 thogail tog NOUN Nv VerbForm=Inf 3 csubj:cop _ _ 8 far far ADP Sp _ 9 case _ _ 9 beanntan beinn NOUN Ncpfn Case=Nom|Gender=Fem|Number=Plur 7 obl _ _ 10 Gleann Gleann PROPN Nt NounType=Top 9 nmod _ _ @@ -7069,7 +7070,7 @@ 17 cus cus NOUN Ncsmn Case=Nom|Gender=Masc|Number=Sing 20 obj _ _ 18 iomagain iomagain NOUN Ncsfg Case=Gen|Gender=Fem|Number=Sing 17 nmod _ _ 19 a a PART Ug PartType=Inf 20 mark:prt _ _ -20 ghabhail gabh NOUN Nv VerbForm=Inf 14 xcomp _ _ +20 ghabhail gabh NOUN Nv VerbForm=Inf 14 csubj:cop _ _ 21 mu mu ADP Sp _ 23 case _ _ 22 cho cho ADV Rg AdvType=Man 23 advmod _ _ 23 beag beag ADJ Ap _ 20 obl _ _ @@ -8021,7 +8022,7 @@ 32 gur is AUX Wpdia Polarity=Aff|Tense=Pres 33 cop _ _ 33 dòcha dòcha NOUN Uf _ 18 conj _ _ 34 gun gu PART Qa PartType=Cmpl 35 mark:prt _ _ -35 leig leig VERB V-f--d Mood=Ind|Tense=Fut|VerbForm=Fin 33 ccomp _ _ +35 leig leig VERB V-f--d Mood=Ind|Tense=Fut|VerbForm=Fin 33 csubj:cop _ _ 36 iad iad PRON Pp3p Number=Plur|Person=3|PronType=Prs 35 nsubj _ _ 37-38 dhuibh _ _ _ _ _ _ _ _ 37 do do ADP Sp _ 38 case _ _ @@ -10233,7 +10234,7 @@ 4 do do ADP Sp _ 5 case _ _ 5 sinn sinn PRON Pp1p Number=Plur|Person=1|PronType=Prs 3 nmod _ _ 6 a a PART Ug PartType=Inf 7 mark:prt _ _ -7 bhith bi NOUN Nv VerbForm=Inf 3 xcomp _ _ +7 bhith bi NOUN Nv VerbForm=Inf 3 csubj:cop _ _ 8 air air ADP Sp _ 10 case _ _ 9 ar ar DET Dp1p Number=Plur|Person=1|Poss=Yes|PronType=Prs 10 obj _ _ 10 faiceall faiceall NOUN Ncsmd Case=Dat|Gender=Masc|Number=Sing 7 xcomp:pred _ _ @@ -10317,7 +10318,7 @@ 5 thu thu PRON Pp2s Number=Sing|Person=2|PronType=Prs 3 nmod _ _ 6 fhèin fèin PRON Px PronType=Prs|Reflex=Yes 5 nmod _ _ 7 a a PART Ug PartType=Inf 8 mark:prt _ _ -8 bhith bi NOUN Nv VerbForm=Inf 3 xcomp _ _ +8 bhith bi NOUN Nv VerbForm=Inf 3 csubj:cop _ _ 9 ’na an ADP Sp _ 11 case _ _ 10 do do DET Dp2s Number=Sing|Person=2|Poss=Yes|PronType=Prs 11 nmod:poss _ _ 11 chorra-ghritheach corra-ghritheach NOUN Ncsmd Case=Dat|Gender=Masc|Number=Sing 8 xcomp:pred _ _ @@ -10615,7 +10616,7 @@ 41 Riaghaltas riaghaltas NOUN Ncsmd Case=Dat|Gender=Masc|Number=Sing 38 nmod _ _ 42 a-nis a-nis ADV Rt AdvType=Tim 38 advmod _ _ 43 oifigeir oifigeir NOUN Ncsmn Case=Nom|Gender=Masc|Number=Sing 44 obj _ _ -44 fhastadh fast NOUN Nv VerbForm=Vnoun 38 xcomp _ _ +44 fhastadh fast NOUN Nv VerbForm=Vnoun 38 csubj:cop _ _ 45-46 dhaib' _ _ _ _ _ _ _ _ 45 do do ADP Sp _ 46 case _ _ 46 iad iad PRON Pp3p Number=Plur|Person=3|PronType=Prs 44 obl _ _ @@ -11109,13 +11110,13 @@ 6 e e PRON Pp3sm Gender=Masc|Number=Sing|Person=3|PronType=Prs 2 xcomp:pred _ _ 7 gum gu PART Qa PartType=Cmpl 9 mark:prt _ _ 8 bu is AUX Ws Tense=Past 9 cop _ _ -9 chòir chòir NOUN Uf _ 2 ccomp _ _ +9 chòir chòir NOUN Uf _ 3 acl _ _ 10-11 dhan _ _ _ _ _ _ _ _ 10 do do ADP Sp _ 12 case _ _ 11 an an DET Tds Definite=Def|Number=Sing|PronType=Art 12 det _ _ 12 oifigeach oifigeach NOUN Ncsmd Case=Dat|Gender=Masc|Number=Sing 9 nmod _ _ 13 a a PART Ug PartType=Inf 14 mark:prt _ _ -14 bhith bi NOUN Nv VerbForm=Inf 9 xcomp _ _ +14 bhith bi NOUN Nv VerbForm=Inf 9 csubj:cop _ _ 15 ag ag PART Sa _ 16 case _ _ 16 obair obraich NOUN Nv VerbForm=Vnoun 14 xcomp:pred _ _ 17 gu gu PART Ua PartType=Ad 18 mark:prt _ _ @@ -11270,17 +11271,18 @@ 37 thàladh tàl NOUN Nv VerbForm=Inf 27 xcomp _ SpaceAfter=No 38 . . PUNCT Fe _ 21 punct _ _ +# comment = 2024-12-16: in the standard language there would be a "do" before "leisg" # sent_id = pw11_026 # text = “Cha bu chòir leisg sam bith a bhith oirnn a bhith ag iarraidh sin, oir aig a’ cheann thall se obair air leth cudromach a bhios an seo, a nì ceangal eadar a’ Phàrlamaid agus coimhearsnachd nan Gaidheal." 1 “ “ PUNCT Fq _ 4 punct _ SpaceAfter=No 2 Cha cha PART Qn PartType=Vb|Polarity=Neg 4 mark:prt _ _ 3 bu is AUX Ws Tense=Past 4 cop _ _ 4 chòir chòir NOUN Uf _ 0 root _ _ -5 leisg leisg NOUN Ncsfn Case=Nom|Gender=Fem|Number=Sing 4 nsubj _ _ +5 leisg leisg NOUN Ncsfn Case=Nom|Gender=Fem|Number=Sing 4 nmod _ _ 6 sam sam ADJ Aq ExtPos=ADJ 5 amod _ _ 7 bith bi ADJ Aq _ 6 fixed _ _ 8 a a PART Ug PartType=Inf 9 mark:prt _ _ -9 bhith bi NOUN Nv VerbForm=Inf 4 xcomp _ _ +9 bhith bi NOUN Nv VerbForm=Inf 4 csubj:cop _ _ 10-11 oirnn _ _ _ _ _ _ _ _ 10 air air ADP Sp _ 11 case _ _ 11 sinn sinn PRON Pp1p Number=Plur|Person=1|PronType=Prs 9 xcomp:pred _ _ diff --git a/not-to-release/validate_gd_extras.py b/not-to-release/validate_gd_extras.py index af4a2da..da968ef 100644 --- a/not-to-release/validate_gd_extras.py +++ b/not-to-release/validate_gd_extras.py @@ -203,6 +203,13 @@ def check_heads_for_upos(sentence) -> int: return errors def check_reported_speech(sentence) -> int: + """ + See https://universaldependencies.org/u/dep/ccomp.html + + Reported speech is ccomp of the verb of saying except where that interrupts speech, in which + case parataxis is used. + In that case the speech verb attaches to the root of the reported speech. + """ errors = 0 return errors @@ -293,14 +300,41 @@ def suggest_relative_deprel(deprels) -> str: return "nsubj" return "obj" +def check_csubj(sentence) -> int: + """ + Checks that the heads of the cop relation do not have nodes linked to them that should be linked by csubj:cleft or csubj:cop. + Candidate relations are acl, ccomp and xcomp. + + Returns an integer with the count of errors. + """ + errors = 0 + ids = {} + deprels = {} + csubj_candidates = ["xcomp", "acl", "ccomp"] + cop_heads = [t.head for t, _ in ud_words(sentence, lambda t: t.deprel == "cop")] + allowed_deprels = ["csubj:cleft", "csubj:cop", "nsubj"] + for token, _ in ud_words(sentence, lambda t: t.head in cop_heads and t.deprel in csubj_candidates or t.deprel in allowed_deprels): + if token.head in ids: + ids[token.head].append(token.id) + deprels[token.head].append(token.deprel) + else: + ids[token.head] = [token.id] + deprels[token.head] = [token.deprel] + for key in deprels: + stub = f"E {sentence.id} {key}" + if "csubj:cop" not in deprels[key] and "csubj:cleft" not in deprels[key] and "nsubj" not in deprels[key]: + print(f"{stub} head of cop should have a csubj:* among {list(zip(ids[key], deprels[key]))}") + errors +=1 + return errors + def check_bi(sentence) -> int: """ - Checks that the verb _bi_ has a node linked to it by xcomp:pred if there are any suitable nodes. - These are obl, xcomp, obl:smod and advmod. + Checks that the verb _bi_ does not have a node linked to it that should be linked by xcomp:pred. + Candidate relations are obl, xcomp, obl:smod and advmod. Note that in the last case there are adverbs that won't be suitable if they are adverbs of time. We also use OblType in the MISC column for phrases like "mar eisimpleir" = 'for example'. - Returns an integer errors. + Returns an integer with the count of errors. """ errors = 0 ids = {} @@ -444,6 +478,7 @@ def validate_corpus(corpus): total_errors += check_target_deprels(tree) total_errors += check_target_upos(tree) total_errors += check_bi(tree) + total_errors += check_csubj(tree) total_errors += check_reported_speech(tree) total_errors += check_passive(tree) total_errors += check_relatives(tree)