From 32d5955aae298d2ca8eb557d5e4b7c6ca7f07504 Mon Sep 17 00:00:00 2001 From: Xabier de Zuazo Date: Wed, 17 Aug 2022 13:21:26 +0200 Subject: [PATCH 1/8] Add Basque segmenter support --- MANIFEST.in | 2 + README.md | 2 +- cvutils/data/eu/abbr.tsv | 200 +++++++++++++++++++++++++++++++++++ cvutils/data/eu/punct.tsv | 3 + cvutils/data/eu/validate.tsv | 20 ++++ 5 files changed, 226 insertions(+), 1 deletion(-) create mode 100644 cvutils/data/eu/abbr.tsv create mode 100644 cvutils/data/eu/punct.tsv diff --git a/MANIFEST.in b/MANIFEST.in index a3eecfa..e4c6ab2 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -380,6 +380,8 @@ include cvutils/data/es/vocab.tsv include cvutils/data/eu include cvutils/data/eu/alphabet.txt include cvutils/data/eu/validate.tsv +include cvutils/data/eu/punct.tsv +include cvutils/data/eu/abbr.tsv include cvutils/data/eu/phon.tsv include cvutils/data/eu/vocab.tsv include cvutils/data/fr diff --git a/README.md b/README.md index d82562a..bb7fefb 100644 --- a/README.md +++ b/README.md @@ -209,7 +209,7 @@ A-hend-all e vez gounezet arc'hant dre chaseal ha pesketa. | Spanish | Español |`spa` | `es` |`es`| ✔ | ✔ | ✔ | | | Erzya | Эрзянь кель |`myv` | `myv` |`myv`| | ✔ | ✔ | | | Estonian | Eesti |`est` | `et` |`et`| ✔ | ✔ | ✔ | | -| Basque | Euskara |`eus` | `eu` |`eu`| ✔ | ✔ | ✔ | | +| Basque | Euskara |`eus` | `eu` |`eu`| ✔ | ✔ | ✔ | ✔ | | Persian | فارسی |`pes` | `fa` |`fa`| — | ✔ | ✔ | | | Finnish | Suomi |`fin` | `fi` |`fi`| ✔ | ✔ | ✔ | | | French | Français |`fra` | `fr` |`fr`| — | ✔ | ✔ | | diff --git a/cvutils/data/eu/abbr.tsv b/cvutils/data/eu/abbr.tsv new file mode 100644 index 0000000..7903bd9 --- /dev/null +++ b/cvutils/data/eu/abbr.tsv @@ -0,0 +1,200 @@ +1 [0-9]+. +1 [MDCLXVI]+. +1 abe. +1 abu. +1 ad. +1 adb. +1 adib. +1 adj. +1 Aeron. +1 aip. +1 al. +1 Albait. +1 a.m. +1 Anat. +1 and. +1 año. +1 Antr. +1 Antz. +1 api. +1 aptu. +1 ar. +1 Ar. +1 arg. +1 Arkeol. +1 Arkit. +1 Arm. +1 arrt. +1 Art. +1 as. +1 Asron. +1 Astrol. +1 Astronaut. +1 Autom. +1 az. +1 aza. +1 banatz. +1 bas. +1 Beh. +1 bibliog. +1 Biokim. +1 Biol. +1 Biz. +1 Bot. +1 e.a. +1 e.b. +1 eka. +1 Ekol. +1 Ekon. +1 Elektr. +1 Elektron. +1 enp. +1 er. +1 Eraik. +1 erak. +1 Erl. +1 esk. +1 eskra. +1 esr. +1 esr.zah. +1 etab. +1 etc. +1 etorb. +1 etx. +1 ezk. +1 Fil. +1 Fin. +1 Fis. +1 fra. +1 g. +1 gald. +1 G.b. +1 G.B. +1 Geogr. +1 Geol. +1 g. +1 Gip. +1 Graf. +1 Gram. +1 h. +1 hed. +1 Heg. +1 herr. +1 Hezk. +1 Hirgz. +1 hirib. +1 Hist. +1 Hizkl. +1 ibde. +1 id. +1 ig. +1 ik. +1 Ik. +1 ind. +1 Inform. +1 interj. +1 Ipar. +1 ira. +1 Itsas. +1 iz. +1 izlag. +1 izond. +1 izord. +1 izpta. +1 jn. +1 junt. +1 k. +1 K.a. +1 Kim. +1 Kir. +1 k.k. +1 K.o. +1 Kont. +1 koop. +1 Koreogr. +1 lab. +1 Lap. +1 lgart. +1 Lit. +1 Log. +1 lok. +1 lr. +1 mai. +1 maiusk. +1 mar. +1 Mar. +1 Mat. +1 Meatz. +1 Med. +1 mend. +1 Metal. +1 Meteorol. +1 Metr. +1 Mikol. +1 Mikrob. +1 Mil. +1 Miner. +1 Mit. +1 Mus. +1 Naf. +1 N.B. +1 Nekaz. +1 neol. +1 O.E. +1 og. +1 ol. +1 onom. +1 or. +1 ord. +1 orok. +1 orr. +1 ots. +1 Paleont. +1 partik. +1 pas. +1 P.-B. +1 P.D. +1 Pedag. +1 Pint. +1 P.-K. +1 P.-Ku. +1 pl. +1 p.m. +1 pol. +1 Pol. +1 postp. +1 prob. +1 P.S. +1 Psikiatr. +1 Psikol. +1 pta. +1 pzta. +1 R.I.P. +1 sin. +1 sing. +1 sol. +1 Soziol. +1 sp. +1 stua. +1 subsp. +1 Teknol. +1 tel. +1 tf. +1 urr. +1 urt. +1 uzt. +1 zah. +1 zbko. +1 zehazt. +1 zehaztgb. +1 zehb. +1 zenb. +1 zenbtz. +1 z.g. +1 Zin. +1 zk. +1 Zool. +1 ztua. +1 Zub. +1 zum. +1 Zuz. diff --git a/cvutils/data/eu/punct.tsv b/cvutils/data/eu/punct.tsv new file mode 100644 index 0000000..7ca258e --- /dev/null +++ b/cvutils/data/eu/punct.tsv @@ -0,0 +1,3 @@ +EOS ! +EOS ? +EOS . diff --git a/cvutils/data/eu/validate.tsv b/cvutils/data/eu/validate.tsv index 22ca6e7..e76ebb8 100644 --- a/cvutils/data/eu/validate.tsv +++ b/cvutils/data/eu/validate.tsv @@ -9,7 +9,27 @@ REPL ; _ REPL ? _ REPL ‘ _ REPL ’ _ +NORM ´ ' +NORM ` ʼ +NORM ' ʼ +NORM ‘ ' +NORM ‘ ʼ +NORM ’ ' +NORM ’ ʼ +NORM “ " +NORM ” " +NORM « " +NORM » " +NORM ‐ - +NORM ‟ " +NORM ʼ ' +NORM á a +NORM é e NORM í i +NORM ó o +NORM ú u +NORM ü u +NORM ç c ALLOW a _ ALLOW b _ ALLOW c _ From e0f5de13d3cc381af2822f908336a509cb79b81a Mon Sep 17 00:00:00 2001 From: Xabier de Zuazo Date: Wed, 17 Aug 2022 13:21:58 +0200 Subject: [PATCH 2/8] Add .gitignore including build directory --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d948958 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/build +/commonvoice_utils.egg-info From e3c1a940f3fa7dc0f170dfb0a2bf81961d08bf1b Mon Sep 17 00:00:00 2001 From: Xabier de Zuazo Date: Fri, 19 Aug 2022 11:57:09 +0200 Subject: [PATCH 3/8] Normalize French diacritics in Basque --- cvutils/data/eu/validate.tsv | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/cvutils/data/eu/validate.tsv b/cvutils/data/eu/validate.tsv index e76ebb8..6ea68ab 100644 --- a/cvutils/data/eu/validate.tsv +++ b/cvutils/data/eu/validate.tsv @@ -24,10 +24,21 @@ NORM ‐ - NORM ‟ " NORM ʼ ' NORM á a +NORM â a +NORM à a NORM é e +NORM ê e +NORM è e NORM í i +NORM î i +NORM ì i +NORM ï i NORM ó o +NORM ô o +NORM ò o NORM ú u +NORM û u +NORM ù u NORM ü u NORM ç c ALLOW a _ From 852bcc979e8d99ae5cc8c9d34f3f495b38ef5d62 Mon Sep 17 00:00:00 2001 From: Xabier de Zuazo Date: Mon, 5 Sep 2022 21:50:57 +0200 Subject: [PATCH 4/8] Normalize some diacritics and Spanish consonants in Galician --- cvutils/data/gl/validate.tsv | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/cvutils/data/gl/validate.tsv b/cvutils/data/gl/validate.tsv index 8e053ed..1f47376 100644 --- a/cvutils/data/gl/validate.tsv +++ b/cvutils/data/gl/validate.tsv @@ -56,3 +56,23 @@ ALLOW x _ 0078 _ ALLOW y _ 0079 _ ALLOW z _ 007a _ ALLOW _ _ 0020 _ +NORM â a +NORM à a +NORM ê e +NORM è e +NORM î i +NORM ì i +NORM ï i +NORM ô o +NORM ò o +NORM û u +NORM ù u +NORM ç c +NORM j i +NORM ka ca +NORM ko co +NORM ku cu +NORM ke que +NORM ki qui +NORM w u +NORM y i From 9df4743a1e9706631df1fe073a801868d6cf3b71 Mon Sep 17 00:00:00 2001 From: Xabier de Zuazo Date: Mon, 5 Sep 2022 21:55:37 +0200 Subject: [PATCH 5/8] Add Galician support to the segmenter --- MANIFEST.in | 2 + README.md | 2 +- cvutils/data/gl/abbr.tsv | 371 ++++++++++++++++++++++++++++++++++++++ cvutils/data/gl/punct.tsv | 3 + 4 files changed, 377 insertions(+), 1 deletion(-) create mode 100644 cvutils/data/gl/abbr.tsv create mode 100644 cvutils/data/gl/punct.tsv diff --git a/MANIFEST.in b/MANIFEST.in index de9823f..4015918 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -27,6 +27,8 @@ include cvutils/data/ckt/phon.tsv include cvutils/data/gl include cvutils/data/gl/alphabet.txt include cvutils/data/gl/validate.tsv +include cvutils/data/gl/punct.tsv +include cvutils/data/gl/abbr.tsv include cvutils/data/gl/phon.tsv include cvutils/data/gl/vocab.tsv include cvutils/data/rm-vallader diff --git a/README.md b/README.md index 450ad20..7756cbc 100644 --- a/README.md +++ b/README.md @@ -247,7 +247,7 @@ A-hend-all e vez gounezet arc'hant dre chaseal ha pesketa. | Frisian | Frysk |`fry` | `fy-NL` |`fy`| | ✔ | ✔ | ✔ | | Igbo | Ásụ̀sụ́ Ìgbò |`ibo` | `ig` |`ig`| ✔ | ✔ | ✔ | | | Irish | Gaeilge |`gle` | `ga-IE` |`ga`| | ✔ | ✔ | | -| Galician | Galego |`glg` | `gl` |`gl`| ✔ | ✔ | ✔ | | +| Galician | Galego |`glg` | `gl` |`gl`| ✔ | ✔ | ✔ | ✔ | | Guaraní | Avañeʼẽ |`gug` | `gn` |`gn`| ✔ | ✔ | ✔ | | | Hindi | हिन्दी |`hin` | `hi` | `hi` | ✔ | ✔ | ✔ | | Hausa | Harshen Hausa |`hau` | `ha` |`ha` | ✔ | ✔ | ✔ | | diff --git a/cvutils/data/gl/abbr.tsv b/cvutils/data/gl/abbr.tsv new file mode 100644 index 0000000..c3ce096 --- /dev/null +++ b/cvutils/data/gl/abbr.tsv @@ -0,0 +1,371 @@ +1 a. +1 AA. +1 ab. +1 a.C. +1 acad. +1 acadca. +1 acadco. +1 acep. +1 adm. +1 admdor. +1 admdora. +1 admtva. +1 admtvo. +1 adv. +1 adx. +1 ag. +1 agr. +1 agrón. +1 alc. +1 alm. +1 alt. +1 a.m. +1 ampl. +1 and. +1 ant. +1 ap. +1 apdo. +1 aprox. +1 apto. +1 arq. +1 arquit. +1 art. +1 asdo. +1 asoc. +1 át. +1 aum. +1 aus. +1 aut. +1 aux. +1 avda. +1 axud. +1 bibl. +1 bibliog. +1 bl. +1 b.o. +1 bol. +1 c. +1 ca. +1 cant. +1 cap. +1 carr. +1 cast. +1 cat. +1 cát. +1 catedr. +1 célt. +1 cént. +1 cert. +1 ch. +1 cit. +1 cl. +1 clás. +1 cód. +1 coed. +1 col. +1 colab. +1 com. +1 comp. +1 conc. +1 constr. +1 cont. +1 convoc. +1 coord. +1 corp. +1 corrix. +1 cp. +1 cta. +1 cto. +1 d. +1 d.C. +1 dec. +1 del. +1 dem. +1 dep. +1 desp. +1 det. +1 dic. +1 dipl. +1 dir. +1 dir.ª +1 disp. +1 distr. +1 d.l. +1 doc. +1 dpto. +1 Dr. +1 Dra. +1 dta. +1 dto. +1 dupl. +1 d/v. +1 d.v. +1 d.x. +1 econ. +1 ed. +1 edit. +1 ef. +1 Em. +1 entr. +1 enx. +1 e.p.d. +1 epíl. +1 escr. +1 esp. +1 esq. +1 esqda. +1 esqdo. +1 est. +1 estat. +1 estr. +1 etc. +1 e.t.s. +1 e.u. +1 eusc. +1 éusc. +1 ex. +1 exc. +1 exped. +1 ext. +1 f. +1 fábr. +1 fac. +1 facs. +1 fact. +1 fasc. +1 feb. +1 fem. +1 fest. +1 fig. +1 fotogr. +1 fr. +1 fund. +1 fut. +1 gal. +1 gar. +1 gl. +1 gob. +1 gr. +1 gram. +1 h. +1 hab. +1 habit. +1 íb. +1 íd. +1 igr. +1 il. +1 ilustr. +1 imp. +1 imper. +1 imperf. +1 impers. +1 impr. +1 inc. +1 incl. +1 incompl. +1 ind. +1 índ. +1 indet. +1 inf. +1 infin. +1 info. +1 inform. +1 ing. +1 ins. +1 insep. +1 inst. +1 int. +1 inter. +1 interr. +1 interx. +1 intr. +1 introd. +1 invent. +1 irr. +1 it. +1 l. +1 lab. +1 lám. +1 lat. +1 lca. +1 lco. +1 ldo.lda. +1 lic. +1 licda. +1 licdo. +1 lit. +1 loc. +1 lonx. +1 ltda. +1 ltdo. +1 m. +1 maiúsc. +1 masc. +1 mat. +1 máx. +1 mc. +1 mecan. +1 med. +1 merc. +1 mercad. +1 min. +1 mín. +1 minist. +1 mod. +1 ms. +1 mt. +1 mun. +1 mús. +1 mz. +1 n. +1 nac. +1 n.do +1 n.doed. +1 neg. +1 nom. +1 not. +1 nov. +1 n.p. +1 ntva. +1 ntvo. +1 núm. +1 o. +1 obs. +1 of. +1 o.p. +1 op. +1 op.cit. +1 opús. +1 orix. +1 out. +1 p. +1 pal. +1 par. +1 parr. +1 part. +1 pat. +1 pav. +1 páx. +1 p.b. +1 P.D. +1 pdo. +1 pen. +1 per. +1 pers. +1 pl. +1 plu. +1 p.m. +1 p.m.a. +1 p.n. +1 pob. +1 pol. +1 port. +1 pos. +1 pr. +1 pral. +1 pref. +1 prelim. +1 prep. +1 pres. +1 prínc. +1 priv. +1 prnl. +1 proc. +1 prof. +1 pról. +1 pron. +1 prov. +1 próx. +1 P.S. +1 pta. +1 pte. +1 publ. +1 públ. +1 pza. +1 r. +1 rec. +1 red. +1 reed. +1 ref. +1 reg. +1 rel. +1 rev. +1 rex. +1 R.I.P. +1 r.p.m. +1 rte. +1 s. +1 S.A. +1 sáb. +1 s.d. +1 sec. +1 séc. +1 secr. +1 seg. +1 sent. +1 s.e.o.o. +1 serv. +1 set. +1 símb +1 símb. +1 sing. +1 s.l. +1 S.L. +1 s.l.s.a. +1 s.n. +1 sobr. +1 soc. +1 Sr. +1 Sra. +1 st. +1 Sta. +1 Sto. +1 subs. +1 subx. +1 sum. +1 sup. +1 supl. +1 suplem. +1 sus. +1 t. +1 téc. +1 tel. +1 teléf. +1 telegr. +1 test. +1 tfno. +1 tip. +1 tít. +1 tón. +1 trad. +1 trans. +1 trat. +1 trav. +1 trib. +1 tripl. +1 tv. +1 u. +1 ú. +1 últ. +1 univ. +1 urb. +1 v. +1 v. +1 Vde. +1 Vde/s. +1 ven. +1 venc. +1 vers. +1 v.gr. +1 vid. +1 vol. +1 VV. +1 x. +1 xan. +1 xer. +1 xll. +1 x.p. +1 xud. +1 xur. +1 xust. +1 xv. diff --git a/cvutils/data/gl/punct.tsv b/cvutils/data/gl/punct.tsv new file mode 100644 index 0000000..7ca258e --- /dev/null +++ b/cvutils/data/gl/punct.tsv @@ -0,0 +1,3 @@ +EOS ! +EOS ? +EOS . From dabdc24f3767c658f8639fda0ada796154c1b7dd Mon Sep 17 00:00:00 2001 From: Xabier de Zuazo Date: Mon, 5 Sep 2022 22:07:24 +0200 Subject: [PATCH 6/8] =?UTF-8?q?Add=20a=20missing=20vowel=20in=20Galician:?= =?UTF-8?q?=20=C3=BC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cvutils/data/gl/alphabet.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cvutils/data/gl/alphabet.txt b/cvutils/data/gl/alphabet.txt index 6c9656d..13142b9 100644 --- a/cvutils/data/gl/alphabet.txt +++ b/cvutils/data/gl/alphabet.txt @@ -1 +1 @@ -aábcdeéfghiílmnñoópqurstuúvxz +aábcdeéfghiílmnñoópquürstuúvxz From cd156af97b558d3066cf8eaf3fce7a71ca3f600a Mon Sep 17 00:00:00 2001 From: Xabier de Zuazo Date: Mon, 5 Sep 2022 22:29:12 +0200 Subject: [PATCH 7/8] Complete Galician alphabet --- cvutils/data/gl/alphabet.txt | 2 +- cvutils/data/gl/validate.tsv | 8 -------- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/cvutils/data/gl/alphabet.txt b/cvutils/data/gl/alphabet.txt index 13142b9..60a38e3 100644 --- a/cvutils/data/gl/alphabet.txt +++ b/cvutils/data/gl/alphabet.txt @@ -1 +1 @@ -aábcdeéfghiílmnñoópquürstuúvxz +aábcdeéfghiíjklmnñoópqrstuúüvwxyz diff --git a/cvutils/data/gl/validate.tsv b/cvutils/data/gl/validate.tsv index 1f47376..29fd28f 100644 --- a/cvutils/data/gl/validate.tsv +++ b/cvutils/data/gl/validate.tsv @@ -68,11 +68,3 @@ NORM ò o NORM û u NORM ù u NORM ç c -NORM j i -NORM ka ca -NORM ko co -NORM ku cu -NORM ke que -NORM ki qui -NORM w u -NORM y i From 8f190f989feaf554ba09463f14d1c0dfad3b4f41 Mon Sep 17 00:00:00 2001 From: Xabier de Zuazo Date: Tue, 6 Sep 2022 09:59:40 +0200 Subject: [PATCH 8/8] Use Galiacian reduced alphabet --- cvutils/data/gl/alphabet.txt | 2 +- cvutils/data/gl/validate.tsv | 12 ++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/cvutils/data/gl/alphabet.txt b/cvutils/data/gl/alphabet.txt index 60a38e3..3414c39 100644 --- a/cvutils/data/gl/alphabet.txt +++ b/cvutils/data/gl/alphabet.txt @@ -1 +1 @@ -aábcdeéfghiíjklmnñoópqrstuúüvwxyz +aábcdeéfghiílmnñoópqrstuúüvxz diff --git a/cvutils/data/gl/validate.tsv b/cvutils/data/gl/validate.tsv index 29fd28f..f0ae18a 100644 --- a/cvutils/data/gl/validate.tsv +++ b/cvutils/data/gl/validate.tsv @@ -34,8 +34,6 @@ ALLOW g _ 0067 _ ALLOW h _ 0068 _ ALLOW i _ 0069 _ ALLOW í _ 00ed _ -ALLOW j _ 006a _ -ALLOW k _ 006b _ ALLOW l _ 006c _ ALLOW m _ 006d _ ALLOW n _ 006e _ @@ -51,9 +49,7 @@ ALLOW u _ 0075 _ ALLOW ú _ 00fa _ ALLOW ü _ 00fc _ ALLOW v _ 0076 _ -ALLOW w _ 0077 _ ALLOW x _ 0078 _ -ALLOW y _ 0079 _ ALLOW z _ 007a _ ALLOW _ _ 0020 _ NORM â a @@ -68,3 +64,11 @@ NORM ò o NORM û u NORM ù u NORM ç c +NORM j i +NORM ka ca +NORM ko co +NORM ku cu +NORM ke que +NORM ki qui +NORM w u +NORM y i