Skip to content

Commit

Permalink
new version for 2023
Browse files Browse the repository at this point in the history
  • Loading branch information
jorgtied committed Sep 26, 2023
1 parent 6274508 commit 55a62fe
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 5 deletions.
58 changes: 56 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,10 @@ OPUSMT_HOMEDIR = ../Opus-MT-train

TODAY := $(shell date +%F)
# VERSION = v2020-07-28
VERSION = v2021-08-07
TATOEBA_VERSION = v2020-05-31
# VERSION = v2021-08-07
VERSION = v2023-09-26
TATOEBA_VERSION = v2023-04-12
# TATOEBA_VERSION = v2020-05-31
# TATOEBA_VERSION = v20190709


Expand Down Expand Up @@ -764,7 +766,59 @@ FIXLANGIDS = | sed 's/ze_zh/zh/g;s/_Hani//g;s/-han[st]//g;s/zht/zh_TW/g;s/zhs/zh
## create training data by concatenating all data sets
## using normalized language codes (macro-languages)


${RELEASEDIR}/%/train.id.gz:
@echo "make train data for ${patsubst ${RELEASEDIR}/%/train.id.gz,%,$@}"
@rm -f $@.tmp1 $@.tmp2
@mkdir -p ${dir $@}train.d
@( l=${patsubst ${RELEASEDIR}/%/train.id.gz,%,$@}; \
s=${firstword ${subst -, ,${patsubst ${RELEASEDIR}/%/train.id.gz,%,$@}}}; \
t=${lastword ${subst -, ,${patsubst ${RELEASEDIR}/%/train.id.gz,%,$@}}};
E=`${SCRIPTDIR}/find_opus_langs.pl $$s ${OPUS_LANGS}`; \
F=`${SCRIPTDIR}/find_opus_langs.pl $$t ${OPUS_LANGS}`; \
for e in $$E; do \
for f in $$F; do \
if [ $$e == $$f ]; then a=$${e}1;b=$${f}2; \
else a=$${e};b=$${f}; fi; \
for z in `wget -O - -q "https://opus.nlpl.eu/opusapi/?source=$$e&target=$$f&preprocessing=moses&version=latest" | sed 's/^.*\[//;s/\].*$$//' | tr ',' "\n" | sed 's/"//g' | grep 'url:' | cut -f2- -d:`; do \
c=`echo "$$z" | cut -f4 -d/ | sed 's/^OPUS-//'`; \
v=`echo "$$z" | cut -f5 -d/`; \
if [ `echo '${EXCLUDE_CORPORA}' | tr ' ' "\n" | grep "$$c" | wc -l` -eq 0 ]; then \
echo "downloading $$c-$$v"; \
wget -q -O ${dir $@}train.d/moses.zip $$z; \
unzip -qq -n -d ${dir $@}train.d ${dir $@}train.d/moses.zip; \
rm -f ${dir $@}train.d/moses.zip; \
paste ${dir $@}train.d/*.$$a ${dir $@}train.d/*.$$b ${BASIC_FILTERS} |\
${SCRIPTDIR}/bitext-match-lang.py -s $$e -t $$f > $@.tmp2; \
rm -f ${dir $@}train.d/*; \
if [ -e $@.tmp2 ]; then \
cut -f1 $@.tmp2 ${FIXLANGIDS} | langscript -3 -l $$e -r -D > $@.tmp2srcid; \
cut -f2 $@.tmp2 ${FIXLANGIDS} | langscript -3 -l $$f -r -D > $@.tmp2trgid; \
paste $@.tmp2srcid $@.tmp2trgid $@.tmp2 | sed "s/^/$$c-$$v /" >> $@.tmp1; \
rm -f $@.tmp2 $@.tmp2srcid $@.tmp2trgid; \
fi \
else \
echo "exclude $$c-$$v"; \
fi \
done \
done \
done \
)
if [ -s $@.tmp1 ]; then \
${SHUFFLE} < $@.tmp1 |\
scripts/exclude-devtest.pl -a -l \
${dir $@}test.src ${dir $@}test.trg \
${dir $@}dev.src ${dir $@}dev.trg > $@.tmp2; \
cut -f4 $@.tmp2 | ${GZIP} -c > ${dir $@}train.src.gz; \
cut -f5 $@.tmp2 | ${GZIP} -c > ${dir $@}train.trg.gz; \
cut -f1,2,3 $@.tmp2 | ${GZIP} -c > $@; \
fi
rm -f $@.tmp1 $@.tmp2
rmdir ${dir $@}train.d



DEPRECATED/${RELEASEDIR}/%/train.id.gz:
@echo "make train data for ${patsubst ${RELEASEDIR}/%/train.id.gz,%,$@}"
@rm -f $@.tmp1 $@.tmp2
@mkdir -p ${dir $@}train.d
Expand Down
2 changes: 1 addition & 1 deletion opus-langpairs.txt

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion opus-langpairs3.txt

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion opus-langs.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
aa ab ace ach acm acu ada ady ae aeb aed af afb afh agr aha aii ain ajg ajp ak aka ake akl aln alt alz am amh ami amu an ang anp aoc aoz apc ar ara arc arh arn arq ar_SY ar_TN ary arz as ase asf ast ati atj av avk awa ay ayl aym az azb az_Cyrl az_IR azz ba bal bam ban bar bas bbc bbj bci bcl be bem ber be_tarask bfi bfz bg bg_BG bh bho bhw bi bin bjn bm bn bn_bd bn_in bn_IN bnt bo bod bpy br brx bs bsn btg bts btx bua bug bum bvl bvy bxr byn byv bzj bzs bzt ca cab cac cak cat ca_valencia ca_x_vlc cay cb cbk cbk_zam cce cdo ce ceb ch chf chg chj chk chn cho chq chr chw chy cjk cjp cjy ckb ckt cku cmn cmn_Hans cmn_Hant cmo cn cnh cni co cop cpi cr crh crh_latn crk crp crs cs csb cse csf csg csl csn csr cto ctu cu cuk cv cx cy cyo da da_DK daf de de_AT de_CH de_DE dga dhv dik din diq dje djk dng dnj dop drt dsb dtp dty dua dv dws dyu dz ecs ee efi egl el eml emx en en_AU en_ca en_CA en_gb en_GB enm en_NZ en_US en_za en_ZA eo es es_ar es_AR es_cl es_CL es_CO es_CR es_DO es_EC es_ES es_GT es_HN es_mx es_MX esn es_NI es_PA es_PE es_PR es_SV es_UY es_VE et eu eu_ES evn ewo ext fa fa_AF fa_IR fan fat fcs ff fi fil fj fkv fo fon foo fr fr_BE fr_ca fr_CA fr_FR frm fro frp frr fse fsl fuc ful fur fuv fy fy_nl ga gaa gag ga_ie gan gbi gbm gcf gcr gd gil gl glk gn gom gor gos got gr grc gsg gsm gss gsw gu guc gug gu_in gum gur guw gv gxx gym ha hai hak hau haw hax haz hb hbo hbs hch hdn hds he hi hif hi_in hi_IN hil him hmn hne hnj ho hoc hr hrx hsb hsh hsn ht hu hup hus hy hy_am hy_arevmda hye hye_x_hma hye_x_hms hyw hz ia iba ibg ibo ic id ie ig ii ik ike ilo inh inl ins io iro is ise ish iso it it_IT iu ixl izh ja jak jam jap jbo jdt jiv jmx jp jpa jsl jv ka kaa kab kac kam kar kau kbd kbh kbp kea kek kg kha ki kik kin kiu kj kjh kk kk_Arab kk_Cyrl kl klj km kmb kmr kmr_Cyrl kmr_latn kmr_x_rdu kn ko koi kok kon koo kpv kqn kr krc kri krl ks ksh kss ksw ku kum kv kvk kw kwn kwy kxi ky kzj la laa lad lam lb lbe ldn lez lfn lg li lij lin liv lkt lld lmo ln lo lou loz lrc lsp lt ltg lu lua lue lun luo lus lut luy lv lzh lzz mad mai mam map_bms mau max maz mco mcp mdf me meh men mfe mfs mg mgm mgr mh mhr mi mic min miq mix mk ml mlg mn mnc mni mnw mo moh mos mr mrj mrq ms ms_MY mt mus mvv mwl mww mxv my myv mzn mzy na nah nan nap naq_x_dmr nb nba nb_no nb_NO nch nci ncj ncs ncx nd ndc nds nds_nl ne ne_np new ng ngl ngt ngu nhg nhk nhn nia nij niu nl nl_BE nl_NL nlv nn nnh nn_no nn_NO no nog non no_nb nov npi nqo nr nrm ns nso nso_x_spl nst nus nv ny nya nyk nyn nys nyu nzi oar oc ofs oj ojb oke olo om ood or orm orv os osp osx os_x_dgr ota ote otk pa pag pai pa_in pal pam pan pap pap_x_paa pau pbb pcd pck pcm pdc pdt pes pfl phn pi pid pih pis pl plt pms pmy pnb pnt po pon pot ppk ppl prg prl prs ps pso psp psr pt pt_br pt_BR pt_PT pys qa qd qu quc que qug qus quw quy quz qvi qvz qxl qxq qya rap rar rcf rel rif rm rmc_sk rmn rmn_Cyrl rmn_x_rmg rms rm_sursilv rm_vallader rmy rmy_AR rmy_x_rmv rn rnd ro roa_tara rom rsl ru rue run rup rw ry ryu sa sah sat sbs sc scc scn sco scr sd sdh se seh ses sfs sfw sg sgn sgn_AO sgs sh shi shn shs shy si sid sjn sk skg_x_vz skr sl sm sma sml smn sn sna so som son sop sot sq sqk sr sr_Cyrl sr_Latn srm sr_ME srn srp ss ssp st stq su sux sv svk sv_se sw swa swc swg swh sxn syc syr sz szl ta ta_LK tc tcf tcy tdt tdx te tet tg tg_TJ th thv ti tig tir tiv tk tk_Cyrl tkl tl tlh tll tl_PH tly tmh tmp tmr tmw tn to tob tog toh toi toi_zw toj top tpi tpw tr trs tr_TR trv ts tsc tso_MZ tss tsz tt ttj tts tum tvl tw ty tyv tz tzh tzl tzo udm ug ug_Cyrl uk umb ur urh ur_PK usp uz uz_Cyrl uz_Latn ve vec vep vi vi_VN vls vmw vo vot vro vsl wa wae wal war wba wes wes_ng wls wlv wo wol wuu xal xcl xh xho xmf xnz xpe xqa yao yap yaq ybb yi yo yom_x_ibi yor yua yue yue_Hans yue_Hant za zab zai zam zdj zea ze_en ze_zh zgh zh zh_cn zh_CN zh_en zh_hk zh_HK zhs zht zh_tw zh_TW zh_yue zh_zh zib zlm zne zpa zpg zsl zsm zu zul zz zza
aa ab ace ach acm acu ady ae aeb af afb afh agr aii ain ajp ak aka ake akl aln alt am amh ami amu an ang anp aoz apc ar ara arc arn arq ar_SY ar_TN ary arz as ase ast atj av avk awa ay ayl aym az az_Arab azb az_IR azz ba bal bam ban bar bas bcl be bem ber be_tarask bfz bg bg_BG bh bho bi bjn bm bn bn_bd bn_in bn_IN bnt bo bod bom bpy br brx bs bsn bua bug bvy bxr byn bzt ca cak cat ca_valencia cay cb cbk cbk_zam cdo ce ceb ch chg chn cho chq chr chv chy cjk cjp cjy ckb ckt cku cmn cmo cn cnh cni co cop cpi cr crh crh_latn crk crp crs cs csb cu cv cx cy cycl cyo da da_DK dag de de_AT de_CH de_DE dik din diq dje djk dng dop drt dsb dtp dty dv dws dyu dz ee efi egl el eml emx en en_AU en_ca en_CA en_gb en_GB enm en_NZ en_US en_za en_ZA eo es es_ar es_AR es_cl es_CL es_CO es_CR es_DO es_EC es_ES es_GT es_HN es_mx es_MX es_NI es_PA es_PE es_PR es_SV es_UY es_VE et eu eu_ES evn ext fa fa_AF fa_IR ff fi fil fj fkv fo fon foo fr fr_BE fr_ca fr_CA fr_FR frm fro frp frr fuc ful fur fuv fy fy_nl ga gaa gag ga_ie gan gbi gbm gcf gcr gd gil gl glk gn gom gor gos got gr grc gsw gu guc gu_in guw gv ha hai hak hau haw hax haz hb hbo hbs hch hdn he hi hif hi_in hi_IN hil him hne hnj ho hoc hr hrx hsb hsn ht hu hup hus hy hy_am hye hyw hz ia iba ibo ic id ie ig igs ii ik ike ilo inh io iro is it it_IT iu ixl izh ja jak jam jap jbo jdt jiv jp jpa jv ka kaa kab kac kam kar kau kbd kbh kbp kcg kea kek kg kha ki kik kin kiu kj kjh kk kl klj km kmb kmr kn knc ko koi kok kon kpv kr kr_Arab krc krl kr_Latn ks ks_Arab ks_Deva ksh ku ku_Arab ku_Latn kum kv kw kxi ky kzj la laa lad lb lbe ldn lez lfn lg li lij lin liv lkt lld lmo ln lo lou lrc lt ltg lu lua luo lus lut luy lv lzh lzz mad mag mai mam map_bms max md mdf me meh mfa mfe mg mgm mh mhr mi mic mik min miq mix mk ml mlg mn mnc mni mnr mnw mo moh mos mr mrj ms ms_MY mt mus mvv mwl mww my myv mzn na nah nan nap nb nb_no nb_NO nch nci nd nds nds_nl ne ne_np new ngt ngu nhg nhn nia niu nl nl_BE nl_NL nlv nn nnb nn_no nn_NO no nog non no_nb nov npi nqo nr nrm ns nso nst nus nv ny nya nys oar oc ofs oj ojb olo om ood or orm orv os osp osx ota otk pa pag pai pa_in pal pam pan pap pau pcd pck pcm pdc pes pfl phn pi pih pl plt pms pmy pnb pnt po pot ppk ppl prg prs ps pt pt_br pt_BR pt_PT py qa qd qu quc que quw quy quz qvi qxq qya rap rel rhg rif rm rm_sursilv rm_vallader rmy rn ro roa_tara rom ru rue run rup rw ry ryu sa sah sat sc scc scn sco scr sd sdh se ses sg sgn sgs sh shi shn shs shy si sjn sk skr sl sm sma sml smn sn sna so som son sot sq sr sr_ME srn srp ss st stq su sux sv sv_se sw swa swc swg swh syc syl syr sz szl szy ta ta_LK taq tc tcy te tet tg tg_TJ th thv ti tig tir tk tkl tl tlh tl_PH tly tmh tmp tmr tmw tmx tn to toi tok toki tpi tpw tr trs tr_TR trv ts tsz tt tts tum tvl tw ty tyv tz tzl tzm udm ug uk umb ur urh ur_PK usp uz v1 ve vec vep vi vi_VN vls vo vot vro wa wae wal war wo wol wuu xal xcl xh xho xmf xnz xqa yaq yi yo yor yua yue za zam zea ze_en ze_zh zgh zh zh_cn zh_CN zh_en zh_hk zh_HK zhs zht zh_tw zh_TW zh_yue zh_zh zlm zsm zu zul zz zza

0 comments on commit 55a62fe

Please sign in to comment.