updated dev and test data (v2023-09-26)

Helsinki-NLP · Oct 9, 2023 · 8b14a10 · 8b14a10
1 parent 1d215d6
commit 8b14a10
Show file tree

Hide file tree

Showing 2,680 changed files with 798,588 additions and 799,272 deletions.
diff --git a/Makefile b/Makefile
@@ -51,6 +51,8 @@
 ## make update-git ............. update the git repository
 ##
 
+
+
 SHELL := bash
 CPU_MODULES := parallel
 
@@ -981,6 +983,42 @@ endif
 
 
 
+fix-ladino-and-kurdisch:
+	make $(patsubst %.id.gz,%.srcid-backup.gz,\
+		$(wildcard ${RELEASEDIR}/kur-*/train.id.gz) \
+		$(wildcard ${RELEASEDIR}/lad-*/train.id.gz))
+	make $(patsubst %.id.gz,%.trgid-backup.gz,\
+		$(wildcard ${RELEASEDIR}/*-kur/train.id.gz) \
+		$(wildcard ${RELEASEDIR}/*-lad/train.id.gz))
+
+## fix langids that are incorrect on the source side of the training data
+
+${RELEASEDIR}/%/train.srcid-backup.gz:
+	mv ${@:srcid-backup.gz=id.gz} $@
+	${GZIP} -cd < ${@:srcid-backup.gz=src.gz} \
+	| langscript -3 -r -D -l $(firstword $(subst -, ,$(patsubst ${RELEASEDIR}/%/train.srcid-backup.gz,%,$@))) \
+	${FIXLANGIDS} > $@.srcid
+	${GZIP} -cd $@ | cut -f1 > $@.corpus
+	${GZIP} -cd $@ | cut -f3 > $@.trgid
+	paste $@.corpus $@.srcid $@.trgid | ${GZIP} -c > ${@:srcid-backup.gz=id.gz}
+	rm -f $@.corpus $@.srcid $@.trgid
+	touch $@
+
+## fix langids that are incorrect on the target side of the training data
+
+${RELEASEDIR}/%/train.trgid-backup.gz:
+	mv ${@:trgid-backup.gz=id.gz} $@
+	${GZIP} -cd < ${@:trgid-backup.gz=trg.gz} \
+	| langscript -3 -r -D -l $(lastword $(subst -, ,$(patsubst ${RELEASEDIR}/%/train.trgid-backup.gz,%,$@))) \
+	${FIXLANGIDS} > $@.trgid
+	${GZIP} -cd $@ | cut -f1,2 > $@.corpus
+	paste $@.corpus $@.trgid | ${GZIP} -c > ${@:trgid-backup.gz=id.gz}
+	rm -f $@.corpus $@.trgid
+	touch $@
+
+
+
+
 
 DEPRECATED/${RELEASEDIR}/%/train.id.gz:
 	@echo "make train data for ${patsubst ${RELEASEDIR}/%/train.id.gz,%,$@}"
@@ -1105,10 +1143,17 @@ ${RELEASEDIR}/%/test.id:
 	mkdir -p ${dir $@}
 	cat ${patsubst ${RELEASEDIR}/%/test.id,${DEVTESTDIR}/%,$@}/test-*.txt |\
 	sed "s/ *\t/\t/g;s/ *$$//" | sort -u > $@.merged
-	cut -f1,2 $@.merged > $@
+	cut -f3 $@.merged | langscript -3 -r -D \
+		-l $(firstword $(subst -, , $(patsubst ${RELEASEDIR}/%/test.id,%,$@))) \
+		${FIXLANGIDS} > $@.srcid
+	cut -f4 $@.merged | langscript -3 -r -D \
+		-l $(lastword $(subst -, , $(patsubst ${RELEASEDIR}/%/test.id,%,$@))) \
+		${FIXLANGIDS} > $@.trgid
+	paste $@.srcid $@.trgid  > $@
+#	cut -f1,2 [email protected] > $@
 	cut -f3 [email protected] > ${dir $@}test.src
 	cut -f4 [email protected] > ${dir $@}test.trg
-	rm -f $@.merged
+	rm -f [email protected] [email protected] [email protected]
 
 ## dev data in the release: merge all cumulated dev data in data/devtest
 
@@ -1118,11 +1163,17 @@ ${RELEASEDIR}/%/dev.id:
 	-cat ${patsubst ${RELEASEDIR}/%/dev.id,${DEVTESTDIR}/%,$@}/dev-*.txt |\
 	sed "s/ *\t/\t/g;s/ *$$//" | sort -u > $@.merged
 	if [ -s $@.merged ]; then \
-	  cut -f1,2 $@.merged > $@; \
+	  cut -f3 $@.merged | langscript -3 -r -D \
+		-l $(firstword $(subst -, , $(patsubst ${RELEASEDIR}/%/dev.id,%,$@))) \
+		${FIXLANGIDS} > $@.srcid; \
+	  cut -f4 $@.merged | langscript -3 -r -D \
+		-l $(lastword $(subst -, , $(patsubst ${RELEASEDIR}/%/dev.id,%,$@))) \
+		${FIXLANGIDS} > $@.trgid; \
+	  paste $@.srcid $@.trgid  > $@; \
 	  cut -f3 $@.merged > ${dir $@}dev.src; \
 	  cut -f4 $@.merged > ${dir $@}dev.trg; \
 	fi
-	rm -f $@.merged
+	rm -f $@.merged $@.srcid $@.trgid
 
 
 ## add test and dev data from the Tatoeba release

diff --git a/README-v2023-09-26.md b/README-v2023-09-26.md
@@ -2,15 +2,16 @@
 
 # The Tatoeba Translation Challenge (v2023-09-26)
 
-This is a challenge set for machine translation that contains 33G translation units in 4,083 bitexts covering 488 languages. The package includes a release of 665 test sets derived from [Tatoeba.org](https://tatoeba.org) that cover 139 languages.
+This is a challenge set for machine translation that contains 32G translation units in 4,024 bitexts covering 487 languages. The package includes a release of 657 test sets derived from [Tatoeba.org](https://tatoeba.org) that cover 138 languages.
 
 * Benchmark for realistic low-resource scenarios
 * [Training](data/README.md), [development](https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/dev.tar) and [test data](https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test.tar) 
 * [Baseline models](results/tatoeba-models-all.md) and [results](results/tatoeba-results-all.md) ([training procedures](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/doc/TatoebaChallenge.md))
 * [Ideal for multilingual models and transfer learning](results/tatoeba-results-langgroup.md)
+* New: [The OPUS-MT leaderboard](https://opus.nlpl.eu/dashboard/)
 * New: [The status of available NMT models on a map](https://opus.nlpl.eu/NMT-map/Tatoeba/all/src2trg/) (for release v2020-07-28)
 
-[![NMT map](images/NMT-map-small.png)](https://opus.nlpl.eu/NMT-map/Tatoeba/all/src2trg/)
+[![NMT map](images/NMT-map-small.png)](https://opus.nlpl.eu/NMT-map/Tatoeba-all/src2trg/)
 
 
 ## Tasks
@@ -22,14 +23,15 @@ This is a challenge set for machine translation that contains 33G translation un
 
 ## Downloads
 
-* [All test data](https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test.tar) ([individual files](https://github.com/Helsinki-NLP/Tatoeba-Challenge/tree/master/data/release/test))
-* [All development data](https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/dev.tar) ([individual files](https://github.com/Helsinki-NLP/Tatoeba-Challenge/tree/master/data/release/dev))
+* [All test data](https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test.tar) ([individual files](data/release/test))
+* [All development data](https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/dev.tar) ([individual files](data/release/dev))
 * [Bilingual training data](data/README-v2023-09-26.md), language-pair specific downloads
 * [Extra bilingual training data](data/subsets/NoTestData-v2023-09-26.md), language-pair specific downloads
 * [Monolingual data sets](data/MonolingualData.md), [with document boundaries](data/Wiki.md), [de-duplicated and shuffled](data/Wiki.md)
-* [Incrementally updated development and test data](https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/devtest.tar), ([here for individual language pairs](https://github.com/Helsinki-NLP/Tatoeba-Challenge/tree/master/data/devtest))
+* [Incrementally updated development and test data](https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/devtest.tar), ([here for individual language pairs](data/devtest))
 * [Release history](data/Releases.md)
 * NEW: [Automatically translated monolingual data](data/Backtranslations.md)
+* NEW: [Pre-trained sentence piece models](https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/tatoeba/SentencePieceModels.md)
 
 The latest release also includes some parallel data sets in the same language in order to test paraphrase models. Note, however, that the support for paraphrasing is really limited in our data sets.
 
@@ -60,7 +62,7 @@ Please, cite the following paper if you use data and models from this distributi
 
 ## Data releases
 
-The current release includes data for 4,083 language pairs covering 488 languages.
+The current release includes data for 4,024 language pairs covering 487 languages.
 The data sets are released per language pair with the following structure (using deu-eng as an example):
 
 ```
@@ -80,7 +82,7 @@ Files with the extension `.src` refer to sentences in the source language (`deu`
 
 Other notes about the compilation of the data sets can be found in [Development.md](doc/Development.md) and the complete lists of language pairs is in [data/README.md](data/README.md).
 
-New releases are planned in the future and will be announced here. Development and test data will be updated regularly but the original test sets will stay in the release. Updates of the test data will be available through this [devtest release](https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/devtest.tar) and will not include any examples available in development data. Those data sets are also available from this git repository in the sub directory [data/devtest/](https://github.com/Helsinki-NLP/Tatoeba-Challenge/tree/master/data/devtest).
+New releases are planned in the future and will be announced here. Development and test data will be updated regularly but the original test sets will stay in the release. Updates of the test data will be available through this [devtest release](https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/devtest.tar) and will not include any examples available in development data. Those data sets are also available from this git repository in the sub directory [data/devtest/](data/devtest).
 
 
 ## The translation challenge
@@ -124,6 +126,15 @@ Challenge subset results (v2023-09-26):
 * results for the [higher resource language pairs](results/tatoeba-results-v2023-09-26-subset-higher.md)
 * results for the [highest resource language pairs](results/tatoeba-results-v2023-09-26-subset-highest.md)
 
+Challenge subset results (v2021-08-07):
+
+* results for the [zero-shot language pairs](results/tatoeba-results-v2021-08-07-subset-zero.md)
+* results for the [lowest resource language pairs](results/tatoeba-results-v2021-08-07-subset-lowest.md)
+* results for the [lower resource language pairs](results/tatoeba-results-v2021-08-07-subset-lower.md)
+* results for the [medium resource language pairs](results/tatoeba-results-v2021-08-07-subset-medium.md)
+* results for the [higher resource language pairs](results/tatoeba-results-v2021-08-07-subset-higher.md)
+* results for the [highest resource language pairs](results/tatoeba-results-v2021-08-07-subset-highest.md)
+
 Challenge subset results (v2020-07-28):
 
 * results for the [zero-shot language pairs](results/tatoeba-results-v2020-07-28-subset-zero.md)
@@ -133,7 +144,7 @@ Challenge subset results (v2020-07-28):
 * results for the [higher resource language pairs](results/tatoeba-results-v2020-07-28-subset-higher.md)
 * results for the [highest resource language pairs](results/tatoeba-results-v2020-07-28-subset-highest.md)
 
-We publish (reasonable) models to be re-used and deployed through [OPUS-MT](https://github.com/Helsinki-NLP/Opus-MT) and linked from the [model subdir in this github](https://github.com/Helsinki-NLP/Tatoeba-Challenge/tree/master/models). This includes multilingual models that cover several languages in source and target to enable transfer learning across languages.
+We publish (reasonable) models to be re-used and deployed through [OPUS-MT](https://github.com/Helsinki-NLP/Opus-MT) and linked from the [model subdir in this github](models). This includes multilingual models that cover several languages in source and target to enable transfer learning across languages.
 
 
 

diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 # The Tatoeba Translation Challenge (v2023-09-26)
 
-This is a challenge set for machine translation that contains 33G translation units in 4,083 bitexts covering 488 languages. The package includes a release of 665 test sets derived from [Tatoeba.org](https://tatoeba.org) that cover 139 languages.
+This is a challenge set for machine translation that contains 32G translation units in 4,024 bitexts covering 487 languages. The package includes a release of 657 test sets derived from [Tatoeba.org](https://tatoeba.org) that cover 138 languages.
 
 * Benchmark for realistic low-resource scenarios
 * [Training](data/README.md), [development](https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/dev.tar) and [test data](https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test.tar) 
@@ -62,7 +62,7 @@ Please, cite the following paper if you use data and models from this distributi
 
 ## Data releases
 
-The current release includes data for 4,083 language pairs covering 488 languages.
+The current release includes data for 4,024 language pairs covering 487 languages.
 The data sets are released per language pair with the following structure (using deu-eng as an example):
 
 ```