Skip to content

Commit

Permalink
preparing new version of 2023
Browse files Browse the repository at this point in the history
  • Loading branch information
jorgtied committed Oct 2, 2023
1 parent 55a62fe commit ae3b62b
Show file tree
Hide file tree
Showing 4 changed files with 274 additions and 11 deletions.
188 changes: 178 additions & 10 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
##

SHELL := bash
CPU_MODULES := parallel

## OPUS home directory and language code conversion tools
## OPUSMT_HOMEDIR: local copy of Opus-MT-train project
Expand All @@ -73,6 +74,11 @@ TATOEBA_VERSION = v2023-04-12
# TATOEBA_VERSION = v20190709


## previous release (used for merging with new release)

PREVIOUS_VERSION = v2021-08-07


## maximum size of dev and test sets
## (currently MAX_TESTSIZE is not used in his makefile)

Expand Down Expand Up @@ -120,7 +126,7 @@ GET_ISO_CODE = ${ISO639} -m
OPUSREAD_ARGS =

THREADS ?= 4
SORT = sort -T ${TMPDIR} --parallel=${THREADS}
SORT = sort -T ${TMPDIR} -S50M --parallel=${THREADS}
SHUFFLE = ${shell which terashuf 2>/dev/null}
ifeq (${SHUFFLE},)
SHUFFLE = ${SORT} --random-sort
Expand All @@ -129,6 +135,10 @@ GZIP := ${shell which pigz 2>/dev/null}
GZIP ?= gzip
ZCAT := ${GZIP} -cd

PARALLEL_ARGS := --pipe --keep-order -q -L10000 --max-procs 25%
PARALLEL := ${shell if [ `which parallel 2>/dev/null | wc -l` -gt 0 ]; then echo 'parallel ${PARALLEL_ARGS}'; fi }


## basic training data filtering pipeline

BASIC_FILTERS = | perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' \
Expand Down Expand Up @@ -170,6 +180,10 @@ TESTRELEASEDIR = ${RELEASEHOME}/test/${VERSION}
DEVRELEASEDIR = ${RELEASEHOME}/dev/${VERSION}
INFODIR = ${RELEASEDIR}


PREVIOUS_RELEASEDIR = ${RELEASEHOME}/${PREVIOUS_VERSION}


## additional data directories for
## - incremental updates of dev/test data (DEVTESTDIR)
## - latest test and dev data sets
Expand Down Expand Up @@ -300,8 +314,12 @@ all: opus-langpairs3.txt
${MAKE} release-tag


## TODO: there is some kind of memory-leak somewhere that causes jobs to crahs
## some dataset seems to require a lot of memory (langid or shuffling?)

release-job:
${MAKE} HPC_MEM=32g HPC_CORES=16 HPC_DISK=1000 HPC_TIME=3-00 release.submit
${MAKE} HPC_MEM=128g HPC_CORES=16 HPC_DISK=1000 HPC_TIME=3-00 release.submit
# ${MAKE} HPC_MEM=32g HPC_CORES=16 HPC_DISK=1000 HPC_TIME=3-00 release.submit
# ${MAKE} HPC_MEM=32g HPC_CORES=16 HPC_DISK=1000 HPC_TIME=14-00 HPC_QUEUE=longrun release.submit

release:
Expand Down Expand Up @@ -427,6 +445,46 @@ traindata: ${TRAIN_DATA}
testdata: ${TEST_DATA}
devdata: ${DEV_DATA}



## divide into several jobs for creating training data
## each one for 500 language pairs
## TODO: more generic sharding
## TODO: separate big from smaller language pairs (difficult!)

TRAIN_DATA_1_500 := $(wordlist 1,500,${TRAIN_DATA})
TRAIN_DATA_501_1000 := $(wordlist 501,1000,${TRAIN_DATA})
TRAIN_DATA_1001_1500 := $(wordlist 1001,1500,${TRAIN_DATA})
TRAIN_DATA_1501_2000 := $(wordlist 1501,2000,${TRAIN_DATA})
TRAIN_DATA_2001_2500 := $(wordlist 2001,2500,${TRAIN_DATA})
TRAIN_DATA_2501_3000 := $(wordlist 2501,3000,${TRAIN_DATA})
TRAIN_DATA_3001_3500 := $(wordlist 3001,3500,${TRAIN_DATA})
TRAIN_DATA_3501_end := $(wordlist 3501,$(words ${TRAIN_DATA}),${TRAIN_DATA})

TRAIN_DATA_LAST500 = $(wordlist $(shell echo $$(( $(words ${TRAIN_DATA}) - 500 ))),\
$(words ${TRAIN_DATA}),\
${TRAIN_DATA})

traindata-jobs:
${MAKE} HPC_MEM=32g HPC_CORES=8 HPC_DISK=500 HPC_TIME=3-00 traindata_1_500.submit
${MAKE} HPC_MEM=32g HPC_CORES=8 HPC_DISK=500 HPC_TIME=3-00 traindata_501_1000.submit
${MAKE} HPC_MEM=32g HPC_CORES=8 HPC_DISK=500 HPC_TIME=3-00 traindata_1001_1500.submit
${MAKE} HPC_MEM=32g HPC_CORES=8 HPC_DISK=500 HPC_TIME=3-00 traindata_1501_2000.submit
${MAKE} HPC_MEM=32g HPC_CORES=8 HPC_DISK=500 HPC_TIME=3-00 traindata_2001_2500.submit
${MAKE} HPC_MEM=32g HPC_CORES=8 HPC_DISK=500 HPC_TIME=3-00 traindata_2501_3000.submit
${MAKE} HPC_MEM=32g HPC_CORES=8 HPC_DISK=500 HPC_TIME=3-00 traindata_3001_3500.submit
${MAKE} HPC_MEM=32g HPC_CORES=8 HPC_DISK=500 HPC_TIME=3-00 traindata_3501_end.submit

traindata_1_500: ${TRAIN_DATA_1_500}
traindata_501_1000: ${TRAIN_DATA_501_1000}
traindata_1001_1500: ${TRAIN_DATA_1001_1500}
traindata_1501_2000: ${TRAIN_DATA_1501_2000}
traindata_2001_2500: ${TRAIN_DATA_2001_2500}
traindata_2501_3000: ${TRAIN_DATA_2501_3000}
traindata_3001_3500: ${TRAIN_DATA_3001_3500}
traindata_3501_end: ${TRAIN_DATA_3501_end}


## this is for regular updates of testdata with new Tatoeba releases in OPUS
## call `make update-testdata`
update update-testdata: ${UPDATED_TEST_DATA}
Expand Down Expand Up @@ -765,22 +823,29 @@ FIXLANGIDS = | sed 's/ze_zh/zh/g;s/_Hani//g;s/-han[st]//g;s/zht/zh_TW/g;s/zhs/zh

## create training data by concatenating all data sets
## using normalized language codes (macro-languages)
## TODO: this assumes that we have plain text moses files for all corpora

# test-create-train: ${RELEASEDIR}/afr-heb/train.id.gz
test-create-train: ${RELEASEDIR}/nld-uzb/train.id.gz

OPUS_API := https://opus.nlpl.eu/opusapi/
OPUS_API_MOSES := ${OPUS_API}?preprocessing=moses&version=latest
UNICODE_CLEANUP := perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;s/\&\s*\#\s*160\s*\;/ /g;'

${RELEASEDIR}/%/train.id.gz:
@echo "make train data for ${patsubst ${RELEASEDIR}/%/train.id.gz,%,$@}"
@rm -f $@.tmp1 $@.tmp2
@mkdir -p ${dir $@}train.d
@( l=${patsubst ${RELEASEDIR}/%/train.id.gz,%,$@}; \
s=${firstword ${subst -, ,${patsubst ${RELEASEDIR}/%/train.id.gz,%,$@}}}; \
t=${lastword ${subst -, ,${patsubst ${RELEASEDIR}/%/train.id.gz,%,$@}}};
t=${lastword ${subst -, ,${patsubst ${RELEASEDIR}/%/train.id.gz,%,$@}}}; \
E=`${SCRIPTDIR}/find_opus_langs.pl $$s ${OPUS_LANGS}`; \
F=`${SCRIPTDIR}/find_opus_langs.pl $$t ${OPUS_LANGS}`; \
for e in $$E; do \
for f in $$F; do \
if [ $$e == $$f ]; then a=$${e}1;b=$${f}2; \
else a=$${e};b=$${f}; fi; \
for z in `wget -O - -q "https://opus.nlpl.eu/opusapi/?source=$$e&target=$$f&preprocessing=moses&version=latest" | sed 's/^.*\[//;s/\].*$$//' | tr ',' "\n" | sed 's/"//g' | grep 'url:' | cut -f2- -d:`; do \
for z in `wget -O - -q "${OPUS_API_MOSES}&source=$$e&target=$$f" | sed 's/^.*\[//;s/\].*$$//' | tr ',' "\n" | sed 's/"//g' | grep 'url:' | cut -f2- -d:`; do \
c=`echo "$$z" | cut -f4 -d/ | sed 's/^OPUS-//'`; \
v=`echo "$$z" | cut -f5 -d/`; \
if [ `echo '${EXCLUDE_CORPORA}' | tr ' ' "\n" | grep "$$c" | wc -l` -eq 0 ]; then \
Expand All @@ -792,8 +857,8 @@ ${RELEASEDIR}/%/train.id.gz:
${SCRIPTDIR}/bitext-match-lang.py -s $$e -t $$f > $@.tmp2; \
rm -f ${dir $@}train.d/*; \
if [ -e $@.tmp2 ]; then \
cut -f1 $@.tmp2 ${FIXLANGIDS} | langscript -3 -l $$e -r -D > $@.tmp2srcid; \
cut -f2 $@.tmp2 ${FIXLANGIDS} | langscript -3 -l $$f -r -D > $@.tmp2trgid; \
cut -f1 $@.tmp2 | ${PARALLEL} langscript -3 -l $$e -r -D ${FIXLANGIDS} > $@.tmp2srcid; \
cut -f2 $@.tmp2 | ${PARALLEL} langscript -3 -l $$f -r -D ${FIXLANGIDS} > $@.tmp2trgid; \
paste $@.tmp2srcid $@.tmp2trgid $@.tmp2 | sed "s/^/$$c-$$v /" >> $@.tmp1; \
rm -f $@.tmp2 $@.tmp2srcid $@.tmp2trgid; \
fi \
Expand All @@ -804,17 +869,118 @@ ${RELEASEDIR}/%/train.id.gz:
done \
done \
)
if [ -s $@.tmp1 ]; then \
${SHUFFLE} < $@.tmp1 |\
scripts/exclude-devtest.pl -a -l \

## merge with previous releases if PREVIOUS_VERSION is set

ifdef PREVIOUS_VERSION

-${MAKE} $(patsubst ${RELEASEDIR}/%,${PREVIOUS_RELEASEDIR}/%,$@)
@if [ -e $(patsubst ${RELEASEDIR}/%,${PREVIOUS_RELEASEDIR}/%,$@) ]; then \
echo "merge with previous version!"; \
l=${patsubst ${RELEASEDIR}/%/train.id.gz,%,$@}; \
s=${firstword ${subst -, ,${patsubst ${RELEASEDIR}/%/train.id.gz,%,$@}}}; \
t=${lastword ${subst -, ,${patsubst ${RELEASEDIR}/%/train.id.gz,%,$@}}}; \
paste <(gzip -cd $(patsubst ${RELEASEDIR}/%.id.gz,${PREVIOUS_RELEASEDIR}/%.src.gz,$@)) \
<(gzip -cd $(patsubst ${RELEASEDIR}/%.id.gz,${PREVIOUS_RELEASEDIR}/%.trg.gz,$@)) > [email protected]; \
${GZIP} -cd $(patsubst ${RELEASEDIR}/%,${PREVIOUS_RELEASEDIR}/%,$@) | cut -f1 > [email protected]; \
if [ -e [email protected] ]; then \
cut -f1 [email protected] | ${PARALLEL} langscript -3 -l $$s -r -D ${FIXLANGIDS} > [email protected]; \
cut -f2 [email protected] | ${PARALLEL} langscript -3 -l $$t -r -D ${FIXLANGIDS} > [email protected]; \
paste [email protected] [email protected] [email protected] [email protected] >> [email protected]; \
rm -f [email protected] [email protected] [email protected] [email protected]; \
fi \
fi

endif

## sort, de-duplicate and shuffle the collected data
@if [ -s [email protected] ]; then \
cat [email protected] \
| ${UNICODE_CLEANUP} \
| scripts/exclude-identical.pl \
| ${SORT} -t ' ' -k4,5 -u | ${SHUFFLE} \
| scripts/exclude-devtest.pl -a -l \
${dir $@}test.src ${dir $@}test.trg \
${dir $@}dev.src ${dir $@}dev.trg > $@.tmp2; \
${dir $@}dev.src ${dir $@}dev.trg > [email protected] 2>$(@:.id.gz=.log); \
cut -f4 [email protected] | ${GZIP} -c > ${dir $@}train.src.gz; \
cut -f5 [email protected] | ${GZIP} -c > ${dir $@}train.trg.gz; \
cut -f1,2,3 [email protected] | ${GZIP} -c > $@; \
fi
rm -f [email protected] [email protected]
rmdir ${dir $@}train.d
touch $(@:.id.gz=.corrected)




## re-run the filter of excluding dev and test data
## (in case this was not correctly done before, e.g. because the files were not there yet)

# TO_BE_CORRECTED = $(patsubst %.id.gz,%.corrected,\
# $(wildcard ${RELEASEDIR}/ara-*/train.id.gz) $(wildcard ${RELEASEDIR}/afr-*/train.id.gz))

TO_BE_CORRECTED = $(patsubst %.id.gz,%.corrected,$(wildcard ${RELEASEDIR}/*/train.id.gz))

%-missing-traindata:
${MAKE} ${RELEASEDIR}/$(@:-missing-traindata=)/train.id.gz

correct-files:
${MAKE} HPC_MEM=32g HPC_CORES=8 HPC_DISK=500 HPC_TIME=1-00 THREADS=8 fra-ita-missing-traindata.submit
${MAKE} HPC_MEM=64g HPC_CORES=16 HPC_DISK=1000 HPC_TIME=1-12 THREADS=8 correct-train-files.submit
${MAKE} HPC_MEM=32g HPC_CORES=8 HPC_DISK=500 HPC_TIME=1-00 THREADS=8 fra-rus-missing-traindata.submit
${MAKE} HPC_MEM=16g HPC_CORES=4 HPC_DISK=500 HPC_TIME=1-00 THREADS=4 deu-est-missing-traindata.submit
${MAKE} HPC_MEM=16g HPC_CORES=4 HPC_DISK=500 HPC_TIME=1-00 THREADS=4 eng-est-missing-traindata.submit
${MAKE} HPC_MEM=16g HPC_CORES=4 HPC_DISK=500 HPC_TIME=1-00 THREADS=4 est-fin-missing-traindata.submit
${MAKE} HPC_MEM=16g HPC_CORES=4 HPC_DISK=500 HPC_TIME=1-00 THREADS=4 est-fra-missing-traindata.submit
${MAKE} HPC_MEM=16g HPC_CORES=4 HPC_DISK=500 HPC_TIME=1-00 THREADS=4 est-rus-missing-traindata.submit
${MAKE} HPC_MEM=16g HPC_CORES=4 HPC_DISK=500 HPC_TIME=1-00 THREADS=4 est-swe-missing-traindata.submit

correct-train-files: ${TO_BE_CORRECTED}

${RELEASEDIR}/%/train.corrected:
paste <(gzip -cd ${dir $@}train.id.gz) \
<(gzip -cd ${dir $@}train.src.gz) \
<(gzip -cd ${dir $@}train.trg.gz) \
| ${UNICODE_CLEANUP} \
| scripts/exclude-identical.pl \
| ${SORT} -t ' ' -k4,5 -u | ${SHUFFLE} \
| scripts/exclude-devtest.pl -a -l \
${dir $@}test.src ${dir $@}test.trg \
${dir $@}dev.src ${dir $@}dev.trg > $@.tmp2
cut -f4 $@.tmp2 | ${GZIP} -c > $@.src.gz
cut -f5 $@.tmp2 | ${GZIP} -c > $@.trg.gz
cut -f1,2,3 $@.tmp2 | ${GZIP} -c > $@.id.gz
rm -f $@.tmp2
if [ `zcat $@.id.gz | wc -l` -eq `zcat ${dir $@}train.id.gz | wc -l` ]; then \
echo "no change! delete corrected files!"; \
rm -f $@.src.gz $@.trg.gz $@.id.gz; \
else \
mv ${dir $@}train.id.gz ${dir $@}train.backup.id.gz; \
mv ${dir $@}train.src.gz ${dir $@}train.backup.src.gz; \
mv ${dir $@}train.trg.gz ${dir $@}train.backup.trg.gz; \
mv $@.id.gz ${dir $@}train.id.gz; \
mv $@.src.gz ${dir $@}train.src.gz; \
mv $@.trg.gz ${dir $@}train.trg.gz; \
fi
touch $@




## download the previous release

ifdef PREVIOUS_VERSION

PREVIOUS_DOWNLOAD_BASE_URL := https://object.pouta.csc.fi/Tatoeba-Challenge-${PREVIOUS_VERSION}

${PREVIOUS_RELEASEDIR}/%/train.id.gz:
@wget -q $(patsubst ${PREVIOUS_RELEASEDIR}/%/train.id.gz,${PREVIOUS_DOWNLOAD_BASE_URL}/%.tar,$@)
@if [ -e $(patsubst ${PREVIOUS_RELEASEDIR}/%/train.id.gz,%.tar,$@) ]; then \
tar -xf $(patsubst ${PREVIOUS_RELEASEDIR}/%/train.id.gz,%.tar,$@); \
rm -f $(patsubst ${PREVIOUS_RELEASEDIR}/%/train.id.gz,%.tar,$@); \
fi

endif



Expand Down Expand Up @@ -1904,7 +2070,9 @@ ifdef EMAIL
endif
ifeq (${shell hostname --domain 2>/dev/null},bullx)
echo '#SBATCH --account=${CSCPROJECT}' >> $@
ifneq (${HPC_DISK},)
echo '#SBATCH --gres=nvme:${HPC_DISK}' >> $@
endif
endif
echo '#SBATCH -n ${HPC_CORES}' >> $@
echo '#SBATCH -N ${HPC_NODES}' >> $@
Expand Down
1 change: 0 additions & 1 deletion scripts/exclude-devtest.pl
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,6 @@
printf STDERR "# skipped %d/%d (%5.2f\%) training examples from $langpair\n",$skipped,$count,$skipped/$count*100;
}


sub make_key{
my $key = $_[0].$_[1];
$key=~s/\P{IsAlpha}//gs if ($AlphaOnly);
Expand Down
48 changes: 48 additions & 0 deletions scripts/exclude-identical.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#!/usr/bin/env perl
#

use strict;
use open qw/:std :utf8/;
use Getopt::Long;

my $AlphaOnly = 0;
my $LowerCase = 0;
my $verbose = 0;

GetOptions(
"alpha|a" => \$AlphaOnly,
"lower-case|l" => \$LowerCase,
"verbose|v" => \$verbose );


my $count = 0;
my $skipped = 0;
my $kept = 0;
while (<>){
$count++;
chomp;
my ($corpus,$lang1,$lang2,$src,$trg) = split(/\t/);

$src=~s/\P{IsAlpha}//gs if ($AlphaOnly);
$src = lc($src) if ($LowerCase);
$trg=~s/\P{IsAlpha}//gs if ($AlphaOnly);
$trg = lc($trg) if ($LowerCase);

if ($src eq $trg){
$skipped++;
}
else{
$kept++;
print "$_\n";
}

print STDERR '.' if (! ($count % 10000));
print STDERR " $count ($skipped)\n" if (! ($count % 500000));
}


if ($count){
printf STDERR "\n# kept pairs of non-identical sentences %d/%d (%5.2f\%)\n",$kept,$count,$kept/$count*100;
printf STDERR "# skipped pairs of identical sentences %d/%d (%5.2f\%)\n",$skipped,$count,$skipped/$count*100;
}

48 changes: 48 additions & 0 deletions scripts/extract-identical.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#!/usr/bin/env perl
#

use strict;
use open qw/:std :utf8/;
use Getopt::Long;

my $AlphaOnly = 0;
my $LowerCase = 0;
my $verbose = 0;

GetOptions(
"alpha|a" => \$AlphaOnly,
"lower-case|l" => \$LowerCase,
"verbose|v" => \$verbose );


my $count = 0;
my $skipped = 0;
my $kept = 0;
while (<>){
$count++;
chomp;
my ($corpus,$lang1,$lang2,$src,$trg) = split(/\t/);

$src=~s/\P{IsAlpha}//gs if ($AlphaOnly);
$src = lc($src) if ($LowerCase);
$trg=~s/\P{IsAlpha}//gs if ($AlphaOnly);
$trg = lc($trg) if ($LowerCase);

if ($src eq $trg){
$kept++;
print "$_\n";
}
else{
$skipped++;
}

print STDERR '.' if (! ($count % 10000));
print STDERR " $count ($kept)\n" if (! ($count % 500000));
}


if ($count){
printf STDERR "\n# kept %d/%d (%5.2f\%)\n",$kept,$count,$kept/$count*100;
printf STDERR "# skipped %d/%d (%5.2f\%)\n",$skipped,$count,$skipped/$count*100;
}

0 comments on commit ae3b62b

Please sign in to comment.