-
Notifications
You must be signed in to change notification settings - Fork 92
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
274 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -52,6 +52,7 @@ | |
## | ||
|
||
SHELL := bash | ||
CPU_MODULES := parallel | ||
|
||
## OPUS home directory and language code conversion tools | ||
## OPUSMT_HOMEDIR: local copy of Opus-MT-train project | ||
|
@@ -73,6 +74,11 @@ TATOEBA_VERSION = v2023-04-12 | |
# TATOEBA_VERSION = v20190709 | ||
|
||
|
||
## previous release (used for merging with new release) | ||
|
||
PREVIOUS_VERSION = v2021-08-07 | ||
|
||
|
||
## maximum size of dev and test sets | ||
## (currently MAX_TESTSIZE is not used in his makefile) | ||
|
||
|
@@ -120,7 +126,7 @@ GET_ISO_CODE = ${ISO639} -m | |
OPUSREAD_ARGS = | ||
|
||
THREADS ?= 4 | ||
SORT = sort -T ${TMPDIR} --parallel=${THREADS} | ||
SORT = sort -T ${TMPDIR} -S50M --parallel=${THREADS} | ||
SHUFFLE = ${shell which terashuf 2>/dev/null} | ||
ifeq (${SHUFFLE},) | ||
SHUFFLE = ${SORT} --random-sort | ||
|
@@ -129,6 +135,10 @@ GZIP := ${shell which pigz 2>/dev/null} | |
GZIP ?= gzip | ||
ZCAT := ${GZIP} -cd | ||
|
||
PARALLEL_ARGS := --pipe --keep-order -q -L10000 --max-procs 25% | ||
PARALLEL := ${shell if [ `which parallel 2>/dev/null | wc -l` -gt 0 ]; then echo 'parallel ${PARALLEL_ARGS}'; fi } | ||
|
||
|
||
## basic training data filtering pipeline | ||
|
||
BASIC_FILTERS = | perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' \ | ||
|
@@ -170,6 +180,10 @@ TESTRELEASEDIR = ${RELEASEHOME}/test/${VERSION} | |
DEVRELEASEDIR = ${RELEASEHOME}/dev/${VERSION} | ||
INFODIR = ${RELEASEDIR} | ||
|
||
|
||
PREVIOUS_RELEASEDIR = ${RELEASEHOME}/${PREVIOUS_VERSION} | ||
|
||
|
||
## additional data directories for | ||
## - incremental updates of dev/test data (DEVTESTDIR) | ||
## - latest test and dev data sets | ||
|
@@ -300,8 +314,12 @@ all: opus-langpairs3.txt | |
${MAKE} release-tag | ||
|
||
|
||
## TODO: there is some kind of memory-leak somewhere that causes jobs to crahs | ||
## some dataset seems to require a lot of memory (langid or shuffling?) | ||
|
||
release-job: | ||
${MAKE} HPC_MEM=32g HPC_CORES=16 HPC_DISK=1000 HPC_TIME=3-00 release.submit | ||
${MAKE} HPC_MEM=128g HPC_CORES=16 HPC_DISK=1000 HPC_TIME=3-00 release.submit | ||
# ${MAKE} HPC_MEM=32g HPC_CORES=16 HPC_DISK=1000 HPC_TIME=3-00 release.submit | ||
# ${MAKE} HPC_MEM=32g HPC_CORES=16 HPC_DISK=1000 HPC_TIME=14-00 HPC_QUEUE=longrun release.submit | ||
|
||
release: | ||
|
@@ -427,6 +445,46 @@ traindata: ${TRAIN_DATA} | |
testdata: ${TEST_DATA} | ||
devdata: ${DEV_DATA} | ||
|
||
|
||
|
||
## divide into several jobs for creating training data | ||
## each one for 500 language pairs | ||
## TODO: more generic sharding | ||
## TODO: separate big from smaller language pairs (difficult!) | ||
|
||
TRAIN_DATA_1_500 := $(wordlist 1,500,${TRAIN_DATA}) | ||
TRAIN_DATA_501_1000 := $(wordlist 501,1000,${TRAIN_DATA}) | ||
TRAIN_DATA_1001_1500 := $(wordlist 1001,1500,${TRAIN_DATA}) | ||
TRAIN_DATA_1501_2000 := $(wordlist 1501,2000,${TRAIN_DATA}) | ||
TRAIN_DATA_2001_2500 := $(wordlist 2001,2500,${TRAIN_DATA}) | ||
TRAIN_DATA_2501_3000 := $(wordlist 2501,3000,${TRAIN_DATA}) | ||
TRAIN_DATA_3001_3500 := $(wordlist 3001,3500,${TRAIN_DATA}) | ||
TRAIN_DATA_3501_end := $(wordlist 3501,$(words ${TRAIN_DATA}),${TRAIN_DATA}) | ||
|
||
TRAIN_DATA_LAST500 = $(wordlist $(shell echo $$(( $(words ${TRAIN_DATA}) - 500 ))),\ | ||
$(words ${TRAIN_DATA}),\ | ||
${TRAIN_DATA}) | ||
|
||
traindata-jobs: | ||
${MAKE} HPC_MEM=32g HPC_CORES=8 HPC_DISK=500 HPC_TIME=3-00 traindata_1_500.submit | ||
${MAKE} HPC_MEM=32g HPC_CORES=8 HPC_DISK=500 HPC_TIME=3-00 traindata_501_1000.submit | ||
${MAKE} HPC_MEM=32g HPC_CORES=8 HPC_DISK=500 HPC_TIME=3-00 traindata_1001_1500.submit | ||
${MAKE} HPC_MEM=32g HPC_CORES=8 HPC_DISK=500 HPC_TIME=3-00 traindata_1501_2000.submit | ||
${MAKE} HPC_MEM=32g HPC_CORES=8 HPC_DISK=500 HPC_TIME=3-00 traindata_2001_2500.submit | ||
${MAKE} HPC_MEM=32g HPC_CORES=8 HPC_DISK=500 HPC_TIME=3-00 traindata_2501_3000.submit | ||
${MAKE} HPC_MEM=32g HPC_CORES=8 HPC_DISK=500 HPC_TIME=3-00 traindata_3001_3500.submit | ||
${MAKE} HPC_MEM=32g HPC_CORES=8 HPC_DISK=500 HPC_TIME=3-00 traindata_3501_end.submit | ||
|
||
traindata_1_500: ${TRAIN_DATA_1_500} | ||
traindata_501_1000: ${TRAIN_DATA_501_1000} | ||
traindata_1001_1500: ${TRAIN_DATA_1001_1500} | ||
traindata_1501_2000: ${TRAIN_DATA_1501_2000} | ||
traindata_2001_2500: ${TRAIN_DATA_2001_2500} | ||
traindata_2501_3000: ${TRAIN_DATA_2501_3000} | ||
traindata_3001_3500: ${TRAIN_DATA_3001_3500} | ||
traindata_3501_end: ${TRAIN_DATA_3501_end} | ||
|
||
|
||
## this is for regular updates of testdata with new Tatoeba releases in OPUS | ||
## call `make update-testdata` | ||
update update-testdata: ${UPDATED_TEST_DATA} | ||
|
@@ -765,22 +823,29 @@ FIXLANGIDS = | sed 's/ze_zh/zh/g;s/_Hani//g;s/-han[st]//g;s/zht/zh_TW/g;s/zhs/zh | |
|
||
## create training data by concatenating all data sets | ||
## using normalized language codes (macro-languages) | ||
## TODO: this assumes that we have plain text moses files for all corpora | ||
|
||
# test-create-train: ${RELEASEDIR}/afr-heb/train.id.gz | ||
test-create-train: ${RELEASEDIR}/nld-uzb/train.id.gz | ||
|
||
OPUS_API := https://opus.nlpl.eu/opusapi/ | ||
OPUS_API_MOSES := ${OPUS_API}?preprocessing=moses&version=latest | ||
UNICODE_CLEANUP := perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;s/\&\s*\#\s*160\s*\;/ /g;' | ||
|
||
${RELEASEDIR}/%/train.id.gz: | ||
@echo "make train data for ${patsubst ${RELEASEDIR}/%/train.id.gz,%,$@}" | ||
@rm -f $@.tmp1 $@.tmp2 | ||
@mkdir -p ${dir $@}train.d | ||
@( l=${patsubst ${RELEASEDIR}/%/train.id.gz,%,$@}; \ | ||
s=${firstword ${subst -, ,${patsubst ${RELEASEDIR}/%/train.id.gz,%,$@}}}; \ | ||
t=${lastword ${subst -, ,${patsubst ${RELEASEDIR}/%/train.id.gz,%,$@}}}; | ||
t=${lastword ${subst -, ,${patsubst ${RELEASEDIR}/%/train.id.gz,%,$@}}}; \ | ||
E=`${SCRIPTDIR}/find_opus_langs.pl $$s ${OPUS_LANGS}`; \ | ||
F=`${SCRIPTDIR}/find_opus_langs.pl $$t ${OPUS_LANGS}`; \ | ||
for e in $$E; do \ | ||
for f in $$F; do \ | ||
if [ $$e == $$f ]; then a=$${e}1;b=$${f}2; \ | ||
else a=$${e};b=$${f}; fi; \ | ||
for z in `wget -O - -q "https://opus.nlpl.eu/opusapi/?source=$$e&target=$$f&preprocessing=moses&version=latest" | sed 's/^.*\[//;s/\].*$$//' | tr ',' "\n" | sed 's/"//g' | grep 'url:' | cut -f2- -d:`; do \ | ||
for z in `wget -O - -q "${OPUS_API_MOSES}&source=$$e&target=$$f" | sed 's/^.*\[//;s/\].*$$//' | tr ',' "\n" | sed 's/"//g' | grep 'url:' | cut -f2- -d:`; do \ | ||
c=`echo "$$z" | cut -f4 -d/ | sed 's/^OPUS-//'`; \ | ||
v=`echo "$$z" | cut -f5 -d/`; \ | ||
if [ `echo '${EXCLUDE_CORPORA}' | tr ' ' "\n" | grep "$$c" | wc -l` -eq 0 ]; then \ | ||
|
@@ -792,8 +857,8 @@ ${RELEASEDIR}/%/train.id.gz: | |
${SCRIPTDIR}/bitext-match-lang.py -s $$e -t $$f > $@.tmp2; \ | ||
rm -f ${dir $@}train.d/*; \ | ||
if [ -e $@.tmp2 ]; then \ | ||
cut -f1 $@.tmp2 ${FIXLANGIDS} | langscript -3 -l $$e -r -D > $@.tmp2srcid; \ | ||
cut -f2 $@.tmp2 ${FIXLANGIDS} | langscript -3 -l $$f -r -D > $@.tmp2trgid; \ | ||
cut -f1 $@.tmp2 | ${PARALLEL} langscript -3 -l $$e -r -D ${FIXLANGIDS} > $@.tmp2srcid; \ | ||
cut -f2 $@.tmp2 | ${PARALLEL} langscript -3 -l $$f -r -D ${FIXLANGIDS} > $@.tmp2trgid; \ | ||
paste $@.tmp2srcid $@.tmp2trgid $@.tmp2 | sed "s/^/$$c-$$v /" >> $@.tmp1; \ | ||
rm -f $@.tmp2 $@.tmp2srcid $@.tmp2trgid; \ | ||
fi \ | ||
|
@@ -804,17 +869,118 @@ ${RELEASEDIR}/%/train.id.gz: | |
done \ | ||
done \ | ||
) | ||
if [ -s $@.tmp1 ]; then \ | ||
${SHUFFLE} < $@.tmp1 |\ | ||
scripts/exclude-devtest.pl -a -l \ | ||
|
||
## merge with previous releases if PREVIOUS_VERSION is set | ||
|
||
ifdef PREVIOUS_VERSION | ||
|
||
-${MAKE} $(patsubst ${RELEASEDIR}/%,${PREVIOUS_RELEASEDIR}/%,$@) | ||
@if [ -e $(patsubst ${RELEASEDIR}/%,${PREVIOUS_RELEASEDIR}/%,$@) ]; then \ | ||
echo "merge with previous version!"; \ | ||
l=${patsubst ${RELEASEDIR}/%/train.id.gz,%,$@}; \ | ||
s=${firstword ${subst -, ,${patsubst ${RELEASEDIR}/%/train.id.gz,%,$@}}}; \ | ||
t=${lastword ${subst -, ,${patsubst ${RELEASEDIR}/%/train.id.gz,%,$@}}}; \ | ||
paste <(gzip -cd $(patsubst ${RELEASEDIR}/%.id.gz,${PREVIOUS_RELEASEDIR}/%.src.gz,$@)) \ | ||
<(gzip -cd $(patsubst ${RELEASEDIR}/%.id.gz,${PREVIOUS_RELEASEDIR}/%.trg.gz,$@)) > [email protected]; \ | ||
${GZIP} -cd $(patsubst ${RELEASEDIR}/%,${PREVIOUS_RELEASEDIR}/%,$@) | cut -f1 > [email protected]; \ | ||
if [ -e [email protected] ]; then \ | ||
cut -f1 [email protected] | ${PARALLEL} langscript -3 -l $$s -r -D ${FIXLANGIDS} > [email protected]; \ | ||
cut -f2 [email protected] | ${PARALLEL} langscript -3 -l $$t -r -D ${FIXLANGIDS} > [email protected]; \ | ||
paste [email protected] [email protected] [email protected] [email protected] >> [email protected]; \ | ||
rm -f [email protected] [email protected] [email protected] [email protected]; \ | ||
fi \ | ||
fi | ||
|
||
endif | ||
|
||
## sort, de-duplicate and shuffle the collected data | ||
@if [ -s [email protected] ]; then \ | ||
cat [email protected] \ | ||
| ${UNICODE_CLEANUP} \ | ||
| scripts/exclude-identical.pl \ | ||
| ${SORT} -t ' ' -k4,5 -u | ${SHUFFLE} \ | ||
| scripts/exclude-devtest.pl -a -l \ | ||
${dir $@}test.src ${dir $@}test.trg \ | ||
${dir $@}dev.src ${dir $@}dev.trg > $@.tmp2; \ | ||
${dir $@}dev.src ${dir $@}dev.trg > [email protected] 2>$(@:.id.gz=.log); \ | ||
cut -f4 [email protected] | ${GZIP} -c > ${dir $@}train.src.gz; \ | ||
cut -f5 [email protected] | ${GZIP} -c > ${dir $@}train.trg.gz; \ | ||
cut -f1,2,3 [email protected] | ${GZIP} -c > $@; \ | ||
fi | ||
rm -f [email protected] [email protected] | ||
rmdir ${dir $@}train.d | ||
touch $(@:.id.gz=.corrected) | ||
|
||
|
||
|
||
|
||
## re-run the filter of excluding dev and test data | ||
## (in case this was not correctly done before, e.g. because the files were not there yet) | ||
|
||
# TO_BE_CORRECTED = $(patsubst %.id.gz,%.corrected,\ | ||
# $(wildcard ${RELEASEDIR}/ara-*/train.id.gz) $(wildcard ${RELEASEDIR}/afr-*/train.id.gz)) | ||
|
||
TO_BE_CORRECTED = $(patsubst %.id.gz,%.corrected,$(wildcard ${RELEASEDIR}/*/train.id.gz)) | ||
|
||
%-missing-traindata: | ||
${MAKE} ${RELEASEDIR}/$(@:-missing-traindata=)/train.id.gz | ||
|
||
correct-files: | ||
${MAKE} HPC_MEM=32g HPC_CORES=8 HPC_DISK=500 HPC_TIME=1-00 THREADS=8 fra-ita-missing-traindata.submit | ||
${MAKE} HPC_MEM=64g HPC_CORES=16 HPC_DISK=1000 HPC_TIME=1-12 THREADS=8 correct-train-files.submit | ||
${MAKE} HPC_MEM=32g HPC_CORES=8 HPC_DISK=500 HPC_TIME=1-00 THREADS=8 fra-rus-missing-traindata.submit | ||
${MAKE} HPC_MEM=16g HPC_CORES=4 HPC_DISK=500 HPC_TIME=1-00 THREADS=4 deu-est-missing-traindata.submit | ||
${MAKE} HPC_MEM=16g HPC_CORES=4 HPC_DISK=500 HPC_TIME=1-00 THREADS=4 eng-est-missing-traindata.submit | ||
${MAKE} HPC_MEM=16g HPC_CORES=4 HPC_DISK=500 HPC_TIME=1-00 THREADS=4 est-fin-missing-traindata.submit | ||
${MAKE} HPC_MEM=16g HPC_CORES=4 HPC_DISK=500 HPC_TIME=1-00 THREADS=4 est-fra-missing-traindata.submit | ||
${MAKE} HPC_MEM=16g HPC_CORES=4 HPC_DISK=500 HPC_TIME=1-00 THREADS=4 est-rus-missing-traindata.submit | ||
${MAKE} HPC_MEM=16g HPC_CORES=4 HPC_DISK=500 HPC_TIME=1-00 THREADS=4 est-swe-missing-traindata.submit | ||
|
||
correct-train-files: ${TO_BE_CORRECTED} | ||
|
||
${RELEASEDIR}/%/train.corrected: | ||
paste <(gzip -cd ${dir $@}train.id.gz) \ | ||
<(gzip -cd ${dir $@}train.src.gz) \ | ||
<(gzip -cd ${dir $@}train.trg.gz) \ | ||
| ${UNICODE_CLEANUP} \ | ||
| scripts/exclude-identical.pl \ | ||
| ${SORT} -t ' ' -k4,5 -u | ${SHUFFLE} \ | ||
| scripts/exclude-devtest.pl -a -l \ | ||
${dir $@}test.src ${dir $@}test.trg \ | ||
${dir $@}dev.src ${dir $@}dev.trg > $@.tmp2 | ||
cut -f4 $@.tmp2 | ${GZIP} -c > $@.src.gz | ||
cut -f5 $@.tmp2 | ${GZIP} -c > $@.trg.gz | ||
cut -f1,2,3 $@.tmp2 | ${GZIP} -c > $@.id.gz | ||
rm -f $@.tmp2 | ||
if [ `zcat $@.id.gz | wc -l` -eq `zcat ${dir $@}train.id.gz | wc -l` ]; then \ | ||
echo "no change! delete corrected files!"; \ | ||
rm -f $@.src.gz $@.trg.gz $@.id.gz; \ | ||
else \ | ||
mv ${dir $@}train.id.gz ${dir $@}train.backup.id.gz; \ | ||
mv ${dir $@}train.src.gz ${dir $@}train.backup.src.gz; \ | ||
mv ${dir $@}train.trg.gz ${dir $@}train.backup.trg.gz; \ | ||
mv $@.id.gz ${dir $@}train.id.gz; \ | ||
mv $@.src.gz ${dir $@}train.src.gz; \ | ||
mv $@.trg.gz ${dir $@}train.trg.gz; \ | ||
fi | ||
touch $@ | ||
|
||
|
||
|
||
|
||
## download the previous release | ||
|
||
ifdef PREVIOUS_VERSION | ||
|
||
PREVIOUS_DOWNLOAD_BASE_URL := https://object.pouta.csc.fi/Tatoeba-Challenge-${PREVIOUS_VERSION} | ||
|
||
${PREVIOUS_RELEASEDIR}/%/train.id.gz: | ||
@wget -q $(patsubst ${PREVIOUS_RELEASEDIR}/%/train.id.gz,${PREVIOUS_DOWNLOAD_BASE_URL}/%.tar,$@) | ||
@if [ -e $(patsubst ${PREVIOUS_RELEASEDIR}/%/train.id.gz,%.tar,$@) ]; then \ | ||
tar -xf $(patsubst ${PREVIOUS_RELEASEDIR}/%/train.id.gz,%.tar,$@); \ | ||
rm -f $(patsubst ${PREVIOUS_RELEASEDIR}/%/train.id.gz,%.tar,$@); \ | ||
fi | ||
|
||
endif | ||
|
||
|
||
|
||
|
@@ -1904,7 +2070,9 @@ ifdef EMAIL | |
endif | ||
ifeq (${shell hostname --domain 2>/dev/null},bullx) | ||
echo '#SBATCH --account=${CSCPROJECT}' >> $@ | ||
ifneq (${HPC_DISK},) | ||
echo '#SBATCH --gres=nvme:${HPC_DISK}' >> $@ | ||
endif | ||
endif | ||
echo '#SBATCH -n ${HPC_CORES}' >> $@ | ||
echo '#SBATCH -N ${HPC_NODES}' >> $@ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
#!/usr/bin/env perl | ||
# | ||
|
||
use strict; | ||
use open qw/:std :utf8/; | ||
use Getopt::Long; | ||
|
||
my $AlphaOnly = 0; | ||
my $LowerCase = 0; | ||
my $verbose = 0; | ||
|
||
GetOptions( | ||
"alpha|a" => \$AlphaOnly, | ||
"lower-case|l" => \$LowerCase, | ||
"verbose|v" => \$verbose ); | ||
|
||
|
||
my $count = 0; | ||
my $skipped = 0; | ||
my $kept = 0; | ||
while (<>){ | ||
$count++; | ||
chomp; | ||
my ($corpus,$lang1,$lang2,$src,$trg) = split(/\t/); | ||
|
||
$src=~s/\P{IsAlpha}//gs if ($AlphaOnly); | ||
$src = lc($src) if ($LowerCase); | ||
$trg=~s/\P{IsAlpha}//gs if ($AlphaOnly); | ||
$trg = lc($trg) if ($LowerCase); | ||
|
||
if ($src eq $trg){ | ||
$skipped++; | ||
} | ||
else{ | ||
$kept++; | ||
print "$_\n"; | ||
} | ||
|
||
print STDERR '.' if (! ($count % 10000)); | ||
print STDERR " $count ($skipped)\n" if (! ($count % 500000)); | ||
} | ||
|
||
|
||
if ($count){ | ||
printf STDERR "\n# kept pairs of non-identical sentences %d/%d (%5.2f\%)\n",$kept,$count,$kept/$count*100; | ||
printf STDERR "# skipped pairs of identical sentences %d/%d (%5.2f\%)\n",$skipped,$count,$skipped/$count*100; | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
#!/usr/bin/env perl | ||
# | ||
|
||
use strict; | ||
use open qw/:std :utf8/; | ||
use Getopt::Long; | ||
|
||
my $AlphaOnly = 0; | ||
my $LowerCase = 0; | ||
my $verbose = 0; | ||
|
||
GetOptions( | ||
"alpha|a" => \$AlphaOnly, | ||
"lower-case|l" => \$LowerCase, | ||
"verbose|v" => \$verbose ); | ||
|
||
|
||
my $count = 0; | ||
my $skipped = 0; | ||
my $kept = 0; | ||
while (<>){ | ||
$count++; | ||
chomp; | ||
my ($corpus,$lang1,$lang2,$src,$trg) = split(/\t/); | ||
|
||
$src=~s/\P{IsAlpha}//gs if ($AlphaOnly); | ||
$src = lc($src) if ($LowerCase); | ||
$trg=~s/\P{IsAlpha}//gs if ($AlphaOnly); | ||
$trg = lc($trg) if ($LowerCase); | ||
|
||
if ($src eq $trg){ | ||
$kept++; | ||
print "$_\n"; | ||
} | ||
else{ | ||
$skipped++; | ||
} | ||
|
||
print STDERR '.' if (! ($count % 10000)); | ||
print STDERR " $count ($kept)\n" if (! ($count % 500000)); | ||
} | ||
|
||
|
||
if ($count){ | ||
printf STDERR "\n# kept %d/%d (%5.2f\%)\n",$kept,$count,$kept/$count*100; | ||
printf STDERR "# skipped %d/%d (%5.2f\%)\n",$skipped,$count,$skipped/$count*100; | ||
} | ||
|