Skip to content

Commit

Permalink
Now opus versioning is marked as __
Browse files Browse the repository at this point in the history
  • Loading branch information
onadegibert committed Sep 2, 2024
1 parent 8732d19 commit e5595ba
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 7 deletions.
2 changes: 1 addition & 1 deletion configs/config.quickstart.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ marian-args:

datasets:
train:
- opus_ELRC_2922/v1
- opus_ELRC_2922__v1
devtest:
- flores_dev
test:
Expand Down
12 changes: 6 additions & 6 deletions pipeline/data/importers/corpus/opus.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,25 +16,25 @@ dataset=$4
COMPRESSION_CMD="${COMPRESSION_CMD:-pigz}"
ARTIFACT_EXT="${ARTIFACT_EXT:-gz}"

name=${dataset%%/*}
name=${dataset%%__*}
name_and_version="${dataset//[^A-Za-z0-9_- ]/_}"
version=${dataset##*/}
version=${dataset##*__}

tmp="$(dirname "${output_prefix}")/opus/${name_and_version}"
mkdir -p "${tmp}"

archive_path="${tmp}/${name}.txt.zip"

wget -q "https://object.pouta.csc.fi/OPUS-${dataset}/${version}/moses/${src}-${trg}.txt.zip"
wget -q "https://object.pouta.csc.fi/OPUS-${name}/${version}/moses/${src}-${trg}.txt.zip"
wget_output_1=$?

wget -q "https://object.pouta.csc.fi/OPUS-${dataset}/${version}/moses/${trg}-${src}.txt.zip"
wget -q "https://object.pouta.csc.fi/OPUS-${name}/${version}/moses/${trg}-${src}.txt.zip"
wget_output_2=$?

# Attempt to download the file using the first URL
if [ $wget_output_1 -eq 0 ] || [ $wget_output_2 -eq 0 ]; then
wget -O "${archive_path}" "https://object.pouta.csc.fi/OPUS-${dataset}/moses/${src}-${trg}.txt.zip" ||
wget -O "${archive_path}" "https://object.pouta.csc.fi/OPUS-${dataset}/moses/${trg}-${src}.txt.zip"
wget -O "${archive_path}" "https://object.pouta.csc.fi/OPUS-${name}/${version}/moses/${src}-${trg}.txt.zip" ||
wget -O "${archive_path}" "https://object.pouta.csc.fi/OPUS-${name}/${version}/moses/${trg}-${src}.txt.zip"

unzip -o "${archive_path}" -d "${tmp}"

Expand Down

0 comments on commit e5595ba

Please sign in to comment.