Skip to content

Commit

Permalink
fixed paths for add_lang_tag_corpus_src_for_student
Browse files Browse the repository at this point in the history
  • Loading branch information
onadegibert committed Sep 23, 2024
1 parent 4b9ef99 commit 023a2b8
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 8 deletions.
14 changes: 9 additions & 5 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -934,12 +934,16 @@ rule add_lang_tag_corpus_src_for_student:
threads: workflow.cores
input: expand(f"{train_student_dir}/corpus.{{lang}}.gz", langpair=langpairs, lang=['source', 'target'])
output: f"{filtered}/{{langpair}}/corpus.source.langtagged.gz",f"{filtered}/{{langpair}}/corpus.target.gz"
params: prefix=f"{filtered}/{{langpair}}/corpus",
params: prefix=f"{train_student_dir}/corpus",
trg_three_letter=lambda wildcards: Language.get(wildcards.langpair.split('-')[1]).to_alpha3(),
suffix="source"
shell: '''bash pipeline/clean/add-lang-tag.sh "{params.trg_three_letter}" "{params.prefix}" "{o2m_student}" "{params.suffix}" "" >> {log} 2>&1
if [ ! -f "{filtered}/{{langpair}}/corpus.target.gz" ]; then cp "{train_student_dir}/{{langpair}}/corpus.target.gz" "{filtered}/{{langpair}}/corpus.target.gz"; fi
'''
suffix="source",
train_dir_langpair=lambda wildcards: f"{train_student_dir}".replace("{langpair}", wildcards.langpair)
shell: '''
bash pipeline/clean/add-lang-tag.sh "{params.trg_three_letter}" "{params.prefix}" "{o2m_student}" "{params.suffix}" "" >> {log} 2>&1
if [ ! -f "{filtered}/{wildcards.langpair}/corpus.target.gz" ]; then
cp "{params.train_dir_langpair}/corpus.target.gz" "{filtered}/{wildcards.langpair}/corpus.target.gz";
fi
'''

rule add_lang_tag_devset_for_student:
message: "Adding language tag id for devset for student training"
Expand Down
8 changes: 5 additions & 3 deletions pipeline/clean/add-lang-tag.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ echo "###### Adding language tag"

target_lang_token=$1
file=$2
# When we skip Cross-Entropy filtering, we need to move the file from the merged folder to the filtered one.
outfile="${file/merged/filtered}"
o2m=$3
suffix=$4
model_dir=$5
Expand All @@ -26,13 +28,13 @@ if [ $o2m == "True" ]; then
target_lang_token=">>${target_lang_token}<< "
# Check if there is already a language tag token
if zgrep -q "${target_lang_token}" $file.$lang.gz; then
ln -s $file.$lang.gz $file.$suffix.langtagged.gz
ln -s $file.$lang.gz $outfile.$suffix.langtagged.gz
echo "The file already contains language tags, we create a dummy file"
else
zcat $file.$lang.gz | sed "s/^/${target_lang_token}/" | pigz > $file.$suffix.langtagged.gz
zcat $file.$lang.gz | sed "s/^/${target_lang_token}/" | pigz > $outfile.$suffix.langtagged.gz
echo "###### Done: Adding language tag"
fi
else
ln -s $file.$lang.gz $file.$suffix.langtagged.gz
ln -s $file.$lang.gz $outfile.$suffix.langtagged.gz
echo "The model doesn't have multiple targets, so there is no need to add a language tag, we create a dummy file"
fi

0 comments on commit 023a2b8

Please sign in to comment.