diff --git a/Snakefile b/Snakefile index 07999e855..40f25824d 100644 --- a/Snakefile +++ b/Snakefile @@ -934,12 +934,16 @@ rule add_lang_tag_corpus_src_for_student: threads: workflow.cores input: expand(f"{train_student_dir}/corpus.{{lang}}.gz", langpair=langpairs, lang=['source', 'target']) output: f"{filtered}/{{langpair}}/corpus.source.langtagged.gz",f"{filtered}/{{langpair}}/corpus.target.gz" - params: prefix=f"{filtered}/{{langpair}}/corpus", + params: prefix=f"{train_student_dir}/corpus", trg_three_letter=lambda wildcards: Language.get(wildcards.langpair.split('-')[1]).to_alpha3(), - suffix="source" - shell: '''bash pipeline/clean/add-lang-tag.sh "{params.trg_three_letter}" "{params.prefix}" "{o2m_student}" "{params.suffix}" "" >> {log} 2>&1 - if [ ! -f "{filtered}/{{langpair}}/corpus.target.gz" ]; then cp "{train_student_dir}/{{langpair}}/corpus.target.gz" "{filtered}/{{langpair}}/corpus.target.gz"; fi - ''' + suffix="source", + train_dir_langpair=lambda wildcards: f"{train_student_dir}".replace("{langpair}", wildcards.langpair) + shell: ''' + bash pipeline/clean/add-lang-tag.sh "{params.trg_three_letter}" "{params.prefix}" "{o2m_student}" "{params.suffix}" "" >> {log} 2>&1 + if [ ! -f "{filtered}/{wildcards.langpair}/corpus.target.gz" ]; then + cp "{params.train_dir_langpair}/corpus.target.gz" "{filtered}/{wildcards.langpair}/corpus.target.gz"; + fi + ''' rule add_lang_tag_devset_for_student: message: "Adding language tag id for devset for student training" diff --git a/pipeline/clean/add-lang-tag.sh b/pipeline/clean/add-lang-tag.sh index 84cb793c6..8d9822196 100644 --- a/pipeline/clean/add-lang-tag.sh +++ b/pipeline/clean/add-lang-tag.sh @@ -10,6 +10,8 @@ echo "###### Adding language tag" target_lang_token=$1 file=$2 +# When we skip Cross-Entropy filtering, we need to move the file from the merged folder to the filtered one. +outfile="${file/merged/filtered}" o2m=$3 suffix=$4 model_dir=$5 @@ -26,13 +28,13 @@ if [ $o2m == "True" ]; then target_lang_token=">>${target_lang_token}<< " # Check if there is already a language tag token if zgrep -q "${target_lang_token}" $file.$lang.gz; then - ln -s $file.$lang.gz $file.$suffix.langtagged.gz + ln -s $file.$lang.gz $outfile.$suffix.langtagged.gz echo "The file already contains language tags, we create a dummy file" else - zcat $file.$lang.gz | sed "s/^/${target_lang_token}/" | pigz > $file.$suffix.langtagged.gz + zcat $file.$lang.gz | sed "s/^/${target_lang_token}/" | pigz > $outfile.$suffix.langtagged.gz echo "###### Done: Adding language tag" fi else - ln -s $file.$lang.gz $file.$suffix.langtagged.gz + ln -s $file.$lang.gz $outfile.$suffix.langtagged.gz echo "The model doesn't have multiple targets, so there is no need to add a language tag, we create a dummy file" fi \ No newline at end of file