more nodalida work with rat and domain evaluation

Helsinki-NLP · Oct 8, 2024 · 8f403db · 8f403db
1 parent 975aa0e
commit 8f403db
Show file tree

Hide file tree

Showing 18 changed files with 511 additions and 53 deletions.
diff --git a/configs/rat/config.eng-fin.yml b/configs/rat/config.eng-fin.yml
@@ -37,6 +37,11 @@ marian-args:
 # these configs override pipeline/train/configs
   finetune-teacher-with-terms:
     after: 1e
+  training-teacher:
+    after: 120e
+    early-stopping: 40
+    no-restore-corpus: ""
+    valid-reset-stalled: ""
 
 
 datasets:

diff --git a/data.smk b/data.smk
@@ -50,8 +50,7 @@ checkpoint extract_tc_scored:
         subcorpora=directory("{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/subcorpora")
     params:
         input_dir="{project_name}/{src}-{trg}/{download_tc_dir}/",
-        output_dir="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/"
-
+        output_dir="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/" 
     shell: 
         '''python3 pipeline/data/filter-tc-data.py --source_corpus {input.train_src} --target_corpus {input.train_trg} --source_lang {wildcards.src} --target_lang {wildcards.trg} --id_file {input.train_ids} --score_file {input.scores} --domain_eval_lines 1000 --output_dir {params.output_dir}  --min_score {wildcards.min_score} >> {log} 2>&1 && \
         ln {params.input_dir}/{{eval,dev}}.*.gz {params.output_dir} >> {log} 2>&1'''
@@ -98,6 +97,9 @@ rule subset_corpus:
     input:         
         train_source="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/train.{src}.gz",
         train_target="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/train.{trg}.gz",
+        domeval_src="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/domeval.{src}.gz",
+        domeval_trg="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/domeval.{trg}.gz",
+        domeval_ids="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/domeval.ids.gz"
     output: 
         train_source="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/subset_{max_train_sents}/train.{src}.gz",
         train_target="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/subset_{max_train_sents}/train.{trg}.gz",
@@ -106,7 +108,10 @@ rule subset_corpus:
         eval_source="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/subset_{max_train_sents}/eval.{src}.gz",
         eval_target="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/subset_{max_train_sents}/eval.{trg}.gz",
         all_filtered_source="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/subset_{max_train_sents}/all_filtered.{src}.gz",
-        all_filtered_target="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/subset_{max_train_sents}/all_filtered.{trg}.gz"
+        all_filtered_target="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/subset_{max_train_sents}/all_filtered.{trg}.gz",
+        domeval_src="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/subset_{max_train_sents}/domeval.{src}.gz",
+        domeval_trg="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/subset_{max_train_sents}/domeval.{trg}.gz",
+        domeval_ids="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/subset_{max_train_sents}/domeval.ids.gz"
     params:
         input_dir="{project_name}/{src}-{trg}/{download_tc_dir}/",
         output_dir="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/subset_{max_train_sents}/"
@@ -115,11 +120,13 @@ rule subset_corpus:
         ln {params.input_dir}/{{eval,dev}}.*.gz {params.output_dir} >> {log} 2>&1 && \
         ln {input.train_source} {output.all_filtered_source} >> {log} 2>&1 && \
         ln {input.train_target} {output.all_filtered_target} >> {log} 2>&1 && \
+        ln {input.domeval_src} {output.domeval_src} >> {log} 2>&1 && \
+        ln {input.domeval_trg} {output.domeval_trg} >> {log} 2>&1 && \
+        ln {input.domeval_ids} {output.domeval_ids} >> {log} 2>&1 && \
         {{ pigz -dc {input.train_source} | head -n {wildcards.max_train_sents}B | pigz -c > {output.train_source} ; }} 2>> {log} && \
         {{ pigz -dc {input.train_target} | head -n {wildcards.max_train_sents}B | pigz -c > {output.train_target} ; }} 2>> {log}
         """
 
-
 rule use_custom_corpus:
     message: "Using custom corpus"
     log: "{datadir}/{project_name}/{src}-{trg}/corpus_custom_{dataset}/custom_corpus_{dataset}.log"

diff --git a/eval.smk b/eval.smk
@@ -9,7 +9,10 @@ gpus_num=config["gpus-num"]
 
 def find_domain_sets(wildcards, checkpoint): 
     checkpoint_output = checkpoint.get(src=wildcards.src,trg=wildcards.trg,project_name=wildcards.project_name,download_tc_dir=wildcards.download_tc_dir,min_score=wildcards.min_score).output["subcorpora"]
-    print(checkpoint_output)
+    return glob_wildcards(os.path.join(checkpoint_output,f"{{domain,.*}}.{wildcards.src}.gz")).domain
+
+def find_translate_sets(wildcards, checkpoint): 
+    checkpoint_output = checkpoint.get(**wildcards).output["output_dir"]
     return glob_wildcards(os.path.join(checkpoint_output,f"{{domain,.*}}.{wildcards.src}.gz")).domain
 
 #TODO: combine model evaluation rules by storing vocabs in model dir with normally trained models as well
@@ -39,20 +42,50 @@ rule evaluate_opus_model:
     shell: '''bash pipeline/eval/eval-gpu.sh "{params.res_prefix}" "{params.dataset_prefix}" {wildcards.src} {wildcards.trg} {params.decoder} "{params.decoder_config}" >> {log} 2>&1'''
 
 
-#TODO: this need to output a single report on the domain evaluations. input should be a directory containing the domain indices and the domain src, trg and id files. Translate the domain source with all domain indices, then separate the output according to ids for evaluation. Skip crawled data sets.
-rule merge_domain_evaluation:
-    message: "Merging domain evaluation results"
+#TODO: for domeval, only the fuzzy sentences need to be translated for each index. The non-fuzzies can be reused from a common non-fuzz translation file (generate this separately). Otherwise translation takes ages.
+
+# This translates the domeval sets with various indexes in an economincal fashion, i.e. only translating fuzzies
+# TODO: this could also be done with single-file rules, if the non-fuzzy file were to be translated
+# first. Would make the process cleaner, with no need for the output dir
+checkpoint translate_domeval:
+    message: "Translating domain evaluation data"
+    log: "{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/{preprocessing}/{train_vocab}/{train_model}/eval/translate_domeval.log"
+    conda: None
+    container: None
+    envmodules:
+        "LUMI/22.08",
+        "partition/G",
+        "rocm/5.3.3"
+    threads: 1
+    priority: 50
+    wildcard_constraints:
+        min_score="0\.\d+",
+        model="[\w-]+"
+    input:
+        decoder=ancient(config["marian-decoder"]),
+    	domain_index_src=lambda wildcards: expand("{{project_name}}/{{src}}-{{trg}}/{{download_tc_dir}}/extract_tc_scored_{{min_score}}/{{preprocessing}}/{domain}-domeval.{{src}}.gz", domain=find_domain_sets(wildcards, checkpoints.extract_tc_scored)),
+        train_index_src="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/{preprocessing}/train-domeval.{src}.gz",
+        all_filtered_index_src="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/{preprocessing}/all_filtered-domeval.{src}.gz"
+    output:
+        output_dir=directory("{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/{preprocessing}/{train_vocab}/{train_model}/eval/domeval")
+    params:
+        domain_index_src_dir="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/{preprocessing}",
+        decoder_config=f'{{project_name}}/{{src}}-{{trg}}/{{download_tc_dir}}/extract_tc_scored_{{min_score}}/{{preprocessing}}/{{train_vocab}}/{{train_model}}/final.model.npz.best-{config["best-model-metric"]}.npz.decoder.yml'
+    shell: '''pipeline/eval/eval-domains.sh {params.domain_index_src_dir} {output.output_dir} {src} {trg} {input.decoder} params.decoder_config --mini-batch 128 --workspace 20000 >> {log} 2>&1'''
+
+# This evaluates the translations generated with translate_domeval
+rule eval_domeval:
+    message: "Evaluating domain translation quality"
     log: "{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/{preprocessing}/{train_vocab}/{train_model}/eval/evaluate_domains.log"
     conda: None
     container: None
-    threads: 7
-    resources: gpu=1
+    threads: 1
     priority: 50
     wildcard_constraints:
         min_score="0\.\d+",
         model="[\w-]+"
     input:
-    	lambda wildcards: expand("{{project_name}}/{{src}}-{{trg}}/{{download_tc_dir}}/extract_tc_scored_{{min_score}}/{{preprocessing}}/{{train_vocab}}/{{train_model}}/eval/{domain}-domeval.metrics", domain=find_domain_sets(wildcards, checkpoints.extract_tc_scored))
+        domain_index_trg=lambda wildcards: expand("{{project_name}}/{{src}}-{{trg}}/{{download_tc_dir}}/extract_tc_scored_{{min_score}}/{{preprocessing}}/{{train_vocab}}/{{train_model}}/eval/domeval/{domain}-domeval.{{trg}}.gz", domain=find_translate_sets(wildcards, checkpoints.translate_domeval))
     output:
         report('{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/{preprocessing}/{train_vocab}/{train_model}/eval/domeval.done',
             category='evaluation', subcategory='{model}', caption='reports/evaluation.rst')
@@ -65,6 +98,7 @@ rule evaluate:
     threads: 7
     resources: gpu=1
     priority: 50
+    group: "evaluate"
     wildcard_constraints:
         model="[\w-]+"
     input:

diff --git a/pipeline/data/filter-tc-data.py b/pipeline/data/filter-tc-data.py
@@ -34,8 +34,14 @@ def process_files(src_path, trg_path, src_lang, trg_lang, id_path, score_path, m
          gzip.open(domaineval_id_path, 'wt', encoding='utf-8') as eval_ids:
 
         eval_counts = defaultdict(int)
-
+        domain_counts = defaultdict(int)
+
+        crawl_prefixes = ["CCMatrix","NLLB","ParaCrawl","HPLT","CCAligned","XLEnt"]
+
+
         for src_line, trg_line, id_line, score_line in zip(src, trg, ids, scores):
+            if len(src_line) > 1000 or len(trg_line) > 1000:
+                continue
             score = float(score_line.strip().split("\t")[-1])
             if score < min_score:
                 continue
@@ -46,23 +52,38 @@ def process_files(src_path, trg_path, src_lang, trg_lang, id_path, score_path, m
             if corpus_name not in domain_files:
                 domain_src_path = f"{output_dir}/subcorpora/{corpus_name}.{src_lang}.gz"
                 domain_trg_path = f"{output_dir}/subcorpora/{corpus_name}.{trg_lang}.gz"
-                domain_files[corpus_name] = (
-                    gzip.open(domain_src_path, 'wt', encoding='utf-8'),
-                    gzip.open(domain_trg_path, 'wt', encoding='utf-8')
-                )
+                if any(corpus_name.startswith(prefix) for prefix in crawl_prefixes):
+                    domain_files[corpus_name] = None
+                else:
+                    domain_files[corpus_name] = (
+                        gzip.open(domain_src_path, 'wt', encoding='utf-8'),
+                        gzip.open(domain_trg_path, 'wt', encoding='utf-8')
+                    )
 
             if domain_eval_lines > 0 and eval_counts[corpus_name] < domain_eval_lines:
                 eval_src.write(src_line)
                 eval_trg.write(trg_line)
                 eval_ids.write(id_line)
                 eval_counts[corpus_name] += 1
             else:
-                domain_files[corpus_name][0].write(src_line)
-                domain_files[corpus_name][1].write(trg_line)
+                if domain_files[corpus_name]:
+                    domain_files[corpus_name][0].write(src_line)
+                    domain_files[corpus_name][1].write(trg_line)
+                    domain_counts[corpus_name] += 1
                 train_src.write(src_line)
                 train_trg.write(trg_line)
                 train_ids.write(id_line)
 
+        for domain_name, (src_file, trg_file) in [x for x in domain_files.items() if x[1] is not None]:
+            src_file.close()
+            trg_file.close()
+            # remove the domain files that do not have enough lines for domain tms
+            if domain_counts[domain_name] < 1000:
+                os.remove(src_file.name)
+                os.remove(trg_file.name)
+
+
+
 def main():
     parser = argparse.ArgumentParser(description='Process and filter corpus data based on score.')
     parser.add_argument('--source_corpus', required=True, help='Path to the source corpus file (gzipped)')

diff --git a/pipeline/eval/eval-domains.sh b/pipeline/eval/eval-domains.sh
@@ -0,0 +1,95 @@
+#!/bin/bash
+
+##
+# Evaluate a model with domain data.
+#
+
+set -x
+set -euo pipefail
+
+echo "###### Evaluation of a model"
+
+data_directory=$1
+result_directory=$2
+src=$3
+trg=$4
+marian_decoder=$5
+decoder_config=$6
+model_dir=$(dirname "${decoder_config}")
+model_step=$(basename "${model_dir}")
+args=( "${@:7}" )
+
+mkdir -p "$(basename "${result_directory}")"
+
+translate() {
+    local source_file=$1
+    local output_file=$2
+	if [[ "${model_step}" == *opus* ]]; then
+	  source_spm_path="${model_dir}/source.spm"
+	  target_spm_path="${model_dir}/target.spm"
+	  sp_source_file="${source_file}.sp}"
+	  cat "${source_file}" | "${MARIAN}/spm_encode" --model "${source_spm_path}" > "${sp_source_file}" 
+	  source_file=$sp_source_file
+	fi
+        echo "Translating $source_file to $output_file..."
+        "${marian_decoder}" \
+          -c "${decoder_config}" \
+          --input "${source_file}" \
+          --quiet \
+          --quiet-translation \
+          --log "${output_file}.log" \
+          "${args[@]}" > "${output_file}"
+
+	if [[ "${model_step}" == *opus* ]]; then
+	  sp_output_file="${output_file}.sp"
+	  mv "${output_file}" "${sp_output_file}"
+	  "${MARIAN}/spm_decode" --model "${target_spm_path}" < "${sp_output_file}" > "${output_file}" 
+	fi
+
+}
+
+domeval_dir="$result_directory/domeval"
+
+# Create the domeval subdirectory in the output directory
+mkdir -p "$domeval_dir"
+
+# First find all files matching the pattern in the directory
+files=$(find "$data_directory" -type f -name "*-domeval.${src}.gz")
+
+# Remove FUZZY_BREAK tokens, save as gzipped nofuzzies.trans, and run translate on the nofuzzies file for the first file
+first_file=$(echo "$files" | head -n 1)
+first_file_basename=$(basename ${first_file} .${src}.gz)
+gunzip -c "$first_file" | sed 's/.*FUZZY_BREAK //' > "$domeval_dir/nofuzzies.${src}"
+
+translate "$domeval_dir/nofuzzies.${src}" "$domeval_dir/nofuzzies.${trg}"
+
+#create ref file
+ref_file="${domeval_dir}/domeval.${trg}.ref"
+zcat "${data_directory}/${first_file_basename}.${trg}.gz" > ${ref_file}
+
+# Translate domeval with non-domain specific train and all_filtered indexes
+
+
+# Iterate over each file found in the directory
+for file in $files; do
+    basename=$(basename "$file" .${src}.gz)
+    fuzzies_file="$domeval_dir/${basename}.fuzzies"
+    line_numbers_file="$domeval_dir/${basename}.linenum"
+    translated_fuzzies_file="$domeval_dir/${basename}.translated_fuzzies"
+
+    # Separate lines containing FUZZY_BREAK into .fuzzies file and store their line numbers
+    gunzip -c "$file" | grep -n 'FUZZY_BREAK' > "$line_numbers_file"
+
+    # Extract only the FUZZY_BREAK lines into the .fuzzies file and gzip the result
+    cut -d: -f2- "$line_numbers_file" > "$fuzzies_file"
+
+    # Run translate on the fuzzies file and generate the translated fuzzies file
+    translate "$fuzzies_file" "$translated_fuzzies_file"
+
+    # Create the output file for this input file
+    output_file="$domeval_dir/${basename}.${trg}"
+
+    python pipeline/eval/merge_domain_translations.py $"$domeval_dir/nofuzzies.${trg}" "$translated_fuzzies_file" "${line_numbers_file}" "${output_file}"
+
+    echo "Created merged output for $file as $output_file"
+done
diff --git a/pipeline/eval/merge_domain_translations.py b/pipeline/eval/merge_domain_translations.py
@@ -0,0 +1,43 @@
+import argparse
+
+def replace_fuzzy_lines(non_fuzzy_file, fuzzy_file, fuzzy_line_number_file, output_path):
+    # Read lines from the non-fuzzy translation file
+    with open(non_fuzzy_file, 'r', encoding='utf-8') as nf:
+        non_fuzzy_lines = nf.readlines()
+
+    # Read lines from the fuzzy translation file
+    with open(fuzzy_file, 'r', encoding='utf-8') as f:
+        fuzzy_lines = f.readlines()
+
+    # Read the fuzzy line numbers (1-based index)
+    with open(fuzzy_line_number_file, 'r', encoding='utf-8') as fln:
+        fuzzy_line_numbers = [int(line.strip().split(":")[0]) for line in fln.readlines()]
+
+    # Replace lines in non-fuzzy lines with those from fuzzy lines based on fuzzy line numbers
+    for (line_number_index, line_number) in enumerate(fuzzy_line_numbers):
+        print(line_number)
+        # Check if the line number is within range
+        if 1 <= line_number <= len(non_fuzzy_lines):
+            non_fuzzy_lines[line_number - 1] = fuzzy_lines[line_number_index]
+
+    # Write the modified lines to the output file
+    with open(output_path, 'w', encoding='utf-8') as output_file:
+        output_file.writelines(non_fuzzy_lines)
+
+def main():
+    # Set up argument parsing
+    parser = argparse.ArgumentParser(description='Replace lines in a non-fuzzy translation file with lines from a fuzzy translation file based on provided line numbers.')
+    parser.add_argument('non_fuzzy_file', type=str, help='Path to the non-fuzzy translation file.')
+    parser.add_argument('fuzzy_file', type=str, help='Path to the fuzzy translation file.')
+    parser.add_argument('fuzzy_line_number_file', type=str, help='Path to the file containing fuzzy line numbers.')
+    parser.add_argument('output_path', type=str, help='Path to the output file where modified content will be saved.')
+
+    # Parse the arguments
+    args = parser.parse_args()
+
+    # Call the function to replace fuzzy lines
+    replace_fuzzy_lines(args.non_fuzzy_file, args.fuzzy_file, args.fuzzy_line_number_file, args.output_path)
+
+if __name__ == '__main__':
+    main()
+