Skip to content

Commit

Permalink
more nodalida work with rat and domain evaluation
Browse files Browse the repository at this point in the history
  • Loading branch information
Tommi Nieminen committed Oct 8, 2024
1 parent 975aa0e commit 8f403db
Show file tree
Hide file tree
Showing 18 changed files with 511 additions and 53 deletions.
5 changes: 5 additions & 0 deletions configs/rat/config.eng-fin.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,11 @@ marian-args:
# these configs override pipeline/train/configs
finetune-teacher-with-terms:
after: 1e
training-teacher:
after: 120e
early-stopping: 40
no-restore-corpus: ""
valid-reset-stalled: ""


datasets:
Expand Down
15 changes: 11 additions & 4 deletions data.smk
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,7 @@ checkpoint extract_tc_scored:
subcorpora=directory("{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/subcorpora")
params:
input_dir="{project_name}/{src}-{trg}/{download_tc_dir}/",
output_dir="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/"

output_dir="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/"
shell:
'''python3 pipeline/data/filter-tc-data.py --source_corpus {input.train_src} --target_corpus {input.train_trg} --source_lang {wildcards.src} --target_lang {wildcards.trg} --id_file {input.train_ids} --score_file {input.scores} --domain_eval_lines 1000 --output_dir {params.output_dir} --min_score {wildcards.min_score} >> {log} 2>&1 && \
ln {params.input_dir}/{{eval,dev}}.*.gz {params.output_dir} >> {log} 2>&1'''
Expand Down Expand Up @@ -98,6 +97,9 @@ rule subset_corpus:
input:
train_source="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/train.{src}.gz",
train_target="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/train.{trg}.gz",
domeval_src="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/domeval.{src}.gz",
domeval_trg="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/domeval.{trg}.gz",
domeval_ids="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/domeval.ids.gz"
output:
train_source="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/subset_{max_train_sents}/train.{src}.gz",
train_target="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/subset_{max_train_sents}/train.{trg}.gz",
Expand All @@ -106,7 +108,10 @@ rule subset_corpus:
eval_source="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/subset_{max_train_sents}/eval.{src}.gz",
eval_target="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/subset_{max_train_sents}/eval.{trg}.gz",
all_filtered_source="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/subset_{max_train_sents}/all_filtered.{src}.gz",
all_filtered_target="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/subset_{max_train_sents}/all_filtered.{trg}.gz"
all_filtered_target="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/subset_{max_train_sents}/all_filtered.{trg}.gz",
domeval_src="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/subset_{max_train_sents}/domeval.{src}.gz",
domeval_trg="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/subset_{max_train_sents}/domeval.{trg}.gz",
domeval_ids="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/subset_{max_train_sents}/domeval.ids.gz"
params:
input_dir="{project_name}/{src}-{trg}/{download_tc_dir}/",
output_dir="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/subset_{max_train_sents}/"
Expand All @@ -115,11 +120,13 @@ rule subset_corpus:
ln {params.input_dir}/{{eval,dev}}.*.gz {params.output_dir} >> {log} 2>&1 && \
ln {input.train_source} {output.all_filtered_source} >> {log} 2>&1 && \
ln {input.train_target} {output.all_filtered_target} >> {log} 2>&1 && \
ln {input.domeval_src} {output.domeval_src} >> {log} 2>&1 && \
ln {input.domeval_trg} {output.domeval_trg} >> {log} 2>&1 && \
ln {input.domeval_ids} {output.domeval_ids} >> {log} 2>&1 && \
{{ pigz -dc {input.train_source} | head -n {wildcards.max_train_sents}B | pigz -c > {output.train_source} ; }} 2>> {log} && \
{{ pigz -dc {input.train_target} | head -n {wildcards.max_train_sents}B | pigz -c > {output.train_target} ; }} 2>> {log}
"""


rule use_custom_corpus:
message: "Using custom corpus"
log: "{datadir}/{project_name}/{src}-{trg}/corpus_custom_{dataset}/custom_corpus_{dataset}.log"
Expand Down
48 changes: 41 additions & 7 deletions eval.smk
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@ gpus_num=config["gpus-num"]

def find_domain_sets(wildcards, checkpoint):
checkpoint_output = checkpoint.get(src=wildcards.src,trg=wildcards.trg,project_name=wildcards.project_name,download_tc_dir=wildcards.download_tc_dir,min_score=wildcards.min_score).output["subcorpora"]
print(checkpoint_output)
return glob_wildcards(os.path.join(checkpoint_output,f"{{domain,.*}}.{wildcards.src}.gz")).domain

def find_translate_sets(wildcards, checkpoint):
checkpoint_output = checkpoint.get(**wildcards).output["output_dir"]
return glob_wildcards(os.path.join(checkpoint_output,f"{{domain,.*}}.{wildcards.src}.gz")).domain

#TODO: combine model evaluation rules by storing vocabs in model dir with normally trained models as well
Expand Down Expand Up @@ -39,20 +42,50 @@ rule evaluate_opus_model:
shell: '''bash pipeline/eval/eval-gpu.sh "{params.res_prefix}" "{params.dataset_prefix}" {wildcards.src} {wildcards.trg} {params.decoder} "{params.decoder_config}" >> {log} 2>&1'''


#TODO: this need to output a single report on the domain evaluations. input should be a directory containing the domain indices and the domain src, trg and id files. Translate the domain source with all domain indices, then separate the output according to ids for evaluation. Skip crawled data sets.
rule merge_domain_evaluation:
message: "Merging domain evaluation results"
#TODO: for domeval, only the fuzzy sentences need to be translated for each index. The non-fuzzies can be reused from a common non-fuzz translation file (generate this separately). Otherwise translation takes ages.

# This translates the domeval sets with various indexes in an economincal fashion, i.e. only translating fuzzies
# TODO: this could also be done with single-file rules, if the non-fuzzy file were to be translated
# first. Would make the process cleaner, with no need for the output dir
checkpoint translate_domeval:
message: "Translating domain evaluation data"
log: "{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/{preprocessing}/{train_vocab}/{train_model}/eval/translate_domeval.log"
conda: None
container: None
envmodules:
"LUMI/22.08",
"partition/G",
"rocm/5.3.3"
threads: 1
priority: 50
wildcard_constraints:
min_score="0\.\d+",
model="[\w-]+"
input:
decoder=ancient(config["marian-decoder"]),
domain_index_src=lambda wildcards: expand("{{project_name}}/{{src}}-{{trg}}/{{download_tc_dir}}/extract_tc_scored_{{min_score}}/{{preprocessing}}/{domain}-domeval.{{src}}.gz", domain=find_domain_sets(wildcards, checkpoints.extract_tc_scored)),
train_index_src="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/{preprocessing}/train-domeval.{src}.gz",
all_filtered_index_src="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/{preprocessing}/all_filtered-domeval.{src}.gz"
output:
output_dir=directory("{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/{preprocessing}/{train_vocab}/{train_model}/eval/domeval")
params:
domain_index_src_dir="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/{preprocessing}",
decoder_config=f'{{project_name}}/{{src}}-{{trg}}/{{download_tc_dir}}/extract_tc_scored_{{min_score}}/{{preprocessing}}/{{train_vocab}}/{{train_model}}/final.model.npz.best-{config["best-model-metric"]}.npz.decoder.yml'
shell: '''pipeline/eval/eval-domains.sh {params.domain_index_src_dir} {output.output_dir} {src} {trg} {input.decoder} params.decoder_config --mini-batch 128 --workspace 20000 >> {log} 2>&1'''

# This evaluates the translations generated with translate_domeval
rule eval_domeval:
message: "Evaluating domain translation quality"
log: "{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/{preprocessing}/{train_vocab}/{train_model}/eval/evaluate_domains.log"
conda: None
container: None
threads: 7
resources: gpu=1
threads: 1
priority: 50
wildcard_constraints:
min_score="0\.\d+",
model="[\w-]+"
input:
lambda wildcards: expand("{{project_name}}/{{src}}-{{trg}}/{{download_tc_dir}}/extract_tc_scored_{{min_score}}/{{preprocessing}}/{{train_vocab}}/{{train_model}}/eval/{domain}-domeval.metrics", domain=find_domain_sets(wildcards, checkpoints.extract_tc_scored))
domain_index_trg=lambda wildcards: expand("{{project_name}}/{{src}}-{{trg}}/{{download_tc_dir}}/extract_tc_scored_{{min_score}}/{{preprocessing}}/{{train_vocab}}/{{train_model}}/eval/domeval/{domain}-domeval.{{trg}}.gz", domain=find_translate_sets(wildcards, checkpoints.translate_domeval))
output:
report('{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/{preprocessing}/{train_vocab}/{train_model}/eval/domeval.done',
category='evaluation', subcategory='{model}', caption='reports/evaluation.rst')
Expand All @@ -65,6 +98,7 @@ rule evaluate:
threads: 7
resources: gpu=1
priority: 50
group: "evaluate"
wildcard_constraints:
model="[\w-]+"
input:
Expand Down
35 changes: 28 additions & 7 deletions pipeline/data/filter-tc-data.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,14 @@ def process_files(src_path, trg_path, src_lang, trg_lang, id_path, score_path, m
gzip.open(domaineval_id_path, 'wt', encoding='utf-8') as eval_ids:

eval_counts = defaultdict(int)

domain_counts = defaultdict(int)

crawl_prefixes = ["CCMatrix","NLLB","ParaCrawl","HPLT","CCAligned","XLEnt"]


for src_line, trg_line, id_line, score_line in zip(src, trg, ids, scores):
if len(src_line) > 1000 or len(trg_line) > 1000:
continue
score = float(score_line.strip().split("\t")[-1])
if score < min_score:
continue
Expand All @@ -46,23 +52,38 @@ def process_files(src_path, trg_path, src_lang, trg_lang, id_path, score_path, m
if corpus_name not in domain_files:
domain_src_path = f"{output_dir}/subcorpora/{corpus_name}.{src_lang}.gz"
domain_trg_path = f"{output_dir}/subcorpora/{corpus_name}.{trg_lang}.gz"
domain_files[corpus_name] = (
gzip.open(domain_src_path, 'wt', encoding='utf-8'),
gzip.open(domain_trg_path, 'wt', encoding='utf-8')
)
if any(corpus_name.startswith(prefix) for prefix in crawl_prefixes):
domain_files[corpus_name] = None
else:
domain_files[corpus_name] = (
gzip.open(domain_src_path, 'wt', encoding='utf-8'),
gzip.open(domain_trg_path, 'wt', encoding='utf-8')
)

if domain_eval_lines > 0 and eval_counts[corpus_name] < domain_eval_lines:
eval_src.write(src_line)
eval_trg.write(trg_line)
eval_ids.write(id_line)
eval_counts[corpus_name] += 1
else:
domain_files[corpus_name][0].write(src_line)
domain_files[corpus_name][1].write(trg_line)
if domain_files[corpus_name]:
domain_files[corpus_name][0].write(src_line)
domain_files[corpus_name][1].write(trg_line)
domain_counts[corpus_name] += 1
train_src.write(src_line)
train_trg.write(trg_line)
train_ids.write(id_line)

for domain_name, (src_file, trg_file) in [x for x in domain_files.items() if x[1] is not None]:
src_file.close()
trg_file.close()
# remove the domain files that do not have enough lines for domain tms
if domain_counts[domain_name] < 1000:
os.remove(src_file.name)
os.remove(trg_file.name)



def main():
parser = argparse.ArgumentParser(description='Process and filter corpus data based on score.')
parser.add_argument('--source_corpus', required=True, help='Path to the source corpus file (gzipped)')
Expand Down
95 changes: 95 additions & 0 deletions pipeline/eval/eval-domains.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#!/bin/bash

##
# Evaluate a model with domain data.
#

set -x
set -euo pipefail

echo "###### Evaluation of a model"

data_directory=$1
result_directory=$2
src=$3
trg=$4
marian_decoder=$5
decoder_config=$6
model_dir=$(dirname "${decoder_config}")
model_step=$(basename "${model_dir}")
args=( "${@:7}" )

mkdir -p "$(basename "${result_directory}")"

translate() {
local source_file=$1
local output_file=$2
if [[ "${model_step}" == *opus* ]]; then
source_spm_path="${model_dir}/source.spm"
target_spm_path="${model_dir}/target.spm"
sp_source_file="${source_file}.sp}"
cat "${source_file}" | "${MARIAN}/spm_encode" --model "${source_spm_path}" > "${sp_source_file}"
source_file=$sp_source_file
fi
echo "Translating $source_file to $output_file..."
"${marian_decoder}" \
-c "${decoder_config}" \
--input "${source_file}" \
--quiet \
--quiet-translation \
--log "${output_file}.log" \
"${args[@]}" > "${output_file}"

if [[ "${model_step}" == *opus* ]]; then
sp_output_file="${output_file}.sp"
mv "${output_file}" "${sp_output_file}"
"${MARIAN}/spm_decode" --model "${target_spm_path}" < "${sp_output_file}" > "${output_file}"
fi

}

domeval_dir="$result_directory/domeval"

# Create the domeval subdirectory in the output directory
mkdir -p "$domeval_dir"

# First find all files matching the pattern in the directory
files=$(find "$data_directory" -type f -name "*-domeval.${src}.gz")

# Remove FUZZY_BREAK tokens, save as gzipped nofuzzies.trans, and run translate on the nofuzzies file for the first file
first_file=$(echo "$files" | head -n 1)
first_file_basename=$(basename ${first_file} .${src}.gz)
gunzip -c "$first_file" | sed 's/.*FUZZY_BREAK //' > "$domeval_dir/nofuzzies.${src}"

translate "$domeval_dir/nofuzzies.${src}" "$domeval_dir/nofuzzies.${trg}"

#create ref file
ref_file="${domeval_dir}/domeval.${trg}.ref"
zcat "${data_directory}/${first_file_basename}.${trg}.gz" > ${ref_file}

# Translate domeval with non-domain specific train and all_filtered indexes


# Iterate over each file found in the directory
for file in $files; do
basename=$(basename "$file" .${src}.gz)
fuzzies_file="$domeval_dir/${basename}.fuzzies"
line_numbers_file="$domeval_dir/${basename}.linenum"
translated_fuzzies_file="$domeval_dir/${basename}.translated_fuzzies"

# Separate lines containing FUZZY_BREAK into .fuzzies file and store their line numbers
gunzip -c "$file" | grep -n 'FUZZY_BREAK' > "$line_numbers_file"

# Extract only the FUZZY_BREAK lines into the .fuzzies file and gzip the result
cut -d: -f2- "$line_numbers_file" > "$fuzzies_file"

# Run translate on the fuzzies file and generate the translated fuzzies file
translate "$fuzzies_file" "$translated_fuzzies_file"

# Create the output file for this input file
output_file="$domeval_dir/${basename}.${trg}"

python pipeline/eval/merge_domain_translations.py $"$domeval_dir/nofuzzies.${trg}" "$translated_fuzzies_file" "${line_numbers_file}" "${output_file}"

echo "Created merged output for $file as $output_file"
done
43 changes: 43 additions & 0 deletions pipeline/eval/merge_domain_translations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import argparse

def replace_fuzzy_lines(non_fuzzy_file, fuzzy_file, fuzzy_line_number_file, output_path):
# Read lines from the non-fuzzy translation file
with open(non_fuzzy_file, 'r', encoding='utf-8') as nf:
non_fuzzy_lines = nf.readlines()

# Read lines from the fuzzy translation file
with open(fuzzy_file, 'r', encoding='utf-8') as f:
fuzzy_lines = f.readlines()

# Read the fuzzy line numbers (1-based index)
with open(fuzzy_line_number_file, 'r', encoding='utf-8') as fln:
fuzzy_line_numbers = [int(line.strip().split(":")[0]) for line in fln.readlines()]

# Replace lines in non-fuzzy lines with those from fuzzy lines based on fuzzy line numbers
for (line_number_index, line_number) in enumerate(fuzzy_line_numbers):
print(line_number)
# Check if the line number is within range
if 1 <= line_number <= len(non_fuzzy_lines):
non_fuzzy_lines[line_number - 1] = fuzzy_lines[line_number_index]

# Write the modified lines to the output file
with open(output_path, 'w', encoding='utf-8') as output_file:
output_file.writelines(non_fuzzy_lines)

def main():
# Set up argument parsing
parser = argparse.ArgumentParser(description='Replace lines in a non-fuzzy translation file with lines from a fuzzy translation file based on provided line numbers.')
parser.add_argument('non_fuzzy_file', type=str, help='Path to the non-fuzzy translation file.')
parser.add_argument('fuzzy_file', type=str, help='Path to the fuzzy translation file.')
parser.add_argument('fuzzy_line_number_file', type=str, help='Path to the file containing fuzzy line numbers.')
parser.add_argument('output_path', type=str, help='Path to the output file where modified content will be saved.')

# Parse the arguments
args = parser.parse_args()

# Call the function to replace fuzzy lines
replace_fuzzy_lines(args.non_fuzzy_file, args.fuzzy_file, args.fuzzy_line_number_file, args.output_path)

if __name__ == '__main__':
main()

Loading

0 comments on commit 8f403db

Please sign in to comment.