Skip to content

Commit

Permalink
domain evaluation work
Browse files Browse the repository at this point in the history
  • Loading branch information
Tommi Nieminen committed Sep 26, 2024
1 parent 8d28343 commit 975aa0e
Show file tree
Hide file tree
Showing 11 changed files with 129 additions and 61 deletions.
4 changes: 2 additions & 2 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -268,10 +268,10 @@ else:

shell.prefix(f"{envs} ")

results = expand(f"{data_root_dir}/{experiment}/{src}-{trg}/corpus_{{corpus}}/finetune_{{learning_rate}}_opusTCv20210807+bt-2021-09-01/eval/eval-{{dataset}}.metrics", corpus=config["datasets"]["train"][0], learning_rate=config["experiment"]["finetune"]["learning-rates"], dataset=eval_datasets)
#results = expand(f"{data_root_dir}/{experiment}/{src}-{trg}/corpus_{{corpus}}/finetune_{{learning_rate}}_opusTCv20210807+bt-2021-09-01/eval/eval-{{dataset}}.metrics", corpus=config["datasets"]["train"][0], learning_rate=config["experiment"]["finetune"]["learning-rates"], dataset=eval_datasets)

# For base model, only generate the metrics once
results.extend(expand(f"{data_root_dir}/{experiment}/{src}-{trg}/corpus_{{corpus}}/finetune_{{learning_rate}}_opusTCv20210807+bt-2021-09-01/eval/basemodel-eval-{{dataset}}.metrics", corpus=config["datasets"]["train"][0], learning_rate=config["experiment"]["finetune"]["learning-rates"][0], dataset=eval_datasets))
#results.extend(expand(f"{data_root_dir}/{experiment}/{src}-{trg}/corpus_{{corpus}}/finetune_{{learning_rate}}_opusTCv20210807+bt-2021-09-01/eval/basemodel-eval-{{dataset}}.metrics", corpus=config["datasets"]["train"][0], learning_rate=config["experiment"]["finetune"]["learning-rates"][0], dataset=eval_datasets))

#print(results)

Expand Down
18 changes: 13 additions & 5 deletions data.smk
Original file line number Diff line number Diff line change
Expand Up @@ -27,26 +27,34 @@ rule download_tatoeba_corpus:
shell: 'bash pipeline/data/download-tc-data.sh {wildcards.src} {wildcards.trg} {params.prefix} {params.version} inf >> {log} 2>&1'

#TODO: explicitly defined dev and eval linking, the glob might cause problems
rule extract_tc_scored:
checkpoint extract_tc_scored:
message: "Extracting corpora from scored tc training set"
log: "{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/extract_tc_scored.log"
conda: "envs/base.yml"
log: "{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}.log"
conda: None
container: None
wildcard_constraints:
min_score="0\.\d+",
threads: 1
input: train_src="{project_name}/{src}-{trg}/{download_tc_dir}/train.{src}.gz", train_trg="{project_name}/{src}-{trg}/{download_tc_dir}/train.{trg}.gz", train_ids="{project_name}/{src}-{trg}/{download_tc_dir}/train.id.gz", scores="../data/scores/{src}-{trg}.scored.gz"
output:
src="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/train.{src}.gz",
trg="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/train.{trg}.gz",
train_ids="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/train.ids.gz",
domeval_src="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/domeval.{src}.gz",
domeval_trg="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/domeval.{trg}.gz",
domeval_ids="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/domeval.ids.gz",
dev_src="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/dev.{src}.gz",
dev_trg="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/dev.{trg}.gz",
eval_src="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/eval.{src}.gz",
eval_trg="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/eval.{trg}.gz"
eval_trg="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/eval.{trg}.gz",
subcorpora=directory("{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/subcorpora")
params:
input_dir="{project_name}/{src}-{trg}/{download_tc_dir}/",
output_dir="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/"

shell: '''python3 pipeline/data/filter-tc-data.py --source_corpus {input.train_src} --target_corpus {input.train_trg} --id_file {input.train_ids} --score_file {input.scores} --domain_eval_lines 1000 --output_dir {params.output_dir} --min_score {wildcards.min_score} && ln {params.input_dir}/{{eval,dev}}.*.gz {params.output_dir} >> {log} 2>&1'''
shell:
'''python3 pipeline/data/filter-tc-data.py --source_corpus {input.train_src} --target_corpus {input.train_trg} --source_lang {wildcards.src} --target_lang {wildcards.trg} --id_file {input.train_ids} --score_file {input.scores} --domain_eval_lines 1000 --output_dir {params.output_dir} --min_score {wildcards.min_score} >> {log} 2>&1 && \
ln {params.input_dir}/{{eval,dev}}.*.gz {params.output_dir} >> {log} 2>&1'''

rule baseline_preprocessing:
message: "Preprocessing data for baseline training"
Expand Down
24 changes: 24 additions & 0 deletions eval.smk
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,16 @@ wildcard_constraints:
src="\w{2,3}",
trg="\w{2,3}",
train_vocab="train_joint_spm_vocab[^/]+",
extract_tc="extract_tc[^/]+",
learn_rate="\d+"

gpus_num=config["gpus-num"]

def find_domain_sets(wildcards, checkpoint):
checkpoint_output = checkpoint.get(src=wildcards.src,trg=wildcards.trg,project_name=wildcards.project_name,download_tc_dir=wildcards.download_tc_dir,min_score=wildcards.min_score).output["subcorpora"]
print(checkpoint_output)
return glob_wildcards(os.path.join(checkpoint_output,f"{{domain,.*}}.{wildcards.src}.gz")).domain

#TODO: combine model evaluation rules by storing vocabs in model dir with normally trained models as well
rule evaluate_opus_model:
message: "Evaluating an OPUS model"
Expand Down Expand Up @@ -33,6 +39,24 @@ rule evaluate_opus_model:
shell: '''bash pipeline/eval/eval-gpu.sh "{params.res_prefix}" "{params.dataset_prefix}" {wildcards.src} {wildcards.trg} {params.decoder} "{params.decoder_config}" >> {log} 2>&1'''


#TODO: this need to output a single report on the domain evaluations. input should be a directory containing the domain indices and the domain src, trg and id files. Translate the domain source with all domain indices, then separate the output according to ids for evaluation. Skip crawled data sets.
rule merge_domain_evaluation:
message: "Merging domain evaluation results"
log: "{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/{preprocessing}/{train_vocab}/{train_model}/eval/evaluate_domains.log"
conda: None
container: None
threads: 7
resources: gpu=1
priority: 50
wildcard_constraints:
min_score="0\.\d+",
model="[\w-]+"
input:
lambda wildcards: expand("{{project_name}}/{{src}}-{{trg}}/{{download_tc_dir}}/extract_tc_scored_{{min_score}}/{{preprocessing}}/{{train_vocab}}/{{train_model}}/eval/{domain}-domeval.metrics", domain=find_domain_sets(wildcards, checkpoints.extract_tc_scored))
output:
report('{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/{preprocessing}/{train_vocab}/{train_model}/eval/domeval.done',
category='evaluation', subcategory='{model}', caption='reports/evaluation.rst')
shell: '''touch {output} >> {log} 2>&1'''

rule evaluate:
message: "Evaluating a model"
Expand Down
62 changes: 32 additions & 30 deletions pipeline/data/filter-tc-data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,35 @@
import os
from collections import defaultdict

def process_files(src_file, trg_file, id_file, score_file, min_score, domain_eval_lines, output_dir):
def process_files(src_path, trg_path, src_lang, trg_lang, id_path, score_path, min_score, domain_eval_lines, output_dir):
print(f"Creating output dir in {output_dir}")
if not os.path.exists(output_dir):
os.makedirs(output_dir)

train_src_path = f"{output_dir}/{os.path.basename(src_file)}"
train_trg_path = f"{output_dir}/{os.path.basename(trg_file)}"
train_src_path = f"{output_dir}/train.{src_lang}.gz"
train_trg_path = f"{output_dir}/train.{trg_lang}.gz"
train_id_path = f"{output_dir}/train.ids.gz"
domaineval_src_path = f"{output_dir}/domeval.{src_lang}.gz"
domaineval_trg_path = f"{output_dir}/domeval.{trg_lang}.gz"
domaineval_id_path = f"{output_dir}/domeval.ids.gz"

eval_lines = defaultdict(lambda: ([], [])) # dictionary to store eval lines per corpus
domain_files = {} # dictionary to store open file handles for domains
domain_files = {} # dictionary to store open file handles for domains

with gzip.open(src_file, 'rt', encoding='utf-8') as src, \
gzip.open(trg_file, 'rt', encoding='utf-8') as trg, \
gzip.open(id_file, 'rt', encoding='utf-8') as ids, \
gzip.open(score_file, 'rt', encoding='utf-8') as scores, \
if not os.path.exists(os.path.join(output_dir,"subcorpora")):
os.makedirs(os.path.join(output_dir,"subcorpora"))


with gzip.open(src_path, 'rt', encoding='utf-8') as src, \
gzip.open(trg_path, 'rt', encoding='utf-8') as trg, \
gzip.open(id_path, 'rt', encoding='utf-8') as ids, \
gzip.open(score_path, 'rt', encoding='utf-8') as scores, \
gzip.open(train_src_path, 'wt', encoding='utf-8') as train_src, \
gzip.open(train_trg_path, 'wt', encoding='utf-8') as train_trg:
gzip.open(train_trg_path, 'wt', encoding='utf-8') as train_trg, \
gzip.open(train_id_path, 'wt', encoding='utf-8') as train_ids, \
gzip.open(domaineval_src_path, 'wt', encoding='utf-8') as eval_src, \
gzip.open(domaineval_trg_path, 'wt', encoding='utf-8') as eval_trg, \
gzip.open(domaineval_id_path, 'wt', encoding='utf-8') as eval_ids:

eval_counts = defaultdict(int)

Expand All @@ -30,45 +42,33 @@ def process_files(src_file, trg_file, id_file, score_file, min_score, domain_eva

corpus_name = id_line.split("\t")[0]

# Open domain-specific files if not already opened
# Open domain-specific files if not already opened
if corpus_name not in domain_files:
domain_src_path = f"{output_dir}/{corpus_name}.train.src.gz"
domain_trg_path = f"{output_dir}/{corpus_name}.train.trg.gz"
domain_src_path = f"{output_dir}/subcorpora/{corpus_name}.{src_lang}.gz"
domain_trg_path = f"{output_dir}/subcorpora/{corpus_name}.{trg_lang}.gz"
domain_files[corpus_name] = (
gzip.open(domain_src_path, 'wt', encoding='utf-8'),
gzip.open(domain_trg_path, 'wt', encoding='utf-8')
)


if domain_eval_lines > 0 and eval_counts[corpus_name] < domain_eval_lines:
eval_lines[corpus_name][0].append(src_line)
eval_lines[corpus_name][1].append(trg_line)
eval_src.write(src_line)
eval_trg.write(trg_line)
eval_ids.write(id_line)
eval_counts[corpus_name] += 1
else:
domain_files[corpus_name][0].write(src_line)
domain_files[corpus_name][1].write(trg_line)
train_src.write(src_line)
train_trg.write(trg_line)

# Write the eval data to respective files
for corpus_name, (src_lines, trg_lines) in eval_lines.items():
eval_src_file_path = f"{output_dir}/{corpus_name}.eval.src.gz"
eval_trg_file_path = f"{output_dir}/{corpus_name}.eval.trg.gz"
with gzip.open(eval_src_file_path, 'wt', encoding='utf-8') as eval_src_file, \
gzip.open(eval_trg_file_path, 'wt', encoding='utf-8') as eval_trg_file:
for src_line, trg_line in zip(src_lines, trg_lines):
eval_src_file.write(src_line)
eval_trg_file.write(trg_line)

# Close all domain-specific files
for src_file, trg_file in domain_files.values():
src_file.close()
trg_file.close()
train_ids.write(id_line)

def main():
parser = argparse.ArgumentParser(description='Process and filter corpus data based on score.')
parser.add_argument('--source_corpus', required=True, help='Path to the source corpus file (gzipped)')
parser.add_argument('--target_corpus', required=True, help='Path to the target corpus file (gzipped)')
parser.add_argument('--source_lang', required=True, help='Source language code')
parser.add_argument('--target_lang', required=True, help='Target language code')
parser.add_argument('--id_file', required=True, help='Path to the ID file')
parser.add_argument('--score_file', required=True, help='Path to the score file')
parser.add_argument('--min_score', type=float, required=True, help='Minimum score for filtering (0 to 1)')
Expand All @@ -82,6 +82,8 @@ def main():
process_files(
args.source_corpus,
args.target_corpus,
args.source_lang,
args.target_lang,
args.id_file,
args.score_file,
args.min_score,
Expand Down
1 change: 1 addition & 0 deletions pipeline/opusmt/finetune.sh
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ echo "### Training ${model_dir}"
--devices ${GPUS} \
--beam-size 6 \
--sharding local \
--lr-report \
--learn-rate "${learn_rate}" \
--sync-sgd \
--valid-metrics "${best_model_metric}" ${all_model_metrics[@]/$best_model_metric} \
Expand Down
4 changes: 2 additions & 2 deletions pipeline/rat/build_index.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@ index_file=$4
echo "##### Building a fuzzy match index"

# index building runs on single thread, --nthreads is only for matching
${fuzzy_match_cli} --action index --corpus ${src_corpus}
# ${fuzzy_match_cli} --action index --corpus ${src_corpus}

# The add-target flag adds the target sentence to the db but there seems to be some bugs associated, so don't use it
#${fuzzy_match_cli} --action index --corpus ${src_corpus},${trg_corpus} --nthreads ${threads} --add-target
${fuzzy_match_cli} --action index --corpus ${src_corpus},${trg_corpus} --add-target

# index is saved as src_corpus.fmi, move it to the correct place
mv "${src_corpus}.fmi" "${index_file}"
2 changes: 1 addition & 1 deletion profiles/slurm-lumi-test/jobscript.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
log_file=$(echo '{properties}' | jq -r .log[0])
gpu=$(echo '{properties}' | jq -r .resources.gpu)

mkdir -p $(dirname $log_file)
#mkdir -p $(dirname $log_file)
mkdir /tmp/$USER

if [ $gpu != "null" ] && [ $gpu != "0" ]; then
Expand Down
5 changes: 3 additions & 2 deletions profiles/slurm-lumi/config.cluster.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# CSC Puhti
single-gpu-partition: small-g
multi-gpu-partition: standard-g
cpu-partition: small
partialnode-cpu-partition: small
fullnode-cpu-partition: standard
cpu-account: project_462000447
gpu-account: project_462000447
time-limit: "24:00:00"
time-limit: "48:00:00"
10 changes: 9 additions & 1 deletion profiles/slurm-lumi/submit.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@

options += ['--job-name', name]

partition = cluster_config['cpu-partition']
account = cluster_config['cpu-account']

if "resources" in job_properties:
Expand All @@ -43,6 +42,15 @@
partition = cluster_config['multi-gpu-partition']
rocm_dir = os.getenv("ROCM_PATH")
options += ['--export', f'ALL,SINGULARITY_BIND="{rocm_dir}"']
else:
#this is a LUMI-C job
if 'threads' in job_properties and int(job_properties['threads']) >= 128:
if 'mem_mb' in resources and int(resources['mem_mb'] < 256000):
partition = cluster_config['fullnode-cpu-partition']
else:
partition = cluster_config['partialnode-cpu-partition'] # The LUMI-C nodes with more memory than 256GB are in small partition
else:
partition = cluster_config['partialnode-cpu-partition']

# we don't need explicit memory limiting for now
if 'mem_mb' in resources:
Expand Down
Loading

0 comments on commit 975aa0e

Please sign in to comment.