Skip to content

Commit

Permalink
more rat work for nodalida
Browse files Browse the repository at this point in the history
  • Loading branch information
Tommi Nieminen committed Oct 12, 2024
1 parent 8f403db commit 82cd43a
Show file tree
Hide file tree
Showing 14 changed files with 522 additions and 125 deletions.
2 changes: 1 addition & 1 deletion Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ use rule * from rat as *

vocab_config = {
"spm-train": f"{marian_dir}/spm_train",
"user-defined-symbols":"FUZZY_BREAK",
"user-defined-symbols": ",".join(["FUZZY_BREAK","SRC_FUZZY_BREAK"] + [f"FUZZY_BREAK_{bucket}" for bucket in range(0,10)]),
"spm-sample-size": 1000000,
"spm-character-coverage": 1.0
}
Expand Down
26 changes: 17 additions & 9 deletions data.smk
Original file line number Diff line number Diff line change
Expand Up @@ -65,23 +65,26 @@ rule baseline_preprocessing:
input:
train_source="{project_name}/{src}-{trg}/{preprocessing}/train.{src}.gz",
train_target="{project_name}/{src}-{trg}/{preprocessing}/train.{trg}.gz",
dev_source="{project_name}/{src}-{trg}/{preprocessing}/dev.{src}.gz",
dev_target="{project_name}/{src}-{trg}/{preprocessing}/dev.{trg}.gz",
dev_source="{project_name}/{src}-{trg}/{preprocessing}/cleandev.{src}.gz",
dev_target="{project_name}/{src}-{trg}/{preprocessing}/cleandev.{trg}.gz",
eval_source="{project_name}/{src}-{trg}/{preprocessing}/eval.{src}.gz",
eval_target="{project_name}/{src}-{trg}/{preprocessing}/eval.{trg}.gz"
output:
train_source="{project_name}/{src}-{trg}/{preprocessing}/baseline_preprocessing_{max_dev_sents}/train.{src}.gz",
train_target="{project_name}/{src}-{trg}/{preprocessing}/baseline_preprocessing_{max_dev_sents}/train.{trg}.gz",
dev_source="{project_name}/{src}-{trg}/{preprocessing}/baseline_preprocessing_{max_dev_sents}/dev.{src}.gz",
dev_target="{project_name}/{src}-{trg}/{preprocessing}/baseline_preprocessing_{max_dev_sents}/dev.{trg}.gz",
train_source="{project_name}/{src}-{trg}/{preprocessing}/baseline_preprocessing_{max_dev_sents}/train-train.{src}.gz",
train_target="{project_name}/{src}-{trg}/{preprocessing}/baseline_preprocessing_{max_dev_sents}/train-train.{trg}.gz",
dev_source="{project_name}/{src}-{trg}/{preprocessing}/baseline_preprocessing_{max_dev_sents}/train-cleandev.{src}.gz",
dev_target="{project_name}/{src}-{trg}/{preprocessing}/baseline_preprocessing_{max_dev_sents}/train-cleandev.{trg}.gz",
eval_source="{project_name}/{src}-{trg}/{preprocessing}/baseline_preprocessing_{max_dev_sents}/eval.{src}.gz",
eval_target="{project_name}/{src}-{trg}/{preprocessing}/baseline_preprocessing_{max_dev_sents}/eval.{trg}.gz"
params:
input_dir="{project_name}/{src}-{trg}/{preprocessing}/",
output_dir="{project_name}/{src}-{trg}/{preprocessing}/baseline_preprocessing_{max_dev_sents}/"
shell:
"""
ln {params.input_dir}/{{eval,train}}.*.gz {params.output_dir} >> {log} 2>&1 && \
ln {input.train_source} {output.train_source} >> {log} 2>&1 && \
ln {input.train_target} {output.train_target} >> {log} 2>&1 && \
ln {input.eval_source} {output.eval_source} >> {log} 2>&1 && \
ln {input.eval_target} {output.eval_target} >> {log} 2>&1 && \
{{ pigz -dc {input.dev_source} | head -n {wildcards.max_dev_sents} | pigz -c > {output.dev_source} ; }} 2>> {log} && \
{{ pigz -dc {input.dev_target} | head -n {wildcards.max_dev_sents} | pigz -c > {output.dev_target} ; }} 2>> {log}
"""
Expand All @@ -97,6 +100,8 @@ rule subset_corpus:
input:
train_source="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/train.{src}.gz",
train_target="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/train.{trg}.gz",
dev_source="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/dev.{src}.gz",
dev_target="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/dev.{trg}.gz",
domeval_src="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/domeval.{src}.gz",
domeval_trg="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/domeval.{trg}.gz",
domeval_ids="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/domeval.ids.gz"
Expand All @@ -105,6 +110,8 @@ rule subset_corpus:
train_target="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/subset_{max_train_sents}/train.{trg}.gz",
dev_source="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/subset_{max_train_sents}/dev.{src}.gz",
dev_target="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/subset_{max_train_sents}/dev.{trg}.gz",
cleandev_source="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/subset_{max_train_sents}/cleandev.{src}.gz",
cleandev_target="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/subset_{max_train_sents}/cleandev.{trg}.gz",
eval_source="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/subset_{max_train_sents}/eval.{src}.gz",
eval_target="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/subset_{max_train_sents}/eval.{trg}.gz",
all_filtered_source="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/subset_{max_train_sents}/all_filtered.{src}.gz",
Expand All @@ -114,7 +121,7 @@ rule subset_corpus:
domeval_ids="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/subset_{max_train_sents}/domeval.ids.gz"
params:
input_dir="{project_name}/{src}-{trg}/{download_tc_dir}/",
output_dir="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/subset_{max_train_sents}/"
output_dir="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/subset_{max_train_sents}/",
shell:
"""
ln {params.input_dir}/{{eval,dev}}.*.gz {params.output_dir} >> {log} 2>&1 && \
Expand All @@ -124,7 +131,8 @@ rule subset_corpus:
ln {input.domeval_trg} {output.domeval_trg} >> {log} 2>&1 && \
ln {input.domeval_ids} {output.domeval_ids} >> {log} 2>&1 && \
{{ pigz -dc {input.train_source} | head -n {wildcards.max_train_sents}B | pigz -c > {output.train_source} ; }} 2>> {log} && \
{{ pigz -dc {input.train_target} | head -n {wildcards.max_train_sents}B | pigz -c > {output.train_target} ; }} 2>> {log}
{{ pigz -dc {input.train_target} | head -n {wildcards.max_train_sents}B | pigz -c > {output.train_target} ; }} 2>> {log} && \
python pipeline/data/clean-tcdev.py --source {input.dev_source} --target {input.dev_target} --prefix {params.output_dir}clean 2>> {log}
"""

rule use_custom_corpus:
Expand Down
22 changes: 13 additions & 9 deletions eval.smk
Original file line number Diff line number Diff line change
Expand Up @@ -52,26 +52,26 @@ checkpoint translate_domeval:
log: "{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/{preprocessing}/{train_vocab}/{train_model}/eval/translate_domeval.log"
conda: None
container: None
resources: gpu=gpus_num
envmodules:
"LUMI/22.08",
"partition/G",
"rocm/5.3.3"
threads: 1
priority: 50
wildcard_constraints:
min_score="0\.\d+",
model="[\w-]+"
min_score="0\.\d+"
input:
decoder=ancient(config["marian-decoder"]),
domain_index_src=lambda wildcards: expand("{{project_name}}/{{src}}-{{trg}}/{{download_tc_dir}}/extract_tc_scored_{{min_score}}/{{preprocessing}}/{domain}-domeval.{{src}}.gz", domain=find_domain_sets(wildcards, checkpoints.extract_tc_scored)),
train_index_src="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/{preprocessing}/train-domeval.{src}.gz",
all_filtered_index_src="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/{preprocessing}/all_filtered-domeval.{src}.gz"
domain_src=lambda wildcards: expand("{{project_name}}/{{src}}-{{trg}}/{{download_tc_dir}}/extract_tc_scored_{{min_score}}/{{preprocessing}}/{domain}-domeval.{{src}}.gz", domain=find_domain_sets(wildcards, checkpoints.extract_tc_scored)),
train_src="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/{preprocessing}/train-domeval.{src}.gz",
all_filtered_src="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/{preprocessing}/all_filtered-domeval.{src}.gz",
decoder_config=f'{{project_name}}/{{src}}-{{trg}}/{{download_tc_dir}}/extract_tc_scored_{{min_score}}/{{preprocessing}}/{{train_vocab}}/{{train_model}}/final.model.npz.best-{config["best-model-metric"]}.npz.decoder.yml'
output:
output_dir=directory("{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/{preprocessing}/{train_vocab}/{train_model}/eval/domeval")
params:
domain_index_src_dir="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/{preprocessing}",
decoder_config=f'{{project_name}}/{{src}}-{{trg}}/{{download_tc_dir}}/extract_tc_scored_{{min_score}}/{{preprocessing}}/{{train_vocab}}/{{train_model}}/final.model.npz.best-{config["best-model-metric"]}.npz.decoder.yml'
shell: '''pipeline/eval/eval-domains.sh {params.domain_index_src_dir} {output.output_dir} {src} {trg} {input.decoder} params.decoder_config --mini-batch 128 --workspace 20000 >> {log} 2>&1'''
domain_index_src_dir="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/{preprocessing}"
shell: '''pipeline/eval/translate-domeval.sh {params.domain_index_src_dir} {output.output_dir} {wildcards.src} {wildcards.trg} {input.decoder} {input.decoder_config} --mini-batch 128 --workspace 20000 >> {log} 2>&1'''

# This evaluates the translations generated with translate_domeval
rule eval_domeval:
Expand All @@ -89,7 +89,11 @@ rule eval_domeval:
output:
report('{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/{preprocessing}/{train_vocab}/{train_model}/eval/domeval.done',
category='evaluation', subcategory='{model}', caption='reports/evaluation.rst')
shell: '''touch {output} >> {log} 2>&1'''
params:
input_dir="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/{preprocessing}/{train_vocab}/{train_model}/eval/domeval",
domeval_ids="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/domeval.ids.gz",
system_id="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/{preprocessing}/{train_vocab}/{train_model}"
shell: '''python pipeline/eval/score-domeval.py --input_dir {params.input_dir} --report {output} --src_lang {wildcards.src} --trg_lang {wildcards.trg} --system_id {params.system_id} --domeval_ids {params.domeval_ids} >> {log} 2>&1'''

rule evaluate:
message: "Evaluating a model"
Expand Down
79 changes: 79 additions & 0 deletions pipeline/data/clean-tcdev.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import argparse
import gzip
import random
import os
import re

# Define sentence-ending punctuation
SENTENCE_ENDINGS = re.compile(r'[.!?]')

def is_valid_line(source_line, target_line, seen_lines):
"""Check if the source line is valid based on conditions:
- Source line must be longer than 5 words.
- Source line must not have occurred before.
- Source and target lines must have at most one sentence-ending punctuation.
"""
# Check if the source line is longer than 5 words
if len(source_line.split()) <= 5:
return False

# Check if the source line has occurred before
if source_line in seen_lines:
return False

# Check if there is more than one sentence-ending punctuation in both lines
if len(SENTENCE_ENDINGS.findall(source_line)) > 1 and len(SENTENCE_ENDINGS.findall(target_line)) > 1:
return False

# Add the source line to the set of seen lines
seen_lines.add(source_line)
return True

def process_files(source_path, target_path, prefix):
"""Process the gzipped source and target files, shuffle them, filter, sort by length, and write results to gzipped files."""
seen_lines = set()

# Read source and target files into memory, aligned by line
with gzip.open(source_path, 'rt', encoding='utf-8') as src_file, \
gzip.open(target_path, 'rt', encoding='utf-8') as tgt_file:
lines = [(src_line.strip(), tgt_line.strip()) for src_line, tgt_line in zip(src_file, tgt_file)]

# Shuffle the lines
random.shuffle(lines)

# Filter the lines based on conditions and store valid ones
filtered_lines = [
(src_line, tgt_line) for src_line, tgt_line in lines
if is_valid_line(src_line, tgt_line, seen_lines)
]

# Sort the filtered lines by the length of the source line (longest first)
sorted_lines = sorted(filtered_lines, key=lambda x: len(x[0]), reverse=True)

# Generate output file paths by prefixing the file names
source_output_path = prefix + os.path.basename(source_path)
target_output_path = prefix + os.path.basename(target_path)

# Write filtered and sorted lines to output files
with gzip.open(source_output_path, 'wt', encoding='utf-8') as src_out_file, \
gzip.open(target_output_path, 'wt', encoding='utf-8') as tgt_out_file:

for src_line, tgt_line in sorted_lines:
src_out_file.write(src_line + '\n')
tgt_out_file.write(tgt_line + '\n')

def main():
# Argument parsing
parser = argparse.ArgumentParser(description='Process gzipped files, shuffle lines, filter, sort, and output them.')
parser.add_argument('--source', required=True, help='Path to the gzipped source file')
parser.add_argument('--target', required=True, help='Path to the gzipped target file')
parser.add_argument('--prefix', required=True, help='Prefix to be added to output file names')

args = parser.parse_args()

# Process the input files and generate output
process_files(args.source, args.target, args.prefix)

if __name__ == "__main__":
main()

Loading

0 comments on commit 82cd43a

Please sign in to comment.