Skip to content

Commit

Permalink
more rat work, started ngram augmentation implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
Tommi Nieminen committed Oct 20, 2024
1 parent 21412a8 commit 4962d9e
Show file tree
Hide file tree
Showing 4 changed files with 182 additions and 24 deletions.
48 changes: 33 additions & 15 deletions pipeline/eval/score-domeval.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import sacrebleu
import csv
import gzip
import bsbleu
import re

def parse_args():
Expand All @@ -24,7 +23,7 @@ def read_file_lines(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
return [line.strip() for line in f.readlines()]

#generate pseudorefs from fuzzy matches for copy rate calculation, use the bsbleu code with modifications (lists instead of files, and BP 0)
#generate pseudorefs from fuzzy matches for copy rate calculation, use modified sacrebleu
def split_on_fuzzy_break(source, target):
results = []

Expand Down Expand Up @@ -104,11 +103,12 @@ def evaluate_full_domeval(domain, input_dir, trg_lang, ref_lines, report_file, s
trg_file = os.path.join(input_dir, f"{domain}-domeval.{trg_lang}")
source_fuzzies_file = os.path.join(input_dir, f"{domain}-domeval.fuzzies")
translated_fuzzies_file = os.path.join(input_dir, f"{domain}-domeval.translated_fuzzies")
linenum_file = os.path.join(input_dir, f"{domain}-domeval.linenum")

#downgraded uses the same linenum file as normal
linenum_path = os.path.join(input_dir, f"{domain}-domeval.linenum".replace("downgraded_",""))

fuzzy_source_lines = read_file_lines(source_fuzzies_file)
fuzzy_lines = read_file_lines(translated_fuzzies_file)
linenum_lines = [int(line.split(":")[0].strip()) for line in read_file_lines(linenum_file)]
linenum_lines = [int(line.split(":")[0].strip()) for line in read_file_lines(linenum_path)]
fuzzy_ref_lines = [ref_lines[linenum-1] for linenum in linenum_lines]
nofuzzy_lines = [nofuzzies_trg_lines[linenum-1] for linenum in linenum_lines]
baseline_lines = [baseline_trg_lines[linenum-1] for linenum in linenum_lines]
Expand Down Expand Up @@ -157,12 +157,19 @@ def evaluate_domeval_domains(input_dir, trg_lang, tsv_file, all_ref_lines, nofuz
domain_specific_baselines = {}

# the crawled corpora are not included as indexes, but they exist in the data, so only add
# the tranlations if the translation file exists
index_domains = []
# the tranlations if the translation file exists (plus train and all_filtered indexes)
if "nobands" in system_id:
index_domains = ["nobands_train","nobands_all_filtered"]
else:
index_domains = ["train","all_filtered"]

for domain in domain_to_id_dict.keys():
domain_trg_file = os.path.join(input_dir, f"{domain}-domeval.{trg_lang}")
if os.path.exists(domain_trg_file):
index_domains.append(domain)
downgraded_domain_file = os.path.join(input_dir, f"downgraded_{domain}-domeval.{trg_lang}")
if os.path.exists(downgraded_domain_file):
index_domains.append(f"downgraded_{domain}")

for index_domain in index_domains:
domain_trg_file = os.path.join(input_dir, f"{index_domain}-domeval.{trg_lang}")
Expand All @@ -174,13 +181,18 @@ def evaluate_domeval_domains(input_dir, trg_lang, tsv_file, all_ref_lines, nofuz

for index_domain in index_domains:
domain_to_fuzzy_id[index_domain] = {}
with open(os.path.join(input_dir, f"{index_domain}-domeval.linenum"),'r') as linenum_file:
linenum_lines = {int(line.split(":")[0].strip())-1 for line in linenum_file.readlines()}

#both downgraded and normal fuzzies have same linenum file
linenum_path = os.path.join(input_dir, f"{index_domain}-domeval.linenum".replace("downgraded_",""))
with open(linenum_path,'r') as linenum_file:
linenum_lines = [int(line.split(":")[0].strip())-1 for line in linenum_file.readlines()]
# get those ids from domain that have fuzzies with the given index domain
for domain in domains:
domain_to_fuzzy_id[index_domain][domain] = linenum_lines.intersection(domain_to_id_dict[domain])
domain_to_fuzzy_id[index_domain][domain] = sorted(set(linenum_lines).intersection(set(domain_to_id_dict[domain])))

with open(os.path.join(input_dir, f"{index_domain}-domeval.fuzzies"),'r') as fuzzy_file:
indexdomain_to_fuzzy_src[index_domain] = zip(linenum_lines, fuzzy_file.readlines())
fuzzy_lines = fuzzy_file.readlines()
indexdomain_to_fuzzy_src[index_domain] = list(zip(linenum_lines, fuzzy_lines))

for idx, domain in id_to_domain_dict.items():
if domain not in all_trg_lines.keys():
Expand Down Expand Up @@ -212,6 +224,7 @@ def evaluate_domeval_domains(input_dir, trg_lang, tsv_file, all_ref_lines, nofuz
# calculate baseline score for domain
baseline_metrics = calculate_metrics(domain_ref_lines, domain_baseline_lines)
report_file.write(f"{domain}\tbaseline\t{baseline_metrics.bleu}\t{baseline_metrics.chrf}\tN/A\tall\t0\t{system_id}\n")


for index_domain in index_domains:
domain_fuzzies = domain_to_fuzzy_id[index_domain][domain]
Expand All @@ -227,10 +240,16 @@ def evaluate_domeval_domains(input_dir, trg_lang, tsv_file, all_ref_lines, nofuz
fuzzy_ref_lines = [all_ref_lines[linenum] for linenum in domain_fuzzies]
fuzzy_trg_lines = [all_trg_lines[index_domain][linenum] for linenum in domain_fuzzies]

# TODO: the src is out of sync, FIX
domain_fuzzy_metrics = calculate_metrics(fuzzy_ref_lines, fuzzy_trg_lines, domain_fuzzy_src)
report_file.write(f"{domain}\t{index_domain}\t{domain_fuzzy_metrics.bleu}\t{domain_fuzzy_metrics.chrf}\t{domain_fuzzy_metrics.copyrate}\tonly_fuzzies\t{fuzzy_count}\t{system_id}\n")

#don't do this for downgraded
if not "downgraded_" in index_domain:
baseline_fuzzy_lines = [baseline_lines[linenum] for linenum in domain_fuzzies]

domain_baseline_fuzzy_metrics = calculate_metrics(fuzzy_ref_lines, baseline_fuzzy_lines)
report_file.write(f"{domain}\tbaseline_{index_domain}\t{domain_baseline_fuzzy_metrics.bleu}\t{domain_baseline_fuzzy_metrics.chrf}\tN/A\tonly_fuzzies\t{fuzzy_count}\t{system_id}\n")

baseline_mix_lines = combine_baseline_and_rat_domain(domain_fuzzies, baseline_lines, all_trg_lines[index_domain], domain_ids=domain_to_id_dict[domain])

domain_baseline_mix_metrics = calculate_metrics(domain_ref_lines, baseline_mix_lines)
Expand All @@ -256,14 +275,13 @@ def main():
nofuzzies_trg_lines = read_file_lines(nofuzzies_trg_path)

evaluate_domeval_domains(args.input_dir, args.trg_lang, args.domeval_ids, ref_lines, nofuzzies_trg_lines, baseline_lines, report_file, args.system_id)

for file_name in os.listdir(args.input_dir):
input_files = os.listdir(args.input_dir)
for file_name in input_files:
if file_name.endswith(f"-domeval.{args.trg_lang}"):
domain = file_name.replace(f"-domeval.{args.trg_lang}","")
evaluate_full_domeval(domain, args.input_dir, args.trg_lang, ref_lines, report_file, args.system_id, nofuzzies_trg_lines, baseline_lines)

evaluate_baseline(args.input_dir, args.trg_lang, ref_lines, report_file, args.baseline_translations, baseline_lines)


if __name__ == "__main__":
main()
22 changes: 15 additions & 7 deletions pipeline/eval/translate-domeval.sh
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ mkdir -p "$domeval_dir"

# First find all files matching the pattern in the directory
if [ "$uses_bands" = true ] ; then
files=$(find "$data_directory" -type f -name "*-domeval.${src}.gz")
files=$(find "$data_directory" -type f -name "*-domeval.${src}.gz" ! -name "nobands*-domeval.${src}.gz")
else
files=$(find "$data_directory" -type f -name "nobands*-domeval.${src}.gz")
fi
Expand Down Expand Up @@ -90,16 +90,24 @@ for file in $files; do
# Run translate on the fuzzies file and generate the translated fuzzies file
translate "$fuzzies_file" "$translated_fuzzies_file"

# TODO: if mt system uses bands, downgrade fuzzies to lower band (except for lowest band the same)

if [ "$uses_bands" = true ] ; then

fi

# Create the output file for this input file
output_file="$domeval_dir/${basename}.${trg}"

python pipeline/eval/merge_domain_translations.py $"$domeval_dir/nofuzzies.${trg}" "$translated_fuzzies_file" "${line_numbers_file}" "${output_file}"

# if mt system uses bands, downgrade fuzzies to lower band (except for lowest band the same)
if [ "$uses_bands" = true ] ; then
downgraded_fuzzies_file="$domeval_dir/downgraded_${basename}.fuzzies"
translated_downgraded_file="$domeval_dir/downgraded_${basename}.translated_fuzzies"
python pipeline/eval/downgrade_fuzzies.py "$fuzzies_file" "$downgraded_fuzzies_file"
translate "$downgraded_fuzzies_file" "$translated_downgraded_file"

# Create the output file for this input file
downgraded_output_file="$domeval_dir/downgraded_${basename}.${trg}"

python pipeline/eval/merge_domain_translations.py $"$domeval_dir/nofuzzies.${trg}" "$translated_downgraded_file" "${line_numbers_file}" "${downgraded_output_file}"

fi

echo "Created merged output for $file as $output_file"
done
4 changes: 2 additions & 2 deletions profiles/slurm-lumi/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ config:
#- root=/pfs/lustrep1/scratch/project_462000088/members/niemine1/data
#- rocm=/opt/rocm
- workspace=40000
- numgpus=8
- numgpus=1
- mariancmake=""
- gpus="0 1 2 3 4 5 6 7"
- gpus="0"
- marianversion="lumi-marian"
132 changes: 132 additions & 0 deletions test_lcs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
import argparse
import re
import gzip
import difflib
import sacremoses
import langcodes
import eflomal
import pathlib
import os
import json
from contextlib import nullcontext

def longest_common_token_sequence(tokens1, tokens2):
"""Find the single longest common sequence of tokens between two token lists using difflib."""
seq_matcher = difflib.SequenceMatcher(None, tokens1, tokens2)

# Find the longest matching block
match = max(seq_matcher.get_matching_blocks(), key=lambda m: m.size)

# Extract the longest sequence from tokens1 based on the match
return tokens1[match.a:match.a + match.size] if match.size > 0 else []

def process_match_line(line, source_sentence, target_sentence, targetsim, src_tokenizer, trg_tokenizer, normalizer):
"""Process a line in the match file and return the longest common sequence of tokens."""
# Split the line into items by tabs
items = re.split(r'\t+', line.strip())
i = 0
while i < len(items):
score = float(items[i]) # The score is the first tab-separated part
i += 1 # Move to the next part

# The next part will have the format [id]=[src] ||| [trg]
match_info = items[i]
match_id, match_text = match_info.split('=', 1)

# Split match_text by '|||' to get source and target matches
src_match, tgt_match = match_text.split('|||')

tokenized_src_match = src_tokenizer.tokenize(normalizer.normalize(src_match.strip()))
tokenized_trg_match = trg_tokenizer.tokenize(normalizer.normalize(tgt_match.strip()))

if targetsim:
# Tokenize target sentence and target match text
tokens1 = trg_tokenizer.tokenize(normalizer.normalize(target_sentence))
tokens2 = tokenized_trg_match
else:
# Tokenize source sentence and source match text
tokens1 = src_tokenizer.tokenize(normalizer.normalize(source_sentence))
tokens2 = tokenized_src_match

# Find the longest common sequence of tokens
lcs_tokens = longest_common_token_sequence(tokens1, tokens2)
yield (tokenized_src_match, tokenized_trg_match, lcs_tokens)
i += 1 # Move to the next match item


def main(source_file, target_file, match_file, priors_file, targetsim, src_lang, trg_lang):
src_tokenizer = sacremoses.MosesTokenizer(lang=langcodes.standardize_tag(src_lang))
trg_tokenizer = sacremoses.MosesTokenizer(lang=langcodes.standardize_tag(trg_lang))
src_detokenizer = sacremoses.MosesDetokenizer(lang=langcodes.standardize_tag(src_lang))
trg_detokenizer = sacremoses.MosesDetokenizer(lang=langcodes.standardize_tag(trg_lang))
normalizer = sacremoses.MosesPunctNormalizer()
if not targetsim:
aligner = eflomal.Aligner()
else:
aligner = None

# temp dir to store intermediate eflomal files
temp_dir = os.path.join(os.path.dirname(source_file),"eflomal_tmp")
if not targetsim and not os.path.exists(temp_dir):
os.makedirs(temp_dir)

ngram_file = source_file.replace(".gz",".ngrams.gz")
if ngram_file == source_file:
ngram_file = source_file + ".ngrams"

# Open all files in the main function using the with statement
with gzip.open(source_file, 'rt', encoding='utf-8') as source_f, \
gzip.open(target_file, 'rt', encoding='utf-8') as target_f, \
gzip.open(match_file, 'rt', encoding='utf-8') as match_f, \
gzip.open(ngram_file, 'wt', encoding='utf-8') as ngram_f, \
gzip.open(os.path.join(temp_dir,"source.gz"), 'wt', encoding='utf-8') if not targetsim else nullcontext() as eflomal_src, \
gzip.open(os.path.join(temp_dir,"target.gz"), 'wt', encoding='utf-8') if not targetsim else nullcontext() as eflomal_trg, \
gzip.open(os.path.join(temp_dir,"matches.gz"), 'wt', encoding='utf-8') if not targetsim else nullcontext() as eflomal_matches:
# Iterate over lines of source, target, and match files together
for index, (source_sentence, target_sentence, match_line) in enumerate(zip(source_f, target_f, match_f)):
if len(match_line.strip())==0:
continue
source_sentence = source_sentence.strip()
target_sentence = target_sentence.strip()
match_line = match_line.strip()

# Process each match line along with corresponding source/target sentences
match_items = process_match_line(match_line, source_sentence, target_sentence, targetsim, src_tokenizer, trg_tokenizer, normalizer)

ngrams = []
for (tokenized_src_match, tokenized_trg_match, lcs_tokens) in match_items:
if targetsim:
ngrams.append(trg_detokenizer.detokenize(lcs_tokens))
else:
eflomal_src.write(f"{tokenized_src_match}\n")
eflomal_trg.write(f"{tokenized_trg_match}\n")
eflomal_matches.write(json.dumps((index,lcs_tokens)))
if targetsim:
ngram_f.write("\t".join(ngrams)+"\n")

#TODO: align the files in elfomal_tmp, then look for correspondign target tokens for the source tokens in the lcs (dropping tokens as necessary, as long as the match remains fairly long)


""".generate_eflomal_priors.py.swp # align source to target
priors_path = pathlib.Path(priors_file)
align_matches = lambda x: aligner.align(
[y[1] for y in x], [y[2] for y in x],
links_filename_fwd=f"{priors_path.stem}.fwd", links_filename_rev=f"{priors_path.stem}.rev",
priors_input=priors_file)
align_matches(results)
"""

if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Find longest common token sequence between source/target sentences and match file.")
parser.add_argument("--source_file", required=True, help="Path to the gzipped source sentence file")
parser.add_argument("--target_file", required=True, help="Path to the gzipped target sentence file")
parser.add_argument("--match_file", required=True, help="Path to the gzipped match file")
parser.add_argument("--src_lang", required=True, help="Source lang, three-letter code")
parser.add_argument("--trg_lang", required=True, help="Target lang, three-letter code")
parser.add_argument("--targetsim", action='store_true', help="If set, compares with target sentences; otherwise, compares with source")
parser.add_argument("--priors_file", required=True, help="Elfomal alignment priors")
args = parser.parse_args()

main(args.source_file, args.target_file, args.match_file, args.priors_file, args.targetsim, args.src_lang, args.trg_lang)

0 comments on commit 4962d9e

Please sign in to comment.