more rat work, started ngram augmentation implementation

Helsinki-NLP · Oct 20, 2024 · 4962d9e · 4962d9e
1 parent 21412a8
commit 4962d9e
Show file tree

Hide file tree

Showing 4 changed files with 182 additions and 24 deletions.
diff --git a/pipeline/eval/score-domeval.py b/pipeline/eval/score-domeval.py
@@ -3,7 +3,6 @@
 import sacrebleu
 import csv
 import gzip
-import bsbleu
 import re
 
 def parse_args():
@@ -24,7 +23,7 @@ def read_file_lines(file_path):
     with open(file_path, 'r', encoding='utf-8') as f:
         return [line.strip() for line in f.readlines()]
 
-#generate pseudorefs from fuzzy matches for copy rate calculation, use the bsbleu code with modifications (lists instead of files, and BP 0)
+#generate pseudorefs from fuzzy matches for copy rate calculation, use modified sacrebleu
 def split_on_fuzzy_break(source, target):
     results = []
 
@@ -104,11 +103,12 @@ def evaluate_full_domeval(domain, input_dir, trg_lang, ref_lines, report_file, s
     trg_file = os.path.join(input_dir, f"{domain}-domeval.{trg_lang}")
     source_fuzzies_file = os.path.join(input_dir, f"{domain}-domeval.fuzzies")
     translated_fuzzies_file = os.path.join(input_dir, f"{domain}-domeval.translated_fuzzies")
-    linenum_file = os.path.join(input_dir, f"{domain}-domeval.linenum")
-
+    #downgraded uses the same linenum file as normal
+    linenum_path = os.path.join(input_dir, f"{domain}-domeval.linenum".replace("downgraded_",""))
+
     fuzzy_source_lines = read_file_lines(source_fuzzies_file)
     fuzzy_lines = read_file_lines(translated_fuzzies_file)
-    linenum_lines = [int(line.split(":")[0].strip()) for line in read_file_lines(linenum_file)]
+    linenum_lines = [int(line.split(":")[0].strip()) for line in read_file_lines(linenum_path)]
     fuzzy_ref_lines = [ref_lines[linenum-1] for linenum in linenum_lines]
     nofuzzy_lines = [nofuzzies_trg_lines[linenum-1] for linenum in linenum_lines]
     baseline_lines = [baseline_trg_lines[linenum-1] for linenum in linenum_lines]
@@ -157,12 +157,19 @@ def evaluate_domeval_domains(input_dir, trg_lang, tsv_file, all_ref_lines, nofuz
     domain_specific_baselines = {}
 
     # the crawled corpora are not included as indexes, but they exist in the data, so only add
-    # the tranlations if the translation file exists
-    index_domains = []
+    # the tranlations if the translation file exists (plus train and all_filtered indexes)
+    if "nobands" in system_id:
+        index_domains = ["nobands_train","nobands_all_filtered"]
+    else:
+        index_domains = ["train","all_filtered"]
+
     for domain in domain_to_id_dict.keys():
         domain_trg_file = os.path.join(input_dir, f"{domain}-domeval.{trg_lang}")
         if os.path.exists(domain_trg_file):
             index_domains.append(domain)
+        downgraded_domain_file = os.path.join(input_dir, f"downgraded_{domain}-domeval.{trg_lang}")
+        if os.path.exists(downgraded_domain_file):
+            index_domains.append(f"downgraded_{domain}")
 
     for index_domain in index_domains:
         domain_trg_file = os.path.join(input_dir, f"{index_domain}-domeval.{trg_lang}")
@@ -174,13 +181,18 @@ def evaluate_domeval_domains(input_dir, trg_lang, tsv_file, all_ref_lines, nofuz
 
     for index_domain in index_domains:
         domain_to_fuzzy_id[index_domain] = {}
-        with open(os.path.join(input_dir, f"{index_domain}-domeval.linenum"),'r') as linenum_file:
-            linenum_lines = {int(line.split(":")[0].strip())-1 for line in linenum_file.readlines()}
+
+        #both downgraded and normal fuzzies have same linenum file
+        linenum_path = os.path.join(input_dir, f"{index_domain}-domeval.linenum".replace("downgraded_",""))
+        with open(linenum_path,'r') as linenum_file:
+            linenum_lines = [int(line.split(":")[0].strip())-1 for line in linenum_file.readlines()]
             # get those ids from domain that have fuzzies with the given index domain
             for domain in domains:
-                domain_to_fuzzy_id[index_domain][domain] = linenum_lines.intersection(domain_to_id_dict[domain])
+                domain_to_fuzzy_id[index_domain][domain] = sorted(set(linenum_lines).intersection(set(domain_to_id_dict[domain])))
+
         with open(os.path.join(input_dir, f"{index_domain}-domeval.fuzzies"),'r') as fuzzy_file:
-            indexdomain_to_fuzzy_src[index_domain] = zip(linenum_lines, fuzzy_file.readlines())
+            fuzzy_lines = fuzzy_file.readlines()
+            indexdomain_to_fuzzy_src[index_domain] = list(zip(linenum_lines, fuzzy_lines))
 
     for idx, domain in id_to_domain_dict.items():
         if domain not in all_trg_lines.keys():
@@ -212,6 +224,7 @@ def evaluate_domeval_domains(input_dir, trg_lang, tsv_file, all_ref_lines, nofuz
         # calculate baseline score for domain
         baseline_metrics = calculate_metrics(domain_ref_lines, domain_baseline_lines)
         report_file.write(f"{domain}\tbaseline\t{baseline_metrics.bleu}\t{baseline_metrics.chrf}\tN/A\tall\t0\t{system_id}\n")
+
 
         for index_domain in index_domains:
             domain_fuzzies = domain_to_fuzzy_id[index_domain][domain]
@@ -227,10 +240,16 @@ def evaluate_domeval_domains(input_dir, trg_lang, tsv_file, all_ref_lines, nofuz
             fuzzy_ref_lines = [all_ref_lines[linenum] for linenum in domain_fuzzies]
             fuzzy_trg_lines = [all_trg_lines[index_domain][linenum] for linenum in domain_fuzzies]
 
-            # TODO: the src is out of sync, FIX
             domain_fuzzy_metrics = calculate_metrics(fuzzy_ref_lines, fuzzy_trg_lines, domain_fuzzy_src)
             report_file.write(f"{domain}\t{index_domain}\t{domain_fuzzy_metrics.bleu}\t{domain_fuzzy_metrics.chrf}\t{domain_fuzzy_metrics.copyrate}\tonly_fuzzies\t{fuzzy_count}\t{system_id}\n")
 
+            #don't do this for downgraded
+            if not "downgraded_" in index_domain:
+                baseline_fuzzy_lines = [baseline_lines[linenum] for linenum in domain_fuzzies]
+
+                domain_baseline_fuzzy_metrics = calculate_metrics(fuzzy_ref_lines, baseline_fuzzy_lines)
+                report_file.write(f"{domain}\tbaseline_{index_domain}\t{domain_baseline_fuzzy_metrics.bleu}\t{domain_baseline_fuzzy_metrics.chrf}\tN/A\tonly_fuzzies\t{fuzzy_count}\t{system_id}\n")
+
             baseline_mix_lines = combine_baseline_and_rat_domain(domain_fuzzies, baseline_lines, all_trg_lines[index_domain], domain_ids=domain_to_id_dict[domain])
 
             domain_baseline_mix_metrics = calculate_metrics(domain_ref_lines, baseline_mix_lines)
@@ -256,14 +275,13 @@ def main():
         nofuzzies_trg_lines = read_file_lines(nofuzzies_trg_path)
 
         evaluate_domeval_domains(args.input_dir, args.trg_lang, args.domeval_ids, ref_lines,  nofuzzies_trg_lines, baseline_lines, report_file, args.system_id)
-
-        for file_name in os.listdir(args.input_dir):
+        input_files = os.listdir(args.input_dir)
+        for file_name in input_files:
             if file_name.endswith(f"-domeval.{args.trg_lang}"):
                 domain = file_name.replace(f"-domeval.{args.trg_lang}","")
                 evaluate_full_domeval(domain, args.input_dir, args.trg_lang, ref_lines, report_file, args.system_id, nofuzzies_trg_lines, baseline_lines)
 
         evaluate_baseline(args.input_dir, args.trg_lang, ref_lines, report_file, args.baseline_translations, baseline_lines)
 
-
 if __name__ == "__main__":
     main()
diff --git a/pipeline/eval/translate-domeval.sh b/pipeline/eval/translate-domeval.sh
@@ -56,7 +56,7 @@ mkdir -p "$domeval_dir"
 
 # First find all files matching the pattern in the directory
 if [ "$uses_bands" = true ] ; then 
-    files=$(find "$data_directory" -type f -name "*-domeval.${src}.gz")
+    files=$(find "$data_directory" -type f -name "*-domeval.${src}.gz" ! -name "nobands*-domeval.${src}.gz")
 else
     files=$(find "$data_directory" -type f -name "nobands*-domeval.${src}.gz")
 fi
@@ -90,16 +90,24 @@ for file in $files; do
     # Run translate on the fuzzies file and generate the translated fuzzies file
     translate "$fuzzies_file" "$translated_fuzzies_file"
 
-    # TODO: if mt system uses bands, downgrade fuzzies to lower band (except for lowest band the same)
-
-    if [ "$uses_bands" = true ] ; then 
-
-    fi
-
     # Create the output file for this input file
     output_file="$domeval_dir/${basename}.${trg}"
 
     python pipeline/eval/merge_domain_translations.py $"$domeval_dir/nofuzzies.${trg}" "$translated_fuzzies_file" "${line_numbers_file}" "${output_file}"
 
+    # if mt system uses bands, downgrade fuzzies to lower band (except for lowest band the same)
+    if [ "$uses_bands" = true ] ; then 
+      downgraded_fuzzies_file="$domeval_dir/downgraded_${basename}.fuzzies"
+      translated_downgraded_file="$domeval_dir/downgraded_${basename}.translated_fuzzies"
+      python pipeline/eval/downgrade_fuzzies.py "$fuzzies_file" "$downgraded_fuzzies_file"
+      translate "$downgraded_fuzzies_file" "$translated_downgraded_file"
+
+      # Create the output file for this input file
+      downgraded_output_file="$domeval_dir/downgraded_${basename}.${trg}"
+
+      python pipeline/eval/merge_domain_translations.py $"$domeval_dir/nofuzzies.${trg}" "$translated_downgraded_file" "${line_numbers_file}" "${downgraded_output_file}"
+
+    fi
+
     echo "Created merged output for $file as $output_file"
 done
diff --git a/profiles/slurm-lumi/config.yaml b/profiles/slurm-lumi/config.yaml
@@ -24,7 +24,7 @@ config:
     #- root=/pfs/lustrep1/scratch/project_462000088/members/niemine1/data
     #- rocm=/opt/rocm
   - workspace=40000
-  - numgpus=8
+  - numgpus=1
   - mariancmake=""
-  - gpus="0 1 2 3 4 5 6 7"
+  - gpus="0"
   - marianversion="lumi-marian"
diff --git a/test_lcs.py b/test_lcs.py
@@ -0,0 +1,132 @@
+import argparse
+import re
+import gzip
+import difflib
+import sacremoses
+import langcodes
+import eflomal
+import pathlib
+import os
+import json
+from contextlib import nullcontext
+
+def longest_common_token_sequence(tokens1, tokens2):
+    """Find the single longest common sequence of tokens between two token lists using difflib."""
+    seq_matcher = difflib.SequenceMatcher(None, tokens1, tokens2)
+
+    # Find the longest matching block
+    match = max(seq_matcher.get_matching_blocks(), key=lambda m: m.size)
+
+    # Extract the longest sequence from tokens1 based on the match
+    return tokens1[match.a:match.a + match.size] if match.size > 0 else []
+
+def process_match_line(line, source_sentence, target_sentence, targetsim, src_tokenizer, trg_tokenizer, normalizer):
+    """Process a line in the match file and return the longest common sequence of tokens."""
+    # Split the line into items by tabs
+    items = re.split(r'\t+', line.strip())    
+    i = 0
+    while i < len(items):
+        score = float(items[i])  # The score is the first tab-separated part
+        i += 1  # Move to the next part
+
+        # The next part will have the format [id]=[src] ||| [trg]
+        match_info = items[i]
+        match_id, match_text = match_info.split('=', 1)
+
+        # Split match_text by '|||' to get source and target matches
+        src_match, tgt_match = match_text.split('|||')
+
+        tokenized_src_match = src_tokenizer.tokenize(normalizer.normalize(src_match.strip()))
+        tokenized_trg_match = trg_tokenizer.tokenize(normalizer.normalize(tgt_match.strip()))
+
+        if targetsim:
+            # Tokenize target sentence and target match text
+            tokens1 = trg_tokenizer.tokenize(normalizer.normalize(target_sentence))
+            tokens2 = tokenized_trg_match
+        else:
+            # Tokenize source sentence and source match text
+            tokens1 = src_tokenizer.tokenize(normalizer.normalize(source_sentence))
+            tokens2 = tokenized_src_match
+
+        # Find the longest common sequence of tokens
+        lcs_tokens = longest_common_token_sequence(tokens1, tokens2)
+        yield (tokenized_src_match, tokenized_trg_match, lcs_tokens)
+        i += 1  # Move to the next match item
+
+
+def main(source_file, target_file, match_file, priors_file, targetsim, src_lang, trg_lang):
+    src_tokenizer = sacremoses.MosesTokenizer(lang=langcodes.standardize_tag(src_lang))
+    trg_tokenizer = sacremoses.MosesTokenizer(lang=langcodes.standardize_tag(trg_lang))
+    src_detokenizer = sacremoses.MosesDetokenizer(lang=langcodes.standardize_tag(src_lang))
+    trg_detokenizer = sacremoses.MosesDetokenizer(lang=langcodes.standardize_tag(trg_lang))
+    normalizer = sacremoses.MosesPunctNormalizer()
+    if not targetsim:
+        aligner = eflomal.Aligner()
+    else:
+        aligner = None
+
+    # temp dir to store intermediate eflomal files 
+    temp_dir = os.path.join(os.path.dirname(source_file),"eflomal_tmp")
+    if not targetsim and not os.path.exists(temp_dir):
+        os.makedirs(temp_dir)
+
+    ngram_file = source_file.replace(".gz",".ngrams.gz")
+    if ngram_file == source_file:
+        ngram_file = source_file + ".ngrams"
+
+    # Open all files in the main function using the with statement
+    with gzip.open(source_file, 'rt', encoding='utf-8') as source_f, \
+         gzip.open(target_file, 'rt', encoding='utf-8') as target_f, \
+         gzip.open(match_file, 'rt', encoding='utf-8') as match_f, \
+         gzip.open(ngram_file, 'wt', encoding='utf-8') as ngram_f, \
+         gzip.open(os.path.join(temp_dir,"source.gz"), 'wt', encoding='utf-8') if not targetsim else nullcontext() as eflomal_src, \
+         gzip.open(os.path.join(temp_dir,"target.gz"), 'wt', encoding='utf-8') if not targetsim else nullcontext() as eflomal_trg, \
+         gzip.open(os.path.join(temp_dir,"matches.gz"), 'wt', encoding='utf-8') if not targetsim else nullcontext() as eflomal_matches:
+        # Iterate over lines of source, target, and match files together
+        for index, (source_sentence, target_sentence, match_line) in enumerate(zip(source_f, target_f, match_f)):
+            if len(match_line.strip())==0:
+                continue
+            source_sentence = source_sentence.strip()
+            target_sentence = target_sentence.strip()
+            match_line = match_line.strip()
+
+            # Process each match line along with corresponding source/target sentences
+            match_items = process_match_line(match_line, source_sentence, target_sentence, targetsim, src_tokenizer, trg_tokenizer, normalizer)
+
+            ngrams = []
+            for (tokenized_src_match, tokenized_trg_match, lcs_tokens) in match_items:
+                if targetsim:
+                    ngrams.append(trg_detokenizer.detokenize(lcs_tokens))
+                else:
+                    eflomal_src.write(f"{tokenized_src_match}\n")
+                    eflomal_trg.write(f"{tokenized_trg_match}\n")
+                    eflomal_matches.write(json.dumps((index,lcs_tokens)))
+            if targetsim:
+                ngram_f.write("\t".join(ngrams)+"\n")
+
+            #TODO: align the files in elfomal_tmp, then look for correspondign target tokens for the source tokens in the lcs (dropping tokens as necessary, as long as the match remains fairly long)
+
+
+""".generate_eflomal_priors.py.swp            # align source to target
+            priors_path = pathlib.Path(priors_file)
+            align_matches = lambda x: aligner.align(
+                [y[1] for y in x], [y[2] for y in x],
+                links_filename_fwd=f"{priors_path.stem}.fwd", links_filename_rev=f"{priors_path.stem}.rev",
+                priors_input=priors_file)
+
+            align_matches(results)
+"""
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Find longest common token sequence between source/target sentences and match file.")
+    parser.add_argument("--source_file", required=True, help="Path to the gzipped source sentence file")
+    parser.add_argument("--target_file", required=True, help="Path to the gzipped target sentence file")
+    parser.add_argument("--match_file", required=True, help="Path to the gzipped match file")
+    parser.add_argument("--src_lang", required=True, help="Source lang, three-letter code")
+    parser.add_argument("--trg_lang", required=True, help="Target lang, three-letter code")
+    parser.add_argument("--targetsim", action='store_true', help="If set, compares with target sentences; otherwise, compares with source")
+    parser.add_argument("--priors_file", required=True, help="Elfomal alignment priors")
+    args = parser.parse_args()
+
+    main(args.source_file, args.target_file, args.match_file, args.priors_file, args.targetsim, args.src_lang, args.trg_lang)
+