forked from mozilla/translations
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
more nodalida work with rat and domain evaluation
- Loading branch information
Tommi Nieminen
committed
Oct 8, 2024
1 parent
975aa0e
commit 8f403db
Showing
18 changed files
with
511 additions
and
53 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
#!/bin/bash | ||
|
||
## | ||
# Evaluate a model with domain data. | ||
# | ||
|
||
set -x | ||
set -euo pipefail | ||
|
||
echo "###### Evaluation of a model" | ||
|
||
data_directory=$1 | ||
result_directory=$2 | ||
src=$3 | ||
trg=$4 | ||
marian_decoder=$5 | ||
decoder_config=$6 | ||
model_dir=$(dirname "${decoder_config}") | ||
model_step=$(basename "${model_dir}") | ||
args=( "${@:7}" ) | ||
|
||
mkdir -p "$(basename "${result_directory}")" | ||
|
||
translate() { | ||
local source_file=$1 | ||
local output_file=$2 | ||
if [[ "${model_step}" == *opus* ]]; then | ||
source_spm_path="${model_dir}/source.spm" | ||
target_spm_path="${model_dir}/target.spm" | ||
sp_source_file="${source_file}.sp}" | ||
cat "${source_file}" | "${MARIAN}/spm_encode" --model "${source_spm_path}" > "${sp_source_file}" | ||
source_file=$sp_source_file | ||
fi | ||
echo "Translating $source_file to $output_file..." | ||
"${marian_decoder}" \ | ||
-c "${decoder_config}" \ | ||
--input "${source_file}" \ | ||
--quiet \ | ||
--quiet-translation \ | ||
--log "${output_file}.log" \ | ||
"${args[@]}" > "${output_file}" | ||
|
||
if [[ "${model_step}" == *opus* ]]; then | ||
sp_output_file="${output_file}.sp" | ||
mv "${output_file}" "${sp_output_file}" | ||
"${MARIAN}/spm_decode" --model "${target_spm_path}" < "${sp_output_file}" > "${output_file}" | ||
fi | ||
|
||
} | ||
|
||
domeval_dir="$result_directory/domeval" | ||
|
||
# Create the domeval subdirectory in the output directory | ||
mkdir -p "$domeval_dir" | ||
|
||
# First find all files matching the pattern in the directory | ||
files=$(find "$data_directory" -type f -name "*-domeval.${src}.gz") | ||
|
||
# Remove FUZZY_BREAK tokens, save as gzipped nofuzzies.trans, and run translate on the nofuzzies file for the first file | ||
first_file=$(echo "$files" | head -n 1) | ||
first_file_basename=$(basename ${first_file} .${src}.gz) | ||
gunzip -c "$first_file" | sed 's/.*FUZZY_BREAK //' > "$domeval_dir/nofuzzies.${src}" | ||
|
||
translate "$domeval_dir/nofuzzies.${src}" "$domeval_dir/nofuzzies.${trg}" | ||
|
||
#create ref file | ||
ref_file="${domeval_dir}/domeval.${trg}.ref" | ||
zcat "${data_directory}/${first_file_basename}.${trg}.gz" > ${ref_file} | ||
|
||
# Translate domeval with non-domain specific train and all_filtered indexes | ||
|
||
|
||
# Iterate over each file found in the directory | ||
for file in $files; do | ||
basename=$(basename "$file" .${src}.gz) | ||
fuzzies_file="$domeval_dir/${basename}.fuzzies" | ||
line_numbers_file="$domeval_dir/${basename}.linenum" | ||
translated_fuzzies_file="$domeval_dir/${basename}.translated_fuzzies" | ||
|
||
# Separate lines containing FUZZY_BREAK into .fuzzies file and store their line numbers | ||
gunzip -c "$file" | grep -n 'FUZZY_BREAK' > "$line_numbers_file" | ||
|
||
# Extract only the FUZZY_BREAK lines into the .fuzzies file and gzip the result | ||
cut -d: -f2- "$line_numbers_file" > "$fuzzies_file" | ||
|
||
# Run translate on the fuzzies file and generate the translated fuzzies file | ||
translate "$fuzzies_file" "$translated_fuzzies_file" | ||
|
||
# Create the output file for this input file | ||
output_file="$domeval_dir/${basename}.${trg}" | ||
|
||
python pipeline/eval/merge_domain_translations.py $"$domeval_dir/nofuzzies.${trg}" "$translated_fuzzies_file" "${line_numbers_file}" "${output_file}" | ||
|
||
echo "Created merged output for $file as $output_file" | ||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
import argparse | ||
|
||
def replace_fuzzy_lines(non_fuzzy_file, fuzzy_file, fuzzy_line_number_file, output_path): | ||
# Read lines from the non-fuzzy translation file | ||
with open(non_fuzzy_file, 'r', encoding='utf-8') as nf: | ||
non_fuzzy_lines = nf.readlines() | ||
|
||
# Read lines from the fuzzy translation file | ||
with open(fuzzy_file, 'r', encoding='utf-8') as f: | ||
fuzzy_lines = f.readlines() | ||
|
||
# Read the fuzzy line numbers (1-based index) | ||
with open(fuzzy_line_number_file, 'r', encoding='utf-8') as fln: | ||
fuzzy_line_numbers = [int(line.strip().split(":")[0]) for line in fln.readlines()] | ||
|
||
# Replace lines in non-fuzzy lines with those from fuzzy lines based on fuzzy line numbers | ||
for (line_number_index, line_number) in enumerate(fuzzy_line_numbers): | ||
print(line_number) | ||
# Check if the line number is within range | ||
if 1 <= line_number <= len(non_fuzzy_lines): | ||
non_fuzzy_lines[line_number - 1] = fuzzy_lines[line_number_index] | ||
|
||
# Write the modified lines to the output file | ||
with open(output_path, 'w', encoding='utf-8') as output_file: | ||
output_file.writelines(non_fuzzy_lines) | ||
|
||
def main(): | ||
# Set up argument parsing | ||
parser = argparse.ArgumentParser(description='Replace lines in a non-fuzzy translation file with lines from a fuzzy translation file based on provided line numbers.') | ||
parser.add_argument('non_fuzzy_file', type=str, help='Path to the non-fuzzy translation file.') | ||
parser.add_argument('fuzzy_file', type=str, help='Path to the fuzzy translation file.') | ||
parser.add_argument('fuzzy_line_number_file', type=str, help='Path to the file containing fuzzy line numbers.') | ||
parser.add_argument('output_path', type=str, help='Path to the output file where modified content will be saved.') | ||
|
||
# Parse the arguments | ||
args = parser.parse_args() | ||
|
||
# Call the function to replace fuzzy lines | ||
replace_fuzzy_lines(args.non_fuzzy_file, args.fuzzy_file, args.fuzzy_line_number_file, args.output_path) | ||
|
||
if __name__ == '__main__': | ||
main() | ||
|
Oops, something went wrong.