From 57a297ecc2c461b8e35a23765172b854ef3846ef Mon Sep 17 00:00:00 2001 From: Kevin Brubeck Unhammer Date: Mon, 25 Nov 2024 16:24:07 +0100 Subject: [PATCH 1/4] New `lt-merge` command to merge LU's from BEG to END tag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit $ echo '^ikke/ikke$ ^«/«$^så/så$ ^veldig/v$^»/»$ ^bra/bra$' | lt-merge ^ikke/ikke$ ^«så veldig»/«så veldig»$ ^bra/bra$ Mostly simple, but escaping can look a bit messy. If any of the LU's have word-bound blanks, the [] need escaping: $ echo '^«/«$[[tf:i:a]]^veldig/veldig$[[/]]^»/»$' | lttoolbox/lt-merge ^«\[\[tf:i:a\]\]veldig\[\[\/\]\]»/«\[\[tf:i:a\]\]veldig\[\[\/\]\]»$ to ensure we have legal stream format. If any of the forms contain already escaped chars, these now need double-escaping. Why? Because we need to run an "unmerge" step towards the end of the pipeline, while still outputting Apertium Stream Format, and need to know the difference between a \[ meaning word-blank or \\[ meaning literal [. $ echo '^ikke/ikke$ ^«/«$^til/til$ ^x\@y.com/x\@y.com$^»/»$ ^da/da$' | lttoolbox/lt-merge ^ikke/ikke$ ^«til x\\\@y.com»/«til x\\\@y.com»$ ^da/da$ If we run lt-merge between analysis and wblank-attach, then after the `lt-proc -b generator.bin` step we should have e.g. ^ikkje/ikkje$ ^«til x\\\@y.com»/«til x\\\@y.com»$ ^då/då$ which after `cg-proc -1 -n -g genprefs.bin` would turn into ikkje «til x\@y.com» då Note how \\\@ turned into \@ – we removed one layer of quoting, but this is still in the apertium stream so special chars stay quoted until the final tf-inject. TODO: * We need to be able to pass MERGED stuff unchanged through biltrans and generator, would like to `` but `ANY_CHAR` isn't supported yet in `lt-proc -b`. * We need an `lt-merge --unmerge` to undo the merge: $ echo '^ikkje/ikkje$ ^«\[\[tf:i:a\]\]s\\\^å\[\[\/\]\]»/«\[\[tf:i:a\]\]s\\\^å\[\[\/\]\]»$' | lt-merge --unmerge ^ikkje/ikkje$ «[[tf:i:a]]s\^å[[/]]» which then becomes $ echo '^ikkje/ikkje$ «[[tf:i:a]]s\^å[[/]]»' |cg-proc -1ng nob-nno.genprefs.rlx.bin ikkje «[[tf:i:a]]s\^å[[/]]» which `tf-inject` is happy to handle. --- .gitignore | 1 + lttoolbox/CMakeLists.txt | 7 +++- lttoolbox/fst_processor.cc | 77 ++++++++++++++++++++++++++++++++++++++ lttoolbox/fst_processor.h | 10 +++++ lttoolbox/lt-merge.1 | 40 ++++++++++++++++++++ lttoolbox/lt_merge.cc | 48 ++++++++++++++++++++++++ tests/lt_merge/__init__.py | 42 +++++++++++++++++++++ tests/run_tests.py | 4 +- 8 files changed, 226 insertions(+), 3 deletions(-) create mode 100644 lttoolbox/lt-merge.1 create mode 100644 lttoolbox/lt_merge.cc create mode 100644 tests/lt_merge/__init__.py diff --git a/.gitignore b/.gitignore index e3cbc8f..d60b4ca 100644 --- a/.gitignore +++ b/.gitignore @@ -47,6 +47,7 @@ lttoolbox/liblttoolbox.so.* /lttoolbox/lt-comp /lttoolbox/lt-compose /lttoolbox/lt-proc +/lttoolbox/lt-merge /lttoolbox/lt-trim /lttoolbox/Makefile /lttoolbox/Makefile.in diff --git a/lttoolbox/CMakeLists.txt b/lttoolbox/CMakeLists.txt index 20b0729..b9b79ec 100644 --- a/lttoolbox/CMakeLists.txt +++ b/lttoolbox/CMakeLists.txt @@ -103,6 +103,9 @@ target_link_libraries(lt-comp lttoolbox ${GETOPT_LIB}) add_executable(lt-proc lt_proc.cc) target_link_libraries(lt-proc lttoolbox ${GETOPT_LIB}) +add_executable(lt-merge lt_merge.cc) +target_link_libraries(lt-merge lttoolbox ${GETOPT_LIB}) + add_executable(lt-expand lt_expand.cc) target_link_libraries(lt-expand lttoolbox ${GETOPT_LIB}) @@ -144,11 +147,11 @@ install(TARGETS lttoolbox ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) install(FILES ${LIBLTTOOLBOX_HEADERS} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/lttoolbox) -install(TARGETS lt-append lt-print lt-trim lt-compose lt-comp lt-proc lt-expand lt-paradigm lt-tmxcomp lt-tmxproc lt-invert lt-restrict lt-apply-acx +install(TARGETS lt-append lt-print lt-trim lt-compose lt-comp lt-proc lt-merge lt-expand lt-paradigm lt-tmxcomp lt-tmxproc lt-invert lt-restrict lt-apply-acx RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) install(FILES dix.dtd dix.rng dix.rnc acx.rng xsd/dix.xsd xsd/acx.xsd DESTINATION ${CMAKE_INSTALL_DATADIR}/lttoolbox) -install(FILES lt-append.1 lt-comp.1 lt-expand.1 lt-paradigm.1 lt-proc.1 lt-tmxcomp.1 lt-tmxproc.1 lt-print.1 lt-trim.1 lt-compose.1 +install(FILES lt-append.1 lt-comp.1 lt-expand.1 lt-paradigm.1 lt-proc.1 lt-merge.1 lt-tmxcomp.1 lt-tmxproc.1 lt-print.1 lt-trim.1 lt-compose.1 DESTINATION ${CMAKE_INSTALL_MANDIR}/man1) diff --git a/lttoolbox/fst_processor.cc b/lttoolbox/fst_processor.cc index a908ea2..81446b9 100644 --- a/lttoolbox/fst_processor.cc +++ b/lttoolbox/fst_processor.cc @@ -1931,6 +1931,83 @@ FSTProcessor::biltransWithoutQueue(UStringView input_word, bool with_delim) return compose(result, ""_u, with_delim, mark); } +void +FSTProcessor::quoteMerge(InputFile& input, UFILE *output) +{ + StreamReader reader(&input); + reader.alpha = &alphabet; + reader.add_unknowns = true; + + bool merging = false; + UString surface; + while (!reader.at_eof) { + reader.next(); + + bool end_merging = false; + + for (StreamReader::Reading &it : reader.readings) { + // TODO: look up in it.symbols instead (but need to make an alphabet then) + if(it.content.find(u"") != std::string::npos) { + merging = true; + } + if(it.content.find(u"") != std::string::npos) { + end_merging = true; + } + } + if(merging) { + if(surface.size() > 0) { + surface += reader.blank; + appendEscaped(surface, reader.wblank); + } + else { + // The initial blank should just be output before the merged LU: + write(reader.blank, output); + write(reader.wblank, output); + } + if(reader.readings.size() > 0) { + // Drop possible unknown marks. + // Double-escape the form since we'll unescape during lt-unmerge: + appendEscaped(surface, reader.readings[0].content); + } + } + else { + write(reader.blank, output); + write(reader.wblank, output); + if(reader.readings.size() > 0) { + // NB. ^$ will produce a readings vector of length 1 where the single item is empty. EOF should give length 0. + // (We *want* to keep ^$ in stream, but not print extra ^$ when there was no ^$) + u_fputc('^', output); + bool seen_reading = false; + for (StreamReader::Reading &it : reader.readings) { + if (seen_reading) { + u_fputc('/', output); + } + if(it.mark != '\0') { u_fputc(it.mark, output); } + write(it.content, output); + seen_reading = true; + } + u_fputc('$', output); + } + } + if(end_merging || reader.at_null) { + if (merging) { + u_fputc('^', output); + write(surface, output); + u_fputc('/', output); + write(surface, output); + write("$"_u, output); + merging = false; + } + end_merging = false; + surface.clear(); + if(reader.at_null) { + u_fputc('\0', output); + u_fflush(output); + } + } + } +} + bool FSTProcessor::valid() const diff --git a/lttoolbox/fst_processor.h b/lttoolbox/fst_processor.h index a5641ee..c825594 100644 --- a/lttoolbox/fst_processor.h +++ b/lttoolbox/fst_processor.h @@ -462,6 +462,15 @@ class FSTProcessor } } + void appendEscaped(UString& to, const UString& from) { + for(auto &c : from) { + if (escaped_chars.find(c) != escaped_chars.end()) { + to += u'\\'; + } + to += c; + } + } + public: /* @@ -496,6 +505,7 @@ class FSTProcessor UString biltrans(UStringView input_word, bool with_delim = true); UString biltransfull(UStringView input_word, bool with_delim = true); void bilingual(InputFile& input, UFILE *output, GenerationMode mode = gm_unknown); + void quoteMerge(InputFile& input, UFILE *output); std::pair biltransWithQueue(UStringView input_word, bool with_delim = true); UString biltransWithoutQueue(UStringView input_word, bool with_delim = true); void SAO(InputFile& input, UFILE *output); diff --git a/lttoolbox/lt-merge.1 b/lttoolbox/lt-merge.1 new file mode 100644 index 0000000..1ae5fb1 --- /dev/null +++ b/lttoolbox/lt-merge.1 @@ -0,0 +1,40 @@ +.Dd December 10, 2024 +.Dt LT-MERGE 1 +.Os Apertium +.Sh NAME +.Nm lt-merge +.Nd lexical merger for Apertium +.Sh SYNOPSIS +.Nm lt-merge +.Op Fl u +.Op Ar input_file Op Ar output_file +.Sh DESCRIPTION +.Nm lt-merge +is the application responsible for merging and unmerging +lexical units +.Pp +It accomplishes this. +.Sh OPTIONS +.Bl -tag -width Ds +.It Fl u , Fl Fl unmerge +Run in reverse, this splits previously merged words. +.It Fl v , Fl Fl version +Display the version number. +.It Fl h , Fl Fl help +Display this help. +.El +\" .Sh FILES +\" .Bl -tag -width Ds +\" .It Ar input_file +\" The input compiled dictionary. +\" .El +.Sh SEE ALSO +.Xr apertium 1 , +.Xr lt-proc 1 +.Sh COPYRIGHT +Copyright \(co 2024 Universitat d'Alacant / Universidad de Alicante. +This is free software. +You may redistribute copies of it under the terms of +.Lk https://www.gnu.org/licenses/gpl.html the GNU General Public License . +.Sh BUGS +Many... lurking in the dark and waiting for you! diff --git a/lttoolbox/lt_merge.cc b/lttoolbox/lt_merge.cc new file mode 100644 index 0000000..9198f73 --- /dev/null +++ b/lttoolbox/lt_merge.cc @@ -0,0 +1,48 @@ +/* + * Copyright (C) 2024 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#include +#include +#include +#include +#include + + +int main(int argc, char *argv[]) +{ + LtLocale::tryToSetLocale(); + CLI cli("merge lexical units from the one tagged BEG until END", PACKAGE_VERSION); + cli.add_file_arg("input_file"); + cli.add_file_arg("output_file"); + cli.add_bool_arg('u', "unmerge", "Undo the merge"); + cli.add_bool_arg('z', "null-flush", "flush output on the null character"); + cli.parse_args(argc, argv); + + auto strs = cli.get_strs(); + bool unmerge = cli.get_bools()["unmerge"]; + InputFile input; + if (!cli.get_files()[1].empty()) { + input.open_or_exit(cli.get_files()[0].c_str()); + } + UFILE* output = openOutTextFile(cli.get_files()[1]); + + FSTProcessor fstp; + fstp.setNullFlush(true); // cf. description of cli["null-flush"] + fstp.initBiltrans(); + fstp.quoteMerge(input, output); + + return 0; +} diff --git a/tests/lt_merge/__init__.py b/tests/lt_merge/__init__.py new file mode 100644 index 0000000..a2c94ff --- /dev/null +++ b/tests/lt_merge/__init__.py @@ -0,0 +1,42 @@ +# -*- coding: utf-8 -*- +import unittest +from basictest import ProcTest +import unittest + +class MergeTest(unittest.TestCase, ProcTest): + inputs = ['^nochange$'] + expectedOutputs = ['^nochange$'] + procflags = [] + + def compileTest(self, tmpd): + return True # "pass" + + def openProc(self, tmpd): + return self.openPipe('lt-merge', self.procflags+[]) + + +class SimpleTest(MergeTest): + inputs = ['^ikke/ikke$ ^«/«$^så/så$ ^veldig/v$^»/»$ ^bra/bra$' ] + expectedOutputs = ['^ikke/ikke$ ^«så veldig»/«så veldig»$ ^bra/bra$'] + + +class SingleTest(MergeTest): + inputs = ['^not/very$' ] + expectedOutputs = ['^not/not$'] + + +class UnknownTest(MergeTest): + inputs = ['^foo/*foo$' ] + expectedOutputs = ['^foo/*foo$'] + + +class EscapeTest(MergeTest): + # Using r'' to avoid doubling escapes even more: + inputs = [r'^ikke/ikke$ ^«/«$^så/så$ ^ve\[dig/v$^»/»$ ^bra/bra$'] + expectedOutputs = [r'^ikke/ikke$ ^«så ve\\\[dig»/«så ve\\\[dig»$ ^bra/bra$'] + + +class WordblankTest(MergeTest): + # Using r'' to avoid doubling escapes even more: + inputs = [r'^«/«$[[tf:i:a]]^ve\/ldig/v$[[/]]^»/»$'] + expectedOutputs = [r'^«\[\[tf:i:a\]\]ve\\\/ldig\[\[\/\]\]»/«\[\[tf:i:a\]\]ve\\\/ldig\[\[\/\]\]»$'] diff --git a/tests/run_tests.py b/tests/run_tests.py index e222bbb..799fa06 100755 --- a/tests/run_tests.py +++ b/tests/run_tests.py @@ -11,7 +11,9 @@ modules = ['lt_proc', 'lt_trim', 'lt_print', 'lt_comp', 'lt_append', 'lt_paradigm', 'lt_expand', 'lt_apply_acx', 'lt_compose', - 'lt_tmxproc'] + 'lt_tmxproc', 'lt_merge'] + +# modules = ['lt_merge'] if __name__ == "__main__": From 1925a7e2c26f42efa90309abc2d1cc7a2eae68aa Mon Sep 17 00:00:00 2001 From: Kevin Brubeck Unhammer Date: Wed, 18 Dec 2024 23:53:28 +0100 Subject: [PATCH 2/4] Implement `lt-merge --unmerge` cf. HEAD^ --- lttoolbox/fst_processor.cc | 62 ++++++++++++++++++++++++++++++++++++++ lttoolbox/fst_processor.h | 1 + lttoolbox/lt_merge.cc | 7 ++++- tests/lt_merge/__init__.py | 14 +++++++++ tests/run_tests.py | 2 -- 5 files changed, 83 insertions(+), 3 deletions(-) diff --git a/lttoolbox/fst_processor.cc b/lttoolbox/fst_processor.cc index 81446b9..0e5330b 100644 --- a/lttoolbox/fst_processor.cc +++ b/lttoolbox/fst_processor.cc @@ -2009,6 +2009,68 @@ FSTProcessor::quoteMerge(InputFile& input, UFILE *output) } +void +FSTProcessor::quoteUnmerge(InputFile &input, UFILE *output) +{ + StreamReader reader(&input); + reader.alpha = &alphabet; + reader.add_unknowns = true; + + UString surface; + while (!reader.at_eof) { + reader.next(); + bool unmerging = false; + for (StreamReader::Reading &it : reader.readings) { + // TODO: look up in it.symbols instead (but need to make an alphabet then) + if(it.content.find(u"") != std::string::npos) { + unmerging = true; + } + } + write(reader.blank, output); + write(reader.wblank, output); + if(unmerging) { + // Just output the last reading (surface form), removing one level of escaping + StreamReader::Reading &lastReading = reader.readings.back(); // (we know there's at least one because of the above loop) + UString surface; + bool escaping = false; + for(auto &c : lastReading.content) { + if(escaping) { + surface += c; + escaping = false; + } + else if(c == u'\\') { + escaping = true; + } + else { + surface += c; + } + } + write(surface, output); + } + else { + if(reader.readings.size() > 0) { + // NB. ^$ will produce a readings vector of length 1 where the single item is empty. EOF should give length 0. + // (We *want* to keep ^$ in stream, but not print extra ^$ when there was no ^$) + u_fputc('^', output); + bool seen_reading = false; + for (StreamReader::Reading &it : reader.readings) { + if (seen_reading) { + u_fputc('/', output); + } + if(it.mark != '\0') { u_fputc(it.mark, output); } + write(it.content, output); + seen_reading = true; + } + u_fputc('$', output); + } + } + if(reader.at_null) { + u_fputc('\0', output); + u_fflush(output); + } + } +} + bool FSTProcessor::valid() const { diff --git a/lttoolbox/fst_processor.h b/lttoolbox/fst_processor.h index c825594..d53556f 100644 --- a/lttoolbox/fst_processor.h +++ b/lttoolbox/fst_processor.h @@ -506,6 +506,7 @@ class FSTProcessor UString biltransfull(UStringView input_word, bool with_delim = true); void bilingual(InputFile& input, UFILE *output, GenerationMode mode = gm_unknown); void quoteMerge(InputFile& input, UFILE *output); + void quoteUnmerge(InputFile& input, UFILE *output); std::pair biltransWithQueue(UStringView input_word, bool with_delim = true); UString biltransWithoutQueue(UStringView input_word, bool with_delim = true); void SAO(InputFile& input, UFILE *output); diff --git a/lttoolbox/lt_merge.cc b/lttoolbox/lt_merge.cc index 9198f73..bcf6a43 100644 --- a/lttoolbox/lt_merge.cc +++ b/lttoolbox/lt_merge.cc @@ -42,7 +42,12 @@ int main(int argc, char *argv[]) FSTProcessor fstp; fstp.setNullFlush(true); // cf. description of cli["null-flush"] fstp.initBiltrans(); - fstp.quoteMerge(input, output); + if(unmerge) { + fstp.quoteUnmerge(input, output); + } + else { + fstp.quoteMerge(input, output); + } return 0; } diff --git a/tests/lt_merge/__init__.py b/tests/lt_merge/__init__.py index a2c94ff..38f1cbb 100644 --- a/tests/lt_merge/__init__.py +++ b/tests/lt_merge/__init__.py @@ -40,3 +40,17 @@ class WordblankTest(MergeTest): # Using r'' to avoid doubling escapes even more: inputs = [r'^«/«$[[tf:i:a]]^ve\/ldig/v$[[/]]^»/»$'] expectedOutputs = [r'^«\[\[tf:i:a\]\]ve\\\/ldig\[\[\/\]\]»/«\[\[tf:i:a\]\]ve\\\/ldig\[\[\/\]\]»$'] + + +class SimpleUnmergeTest(MergeTest): + procflags = ['--unmerge'] + # Using r'' to avoid doubling escapes even more: + inputs = [r'^ikkje/ikkje$ ^«Se og Hør»/«Se og Hør»$ ^då/då$'] + expectedOutputs = [r'^ikkje/ikkje$ «Se og Hør» ^då/då$'] + + +class EscapedUnmergeTest(MergeTest): + procflags = ['--unmerge'] + # Using r'' to avoid doubling escapes even more: + inputs = [r'^ikkje/ikkje$ ^«\[\[tf:i:a\]\]s\\\^å\[\[\/\]\]»/«\[\[tf:i:a\]\]s\\\^å\[\[\/\]\]»$'] + expectedOutputs = [r'^ikkje/ikkje$ «[[tf:i:a]]s\^å[[/]]»'] diff --git a/tests/run_tests.py b/tests/run_tests.py index 799fa06..d36cb73 100755 --- a/tests/run_tests.py +++ b/tests/run_tests.py @@ -13,8 +13,6 @@ 'lt_paradigm', 'lt_expand', 'lt_apply_acx', 'lt_compose', 'lt_tmxproc', 'lt_merge'] -# modules = ['lt_merge'] - if __name__ == "__main__": os.chdir(os.path.dirname(__file__)) From e7c83795044a075a75cb8c9fc62c80ea35420ad4 Mon Sep 17 00:00:00 2001 From: Kevin Brubeck Unhammer Date: Thu, 19 Dec 2024 15:54:47 +0100 Subject: [PATCH 3/4] Let lt-proc -b handle special ANY_CHAR tag ( from lsx) --- lttoolbox/fst_processor.cc | 11 ++++++++++- lttoolbox/fst_processor.h | 5 +++++ lttoolbox/state.h | 6 +++--- tests/data/pass-through.lsx | 20 ++++++++++++++++++++ tests/lt_proc/__init__.py | 21 +++++++++++++++++++++ 5 files changed, 59 insertions(+), 4 deletions(-) create mode 100644 tests/data/pass-through.lsx diff --git a/lttoolbox/fst_processor.cc b/lttoolbox/fst_processor.cc index 0e5330b..28842ce 100644 --- a/lttoolbox/fst_processor.cc +++ b/lttoolbox/fst_processor.cc @@ -627,6 +627,8 @@ void FSTProcessor::load(FILE *input) { readTransducerSet(input, alphabetic_chars, alphabet, transducers); + alphabet.includeSymbol(""_u); + any_char = alphabet(""_u); } void @@ -1755,7 +1757,14 @@ FSTProcessor::bilingual(InputFile& input, UFILE *output, GenerationMode mode) if (reader.readings[index].mark == '#') current_state.step('#'); for (size_t i = 0; i < symbols.size(); i++) { seenTags = seenTags || alphabet.isTag(symbols[i]); - current_state.step_case(symbols[i], beCaseSensitive(current_state)); + UString source; + alphabet.getSymbol(source, symbols[i]); + if(beCaseSensitive(current_state)) { // allow any_char + current_state.step_override(symbols[i], any_char, symbols[i]); + } + else { // include lower alt + current_state.step_override(symbols[i], towlower(symbols[i]), any_char, symbols[i]); + } if (current_state.isFinal(all_finals)) { queue_start = i; current_state.filterFinalsArray(result, diff --git a/lttoolbox/fst_processor.h b/lttoolbox/fst_processor.h index d53556f..a1a1cd7 100644 --- a/lttoolbox/fst_processor.h +++ b/lttoolbox/fst_processor.h @@ -252,6 +252,11 @@ class FSTProcessor */ int maxWeightClasses = INT_MAX; + /** + * The alphabet index of the tag + */ + int any_char; + /** * Prints an error of input stream and exits */ diff --git a/lttoolbox/state.h b/lttoolbox/state.h index 56a7d34..31d2032 100644 --- a/lttoolbox/state.h +++ b/lttoolbox/state.h @@ -113,9 +113,9 @@ class State /** * Make a transition, but overriding the output symbol - * @param input symbol - * @param output symbol we expect to appear - * @param output symbol we want to appear + * @param input symbol read from infile + * @param output symbol from the FST + * @param output symbol we want to appear in outfile */ void apply_override(int const input, int const old_sym, int const new_sym); diff --git a/tests/data/pass-through.lsx b/tests/data/pass-through.lsx new file mode 100644 index 0000000..ba2c875 --- /dev/null +++ b/tests/data/pass-through.lsx @@ -0,0 +1,20 @@ + + + + + + + + + + foo + + + +
+ + + + +
+
diff --git a/tests/lt_proc/__init__.py b/tests/lt_proc/__init__.py index cae6568..d816f06 100644 --- a/tests/lt_proc/__init__.py +++ b/tests/lt_proc/__init__.py @@ -479,6 +479,11 @@ class BiltransGarbage(ProcTest): inputs = ['^$'] expectedOutputs = ['^$'] +class BiltransSimple(ProcTest): + procflags = ['-b', '-z'] + inputs = ['^abc$'] + expectedOutputs = ['^abc/ab$'] + class SlashesInTags(ProcTest): procdix = 'data/slash-tags.dix' procflags = ['-b', '-z'] @@ -496,5 +501,21 @@ class SlashesInTags(ProcTest): '^\\*lobwana1.1<1/2>/*lopwana1.1<1/2>$', '^\\*lobwana1.1<3/4>/@\\*lobwana1.1<3/4>$'] +class BiltransAnyChar(ProcTest): + procdix = 'data/pass-through.lsx' + procflags = ['-b', '-z'] + # Using r'' to avoid doubling escapes even more: + inputs = [r'^simple$'] + expectedOutputs = [r'^simple/simple$'] + + +class BiltransAnyCharEscapes(ProcTest): + procdix = 'data/pass-through.lsx' + procflags = ['-b', '-z'] + # Using r'' to avoid doubling escapes even more: + inputs = [r'^«\[\[tf:i:a\]\]s\\\^å\[\[\/\]\]»$'] + expectedOutputs = [r'^«\[\[tf:i:a\]\]s\\\^å\[\[\/\]\]»/«\[\[tf:i:a\]\]s\\\^å\[\[\/\]\]»$'] + + # These fail on some systems: #from null_flush_invalid_stream_format import * From 648471e4dc2a35fb46167dac57073c665a42cc14 Mon Sep 17 00:00:00 2001 From: Kevin Brubeck Unhammer Date: Thu, 19 Dec 2024 23:58:48 +0100 Subject: [PATCH 4/4] Put an empty ^$ after wblank MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I thought we were good because ```sh $ echo '^ikkje/ikkje$ «[[tf:i:a]]s\^å[[/]]»' |cg-proc -1ng nob-nno.genprefs.rlx.bin ikkje «[[tf:i:a]]s\^å[[/]]» ``` worked, but If there's an analysis after the unmerged word blank, cg-proc errors out: ```sh $ echo '^ikkje/ikkje$ «[[tf:i:a]]s\^å» ^./.$' |cg-proc -1ng nob-nno.genprefs.rlx.bin Error: Word-bound blank was not immediately prior to token on line 0 ``` Fair enough, so lt-merge must put an empty ^$ after word blanks to appease cg-proc. Seems like tf-inject finds the right point at which to end it anyway? --- lttoolbox/fst_processor.cc | 1 + tests/lt_merge/__init__.py | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/lttoolbox/fst_processor.cc b/lttoolbox/fst_processor.cc index 28842ce..d4ff07d 100644 --- a/lttoolbox/fst_processor.cc +++ b/lttoolbox/fst_processor.cc @@ -1967,6 +1967,7 @@ FSTProcessor::quoteMerge(InputFile& input, UFILE *output) if(surface.size() > 0) { surface += reader.blank; appendEscaped(surface, reader.wblank); + if(!reader.wblank.empty()) { appendEscaped(surface, "^$"_u); } // otherwise cg-proc will Error wordblank not prior to token } else { // The initial blank should just be output before the merged LU: diff --git a/tests/lt_merge/__init__.py b/tests/lt_merge/__init__.py index 38f1cbb..6d956d4 100644 --- a/tests/lt_merge/__init__.py +++ b/tests/lt_merge/__init__.py @@ -38,8 +38,8 @@ class EscapeTest(MergeTest): class WordblankTest(MergeTest): # Using r'' to avoid doubling escapes even more: - inputs = [r'^«/«$[[tf:i:a]]^ve\/ldig/v$[[/]]^»/»$'] - expectedOutputs = [r'^«\[\[tf:i:a\]\]ve\\\/ldig\[\[\/\]\]»/«\[\[tf:i:a\]\]ve\\\/ldig\[\[\/\]\]»$'] + inputs = [r'^«/«$[[tf:i:a]]^ve\/ldig/v$^»/»$'] + expectedOutputs = [r'^«\[\[tf:i:a\]\]\^\$ve\\\/ldig»/«\[\[tf:i:a\]\]\^\$ve\\\/ldig»$'] class SimpleUnmergeTest(MergeTest): @@ -52,5 +52,5 @@ class SimpleUnmergeTest(MergeTest): class EscapedUnmergeTest(MergeTest): procflags = ['--unmerge'] # Using r'' to avoid doubling escapes even more: - inputs = [r'^ikkje/ikkje$ ^«\[\[tf:i:a\]\]s\\\^å\[\[\/\]\]»/«\[\[tf:i:a\]\]s\\\^å\[\[\/\]\]»$'] - expectedOutputs = [r'^ikkje/ikkje$ «[[tf:i:a]]s\^å[[/]]»'] + inputs = [r'^ikkje/ikkje$ ^«\[\[tf:i:a\]\]\^\$s\\\^å»/«\[\[tf:i:a\]\]\^\$s\\\^å»$'] + expectedOutputs = [r'^ikkje/ikkje$ «[[tf:i:a]]^$s\^å»']