From 57a297ecc2c461b8e35a23765172b854ef3846ef Mon Sep 17 00:00:00 2001
From: Kevin Brubeck Unhammer <unhammer@fsfe.org>
Date: Mon, 25 Nov 2024 16:24:07 +0100
Subject: [PATCH 1/4] New `lt-merge` command to merge LU's from BEG to END tag
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

    $ echo '^ikke/ikke<adv>$ ^«/«<lquot><MERGE_BEG>$^så/så<adv>$ ^veldig/v<adv>$^»/»<rquot><MERGE_END>$ ^bra/bra<adj>$' | lt-merge
    ^ikke/ikke<adv>$ ^«så veldig»/«så veldig»<MERGED>$ ^bra/bra<adj>$

Mostly simple, but escaping can look a bit messy. If any of the LU's
have word-bound blanks, the [] need escaping:

    $ echo '^«/«<lquot><MERGE_BEG>$[[tf:i:a]]^veldig/veldig<adv>$[[/]]^»/»<rquot><MERGE_END>$' | lttoolbox/lt-merge
    ^«\[\[tf:i:a\]\]veldig\[\[\/\]\]»/«\[\[tf:i:a\]\]veldig\[\[\/\]\]»<MERGED>$

to ensure we have legal stream format.

If any of the forms contain already escaped chars, these now need
double-escaping. Why? Because we need to run an "unmerge" step towards
the end of the pipeline, while still outputting Apertium Stream
Format, and need to know the difference between a \[ meaning
word-blank or \\[ meaning literal [.

    $ echo '^ikke/ikke<adv>$ ^«/«<lquot><MERGE_BEG>$^til/til<pr>$ ^x\@y.com/x\@y.com<email>$^»/»<rquot><MERGE_END>$ ^da/da<adv>$' | lttoolbox/lt-merge
    ^ikke/ikke<adv>$ ^«til x\\\@y.com»/«til x\\\@y.com»<MERGED>$ ^da/da<adv>$

If we run lt-merge between analysis and wblank-attach, then after the
`lt-proc -b generator.bin` step we should have e.g.

    ^ikkje<adv>/ikkje$ ^«til x\\\@y.com»<MERGED>/«til x\\\@y.com»$ ^då<adv>/då$

which after `cg-proc -1 -n -g genprefs.bin` would turn into

    ikkje «til x\@y.com» då

Note how \\\@ turned into \@ – we removed one layer of quoting, but
this is still in the apertium stream so special chars stay quoted
until the final tf-inject.

TODO:

* We need to be able to pass MERGED stuff unchanged through biltrans
  and generator, would like to `<i><w/><s n="MERGED"/></i>` but
  `ANY_CHAR` isn't supported yet in `lt-proc -b`.

* We need an `lt-merge --unmerge` to undo the merge:

    $ echo '^ikkje<adv>/ikkje$ ^«\[\[tf:i:a\]\]s\\\^å\[\[\/\]\]»<MERGED>/«\[\[tf:i:a\]\]s\\\^å\[\[\/\]\]»$' | lt-merge --unmerge
    ^ikkje<adv>/ikkje$ «[[tf:i:a]]s\^å[[/]]»

  which then becomes

    $ echo '^ikkje<adv>/ikkje$ «[[tf:i:a]]s\^å[[/]]»' |cg-proc -1ng nob-nno.genprefs.rlx.bin
    ikkje «[[tf:i:a]]s\^å[[/]]»

  which `tf-inject` is happy to handle.
---
 .gitignore                 |  1 +
 lttoolbox/CMakeLists.txt   |  7 +++-
 lttoolbox/fst_processor.cc | 77 ++++++++++++++++++++++++++++++++++++++
 lttoolbox/fst_processor.h  | 10 +++++
 lttoolbox/lt-merge.1       | 40 ++++++++++++++++++++
 lttoolbox/lt_merge.cc      | 48 ++++++++++++++++++++++++
 tests/lt_merge/__init__.py | 42 +++++++++++++++++++++
 tests/run_tests.py         |  4 +-
 8 files changed, 226 insertions(+), 3 deletions(-)
 create mode 100644 lttoolbox/lt-merge.1
 create mode 100644 lttoolbox/lt_merge.cc
 create mode 100644 tests/lt_merge/__init__.py
diff --git a/.gitignore b/.gitignore
index e3cbc8f..d60b4ca 100644
--- a/.gitignore
+++ b/.gitignore
@@ -47,6 +47,7 @@ lttoolbox/liblttoolbox.so.*
 /lttoolbox/lt-comp
 /lttoolbox/lt-compose
 /lttoolbox/lt-proc
+/lttoolbox/lt-merge
 /lttoolbox/lt-trim
 /lttoolbox/Makefile
 /lttoolbox/Makefile.in
diff --git a/lttoolbox/CMakeLists.txt b/lttoolbox/CMakeLists.txt
index 20b0729..b9b79ec 100644
--- a/lttoolbox/CMakeLists.txt
+++ b/lttoolbox/CMakeLists.txt
@@ -103,6 +103,9 @@ target_link_libraries(lt-comp lttoolbox ${GETOPT_LIB})
 add_executable(lt-proc lt_proc.cc)
 target_link_libraries(lt-proc lttoolbox ${GETOPT_LIB})
 
+add_executable(lt-merge lt_merge.cc)
+target_link_libraries(lt-merge lttoolbox ${GETOPT_LIB})
+
 add_executable(lt-expand lt_expand.cc)
 target_link_libraries(lt-expand lttoolbox ${GETOPT_LIB})
 
@@ -144,11 +147,11 @@ install(TARGETS lttoolbox
 	ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
 install(FILES ${LIBLTTOOLBOX_HEADERS}
 	DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/lttoolbox)
-install(TARGETS lt-append lt-print lt-trim lt-compose lt-comp lt-proc lt-expand lt-paradigm lt-tmxcomp lt-tmxproc lt-invert lt-restrict lt-apply-acx
+install(TARGETS lt-append lt-print lt-trim lt-compose lt-comp lt-proc lt-merge lt-expand lt-paradigm lt-tmxcomp lt-tmxproc lt-invert lt-restrict lt-apply-acx
 	RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
 
 install(FILES dix.dtd dix.rng dix.rnc acx.rng xsd/dix.xsd xsd/acx.xsd
 	DESTINATION ${CMAKE_INSTALL_DATADIR}/lttoolbox)
 
-install(FILES lt-append.1 lt-comp.1 lt-expand.1 lt-paradigm.1 lt-proc.1 lt-tmxcomp.1 lt-tmxproc.1 lt-print.1 lt-trim.1 lt-compose.1
+install(FILES lt-append.1 lt-comp.1 lt-expand.1 lt-paradigm.1 lt-proc.1 lt-merge.1 lt-tmxcomp.1 lt-tmxproc.1 lt-print.1 lt-trim.1 lt-compose.1
 	DESTINATION ${CMAKE_INSTALL_MANDIR}/man1)
diff --git a/lttoolbox/fst_processor.cc b/lttoolbox/fst_processor.cc
index a908ea2..81446b9 100644
--- a/lttoolbox/fst_processor.cc
+++ b/lttoolbox/fst_processor.cc
@@ -1931,6 +1931,83 @@ FSTProcessor::biltransWithoutQueue(UStringView input_word, bool with_delim)
   return compose(result, ""_u, with_delim, mark);
 }
 
+void
+FSTProcessor::quoteMerge(InputFile& input, UFILE *output)
+{
+  StreamReader reader(&input);
+  reader.alpha = &alphabet;
+  reader.add_unknowns = true;
+
+  bool merging = false;
+  UString surface;
+  while (!reader.at_eof) {
+    reader.next();
+
+    bool end_merging = false;
+
+    for (StreamReader::Reading &it : reader.readings) {
+      // TODO: look up in it.symbols instead (but need to make an alphabet then)
+      if(it.content.find(u"<MERGE_BEG>") != std::string::npos) {
+        merging = true;
+      }
+      if(it.content.find(u"<MERGE_END>") != std::string::npos) {
+        end_merging = true;
+      }
+    }
+    if(merging) {
+      if(surface.size() > 0) {
+        surface += reader.blank;
+        appendEscaped(surface, reader.wblank);
+      }
+      else {
+        // The initial blank should just be output before the merged LU:
+        write(reader.blank, output);
+        write(reader.wblank, output);
+      }
+      if(reader.readings.size() > 0) {
+        // Drop possible unknown marks.
+        // Double-escape the form since we'll unescape during lt-unmerge:
+        appendEscaped(surface, reader.readings[0].content);
+      }
+    }
+    else {
+      write(reader.blank, output);
+      write(reader.wblank, output);
+      if(reader.readings.size() > 0) {
+        // NB. ^$ will produce a readings vector of length 1 where the single item is empty. EOF should give length 0.
+        // (We *want* to keep ^$ in stream, but not print extra ^$ when there was no ^$)
+        u_fputc('^', output);
+        bool seen_reading = false;
+        for (StreamReader::Reading &it : reader.readings) {
+          if (seen_reading) {
+            u_fputc('/', output);
+          }
+          if(it.mark != '\0') { u_fputc(it.mark, output); }
+          write(it.content, output);
+          seen_reading = true;
+        }
+        u_fputc('$', output);
+      }
+    }
+    if(end_merging || reader.at_null) {
+      if (merging) {
+        u_fputc('^', output);
+        write(surface, output);
+        u_fputc('/', output);
+        write(surface, output);
+        write("<MERGED>$"_u, output);
+        merging = false;
+      }
+      end_merging = false;
+      surface.clear();
+      if(reader.at_null) {
+        u_fputc('\0', output);
+        u_fflush(output);
+      }
+    }
+  }
+}
+
 
 bool
 FSTProcessor::valid() const
diff --git a/lttoolbox/fst_processor.h b/lttoolbox/fst_processor.h
index a5641ee..c825594 100644
--- a/lttoolbox/fst_processor.h
+++ b/lttoolbox/fst_processor.h
@@ -462,6 +462,15 @@ class FSTProcessor
     }
   }
 
+  void appendEscaped(UString& to, const UString& from) {
+    for(auto &c : from) {
+      if (escaped_chars.find(c) != escaped_chars.end()) {
+        to += u'\\';
+      }
+      to += c;
+    }
+  }
+
 public:
 
   /*
@@ -496,6 +505,7 @@ class FSTProcessor
   UString biltrans(UStringView input_word, bool with_delim = true);
   UString biltransfull(UStringView input_word, bool with_delim = true);
   void bilingual(InputFile& input, UFILE *output, GenerationMode mode = gm_unknown);
+  void quoteMerge(InputFile& input, UFILE *output);
   std::pair<UString, int> biltransWithQueue(UStringView input_word, bool with_delim = true);
   UString biltransWithoutQueue(UStringView input_word, bool with_delim = true);
   void SAO(InputFile& input, UFILE *output);
diff --git a/lttoolbox/lt-merge.1 b/lttoolbox/lt-merge.1
new file mode 100644
index 0000000..1ae5fb1
--- /dev/null
+++ b/lttoolbox/lt-merge.1
@@ -0,0 +1,40 @@
+.Dd December 10, 2024
+.Dt LT-MERGE 1
+.Os Apertium
+.Sh NAME
+.Nm lt-merge
+.Nd lexical merger for Apertium
+.Sh SYNOPSIS
+.Nm lt-merge
+.Op Fl u
+.Op Ar input_file Op Ar output_file
+.Sh DESCRIPTION
+.Nm lt-merge
+is the application responsible for merging and unmerging
+lexical units
+.Pp
+It accomplishes this.
+.Sh OPTIONS
+.Bl -tag -width Ds
+.It Fl u , Fl Fl unmerge
+Run in reverse, this splits previously merged words.
+.It Fl v , Fl Fl version
+Display the version number.
+.It Fl h , Fl Fl help
+Display this help.
+.El
+\" .Sh FILES
+\" .Bl -tag -width Ds
+\" .It Ar input_file
+\" The input compiled dictionary.
+\" .El
+.Sh SEE ALSO
+.Xr apertium 1 ,
+.Xr lt-proc 1
+.Sh COPYRIGHT
+Copyright \(co 2024 Universitat d'Alacant / Universidad de Alicante.
+This is free software.
+You may redistribute copies of it under the terms of
+.Lk https://www.gnu.org/licenses/gpl.html the GNU General Public License .
+.Sh BUGS
+Many... lurking in the dark and waiting for you!
diff --git a/lttoolbox/lt_merge.cc b/lttoolbox/lt_merge.cc
new file mode 100644
index 0000000..9198f73
--- /dev/null
+++ b/lttoolbox/lt_merge.cc
@@ -0,0 +1,48 @@
+/*
+ * Copyright (C) 2024 Universitat d'Alacant / Universidad de Alicante
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses/>.
+ */
+#include <lttoolbox/fst_processor.h>
+#include <lttoolbox/file_utils.h>
+#include <lttoolbox/cli.h>
+#include <lttoolbox/lt_locale.h>
+#include <iostream>
+
+
+int main(int argc, char *argv[])
+{
+  LtLocale::tryToSetLocale();
+  CLI cli("merge lexical units from the one tagged BEG until END", PACKAGE_VERSION);
+  cli.add_file_arg("input_file");
+  cli.add_file_arg("output_file");
+  cli.add_bool_arg('u', "unmerge", "Undo the merge");
+  cli.add_bool_arg('z', "null-flush", "flush output on the null character");
+  cli.parse_args(argc, argv);
+
+  auto strs = cli.get_strs();
+  bool unmerge = cli.get_bools()["unmerge"];
+  InputFile input;
+  if (!cli.get_files()[1].empty()) {
+    input.open_or_exit(cli.get_files()[0].c_str());
+  }
+  UFILE* output = openOutTextFile(cli.get_files()[1]);
+
+  FSTProcessor fstp;
+  fstp.setNullFlush(true); // cf. description of cli["null-flush"]
+  fstp.initBiltrans();
+  fstp.quoteMerge(input, output);
+
+  return 0;
+}
diff --git a/tests/lt_merge/__init__.py b/tests/lt_merge/__init__.py
new file mode 100644
index 0000000..a2c94ff
--- /dev/null
+++ b/tests/lt_merge/__init__.py
@@ -0,0 +1,42 @@
+# -*- coding: utf-8 -*-
+import unittest
+from basictest import ProcTest
+import unittest
+
+class MergeTest(unittest.TestCase, ProcTest):
+    inputs = ['^nochange<n>$']
+    expectedOutputs = ['^nochange<n>$']
+    procflags = []
+
+    def compileTest(self, tmpd):
+        return True             # "pass"
+
+    def openProc(self, tmpd):
+        return self.openPipe('lt-merge', self.procflags+[])
+
+
+class SimpleTest(MergeTest):
+    inputs = ['^ikke/ikke<adv>$ ^«/«<lquot><MERGE_BEG>$^så/så<adv>$ ^veldig/v<adv>$^»/»<rquot><MERGE_END>$ ^bra/bra<adj>$' ]
+    expectedOutputs = ['^ikke/ikke<adv>$ ^«så veldig»/«så veldig»<MERGED>$ ^bra/bra<adj>$']
+
+
+class SingleTest(MergeTest):
+    inputs = ['^not/very<useful><MERGE_BEG><MERGE_END>$' ]
+    expectedOutputs = ['^not/not<MERGED>$']
+
+
+class UnknownTest(MergeTest):
+    inputs = ['^foo/*foo$' ]
+    expectedOutputs = ['^foo/*foo$']
+
+
+class EscapeTest(MergeTest):
+    # Using r'' to avoid doubling escapes even more:
+    inputs = [r'^ikke/ikke<adv>$ ^«/«<lquot><MERGE_BEG>$^så/så<adv>$ ^ve\[dig/v<adv>$^»/»<rquot><MERGE_END>$ ^bra/bra<adj>$']
+    expectedOutputs = [r'^ikke/ikke<adv>$ ^«så ve\\\[dig»/«så ve\\\[dig»<MERGED>$ ^bra/bra<adj>$']
+
+
+class WordblankTest(MergeTest):
+    # Using r'' to avoid doubling escapes even more:
+    inputs = [r'^«/«<lquot><MERGE_BEG>$[[tf:i:a]]^ve\/ldig/v<adv>$[[/]]^»/»<rquot><MERGE_END>$']
+    expectedOutputs = [r'^«\[\[tf:i:a\]\]ve\\\/ldig\[\[\/\]\]»/«\[\[tf:i:a\]\]ve\\\/ldig\[\[\/\]\]»<MERGED>$']
diff --git a/tests/run_tests.py b/tests/run_tests.py
index e222bbb..799fa06 100755
--- a/tests/run_tests.py
+++ b/tests/run_tests.py
@@ -11,7 +11,9 @@
 
 modules = ['lt_proc', 'lt_trim', 'lt_print', 'lt_comp', 'lt_append',
            'lt_paradigm', 'lt_expand', 'lt_apply_acx', 'lt_compose',
-           'lt_tmxproc']
+           'lt_tmxproc', 'lt_merge']
+
+# modules = ['lt_merge']
 
 
 if __name__ == "__main__":

From 1925a7e2c26f42efa90309abc2d1cc7a2eae68aa Mon Sep 17 00:00:00 2001
From: Kevin Brubeck Unhammer <unhammer@fsfe.org>
Date: Wed, 18 Dec 2024 23:53:28 +0100
Subject: [PATCH 2/4] Implement `lt-merge --unmerge`

cf. HEAD^
---
 lttoolbox/fst_processor.cc | 62 ++++++++++++++++++++++++++++++++++++++
 lttoolbox/fst_processor.h  |  1 +
 lttoolbox/lt_merge.cc      |  7 ++++-
 tests/lt_merge/__init__.py | 14 +++++++++
 tests/run_tests.py         |  2 --
 5 files changed, 83 insertions(+), 3 deletions(-)

diff --git a/lttoolbox/fst_processor.cc b/lttoolbox/fst_processor.cc
index 81446b9..0e5330b 100644
--- a/lttoolbox/fst_processor.cc
+++ b/lttoolbox/fst_processor.cc
@@ -2009,6 +2009,68 @@ FSTProcessor::quoteMerge(InputFile& input, UFILE *output)
 }
 
 
+void
+FSTProcessor::quoteUnmerge(InputFile &input, UFILE *output)
+{
+  StreamReader reader(&input);
+  reader.alpha = &alphabet;
+  reader.add_unknowns = true;
+
+  UString surface;
+  while (!reader.at_eof) {
+    reader.next();
+    bool unmerging = false;
+    for (StreamReader::Reading &it : reader.readings) {
+      // TODO: look up in it.symbols instead (but need to make an alphabet then)
+      if(it.content.find(u"<MERGED>") != std::string::npos) {
+        unmerging = true;
+      }
+    }
+    write(reader.blank, output);
+    write(reader.wblank, output);
+    if(unmerging) {
+      // Just output the last reading (surface form), removing one level of escaping
+      StreamReader::Reading &lastReading = reader.readings.back(); // (we know there's at least one because of the above loop)
+      UString surface;
+      bool escaping = false;
+      for(auto &c : lastReading.content) {
+        if(escaping) {
+          surface += c;
+          escaping = false;
+        }
+        else if(c == u'\\') {
+          escaping = true;
+        }
+        else {
+          surface += c;
+        }
+      }
+      write(surface, output);
+    }
+    else {
+      if(reader.readings.size() > 0) {
+        // NB. ^$ will produce a readings vector of length 1 where the single item is empty. EOF should give length 0.
+        // (We *want* to keep ^$ in stream, but not print extra ^$ when there was no ^$)
+        u_fputc('^', output);
+        bool seen_reading = false;
+        for (StreamReader::Reading &it : reader.readings) {
+          if (seen_reading) {
+            u_fputc('/', output);
+          }
+          if(it.mark != '\0') { u_fputc(it.mark, output); }
+          write(it.content, output);
+          seen_reading = true;
+        }
+        u_fputc('$', output);
+      }
+    }
+    if(reader.at_null) {
+      u_fputc('\0', output);
+      u_fflush(output);
+    }
+  }
+}
+
 bool
 FSTProcessor::valid() const
 {
diff --git a/lttoolbox/fst_processor.h b/lttoolbox/fst_processor.h
index c825594..d53556f 100644
--- a/lttoolbox/fst_processor.h
+++ b/lttoolbox/fst_processor.h
@@ -506,6 +506,7 @@ class FSTProcessor
   UString biltransfull(UStringView input_word, bool with_delim = true);
   void bilingual(InputFile& input, UFILE *output, GenerationMode mode = gm_unknown);
   void quoteMerge(InputFile& input, UFILE *output);
+  void quoteUnmerge(InputFile& input, UFILE *output);
   std::pair<UString, int> biltransWithQueue(UStringView input_word, bool with_delim = true);
   UString biltransWithoutQueue(UStringView input_word, bool with_delim = true);
   void SAO(InputFile& input, UFILE *output);
diff --git a/lttoolbox/lt_merge.cc b/lttoolbox/lt_merge.cc
index 9198f73..bcf6a43 100644
--- a/lttoolbox/lt_merge.cc
+++ b/lttoolbox/lt_merge.cc
@@ -42,7 +42,12 @@ int main(int argc, char *argv[])
   FSTProcessor fstp;
   fstp.setNullFlush(true); // cf. description of cli["null-flush"]
   fstp.initBiltrans();
-  fstp.quoteMerge(input, output);
+  if(unmerge) {
+    fstp.quoteUnmerge(input, output);
+  }
+  else {
+    fstp.quoteMerge(input, output);
+  }
 
   return 0;
 }
diff --git a/tests/lt_merge/__init__.py b/tests/lt_merge/__init__.py
index a2c94ff..38f1cbb 100644
--- a/tests/lt_merge/__init__.py
+++ b/tests/lt_merge/__init__.py
@@ -40,3 +40,17 @@ class WordblankTest(MergeTest):
     # Using r'' to avoid doubling escapes even more:
     inputs = [r'^«/«<lquot><MERGE_BEG>$[[tf:i:a]]^ve\/ldig/v<adv>$[[/]]^»/»<rquot><MERGE_END>$']
     expectedOutputs = [r'^«\[\[tf:i:a\]\]ve\\\/ldig\[\[\/\]\]»/«\[\[tf:i:a\]\]ve\\\/ldig\[\[\/\]\]»<MERGED>$']
+
+
+class SimpleUnmergeTest(MergeTest):
+    procflags = ['--unmerge']
+    # Using r'' to avoid doubling escapes even more:
+    inputs = [r'^ikkje<adv>/ikkje$ ^«Se og Hør»<MERGED>/«Se og Hør»$ ^då<adv>/då$']
+    expectedOutputs = [r'^ikkje<adv>/ikkje$ «Se og Hør» ^då<adv>/då$']
+
+
+class EscapedUnmergeTest(MergeTest):
+    procflags = ['--unmerge']
+    # Using r'' to avoid doubling escapes even more:
+    inputs = [r'^ikkje<adv>/ikkje$ ^«\[\[tf:i:a\]\]s\\\^å\[\[\/\]\]»<MERGED>/«\[\[tf:i:a\]\]s\\\^å\[\[\/\]\]»$']
+    expectedOutputs = [r'^ikkje<adv>/ikkje$ «[[tf:i:a]]s\^å[[/]]»']
diff --git a/tests/run_tests.py b/tests/run_tests.py
index 799fa06..d36cb73 100755
--- a/tests/run_tests.py
+++ b/tests/run_tests.py
@@ -13,8 +13,6 @@
            'lt_paradigm', 'lt_expand', 'lt_apply_acx', 'lt_compose',
            'lt_tmxproc', 'lt_merge']
 
-# modules = ['lt_merge']
-
 
 if __name__ == "__main__":
     os.chdir(os.path.dirname(__file__))

From e7c83795044a075a75cb8c9fc62c80ea35420ad4 Mon Sep 17 00:00:00 2001
From: Kevin Brubeck Unhammer <unhammer@fsfe.org>
Date: Thu, 19 Dec 2024 15:54:47 +0100
Subject: [PATCH 3/4] Let lt-proc -b handle special ANY_CHAR tag (<w/> from
 lsx)

---
 lttoolbox/fst_processor.cc  | 11 ++++++++++-
 lttoolbox/fst_processor.h   |  5 +++++
 lttoolbox/state.h           |  6 +++---
 tests/data/pass-through.lsx | 20 ++++++++++++++++++++
 tests/lt_proc/__init__.py   | 21 +++++++++++++++++++++
 5 files changed, 59 insertions(+), 4 deletions(-)
 create mode 100644 tests/data/pass-through.lsx

diff --git a/lttoolbox/fst_processor.cc b/lttoolbox/fst_processor.cc
index 0e5330b..28842ce 100644
--- a/lttoolbox/fst_processor.cc
+++ b/lttoolbox/fst_processor.cc
@@ -627,6 +627,8 @@ void
 FSTProcessor::load(FILE *input)
 {
   readTransducerSet(input, alphabetic_chars, alphabet, transducers);
+  alphabet.includeSymbol("<ANY_CHAR>"_u);
+  any_char = alphabet("<ANY_CHAR>"_u);
 }
 
 void
@@ -1755,7 +1757,14 @@ FSTProcessor::bilingual(InputFile& input, UFILE *output, GenerationMode mode)
     if (reader.readings[index].mark == '#') current_state.step('#');
     for (size_t i = 0; i < symbols.size(); i++) {
       seenTags = seenTags || alphabet.isTag(symbols[i]);
-      current_state.step_case(symbols[i], beCaseSensitive(current_state));
+      UString source;
+      alphabet.getSymbol(source, symbols[i]);
+      if(beCaseSensitive(current_state)) { // allow any_char
+        current_state.step_override(symbols[i], any_char, symbols[i]);
+      }
+      else {                    // include lower alt
+        current_state.step_override(symbols[i], towlower(symbols[i]), any_char, symbols[i]);
+      }
       if (current_state.isFinal(all_finals)) {
         queue_start = i;
         current_state.filterFinalsArray(result,
diff --git a/lttoolbox/fst_processor.h b/lttoolbox/fst_processor.h
index d53556f..a1a1cd7 100644
--- a/lttoolbox/fst_processor.h
+++ b/lttoolbox/fst_processor.h
@@ -252,6 +252,11 @@ class FSTProcessor
    */
   int maxWeightClasses = INT_MAX;
 
+  /**
+   * The alphabet index of the tag <ANY_CHAR>
+   */
+  int any_char;
+
   /**
    * Prints an error of input stream and exits
    */
diff --git a/lttoolbox/state.h b/lttoolbox/state.h
index 56a7d34..31d2032 100644
--- a/lttoolbox/state.h
+++ b/lttoolbox/state.h
@@ -113,9 +113,9 @@ class State
 
   /**
    * Make a transition, but overriding the output symbol
-   * @param input symbol
-   * @param output symbol we expect to appear
-   * @param output symbol we want to appear
+   * @param input symbol read from infile
+   * @param output symbol from the FST
+   * @param output symbol we want to appear in outfile
    */
   void apply_override(int const input, int const old_sym, int const new_sym);
 
diff --git a/tests/data/pass-through.lsx b/tests/data/pass-through.lsx
new file mode 100644
index 0000000..ba2c875
--- /dev/null
+++ b/tests/data/pass-through.lsx
@@ -0,0 +1,20 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<dictionary type="separable">
+  <alphabet></alphabet>
+  <sdefs>
+    <sdef n="MERGED"/>
+  </sdefs>
+
+  <pardefs>
+    <pardef n="foo">
+      <e>   <i>foo<d/></i>            </e>
+    </pardef>
+  </pardefs>
+
+  <section id="main" type="standard">
+
+    <e c="pass-through MERGED words">
+      <i><w/><s n="MERGED"/></i>
+    </e>
+  </section>
+</dictionary>
diff --git a/tests/lt_proc/__init__.py b/tests/lt_proc/__init__.py
index cae6568..d816f06 100644
--- a/tests/lt_proc/__init__.py
+++ b/tests/lt_proc/__init__.py
@@ -479,6 +479,11 @@ class BiltransGarbage(ProcTest):
     inputs = ['^$']
     expectedOutputs = ['^$']
 
+class BiltransSimple(ProcTest):
+    procflags = ['-b', '-z']
+    inputs = ['^abc$']
+    expectedOutputs = ['^abc/ab<n><def>$']
+
 class SlashesInTags(ProcTest):
     procdix = 'data/slash-tags.dix'
     procflags = ['-b', '-z']
@@ -496,5 +501,21 @@ class SlashesInTags(ProcTest):
                        '^\\*lobwana1.1<n><1/2><a/b>/*lopwana1.1<n><1/2><a/b>$',
                        '^\\*lobwana1.1<n><3/4><a/b>/@\\*lobwana1.1<n><3/4><a/b>$']
 
+class BiltransAnyChar(ProcTest):
+    procdix = 'data/pass-through.lsx'
+    procflags = ['-b', '-z']
+    # Using r'' to avoid doubling escapes even more:
+    inputs = [r'^simple<MERGED>$']
+    expectedOutputs = [r'^simple<MERGED>/simple<MERGED>$']
+
+
+class BiltransAnyCharEscapes(ProcTest):
+    procdix = 'data/pass-through.lsx'
+    procflags = ['-b', '-z']
+    # Using r'' to avoid doubling escapes even more:
+    inputs = [r'^«\[\[tf:i:a\]\]s\\\^å\[\[\/\]\]»<MERGED>$']
+    expectedOutputs = [r'^«\[\[tf:i:a\]\]s\\\^å\[\[\/\]\]»<MERGED>/«\[\[tf:i:a\]\]s\\\^å\[\[\/\]\]»<MERGED>$']
+
+
 # These fail on some systems:
 #from null_flush_invalid_stream_format import *

From 648471e4dc2a35fb46167dac57073c665a42cc14 Mon Sep 17 00:00:00 2001
From: Kevin Brubeck Unhammer <unhammer@fsfe.org>
Date: Thu, 19 Dec 2024 23:58:48 +0100
Subject: [PATCH 4/4] Put an empty ^$ after wblank
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

I thought we were good because

```sh
$ echo '^ikkje<adv>/ikkje$ «[[tf:i:a]]s\^å[[/]]»' |cg-proc -1ng nob-nno.genprefs.rlx.bin
ikkje «[[tf:i:a]]s\^å[[/]]»
```

worked, but

If there's an analysis after the unmerged word blank, cg-proc errors out:

```sh
$ echo '^ikkje<adv>/ikkje$ «[[tf:i:a]]s\^å» ^.<sent>/.$' |cg-proc -1ng nob-nno.genprefs.rlx.bin
Error: Word-bound blank was not immediately prior to token on line 0
```

Fair enough, so lt-merge must put an empty ^$ after word blanks to
appease cg-proc. Seems like tf-inject finds the right point at which
to end it anyway?
---
 lttoolbox/fst_processor.cc | 1 +
 tests/lt_merge/__init__.py | 8 ++++----
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/lttoolbox/fst_processor.cc b/lttoolbox/fst_processor.cc
index 28842ce..d4ff07d 100644
--- a/lttoolbox/fst_processor.cc
+++ b/lttoolbox/fst_processor.cc
@@ -1967,6 +1967,7 @@ FSTProcessor::quoteMerge(InputFile& input, UFILE *output)
       if(surface.size() > 0) {
         surface += reader.blank;
         appendEscaped(surface, reader.wblank);
+        if(!reader.wblank.empty()) { appendEscaped(surface, "^$"_u); } // otherwise cg-proc will Error wordblank not prior to token
       }
       else {
         // The initial blank should just be output before the merged LU:
diff --git a/tests/lt_merge/__init__.py b/tests/lt_merge/__init__.py
index 38f1cbb..6d956d4 100644
--- a/tests/lt_merge/__init__.py
+++ b/tests/lt_merge/__init__.py
@@ -38,8 +38,8 @@ class EscapeTest(MergeTest):
 
 class WordblankTest(MergeTest):
     # Using r'' to avoid doubling escapes even more:
-    inputs = [r'^«/«<lquot><MERGE_BEG>$[[tf:i:a]]^ve\/ldig/v<adv>$[[/]]^»/»<rquot><MERGE_END>$']
-    expectedOutputs = [r'^«\[\[tf:i:a\]\]ve\\\/ldig\[\[\/\]\]»/«\[\[tf:i:a\]\]ve\\\/ldig\[\[\/\]\]»<MERGED>$']
+    inputs = [r'^«/«<lquot><MERGE_BEG>$[[tf:i:a]]^ve\/ldig/v<adv>$^»/»<rquot><MERGE_END>$']
+    expectedOutputs = [r'^«\[\[tf:i:a\]\]\^\$ve\\\/ldig»/«\[\[tf:i:a\]\]\^\$ve\\\/ldig»<MERGED>$']
 
 
 class SimpleUnmergeTest(MergeTest):
@@ -52,5 +52,5 @@ class SimpleUnmergeTest(MergeTest):
 class EscapedUnmergeTest(MergeTest):
     procflags = ['--unmerge']
     # Using r'' to avoid doubling escapes even more:
-    inputs = [r'^ikkje<adv>/ikkje$ ^«\[\[tf:i:a\]\]s\\\^å\[\[\/\]\]»<MERGED>/«\[\[tf:i:a\]\]s\\\^å\[\[\/\]\]»$']
-    expectedOutputs = [r'^ikkje<adv>/ikkje$ «[[tf:i:a]]s\^å[[/]]»']
+    inputs = [r'^ikkje<adv>/ikkje$ ^«\[\[tf:i:a\]\]\^\$s\\\^å»<MERGED>/«\[\[tf:i:a\]\]\^\$s\\\^å»$']
+    expectedOutputs = [r'^ikkje<adv>/ikkje$ «[[tf:i:a]]^$s\^å»']