apertium · unhammer · Dec 20, 2024 · Nov 25, 2024 · Dec 18, 2024 · Dec 19, 2024
diff --git a/.gitignore b/.gitignore
@@ -47,6 +47,7 @@ lttoolbox/liblttoolbox.so.*
 /lttoolbox/lt-comp
 /lttoolbox/lt-compose
 /lttoolbox/lt-proc
+/lttoolbox/lt-merge
 /lttoolbox/lt-trim
 /lttoolbox/Makefile
 /lttoolbox/Makefile.in

diff --git a/lttoolbox/CMakeLists.txt b/lttoolbox/CMakeLists.txt
@@ -103,6 +103,9 @@ target_link_libraries(lt-comp lttoolbox ${GETOPT_LIB})
 add_executable(lt-proc lt_proc.cc)
 target_link_libraries(lt-proc lttoolbox ${GETOPT_LIB})
 
+add_executable(lt-merge lt_merge.cc)
+target_link_libraries(lt-merge lttoolbox ${GETOPT_LIB})
+
 add_executable(lt-expand lt_expand.cc)
 target_link_libraries(lt-expand lttoolbox ${GETOPT_LIB})
 
@@ -144,11 +147,11 @@ install(TARGETS lttoolbox
 	ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
 install(FILES ${LIBLTTOOLBOX_HEADERS}
 	DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/lttoolbox)
-install(TARGETS lt-append lt-print lt-trim lt-compose lt-comp lt-proc lt-expand lt-paradigm lt-tmxcomp lt-tmxproc lt-invert lt-restrict lt-apply-acx
+install(TARGETS lt-append lt-print lt-trim lt-compose lt-comp lt-proc lt-merge lt-expand lt-paradigm lt-tmxcomp lt-tmxproc lt-invert lt-restrict lt-apply-acx
 	RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
 
 install(FILES dix.dtd dix.rng dix.rnc acx.rng xsd/dix.xsd xsd/acx.xsd
 	DESTINATION ${CMAKE_INSTALL_DATADIR}/lttoolbox)
 
-install(FILES lt-append.1 lt-comp.1 lt-expand.1 lt-paradigm.1 lt-proc.1 lt-tmxcomp.1 lt-tmxproc.1 lt-print.1 lt-trim.1 lt-compose.1
+install(FILES lt-append.1 lt-comp.1 lt-expand.1 lt-paradigm.1 lt-proc.1 lt-merge.1 lt-tmxcomp.1 lt-tmxproc.1 lt-print.1 lt-trim.1 lt-compose.1
 	DESTINATION ${CMAKE_INSTALL_MANDIR}/man1)
diff --git a/lttoolbox/fst_processor.cc b/lttoolbox/fst_processor.cc
@@ -627,6 +627,8 @@ void
 FSTProcessor::load(FILE *input)
 {
   readTransducerSet(input, alphabetic_chars, alphabet, transducers);
+  alphabet.includeSymbol("<ANY_CHAR>"_u);
+  any_char = alphabet("<ANY_CHAR>"_u);
 }
 
 void
@@ -1755,7 +1757,14 @@ FSTProcessor::bilingual(InputFile& input, UFILE *output, GenerationMode mode)
     if (reader.readings[index].mark == '#') current_state.step('#');
     for (size_t i = 0; i < symbols.size(); i++) {
       seenTags = seenTags || alphabet.isTag(symbols[i]);
-      current_state.step_case(symbols[i], beCaseSensitive(current_state));
+      UString source;
+      alphabet.getSymbol(source, symbols[i]);
+      if(beCaseSensitive(current_state)) { // allow any_char
+        current_state.step_override(symbols[i], any_char, symbols[i]);
+      }
+      else {                    // include lower alt
+        current_state.step_override(symbols[i], towlower(symbols[i]), any_char, symbols[i]);
+      }
       if (current_state.isFinal(all_finals)) {
         queue_start = i;
         current_state.filterFinalsArray(result,
@@ -1931,6 +1940,146 @@ FSTProcessor::biltransWithoutQueue(UStringView input_word, bool with_delim)
   return compose(result, ""_u, with_delim, mark);
 }
 
+void
+FSTProcessor::quoteMerge(InputFile& input, UFILE *output)
+{
+  StreamReader reader(&input);
+  reader.alpha = &alphabet;
+  reader.add_unknowns = true;
+
+  bool merging = false;
+  UString surface;
+  while (!reader.at_eof) {
+    reader.next();
+
+    bool end_merging = false;
+
+    for (StreamReader::Reading &it : reader.readings) {
+      // TODO: look up in it.symbols instead (but need to make an alphabet then)
+      if(it.content.find(u"<MERGE_BEG>") != std::string::npos) {
+        merging = true;
+      }
+      if(it.content.find(u"<MERGE_END>") != std::string::npos) {
+        end_merging = true;
+      }
+    }
+    if(merging) {
+      if(surface.size() > 0) {
+        surface += reader.blank;
+        appendEscaped(surface, reader.wblank);
+        if(!reader.wblank.empty()) { appendEscaped(surface, "^$"_u); } // otherwise cg-proc will Error wordblank not prior to token
+      }
+      else {
+        // The initial blank should just be output before the merged LU:
+        write(reader.blank, output);
+        write(reader.wblank, output);
+      }
+      if(reader.readings.size() > 0) {
+        // Drop possible unknown marks.
+        // Double-escape the form since we'll unescape during lt-unmerge:
+        appendEscaped(surface, reader.readings[0].content);
+      }
+    }
+    else {
+      write(reader.blank, output);
+      write(reader.wblank, output);
+      if(reader.readings.size() > 0) {
+        // NB. ^$ will produce a readings vector of length 1 where the single item is empty. EOF should give length 0.
+        // (We *want* to keep ^$ in stream, but not print extra ^$ when there was no ^$)
+        u_fputc('^', output);
+        bool seen_reading = false;
+        for (StreamReader::Reading &it : reader.readings) {
+          if (seen_reading) {
+            u_fputc('/', output);
+          }
+          if(it.mark != '\0') { u_fputc(it.mark, output); }
+          write(it.content, output);
+          seen_reading = true;
+        }
+        u_fputc('$', output);
+      }
+    }
+    if(end_merging || reader.at_null) {
+      if (merging) {
+        u_fputc('^', output);
+        write(surface, output);
+        u_fputc('/', output);
+        write(surface, output);
+        write("<MERGED>$"_u, output);
+        merging = false;
+      }
+      end_merging = false;
+      surface.clear();
+      if(reader.at_null) {
+        u_fputc('\0', output);
+        u_fflush(output);
+      }
+    }
+  }
+}
+
+
+void
+FSTProcessor::quoteUnmerge(InputFile &input, UFILE *output)
+{
+  StreamReader reader(&input);
+  reader.alpha = &alphabet;
+  reader.add_unknowns = true;
+
+  UString surface;
+  while (!reader.at_eof) {
+    reader.next();
+    bool unmerging = false;
+    for (StreamReader::Reading &it : reader.readings) {
+      // TODO: look up in it.symbols instead (but need to make an alphabet then)
+      if(it.content.find(u"<MERGED>") != std::string::npos) {
+        unmerging = true;
+      }
+    }
+    write(reader.blank, output);
+    write(reader.wblank, output);
+    if(unmerging) {
+      // Just output the last reading (surface form), removing one level of escaping
+      StreamReader::Reading &lastReading = reader.readings.back(); // (we know there's at least one because of the above loop)
+      UString surface;
+      bool escaping = false;
+      for(auto &c : lastReading.content) {
+        if(escaping) {
+          surface += c;
+          escaping = false;
+        }
+        else if(c == u'\\') {
+          escaping = true;
+        }
+        else {
+          surface += c;
+        }
+      }
+      write(surface, output);
+    }
+    else {
+      if(reader.readings.size() > 0) {
+        // NB. ^$ will produce a readings vector of length 1 where the single item is empty. EOF should give length 0.
+        // (We *want* to keep ^$ in stream, but not print extra ^$ when there was no ^$)
+        u_fputc('^', output);
+        bool seen_reading = false;
+        for (StreamReader::Reading &it : reader.readings) {
+          if (seen_reading) {
+            u_fputc('/', output);
+          }
+          if(it.mark != '\0') { u_fputc(it.mark, output); }
+          write(it.content, output);
+          seen_reading = true;
+        }
+        u_fputc('$', output);
+      }
+    }
+    if(reader.at_null) {
+      u_fputc('\0', output);
+      u_fflush(output);
+    }
+  }
+}
 
 bool
 FSTProcessor::valid() const

diff --git a/lttoolbox/fst_processor.h b/lttoolbox/fst_processor.h
@@ -252,6 +252,11 @@ class FSTProcessor
    */
   int maxWeightClasses = INT_MAX;
 
+  /**
+   * The alphabet index of the tag <ANY_CHAR>
+   */
+  int any_char;
+
   /**
    * Prints an error of input stream and exits
    */
@@ -462,6 +467,15 @@ class FSTProcessor
     }
   }
 
+  void appendEscaped(UString& to, const UString& from) {
+    for(auto &c : from) {
+      if (escaped_chars.find(c) != escaped_chars.end()) {
+        to += u'\\';
+      }
+      to += c;
+    }
+  }
+
 public:
 
   /*
@@ -496,6 +510,8 @@ class FSTProcessor
   UString biltrans(UStringView input_word, bool with_delim = true);
   UString biltransfull(UStringView input_word, bool with_delim = true);
   void bilingual(InputFile& input, UFILE *output, GenerationMode mode = gm_unknown);
+  void quoteMerge(InputFile& input, UFILE *output);
+  void quoteUnmerge(InputFile& input, UFILE *output);
   std::pair<UString, int> biltransWithQueue(UStringView input_word, bool with_delim = true);
   UString biltransWithoutQueue(UStringView input_word, bool with_delim = true);
   void SAO(InputFile& input, UFILE *output);

diff --git a/lttoolbox/lt-merge.1 b/lttoolbox/lt-merge.1
@@ -0,0 +1,40 @@
+.Dd December 10, 2024
+.Dt LT-MERGE 1
+.Os Apertium
+.Sh NAME
+.Nm lt-merge
+.Nd lexical merger for Apertium
+.Sh SYNOPSIS
+.Nm lt-merge
+.Op Fl u
+.Op Ar input_file Op Ar output_file
+.Sh DESCRIPTION
+.Nm lt-merge
+is the application responsible for merging and unmerging
+lexical units
+.Pp
+It accomplishes this.
+.Sh OPTIONS
+.Bl -tag -width Ds
+.It Fl u , Fl Fl unmerge
+Run in reverse, this splits previously merged words.
+.It Fl v , Fl Fl version
+Display the version number.
+.It Fl h , Fl Fl help
+Display this help.
+.El
+\" .Sh FILES
+\" .Bl -tag -width Ds
+\" .It Ar input_file
+\" The input compiled dictionary.
+\" .El
+.Sh SEE ALSO
+.Xr apertium 1 ,
+.Xr lt-proc 1
+.Sh COPYRIGHT
+Copyright \(co 2024 Universitat d'Alacant / Universidad de Alicante.
+This is free software.
+You may redistribute copies of it under the terms of
+.Lk https://www.gnu.org/licenses/gpl.html the GNU General Public License .
+.Sh BUGS
+Many... lurking in the dark and waiting for you!
diff --git a/lttoolbox/lt_merge.cc b/lttoolbox/lt_merge.cc
@@ -0,0 +1,53 @@
+/*
+ * Copyright (C) 2024 Universitat d'Alacant / Universidad de Alicante
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses/>.
+ */
+#include <lttoolbox/fst_processor.h>
+#include <lttoolbox/file_utils.h>
+#include <lttoolbox/cli.h>
+#include <lttoolbox/lt_locale.h>
+#include <iostream>
+
+
+int main(int argc, char *argv[])
+{
+  LtLocale::tryToSetLocale();
+  CLI cli("merge lexical units from the one tagged BEG until END", PACKAGE_VERSION);
+  cli.add_file_arg("input_file");
+  cli.add_file_arg("output_file");
+  cli.add_bool_arg('u', "unmerge", "Undo the merge");
+  cli.add_bool_arg('z', "null-flush", "flush output on the null character");
+  cli.parse_args(argc, argv);
+
+  auto strs = cli.get_strs();
+  bool unmerge = cli.get_bools()["unmerge"];
+  InputFile input;
+  if (!cli.get_files()[1].empty()) {
+    input.open_or_exit(cli.get_files()[0].c_str());
+  }
+  UFILE* output = openOutTextFile(cli.get_files()[1]);
+
+  FSTProcessor fstp;
+  fstp.setNullFlush(true); // cf. description of cli["null-flush"]
+  fstp.initBiltrans();
+  if(unmerge) {
+    fstp.quoteUnmerge(input, output);
+  }
+  else {
+    fstp.quoteMerge(input, output);
+  }
+
+  return 0;
+}
diff --git a/lttoolbox/state.h b/lttoolbox/state.h
@@ -113,9 +113,9 @@ class State
 
   /**
    * Make a transition, but overriding the output symbol
-   * @param input symbol
-   * @param output symbol we expect to appear
-   * @param output symbol we want to appear
+   * @param input symbol read from infile
+   * @param output symbol from the FST
+   * @param output symbol we want to appear in outfile
    */
   void apply_override(int const input, int const old_sym, int const new_sym);
 

diff --git a/tests/data/pass-through.lsx b/tests/data/pass-through.lsx
@@ -0,0 +1,20 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<dictionary type="separable">
+  <alphabet></alphabet>
+  <sdefs>
+    <sdef n="MERGED"/>
+  </sdefs>
+
+  <pardefs>
+    <pardef n="foo">
+      <e>   <i>foo<d/></i>            </e>
+    </pardef>
+  </pardefs>
+
+  <section id="main" type="standard">
+
+    <e c="pass-through MERGED words">
+      <i><w/><s n="MERGED"/></i>
+    </e>
+  </section>
+</dictionary>