Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New lt-merge command to merge LU's from BEG to END tag #193

Merged
merged 4 commits into from
Dec 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ lttoolbox/liblttoolbox.so.*
/lttoolbox/lt-comp
/lttoolbox/lt-compose
/lttoolbox/lt-proc
/lttoolbox/lt-merge
/lttoolbox/lt-trim
/lttoolbox/Makefile
/lttoolbox/Makefile.in
Expand Down
7 changes: 5 additions & 2 deletions lttoolbox/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,9 @@ target_link_libraries(lt-comp lttoolbox ${GETOPT_LIB})
add_executable(lt-proc lt_proc.cc)
target_link_libraries(lt-proc lttoolbox ${GETOPT_LIB})

add_executable(lt-merge lt_merge.cc)
target_link_libraries(lt-merge lttoolbox ${GETOPT_LIB})

add_executable(lt-expand lt_expand.cc)
target_link_libraries(lt-expand lttoolbox ${GETOPT_LIB})

Expand Down Expand Up @@ -144,11 +147,11 @@ install(TARGETS lttoolbox
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
install(FILES ${LIBLTTOOLBOX_HEADERS}
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/lttoolbox)
install(TARGETS lt-append lt-print lt-trim lt-compose lt-comp lt-proc lt-expand lt-paradigm lt-tmxcomp lt-tmxproc lt-invert lt-restrict lt-apply-acx
install(TARGETS lt-append lt-print lt-trim lt-compose lt-comp lt-proc lt-merge lt-expand lt-paradigm lt-tmxcomp lt-tmxproc lt-invert lt-restrict lt-apply-acx
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})

install(FILES dix.dtd dix.rng dix.rnc acx.rng xsd/dix.xsd xsd/acx.xsd
DESTINATION ${CMAKE_INSTALL_DATADIR}/lttoolbox)

install(FILES lt-append.1 lt-comp.1 lt-expand.1 lt-paradigm.1 lt-proc.1 lt-tmxcomp.1 lt-tmxproc.1 lt-print.1 lt-trim.1 lt-compose.1
install(FILES lt-append.1 lt-comp.1 lt-expand.1 lt-paradigm.1 lt-proc.1 lt-merge.1 lt-tmxcomp.1 lt-tmxproc.1 lt-print.1 lt-trim.1 lt-compose.1
DESTINATION ${CMAKE_INSTALL_MANDIR}/man1)
151 changes: 150 additions & 1 deletion lttoolbox/fst_processor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -627,6 +627,8 @@ void
FSTProcessor::load(FILE *input)
{
readTransducerSet(input, alphabetic_chars, alphabet, transducers);
alphabet.includeSymbol("<ANY_CHAR>"_u);
any_char = alphabet("<ANY_CHAR>"_u);
}

void
Expand Down Expand Up @@ -1755,7 +1757,14 @@ FSTProcessor::bilingual(InputFile& input, UFILE *output, GenerationMode mode)
if (reader.readings[index].mark == '#') current_state.step('#');
for (size_t i = 0; i < symbols.size(); i++) {
seenTags = seenTags || alphabet.isTag(symbols[i]);
current_state.step_case(symbols[i], beCaseSensitive(current_state));
UString source;
alphabet.getSymbol(source, symbols[i]);
if(beCaseSensitive(current_state)) { // allow any_char
current_state.step_override(symbols[i], any_char, symbols[i]);
}
else { // include lower alt
current_state.step_override(symbols[i], towlower(symbols[i]), any_char, symbols[i]);
}
if (current_state.isFinal(all_finals)) {
queue_start = i;
current_state.filterFinalsArray(result,
Expand Down Expand Up @@ -1931,6 +1940,146 @@ FSTProcessor::biltransWithoutQueue(UStringView input_word, bool with_delim)
return compose(result, ""_u, with_delim, mark);
}

void
FSTProcessor::quoteMerge(InputFile& input, UFILE *output)
{
StreamReader reader(&input);
reader.alpha = &alphabet;
reader.add_unknowns = true;

bool merging = false;
UString surface;
while (!reader.at_eof) {
reader.next();

bool end_merging = false;

for (StreamReader::Reading &it : reader.readings) {
// TODO: look up in it.symbols instead (but need to make an alphabet then)
if(it.content.find(u"<MERGE_BEG>") != std::string::npos) {
merging = true;
}
if(it.content.find(u"<MERGE_END>") != std::string::npos) {
end_merging = true;
}
}
if(merging) {
if(surface.size() > 0) {
surface += reader.blank;
appendEscaped(surface, reader.wblank);
if(!reader.wblank.empty()) { appendEscaped(surface, "^$"_u); } // otherwise cg-proc will Error wordblank not prior to token
}
else {
// The initial blank should just be output before the merged LU:
write(reader.blank, output);
write(reader.wblank, output);
}
if(reader.readings.size() > 0) {
// Drop possible unknown marks.
// Double-escape the form since we'll unescape during lt-unmerge:
appendEscaped(surface, reader.readings[0].content);
}
}
else {
write(reader.blank, output);
write(reader.wblank, output);
if(reader.readings.size() > 0) {
// NB. ^$ will produce a readings vector of length 1 where the single item is empty. EOF should give length 0.
// (We *want* to keep ^$ in stream, but not print extra ^$ when there was no ^$)
u_fputc('^', output);
bool seen_reading = false;
for (StreamReader::Reading &it : reader.readings) {
if (seen_reading) {
u_fputc('/', output);
}
if(it.mark != '\0') { u_fputc(it.mark, output); }
write(it.content, output);
seen_reading = true;
}
u_fputc('$', output);
}
}
if(end_merging || reader.at_null) {
if (merging) {
u_fputc('^', output);
write(surface, output);
u_fputc('/', output);
write(surface, output);
write("<MERGED>$"_u, output);
merging = false;
}
end_merging = false;
surface.clear();
if(reader.at_null) {
u_fputc('\0', output);
u_fflush(output);
}
}
}
}


void
FSTProcessor::quoteUnmerge(InputFile &input, UFILE *output)
{
StreamReader reader(&input);
reader.alpha = &alphabet;
reader.add_unknowns = true;

UString surface;
while (!reader.at_eof) {
reader.next();
bool unmerging = false;
for (StreamReader::Reading &it : reader.readings) {
// TODO: look up in it.symbols instead (but need to make an alphabet then)
if(it.content.find(u"<MERGED>") != std::string::npos) {
unmerging = true;
}
}
write(reader.blank, output);
write(reader.wblank, output);
if(unmerging) {
// Just output the last reading (surface form), removing one level of escaping
StreamReader::Reading &lastReading = reader.readings.back(); // (we know there's at least one because of the above loop)
UString surface;
bool escaping = false;
for(auto &c : lastReading.content) {
if(escaping) {
surface += c;
escaping = false;
}
else if(c == u'\\') {
escaping = true;
}
else {
surface += c;
}
}
write(surface, output);
}
else {
if(reader.readings.size() > 0) {
// NB. ^$ will produce a readings vector of length 1 where the single item is empty. EOF should give length 0.
// (We *want* to keep ^$ in stream, but not print extra ^$ when there was no ^$)
u_fputc('^', output);
bool seen_reading = false;
for (StreamReader::Reading &it : reader.readings) {
if (seen_reading) {
u_fputc('/', output);
}
if(it.mark != '\0') { u_fputc(it.mark, output); }
write(it.content, output);
seen_reading = true;
}
u_fputc('$', output);
}
}
if(reader.at_null) {
u_fputc('\0', output);
u_fflush(output);
}
}
}

bool
FSTProcessor::valid() const
Expand Down
16 changes: 16 additions & 0 deletions lttoolbox/fst_processor.h
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,11 @@ class FSTProcessor
*/
int maxWeightClasses = INT_MAX;

/**
* The alphabet index of the tag <ANY_CHAR>
*/
int any_char;

/**
* Prints an error of input stream and exits
*/
Expand Down Expand Up @@ -462,6 +467,15 @@ class FSTProcessor
}
}

void appendEscaped(UString& to, const UString& from) {
for(auto &c : from) {
if (escaped_chars.find(c) != escaped_chars.end()) {
to += u'\\';
}
to += c;
}
}

public:

/*
Expand Down Expand Up @@ -496,6 +510,8 @@ class FSTProcessor
UString biltrans(UStringView input_word, bool with_delim = true);
UString biltransfull(UStringView input_word, bool with_delim = true);
void bilingual(InputFile& input, UFILE *output, GenerationMode mode = gm_unknown);
void quoteMerge(InputFile& input, UFILE *output);
void quoteUnmerge(InputFile& input, UFILE *output);
std::pair<UString, int> biltransWithQueue(UStringView input_word, bool with_delim = true);
UString biltransWithoutQueue(UStringView input_word, bool with_delim = true);
void SAO(InputFile& input, UFILE *output);
Expand Down
40 changes: 40 additions & 0 deletions lttoolbox/lt-merge.1
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
.Dd December 10, 2024
.Dt LT-MERGE 1
.Os Apertium
.Sh NAME
.Nm lt-merge
.Nd lexical merger for Apertium
.Sh SYNOPSIS
.Nm lt-merge
.Op Fl u
.Op Ar input_file Op Ar output_file
.Sh DESCRIPTION
.Nm lt-merge
is the application responsible for merging and unmerging
lexical units
.Pp
It accomplishes this.
.Sh OPTIONS
.Bl -tag -width Ds
.It Fl u , Fl Fl unmerge
Run in reverse, this splits previously merged words.
.It Fl v , Fl Fl version
Display the version number.
.It Fl h , Fl Fl help
Display this help.
.El
\" .Sh FILES
\" .Bl -tag -width Ds
\" .It Ar input_file
\" The input compiled dictionary.
\" .El
.Sh SEE ALSO
.Xr apertium 1 ,
.Xr lt-proc 1
.Sh COPYRIGHT
Copyright \(co 2024 Universitat d'Alacant / Universidad de Alicante.
This is free software.
You may redistribute copies of it under the terms of
.Lk https://www.gnu.org/licenses/gpl.html the GNU General Public License .
.Sh BUGS
Many... lurking in the dark and waiting for you!
53 changes: 53 additions & 0 deletions lttoolbox/lt_merge.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/*
* Copyright (C) 2024 Universitat d'Alacant / Universidad de Alicante
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, see <https://www.gnu.org/licenses/>.
*/
#include <lttoolbox/fst_processor.h>
#include <lttoolbox/file_utils.h>
#include <lttoolbox/cli.h>
#include <lttoolbox/lt_locale.h>
#include <iostream>


int main(int argc, char *argv[])
{
LtLocale::tryToSetLocale();
CLI cli("merge lexical units from the one tagged BEG until END", PACKAGE_VERSION);
cli.add_file_arg("input_file");
cli.add_file_arg("output_file");
cli.add_bool_arg('u', "unmerge", "Undo the merge");
cli.add_bool_arg('z', "null-flush", "flush output on the null character");
cli.parse_args(argc, argv);

auto strs = cli.get_strs();
bool unmerge = cli.get_bools()["unmerge"];
InputFile input;
if (!cli.get_files()[1].empty()) {
input.open_or_exit(cli.get_files()[0].c_str());
}
UFILE* output = openOutTextFile(cli.get_files()[1]);

FSTProcessor fstp;
fstp.setNullFlush(true); // cf. description of cli["null-flush"]
fstp.initBiltrans();
if(unmerge) {
fstp.quoteUnmerge(input, output);
}
else {
fstp.quoteMerge(input, output);
}

return 0;
}
6 changes: 3 additions & 3 deletions lttoolbox/state.h
Original file line number Diff line number Diff line change
Expand Up @@ -113,9 +113,9 @@ class State

/**
* Make a transition, but overriding the output symbol
* @param input symbol
* @param output symbol we expect to appear
* @param output symbol we want to appear
* @param input symbol read from infile
* @param output symbol from the FST
* @param output symbol we want to appear in outfile
*/
void apply_override(int const input, int const old_sym, int const new_sym);

Expand Down
20 changes: 20 additions & 0 deletions tests/data/pass-through.lsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
<?xml version="1.0" encoding="UTF-8"?>
<dictionary type="separable">
<alphabet></alphabet>
<sdefs>
<sdef n="MERGED"/>
</sdefs>

<pardefs>
<pardef n="foo">
<e> <i>foo<d/></i> </e>
</pardef>
</pardefs>

<section id="main" type="standard">

<e c="pass-through MERGED words">
<i><w/><s n="MERGED"/></i>
</e>
</section>
</dictionary>
Loading
Loading