Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

hardcoded sanity-max state size for case-insensitive matching #168

Merged
merged 1 commit into from
Oct 14, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 12 additions & 12 deletions lttoolbox/fst_processor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -903,7 +903,7 @@ FSTProcessor::compoundAnalysis(UString input_word)
{
UChar val=input_word[i];

current_state.step_case(val, caseSensitive);
current_state.step_case(val, beCaseSensitive(current_state));

if(current_state.size() > MAX_COMBINATIONS)
{
Expand Down Expand Up @@ -1068,7 +1068,7 @@ FSTProcessor::analysis(InputFile& input, UFILE *output)
{
rcx_map_ptr = rcx_map.find(val);
std::set<int> tmpset = rcx_map_ptr->second;
if(!u_isupper(val) || caseSensitive)
if(!u_isupper(val) || beCaseSensitive(current_state))
{
current_state.step(val, tmpset);
}
Expand All @@ -1087,7 +1087,7 @@ FSTProcessor::analysis(InputFile& input, UFILE *output)
}
else
{
current_state.step_case(val, caseSensitive);
current_state.step_case(val, beCaseSensitive(current_state));
}

if(current_state.size() != 0)
Expand Down Expand Up @@ -1580,7 +1580,7 @@ FSTProcessor::generation(InputFile& input, UFILE *output, GenerationMode mode)
alphabet.getSymbol(sf,val);
if(current_state.size() > 0)
{
if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive)
if(!alphabet.isTag(val) && u_isupper(val) && !(beCaseSensitive(current_state)))
{
if(mode == gm_carefulcase)
{
Expand Down Expand Up @@ -1621,7 +1621,7 @@ FSTProcessor::transliteration(InputFile& input, UFILE *output)
size_t cur_word = 0;
size_t cur_pos = 0;
size_t match_pos = 0;
current_state = initial_state;
State current_state = initial_state;
UString last_match;
int space_diff = 0;

Expand Down Expand Up @@ -1712,7 +1712,7 @@ FSTProcessor::transliteration(InputFile& input, UFILE *output)
}
}

current_state.step_case_override(sym, caseSensitive);
current_state.step_case_override(sym, beCaseSensitive(current_state));

if (current_state.size() == 0 || is_end) {
if (last_match.empty()) {
Expand Down Expand Up @@ -1866,7 +1866,7 @@ FSTProcessor::biltransfull(UStringView input_word, bool with_delim)
}
if(current_state.size() != 0)
{
if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive)
if(!alphabet.isTag(val) && u_isupper(val) && !beCaseSensitive(current_state))
{
current_state.step(val, u_tolower(val));
}
Expand Down Expand Up @@ -2019,7 +2019,7 @@ FSTProcessor::biltrans(UStringView input_word, bool with_delim)
}
if(current_state.size() != 0)
{
if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive)
if(!alphabet.isTag(val) && u_isupper(val) && !beCaseSensitive(current_state))
{
current_state.step(val, u_tolower(val));
}
Expand Down Expand Up @@ -2277,7 +2277,7 @@ FSTProcessor::bilingual(InputFile& input, UFILE *output, GenerationMode mode)
}
if(current_state.size() != 0)
{
current_state.step_case(val, caseSensitive);
current_state.step_case(val, beCaseSensitive(current_state));
}
if(current_state.isFinal(all_finals))
{
Expand Down Expand Up @@ -2376,7 +2376,7 @@ FSTProcessor::biltransWithQueue(UStringView input_word, bool with_delim)
}
if(current_state.size() != 0)
{
if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive)
if(!alphabet.isTag(val) && u_isupper(val) && !beCaseSensitive(current_state))
{
current_state.step(val, u_tolower(val));
}
Expand Down Expand Up @@ -2541,7 +2541,7 @@ FSTProcessor::biltransWithoutQueue(UStringView input_word, bool with_delim)
}
if(current_state.size() != 0)
{
if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive)
if(!alphabet.isTag(val) && u_isupper(val) && !beCaseSensitive(current_state))
{
current_state.step(val, u_tolower(val));
}
Expand Down Expand Up @@ -2744,7 +2744,7 @@ FSTProcessor::SAO(InputFile& input, UFILE *output)
last = input_buffer.getPos();
}

current_state.step_case(val, caseSensitive);
current_state.step_case(val, beCaseSensitive(current_state));

if(current_state.size() != 0)
{
Expand Down
18 changes: 13 additions & 5 deletions lttoolbox/fst_processor.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,6 @@ class FSTProcessor
*/
std::map<UString, TransExe> transducers;

/**
* Current state of lexical analysis
*/
State current_state;

/**
* Initial state of every token
*/
Expand Down Expand Up @@ -443,6 +438,19 @@ class FSTProcessor
bool isLastBlankTM = false;

xmlTextReaderPtr reader;

static constexpr size_t max_case_insensitive_state_size = 65536;
/*
* Including lowercased versions for every character can potentially create very large states
* (See https://github.com/apertium/lttoolbox/issues/167 ). As a sanity-check we don't do
* case-insensitive matching if the state size exceeds max_case_insensitive_state_size.
*
* @return running with --case-sensitive or state size exceeds max
*/
bool beCaseSensitive(const State& state) {
return caseSensitive || state.size() >= max_case_insensitive_state_size;
}

public:

/*
Expand Down
2 changes: 1 addition & 1 deletion lttoolbox/state.cc
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ State::copy(State const &s)
}
}

int
size_t
State::size() const
{
return state.size();
Expand Down
2 changes: 1 addition & 1 deletion lttoolbox/state.h
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ class State
* Number of alive transductions
* @return the size
*/
int size() const;
size_t size() const;

/**
* step = apply + epsilonClosure
Expand Down