Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[nl] new hybrid disambiguator #10111

Merged
merged 6 commits into from
Feb 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@
import org.languagetool.synthesis.nl.DutchSynthesizer;
import org.languagetool.tagging.Tagger;
import org.languagetool.tagging.disambiguation.Disambiguator;
import org.languagetool.tagging.disambiguation.rules.XmlRuleDisambiguator;
import org.languagetool.tagging.nl.DutchTagger;
import org.languagetool.tagging.nl.DutchHybridDisambiguator;
import org.languagetool.tokenizers.*;
import org.languagetool.tokenizers.nl.DutchWordTokenizer;

Expand Down Expand Up @@ -91,7 +91,7 @@ public Tokenizer createDefaultWordTokenizer() {

@Override
public Disambiguator createDefaultDisambiguator() {
return new XmlRuleDisambiguator(getDefaultLanguageVariant());
return new DutchHybridDisambiguator(getDefaultLanguageVariant());
}

@Override
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/* LanguageTool, a natural language style checker
* Copyright (C) 2024 Jaume Ortolà
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/

package org.languagetool.tagging.nl;

import org.jetbrains.annotations.Nullable;
import org.languagetool.AnalyzedSentence;
import org.languagetool.JLanguageTool;
import org.languagetool.Language;
import org.languagetool.tagging.disambiguation.AbstractDisambiguator;
import org.languagetool.tagging.disambiguation.Disambiguator;
import org.languagetool.tagging.disambiguation.MultiWordChunker;
import org.languagetool.tagging.disambiguation.rules.XmlRuleDisambiguator;

import java.io.IOException;

/**
* Hybrid chunker-disambiguator for Dutch
*
* @author Jaume Ortolà
*/
public class DutchHybridDisambiguator extends AbstractDisambiguator {

private final MultiWordChunker chunker = new MultiWordChunker("/nl/multiwords.txt", true, true, false, MultiWordChunker.tagForNotAddingTags);
private final MultiWordChunker chunkerGlobal = new MultiWordChunker("/spelling_global.txt", false, true, false, MultiWordChunker.tagForNotAddingTags);
private final Disambiguator disambiguator;

@Override
public AnalyzedSentence disambiguate(AnalyzedSentence input) throws IOException {
return disambiguate(input, null);
}

/**
* Calls two disambiguator classes: (1) a chunker; (2) a rule-based
* disambiguator.
*/

public DutchHybridDisambiguator(Language lang) {
disambiguator = new XmlRuleDisambiguator(lang, true);
chunker.setIgnoreSpelling(true);
chunkerGlobal.setIgnoreSpelling(true);
}

@Override
public final AnalyzedSentence disambiguate(AnalyzedSentence input, @Nullable JLanguageTool.CheckCancelledCallback checkCanceled) throws IOException {
return disambiguator.disambiguate(chunker.disambiguate(chunkerGlobal.disambiguate(input, checkCanceled), checkCanceled), checkCanceled);
}



}
Original file line number Diff line number Diff line change
Expand Up @@ -1303,7 +1303,7 @@ Copyright (C) 2008-2024 Ruud Baars
<rule name="PETER_X" id="PETER_X">
<!-- meneer Teach -->
<pattern>
<token regexp="yes" case_sensitive="yes" postag="ENM:PER:FST">[A-Z][a-z].*</token>
<token regexp="yes" case_sensitive="yes" postag="ENM:PER:FST">[A-Z][a-z].*<exception regexp="yes">Batman|Disney</exception></token>
<marker>
<token regexp="yes" case_sensitive="yes" postag="UNKNOWN">[A-Z][a-z].*</token>
</marker>
Expand Down
Original file line number Diff line number Diff line change
@@ -1 +1,10 @@
Dilan Yeşilgöz
Dilan Yeşilgöz-Zegerius
Sri Lankaan
Sri Lankaans
Sri Lankaanse
Sierra Leoner
Sierra Leoons
Sierra Leoonse
tot ziens
Anna Boleyn
Original file line number Diff line number Diff line change
Expand Up @@ -6587,7 +6587,7 @@ Google Analytics
Ten opzichte
ter harte
Ter harte
tot ziens
#tot ziens
Tot ziens
Amnesty International
Tel Aviv
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ public void testSpeller() throws IOException {
assertEquals(0, rule.match(lt.getAnalyzedSentence("déúr")).length);
assertEquals(1, rule.match(lt.getAnalyzedSentence("déur")).length);
assertEquals(0, rule.match(lt.getAnalyzedSentence("deur-knop")).length);

assertEquals(0, rule.match(lt.getAnalyzedSentence("Hartelijke groet en hopelijk tot ziens!")).length);

//unknown followed by EN, should be accepted as it's not in EN dict
assertEquals(1, rule.match(lt.getAnalyzedSentence("Deze duifkuiker was vlak onder de oever aan het jagen.")).length);
// unknown followed by EN, should get detected as it's in disambig entity
Expand Down