diff --git a/languagetool-language-modules/nl/src/main/java/org/languagetool/language/Dutch.java b/languagetool-language-modules/nl/src/main/java/org/languagetool/language/Dutch.java index c6e5c3f37631..cfb71e8e0754 100644 --- a/languagetool-language-modules/nl/src/main/java/org/languagetool/language/Dutch.java +++ b/languagetool-language-modules/nl/src/main/java/org/languagetool/language/Dutch.java @@ -30,8 +30,8 @@ import org.languagetool.synthesis.nl.DutchSynthesizer; import org.languagetool.tagging.Tagger; import org.languagetool.tagging.disambiguation.Disambiguator; -import org.languagetool.tagging.disambiguation.rules.XmlRuleDisambiguator; import org.languagetool.tagging.nl.DutchTagger; +import org.languagetool.tagging.nl.DutchHybridDisambiguator; import org.languagetool.tokenizers.*; import org.languagetool.tokenizers.nl.DutchWordTokenizer; @@ -91,7 +91,7 @@ public Tokenizer createDefaultWordTokenizer() { @Override public Disambiguator createDefaultDisambiguator() { - return new XmlRuleDisambiguator(getDefaultLanguageVariant()); + return new DutchHybridDisambiguator(getDefaultLanguageVariant()); } @Override diff --git a/languagetool-language-modules/nl/src/main/java/org/languagetool/tagging/nl/DutchHybridDisambiguator.java b/languagetool-language-modules/nl/src/main/java/org/languagetool/tagging/nl/DutchHybridDisambiguator.java new file mode 100644 index 000000000000..93a246d17ef5 --- /dev/null +++ b/languagetool-language-modules/nl/src/main/java/org/languagetool/tagging/nl/DutchHybridDisambiguator.java @@ -0,0 +1,67 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2024 Jaume Ortolà + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package org.languagetool.tagging.nl; + +import org.jetbrains.annotations.Nullable; +import org.languagetool.AnalyzedSentence; +import org.languagetool.JLanguageTool; +import org.languagetool.Language; +import org.languagetool.tagging.disambiguation.AbstractDisambiguator; +import org.languagetool.tagging.disambiguation.Disambiguator; +import org.languagetool.tagging.disambiguation.MultiWordChunker; +import org.languagetool.tagging.disambiguation.rules.XmlRuleDisambiguator; + +import java.io.IOException; + +/** + * Hybrid chunker-disambiguator for Dutch + * + * @author Jaume Ortolà + */ +public class DutchHybridDisambiguator extends AbstractDisambiguator { + + private final MultiWordChunker chunker = new MultiWordChunker("/nl/multiwords.txt", true, true, false, MultiWordChunker.tagForNotAddingTags); + private final MultiWordChunker chunkerGlobal = new MultiWordChunker("/spelling_global.txt", false, true, false, MultiWordChunker.tagForNotAddingTags); + private final Disambiguator disambiguator; + + @Override + public AnalyzedSentence disambiguate(AnalyzedSentence input) throws IOException { + return disambiguate(input, null); + } + + /** + * Calls two disambiguator classes: (1) a chunker; (2) a rule-based + * disambiguator. + */ + + public DutchHybridDisambiguator(Language lang) { + disambiguator = new XmlRuleDisambiguator(lang, true); + chunker.setIgnoreSpelling(true); + chunkerGlobal.setIgnoreSpelling(true); + } + + @Override + public final AnalyzedSentence disambiguate(AnalyzedSentence input, @Nullable JLanguageTool.CheckCancelledCallback checkCanceled) throws IOException { + return disambiguator.disambiguate(chunker.disambiguate(chunkerGlobal.disambiguate(input, checkCanceled), checkCanceled), checkCanceled); + } + + + +} diff --git a/languagetool-language-modules/nl/src/main/resources/org/languagetool/resource/nl/disambiguation.xml b/languagetool-language-modules/nl/src/main/resources/org/languagetool/resource/nl/disambiguation.xml index 2cecaceddb97..8dbe7026b286 100644 --- a/languagetool-language-modules/nl/src/main/resources/org/languagetool/resource/nl/disambiguation.xml +++ b/languagetool-language-modules/nl/src/main/resources/org/languagetool/resource/nl/disambiguation.xml @@ -1303,7 +1303,7 @@ Copyright (C) 2008-2024 Ruud Baars - [A-Z][a-z].* + [A-Z][a-z].*Batman|Disney [A-Z][a-z].* diff --git a/languagetool-language-modules/nl/src/main/resources/org/languagetool/resource/nl/multiwords.txt b/languagetool-language-modules/nl/src/main/resources/org/languagetool/resource/nl/multiwords.txt index 4bf1ac36ae6a..b6cb39a96826 100644 --- a/languagetool-language-modules/nl/src/main/resources/org/languagetool/resource/nl/multiwords.txt +++ b/languagetool-language-modules/nl/src/main/resources/org/languagetool/resource/nl/multiwords.txt @@ -1 +1,10 @@ +Dilan Yeşilgöz +Dilan Yeşilgöz-Zegerius +Sri Lankaan +Sri Lankaans +Sri Lankaanse +Sierra Leoner +Sierra Leoons +Sierra Leoonse +tot ziens Anna Boleyn \ No newline at end of file diff --git a/languagetool-language-modules/nl/src/main/resources/org/languagetool/resource/nl/spelling/spelling.txt b/languagetool-language-modules/nl/src/main/resources/org/languagetool/resource/nl/spelling/spelling.txt index 499d600ddf2f..4c8806487908 100644 --- a/languagetool-language-modules/nl/src/main/resources/org/languagetool/resource/nl/spelling/spelling.txt +++ b/languagetool-language-modules/nl/src/main/resources/org/languagetool/resource/nl/spelling/spelling.txt @@ -6587,7 +6587,7 @@ Google Analytics Ten opzichte ter harte Ter harte -tot ziens +#tot ziens Tot ziens Amnesty International Tel Aviv diff --git a/languagetool-language-modules/nl/src/test/java/org/languagetool/rules/nl/MorfologikDutchSpellerRuleTest.java b/languagetool-language-modules/nl/src/test/java/org/languagetool/rules/nl/MorfologikDutchSpellerRuleTest.java index e95bfc6b3f90..d5ae4bfd5507 100644 --- a/languagetool-language-modules/nl/src/test/java/org/languagetool/rules/nl/MorfologikDutchSpellerRuleTest.java +++ b/languagetool-language-modules/nl/src/test/java/org/languagetool/rules/nl/MorfologikDutchSpellerRuleTest.java @@ -55,6 +55,9 @@ public void testSpeller() throws IOException { assertEquals(0, rule.match(lt.getAnalyzedSentence("déúr")).length); assertEquals(1, rule.match(lt.getAnalyzedSentence("déur")).length); assertEquals(0, rule.match(lt.getAnalyzedSentence("deur-knop")).length); + + assertEquals(0, rule.match(lt.getAnalyzedSentence("Hartelijke groet en hopelijk tot ziens!")).length); + //unknown followed by EN, should be accepted as it's not in EN dict assertEquals(1, rule.match(lt.getAnalyzedSentence("Deze duifkuiker was vlak onder de oever aan het jagen.")).length); // unknown followed by EN, should get detected as it's in disambig entity