diff --git a/warc-indexer/pom.xml b/warc-indexer/pom.xml index aca9b32e..f15e92d7 100644 --- a/warc-indexer/pom.xml +++ b/warc-indexer/pom.xml @@ -315,10 +315,16 @@ 1.1.0 - - org.apache.lucene - lucene-core - 8.7.0 - + + org.apache.lucene + lucene-core + 8.7.0 + + + + com.carrotsearch + langid-java + 1.0.0 + diff --git a/warc-indexer/src/main/java/uk/bl/wa/analyser/text/LanguageAnalyser.java b/warc-indexer/src/main/java/uk/bl/wa/analyser/text/LanguageAnalyser.java index d81ef3f6..bd77280e 100644 --- a/warc-indexer/src/main/java/uk/bl/wa/analyser/text/LanguageAnalyser.java +++ b/warc-indexer/src/main/java/uk/bl/wa/analyser/text/LanguageAnalyser.java @@ -3,11 +3,11 @@ */ package uk.bl.wa.analyser.text; -/* +/*- * #%L * warc-indexer * %% - * Copyright (C) 2013 - 2023 The webarchive-discovery project contributors + * Copyright (C) 2013 - 2024 The webarchive-discovery project contributors * %% * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as @@ -25,12 +25,12 @@ * #L% */ + import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.tika.language.detect.LanguageDetector; -import org.apache.tika.language.detect.LanguageResult; -import org.apache.tika.langdetect.optimaize.OptimaizeLangDetector; +import com.carrotsearch.labs.langid.DetectedLanguage; +import com.carrotsearch.labs.langid.LangIdV3; import com.typesafe.config.Config; import uk.bl.wa.solr.SolrFields; @@ -38,43 +38,50 @@ import uk.bl.wa.util.Instrument; /** - * @author anj + * @author Toth * */ -public class LanguageAnalyser extends AbstractTextAnalyser { +public class LanguageAnalyser extends AbstractTextAnalyser +{ private Logger log = LoggerFactory.getLogger(LanguageAnalyser.class); - /** */ - private LanguageDetector ld; + // The language detection model + private LangIdV3 langid; /** * @param conf */ - public void configure(Config conf) { + public void configure(Config conf) + { setEnabled(!conf.hasPath("warc.index.extract.content.language.enabled") - || conf.getBoolean( - "warc.index.extract.content.language.enabled")); - ld = new OptimaizeLangDetector().loadModels(); - log.info( - "Constructed language analyzer with enabled = " + isEnabled()); + || conf.getBoolean("warc.index.extract.content.language.enabled")); + + this.langid = new LangIdV3(); + + log.debug("Constructed language analyzer with enabled = " + isEnabled()); } - /* (non-Javadoc) - * @see uk.bl.wa.analyser.text.TextAnalyser#analyse(java.lang.String, uk.bl.wa.util.solr.SolrRecord) - */ @Override - public void analyse(String text, SolrRecord solr) { + public void analyse(String text, SolrRecord solr) + { final long start = System.nanoTime(); - try { - LanguageResult li = ld.detect(text); - if (li != null) { - solr.addField(SolrFields.CONTENT_LANGUAGE, li.getLanguage()); + + try + { + DetectedLanguage result = langid.classify(text, true); + + if (result != null) + { + solr.addField(SolrFields.CONTENT_LANGUAGE, result.getLangCode()); } - } catch (IllegalArgumentException e) { - log.error("Exception when determining language of this item: " - + e.getMessage(), e); + } + catch (IllegalArgumentException e) + { + log.error("Exception when determining language of this item: " + e.getMessage(), e); solr.addParseException(e); } + Instrument.timeRel("TextAnalyzers#total", "LanguageAnalyzer#total", start); } + }