diff --git a/warc-indexer/pom.xml b/warc-indexer/pom.xml
index aca9b32e..f15e92d7 100644
--- a/warc-indexer/pom.xml
+++ b/warc-indexer/pom.xml
@@ -315,10 +315,16 @@
1.1.0
-
- org.apache.lucene
- lucene-core
- 8.7.0
-
+
+ org.apache.lucene
+ lucene-core
+ 8.7.0
+
+
+
+ com.carrotsearch
+ langid-java
+ 1.0.0
+
diff --git a/warc-indexer/src/main/java/uk/bl/wa/analyser/text/LanguageAnalyser.java b/warc-indexer/src/main/java/uk/bl/wa/analyser/text/LanguageAnalyser.java
index d81ef3f6..bd77280e 100644
--- a/warc-indexer/src/main/java/uk/bl/wa/analyser/text/LanguageAnalyser.java
+++ b/warc-indexer/src/main/java/uk/bl/wa/analyser/text/LanguageAnalyser.java
@@ -3,11 +3,11 @@
*/
package uk.bl.wa.analyser.text;
-/*
+/*-
* #%L
* warc-indexer
* %%
- * Copyright (C) 2013 - 2023 The webarchive-discovery project contributors
+ * Copyright (C) 2013 - 2024 The webarchive-discovery project contributors
* %%
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as
@@ -25,12 +25,12 @@
* #L%
*/
+
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import org.apache.tika.language.detect.LanguageDetector;
-import org.apache.tika.language.detect.LanguageResult;
-import org.apache.tika.langdetect.optimaize.OptimaizeLangDetector;
+import com.carrotsearch.labs.langid.DetectedLanguage;
+import com.carrotsearch.labs.langid.LangIdV3;
import com.typesafe.config.Config;
import uk.bl.wa.solr.SolrFields;
@@ -38,43 +38,50 @@
import uk.bl.wa.util.Instrument;
/**
- * @author anj
+ * @author Toth
*
*/
-public class LanguageAnalyser extends AbstractTextAnalyser {
+public class LanguageAnalyser extends AbstractTextAnalyser
+{
private Logger log = LoggerFactory.getLogger(LanguageAnalyser.class);
- /** */
- private LanguageDetector ld;
+ // The language detection model
+ private LangIdV3 langid;
/**
* @param conf
*/
- public void configure(Config conf) {
+ public void configure(Config conf)
+ {
setEnabled(!conf.hasPath("warc.index.extract.content.language.enabled")
- || conf.getBoolean(
- "warc.index.extract.content.language.enabled"));
- ld = new OptimaizeLangDetector().loadModels();
- log.info(
- "Constructed language analyzer with enabled = " + isEnabled());
+ || conf.getBoolean("warc.index.extract.content.language.enabled"));
+
+ this.langid = new LangIdV3();
+
+ log.debug("Constructed language analyzer with enabled = " + isEnabled());
}
- /* (non-Javadoc)
- * @see uk.bl.wa.analyser.text.TextAnalyser#analyse(java.lang.String, uk.bl.wa.util.solr.SolrRecord)
- */
@Override
- public void analyse(String text, SolrRecord solr) {
+ public void analyse(String text, SolrRecord solr)
+ {
final long start = System.nanoTime();
- try {
- LanguageResult li = ld.detect(text);
- if (li != null) {
- solr.addField(SolrFields.CONTENT_LANGUAGE, li.getLanguage());
+
+ try
+ {
+ DetectedLanguage result = langid.classify(text, true);
+
+ if (result != null)
+ {
+ solr.addField(SolrFields.CONTENT_LANGUAGE, result.getLangCode());
}
- } catch (IllegalArgumentException e) {
- log.error("Exception when determining language of this item: "
- + e.getMessage(), e);
+ }
+ catch (IllegalArgumentException e)
+ {
+ log.error("Exception when determining language of this item: " + e.getMessage(), e);
solr.addParseException(e);
}
+
Instrument.timeRel("TextAnalyzers#total", "LanguageAnalyzer#total", start);
}
+
}