Skip to content

Commit

Permalink
update language detection dependency
Browse files Browse the repository at this point in the history
  • Loading branch information
jiemakel committed May 3, 2016
1 parent 52029d1 commit 87dc01d
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 45 deletions.
9 changes: 5 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ Lexical Analysis Command-Line Tool for lemmatizing, lexical analysis and languag

Program help:
```
las 1.3.0
las 1.4.1
Usage: las [lemmatize|analyze|inflect|recognize|identify] [options] [<file>...]
Command: lemmatize
Expand All @@ -15,7 +15,7 @@ Command: inflect
Command: recognize
report recognition rate (locales: de, en, fi, fr, it, liv, mdf, mhr, mrj, myv, sme, sv, tr, udm, la
Command: identify
identify language (locales: hy, fi, no, lb, hr, ta, ka, ar, fr, is, ug, lv, eu, am, mt, bn, uz, dk, uk, si, ky, pa, ga, tt, so, pt, cs, fr, gn, sr, mrj, el, it, ca, os, vi, yo, dv, tl, nl, bg, ko, liv, tk, it, mk, et, af, de, ru, yi, cy, en, udm, ur, ln, mdf, jv, myv, sme, ru, ml, th, id, pnb, sq, sv, de, sv, tr, da, my, zh-tw, en, gu, he, es, kn, sk, az, lij, es, fo, hi, te, mr, sw, be, qu, pt, nl, mi, ja, zh-cn, fi, bo, ro, mhr, ne, lt, no, km, kk, fa, mn, hu, pl, la, tr)
identify language (locales: zh-TW, fi, no, hr, ta, ar, fr, is, lv, eu, mt, bn, dk, uk, pa, ga, br, so, pt, cs, fr, gl, sr, zh-CN, mrj, el, it, ca, vi, tl, nl, bg, ko, liv, it, mk, oc, et, af, de, ru, yi, cy, en, udm, ur, mdf, myv, sme, ru, ht, ml, th, id, sq, sv, de, sv, tr, da, en, gu, he, es, kn, sk, es, hi, te, mr, an, sw, be, pt, nl, ja, ast, fi, ro, mhr, ne, lt, no, km, sl, fa, ms, hu, pl, la, tr)
--locale <value>
possible locales
--forms <value>
Expand All @@ -26,11 +26,12 @@ identify language (locales: hy, fi, no, lb, hr, ta, ka, ar, fr, is, ug, lv, eu,
Don't guess baseforms for unknown words?
--no-segment-guessed
Don't guess segmentation information for guessed words (speeds up processing significantly)?
--max-edit-distance <value>
Maximum edit distance for error-correcting unidentified words (default 0)?
--no-pretty
Don't pretty print analysis json in file output?
Don't pretty print json?
<file>...
files to process (stdin if not given)
--help
prints this usage text
```
21 changes: 7 additions & 14 deletions build.sbt
Original file line number Diff line number Diff line change
@@ -1,23 +1,16 @@
name := """las-cl"""

version := "1.3.0"
version := "1.4.1"

scalaVersion := "2.11.5"

// Change this to another test framework if you prefer
libraryDependencies += "org.scalatest" %% "scalatest" % "2.2.4" % "test"

// Uncomment to use Akka
//libraryDependencies += "com.typesafe.akka" % "akka-actor_2.11" % "2.3.9"
scalaVersion := "2.11.8"

libraryDependencies ++= Seq(
"fi.seco" % "lexicalanalysis" % "1.4.3",
"com.cybozu.labs" % "langdetect" % "1.2.2" exclude("net.arnx.jsonic", "jsonic"),
"net.arnx" % "jsonic" % "1.3.0", //langdetect pulls in ancient unavailable version
"com.github.scopt" %% "scopt" % "3.3.0",
"com.typesafe.play" %% "play-json" % "2.3.4",
"com.typesafe.scala-logging" %% "scala-logging" % "3.1.0",
"ch.qos.logback" % "logback-classic" % "1.1.2" % "runtime"
"com.optimaize.languagedetector" % "language-detector" % "0.5",
"com.github.scopt" %% "scopt" % "3.4.0",
"com.typesafe.play" %% "play-json" % "2.5.3",
"com.typesafe.scala-logging" %% "scala-logging" % "3.4.0",
"ch.qos.logback" % "logback-classic" % "1.1.7" % "runtime"
)
resolvers ++= Seq(
"Local Maven Repository" at Path.userHome.asFile.toURI.toURL + ".m2/repository")
Expand Down
45 changes: 18 additions & 27 deletions src/main/scala/LASCommandLineTool.scala
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,26 @@ import play.api.libs.json.Json
import fi.seco.lexical.hfst.HFSTLexicalAnalysisService.WordToResults
import play.api.libs.json.Writes
import java.util.Collections
import com.cybozu.labs.langdetect.LangDetectException
import com.optimaize.langdetect.LanguageDetectorBuilder
import com.optimaize.langdetect.ngram.NgramExtractors
import com.optimaize.langdetect.profiles.LanguageProfileReader
import com.optimaize.langdetect.text.CommonTextObjectFactories

object LASCommandLineTool {

lazy val hfstlas = new HFSTLexicalAnalysisService
lazy val combinedlas = new CombinedLexicalAnalysisService
lazy val snowballlas = new SnowballLexicalAnalysisService
lazy val compoundlas = new CompoundLexicalAnalysisService(combinedlas, snowballlas)


object LanguageDetector extends LazyLogging {
val languageProfiles = new LanguageProfileReader().readAllBuiltIn()
val supportedLanguages = languageProfiles.map(_.getLocale.toString())
val detector = LanguageDetectorBuilder.create(NgramExtractors.standard()).withProfiles(languageProfiles).build()
val textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText()
def apply(text: String) = detector.getProbabilities(textObjectFactory.forText(text))
}

object Action extends Enumeration {
type Action = Value
val Inflect, Lemmatize, Analyze, Detect, Recognize = Value
Expand All @@ -44,7 +55,7 @@ object LASCommandLineTool {

def main(args: Array[String]) = {
val parser = new scopt.OptionParser[Config]("las") {
head("las", "1.4.0")
head("las", "1.4.1")
cmd("lemmatize") action { (_, c) =>
c.copy(action = Action.Lemmatize)
} text (s"(locales: ${compoundlas.getSupportedBaseformLocales.mkString(", ")})")
Expand Down Expand Up @@ -236,20 +247,15 @@ object LASCommandLineTool {
def getBestLang(text: String, locales: Seq[String]): Option[String] = {
if (locales.isEmpty) {
val lrResult = Option(LanguageRecognizer.getLanguageAsObject(text)).map(r => Map(r.getLang() -> r.getIndex))
val detector = LanguageDetector()
detector.append(text)
val ldResult = detector.getProbabilities().map(l => Map(l.lang -> l.prob))
val ldResult = Try(LanguageDetector(text).map(l => Map(l.getLocale.toString -> l.getProbability))).getOrElse(Seq.empty)
val hfstResultTmp = hfstlas.getSupportedAnalyzeLocales.map(lang =>
(lang.toString(),hfstlas.recognize(text, lang))).filter(_._2.getRate!=0.0).toSeq.sortBy(_._2.getRate).reverse.map(p => (p._1,p._2.getRate*p._2.getRate))
val tc = hfstResultTmp.foldRight(0.0) { _._2 + _ }
val hfstResult = hfstResultTmp.map(p => Map(p._1 -> p._2 / tc))
Try(Some((ldResult ++ hfstResult ++ lrResult).groupBy(_.keysIterator.next).mapValues(_.foldRight(0.0) { (p, r) => r + p.valuesIterator.next } / 3.0).maxBy(_._2)._1)).getOrElse(None)
} else {
val lrResult = Option(LanguageRecognizer.getLanguageAsObject(text, locales: _*)).map(r => Map(r.getLang() -> r.getIndex))
val detector = LanguageDetector()
detector.setPriorMap(new HashMap(mapAsJavaMap(locales.map((_, new java.lang.Double(1.0))).toMap)))
detector.append(text)
val ldResult = detector.getProbabilities().map(l => Map(l.lang -> l.prob))
val ldResult = Try(LanguageDetector(text).filter(d => locales.contains(d.getLocale.toString)).map(l => Map(l.getLocale.toString -> l.getProbability))).getOrElse(Seq.empty)
val hfstResultTmp = locales.map(new Locale(_)).intersect(hfstlas.getSupportedAnalyzeLocales.toSeq).map(lang =>
(lang.toString(),hfstlas.recognize(text, lang))).filter(_._2.getRate!=0.0).toSeq.sortBy(_._2.getRate).reverse.map(p => (p._1,p._2.getRate*p._2.getRate))
val tc = hfstResultTmp.foldRight(0.0) { _._2 + _ }
Expand All @@ -261,10 +267,7 @@ object LASCommandLineTool {
def identify(text: String, locales: Seq[String]): Option[String] = {
if (!locales.isEmpty) {
val lrResult = Option(LanguageRecognizer.getLanguageAsObject(text, locales: _*)).map(r => Map(r.getLang() -> r.getIndex))
val detector = LanguageDetector()
detector.setPriorMap(new HashMap(mapAsJavaMap(locales.map((_, new java.lang.Double(1.0))).toMap)))
detector.append(text)
val ldResult = Try(detector.getProbabilities().map(l => Map(l.lang -> l.prob))).getOrElse(Seq.empty)
val ldResult = Try(LanguageDetector(text).filter(d => locales.contains(d.getLocale.toString)).map(l => Map(l.getLocale.toString -> l.getProbability))).getOrElse(Seq.empty)
val hfstResultTmp = locales.map(new Locale(_)).intersect(hfstlas.getSupportedAnalyzeLocales.toSeq).map(lang =>
(lang.toString(),hfstlas.recognize(text, lang))).filter(_._2.getRate!=0.0).toSeq.sortBy(_._2.getRate).reverse.map(p => (p._1,p._2.getRate*p._2.getRate))
val tc = hfstResultTmp.foldRight(0.0) { _._2 + _ }
Expand All @@ -276,9 +279,7 @@ object LASCommandLineTool {
}
} else {
val lrResult = Option(LanguageRecognizer.getLanguageAsObject(text)).map(r => Map(r.getLang() -> r.getIndex))
val detector = LanguageDetector()
detector.append(text)
val ldResult = Try(detector.getProbabilities().map(l => Map(l.lang -> l.prob))).getOrElse(Seq.empty)
val ldResult = Try(LanguageDetector(text).map(l => Map(l.getLocale.toString -> l.getProbability))).getOrElse(Seq.empty)
val hfstResultTmp = hfstlas.getSupportedAnalyzeLocales.map(lang =>
(lang.toString(),hfstlas.recognize(text, lang))).filter(_._2.getRate!=0.0).toSeq.sortBy(_._2.getRate).reverse.map(p => (p._1,p._2.getRate*p._2.getRate))
val tc = hfstResultTmp.foldRight(0.0) { _._2 + _ }
Expand All @@ -292,13 +293,3 @@ object LASCommandLineTool {
}

}

object LanguageDetector extends LazyLogging {
def apply() = com.cybozu.labs.langdetect.DetectorFactory.create()
val supportedLanguages = Array("af", "am", "ar", "az", "be", "bg", "bn", "bo", "ca", "cs", "cy", "da", "de", "dv", "el", "en", "es", "et", "eu", "fa", "fi", "fo", "fr", "ga", "gn", "gu", "he", "hi", "hr", "hu", "hy", "id", "is", "it", "ja", "jv", "ka", "kk", "km", "kn", "ko", "ky", "lb", "lij", "ln", "lt", "lv", "mi", "mk", "ml", "mn", "mr", "mt", "my", "ne", "nl", "no", "os", "pa", "pl", "pnb", "pt", "qu", "ro", "si", "sk", "so", "sq", "sr", "sv", "sw", "ta", "te", "th", "tk", "tl", "tr", "tt", "ug", "uk", "ur", "uz", "vi", "yi", "yo", "zh-cn", "zh-tw")
try {
com.cybozu.labs.langdetect.DetectorFactory.loadProfiles(supportedLanguages: _*)
} catch {
case e: Exception => logger.warn("Couldn't load language profiles", e)
}
}

0 comments on commit 87dc01d

Please sign in to comment.